In [53]:
import requests
import json
import pandas as pd
from math import *
import numpy as np
import tensorflow as tf
import time
import collections
import os
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from IPython.display import display
from random import randint
We can choose on which database we want to do our learning. To test our neural network we created 3 databases, one on the theme "animals", an other on "cars", and one with random videos. We want to see if we get different results depending on the dataset. More databases can easily be created by using the notebook "create_videos_database".
In [54]:
folder = os.path.join('sql_database_animaux')
#folder = os.path.join('sql_database_voitures')
#folder = os.path.join('sql_database_random')
In [55]:
videos_database = pd.read_sql('videos', 'sqlite:///' + os.path.join(folder, 'videos.sqlite'), index_col='index')
videos_database = videos_database.drop_duplicates('id')
videos_database = videos_database.reset_index(drop=True)
display(videos_database)
print("Length of the video database :",len(videos_database))
For our train_data set we use the Bag of Words and Term Frequency-Inverse Document Frequency (TF-IDF) method. It is usually used in sentiment analysis but this method should give interesting results in our case because we think that some particular words in a video title may attract more viewers.
We also append the normalized number of subscribers to each vector.
In [60]:
#maximal number of words to extract, it is also the maximal size of our vectors
#we played a little with this value and 2000 seems to give good results
nwords = 2000
#the stopwords are the words such as "the" or "is" that are everywhere and does not give any information
#we don't want those words in our vocabulary
#we get them from the file "stopwords.txt" found on the internet
stopwords = [line.rstrip('\n') for line in open('stopwords.txt')]
#print('stopwords:',stopwords)
def compute_bag_of_words(text, nwords):
vectorizer = CountVectorizer(max_features=nwords)
vectors = vectorizer.fit_transform(text)
vocabulary = vectorizer.get_feature_names()
return vectors, vocabulary
#we concatenate the titles to extract the words from them
concatenated_titles=[]
for titles in videos_database['title']:
concatenated_titles += [' ', titles]
#create a vocabulary from the titles
title_bow, titles_vocab = compute_bag_of_words(concatenated_titles, nwords)
del concatenated_titles
titles_list = videos_database['title'].tolist()
#we apply the TF-IDF method to the titles
vect = TfidfVectorizer(sublinear_tf=True, max_df=0.5, analyzer='word', stop_words=stopwords, vocabulary=titles_vocab)
vect.fit(titles_list)
#create a sparse TF-IDF matrix
titles_tfidf = vect.transform(titles_list)
del titles_list
train_data = titles_tfidf.todense()
print(train_data.shape)
def print_most_frequent(bow, vocab, n=100):
idx = np.argsort(bow.sum(axis=0))
for i in range(n):
j = idx[0, -i]
print(vocab[j],': ',title_bow.sum(axis=0)[0,j])
print('most used words:')
print_most_frequent(title_bow,titles_vocab)
#print(len(title_vocab))
#print(train_data.shape)
In [61]:
#add the sub count to data_train
subsCountTemp = videos_database['subsCount'].tolist()
maxSubs = max(subsCountTemp)
print(max(subsCountTemp))
#divide all the subs count by the maximal number of subs.
#it is to have values in the range of the values created by the tf-idf algorithm
subsCount = []
for x in subsCountTemp:
subsCount.append(x/maxSubs)
del subsCountTemp
#add the subs to our train_data
subsCount = np.asarray(subsCount)
subsCount = np.reshape(subsCount, [len(subsCount),1]);
train_data = np.append(train_data, np.array(subsCount), 1)
del subsCount
print(train_data.shape)
In [62]:
nbr_labels = 8
nbr_video = len(videos_database['title'])
train_labels = np.zeros([train_data.shape[0],nbr_labels])
for i in range(nbr_video):
views = int(videos_database['viewCount'][i])
if views < 99:
train_labels[i] = [1,0,0,0,0,0,0,0]
elif views < 999:
train_labels[i] = [0,1,0,0,0,0,0,0]
elif views < 9999:
train_labels[i] = [0,0,1,0,0,0,0,0]
elif views < 99999:
train_labels[i] = [0,0,0,1,0,0,0,0]
elif views < 999999:
train_labels[i] = [0,0,0,0,1,0,0,0]
elif views < 9999999:
train_labels[i] = [0,0,0,0,0,1,0,0]
elif views < 99999999:
train_labels[i] = [0,0,0,0,0,0,1,0]
else:
train_labels[i] = [0,0,0,0,0,0,0,1]
print('train_labels shape :', train_labels.shape)
In [63]:
testset = 100
test_data = np.zeros([testset,train_data.shape[1]])
test_labels = np.zeros([testset,nbr_labels])
for i in range(len(test_data)):
x = randint(0,len(test_data))
test_data[i] = train_data[x]
test_labels[i] = train_labels[x]
train_data=np.delete(train_data,x,axis=0)
train_labels=np.delete(train_labels,x,axis=0)
print('train data shape ', train_data.shape)
print('train labels shape', train_labels.shape)
print('train test shape ', test_data.shape)
print('train labels shape', test_labels.shape)
In [51]:
# Define computational graph (CG)
batch_size = testset # batch size
d = train_data.shape[1] # data dimensionality
nc = nbr_labels # number of classes
# CG inputs
xin = tf.placeholder(tf.float32,[batch_size,d]);
y_label = tf.placeholder(tf.float32,[batch_size,nc]);
# 1st Fully Connected layer
nfc1 = 300
Wfc1 = tf.Variable(tf.truncated_normal([d,nfc1], stddev=tf.sqrt(5./tf.to_float(d+nfc1)) ));
bfc1 = tf.Variable(tf.zeros([nfc1]));
y = tf.matmul(xin, Wfc1);
y += bfc1;
# ReLU activation
y = tf.nn.relu(y)
# dropout
y = tf.nn.dropout(y, 0.25)
# 2nd layer
nfc2 = nc
#nfc2 = 100
Wfc2 = tf.Variable(tf.truncated_normal([nfc1,nfc2], stddev=tf.sqrt(5./tf.to_float(nfc1+nc)) ));
bfc2 = tf.Variable(tf.zeros([nfc2]));
y = tf.matmul(y, Wfc2);
y += bfc2;
#y = tf.nn.relu(y)
# 3rd layer
#nfc3 = nc
#Wfc3 = tf.Variable(tf.truncated_normal([nfc2,nfc3], stddev=tf.sqrt(5./tf.to_float(nfc1+nc)) ));
#bfc3 = tf.Variable(tf.zeros([nfc3]));
#y = tf.matmul(y, Wfc3);
#y += bfc3;
# Softmax
y = tf.nn.softmax(y);
# Loss
cross_entropy = tf.reduce_mean(-tf.reduce_sum(y_label * tf.log(y), 1))
# L2 Regularization
reg_loss = tf.nn.l2_loss(Wfc1)
reg_loss += tf.nn.l2_loss(bfc1)
reg_loss += tf.nn.l2_loss(Wfc2)
reg_loss += tf.nn.l2_loss(bfc2)
reg_par = 4*1e-3
total_loss = cross_entropy + reg_par*reg_loss
# Optimization scheme
train_step = tf.train.AdamOptimizer(0.001).minimize(total_loss)
# Accuracy
correct_prediction = tf.equal(tf.argmax(y,1), tf.argmax(y_label,1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
In [52]:
# Run Computational Graph
n = train_data.shape[0]
indices = collections.deque()
init = tf.initialize_all_variables()
sess = tf.Session()
sess.run(init)
for i in range(10001):
# Batch extraction
if len(indices) < batch_size:
indices.extend(np.random.permutation(n))
idx = [indices.popleft() for i in range(batch_size)]
batch_x, batch_y = train_data[idx,:], train_labels[idx]
# Run CG for variable training
_,acc_train,total_loss_o = sess.run([train_step,accuracy,total_loss], feed_dict={xin: batch_x, y_label: batch_y})
# Run CG for test set
if not i%100:
print('\nIteration i=',i,', train accuracy=',acc_train,', loss=',total_loss_o)
acc_test = sess.run(accuracy, feed_dict={xin: test_data, y_label: test_labels})
print('test accuracy=',acc_test)
random dataset:
"cars" dataset:
"animals" dataset:
We can see that we get better results if we use videos in a given theme. Unfortunately we could not use really big dataset because of the limited memory of the virtual machine. It is a good result considering that this neural network do not take into consideration the thumbnail of the video!
Without the L2 regulation and dropout we can overfit our train accuracy up to 0.95, but the test accuracy will drop.