During this data exploitation, we want to find a pattern between the thumbnail of a Youtube video and the number of views of this video. In order to realize this image processing algorithm, we used a Convolutional Neural Network with one convolution, one ReLU activation and one fully connected layer: $$ y=\textrm{softmax}(ReLU(x\ast W_1+b_1)W_2+b_2) $$
I/ Preparation of the data for the CNN algorithm
I.1/ Get the images in the right dimension and format.
I.2/ Create the train data from the images
I.3/ Create the test data, images picked in the train data randomly
I.4/ Create the labels (equivalent to the classes) of the train data and of the test data
I.5/ Suppress in the train data the images used to create the test data.
II/ Model 1 : random video
The video are taken without a specify thematic, but with random key words in the file CREATE_VIDEO_DATABASE.
II.1/ Definition of the computational graph
II.2/ Run the computational graph
III/ Model 2 : video with a same thematic
The video are taken with a specify thematic (main subject is "cars").
III.1/ Definition of the computational graph
III.2/ Run the computational graph
IV/ Exploitation of the results
In [3]:
import os
import requests
import json
import pandas as pd
from math import *
import tensorflow as tf
import time
import collections
import datetime
import numpy as np
import random
folder = os.path.join('videos_data_CNN2', 'youtube_fame') # model 1 :videos_data_CNN1/youtube_fame -- model 2 : videos_data_CNN2/youtube_fame
imag = pd.read_sql('imag', 'sqlite:///' + os.path.join(folder, 'imag.sqlite')) # model 1 : imag1000 -- model 2 : imag
data_video = pd.read_sql('videos', 'sqlite:///' + os.path.join(folder, 'videos.sqlite'))
print(len(imag))
I/Preparation of the data for the CNN algorithm
In [4]:
# I.1/ import image and reshape them, learn the indices from the ones with wrong size, does not reshape them
Images=[]
print(len(imag['imag']))
ind_wrong_size=[]
for i in range(len(imag['imag'])):
if i!=0:
images=np.fromstring(imag['imag'][i],dtype=np.float32)
if len(images)==32400:
Images+=[images.reshape([90,120,3])]
else:
ind_wrong_size+=[i]
print(Images[10].shape)
print(Images[100].shape)
print(len(ind_wrong_size))
In [5]:
# I.2/
nbr_video = len(Images)
test_size = 50
height_video = Images[0].shape[0]
width_video = Images[0].shape[1]
size_video = Images[0].shape[2]
# creation of the train data:
train_data_orig = np.zeros([nbr_video,height_video,width_video,size_video])
for i in range(nbr_video):
train_data_orig[i,:,:,:]=Images[i]
print('train_data_orig shape:', train_data_orig.shape)
train_data = np.zeros([nbr_video,height_video*width_video])
for i in range(nbr_video):
xx = train_data_orig[i,:,:,:]
xx = np.linalg.norm(xx,axis=2)
xx -= np.mean(xx)
xx /= np.linalg.norm(xx)
train_data[i] = np.reshape(xx,[-1])
print('train_data shape:', train_data.shape)
In [6]:
# I.3/creation of the test data: random indices array taken from train data generated
nb_elem = test_size
indices = []
while nb_elem > 0:
i = random.randint(0, nbr_video -1)
while i in indices: # in order to not have twice the same indice
i = random.randint(0, nbr_video -1)
indices.append(i)
nb_elem = nb_elem - 1
indices=np.sort(indices)
print('Indices = ',indices)
test_data_orig = np.zeros([test_size,height_video,width_video,size_video])
for i in range(test_size):
test_data_orig[i,:,:,:] = Images[indices[i]]
print('test_data_orig shape:', test_data_orig.shape)
test_data = np.zeros([test_size,height_video*width_video])
for i in range(test_size):
xx = test_data_orig[i,:,:,:]
xx = np.linalg.norm(xx,axis=2)
xx -= np.mean(xx)
xx /= np.linalg.norm(xx)
test_data[i] = np.reshape(xx,[-1])
print('test_data shape:', test_data.shape)
In [7]:
# I.4/
max_view = int(0);
min_view = int(99999999);
nbr_labels = 7;
# creation of the train labels
train_labels = np.zeros([nbr_video,nbr_labels])
for i in range(nbr_video):
if i!=ind_wrong_size:
views = int(data_video['viewCount'][i])
if views > max_view:
max_view = views;
if views < min_view:
min_view = views;
if views < 999:
train_labels[i] = [1,0,0,0,0,0,0]
elif views < 9999:
train_labels[i] = [0,1,0,0,0,0,0]
elif views < 99999:
train_labels[i] = [0,0,1,0,0,0,0]
elif views < 999999:
train_labels[i] = [0,0,0,1,0,0,0]
elif views < 9999999:
train_labels[i] = [0,0,0,0,1,0,0]
elif views < 99999999:
train_labels[i] = [0,0,0,0,0,1,0]
else:
train_labels[i] = [0,0,0,0,0,0,1]
# Creation of the test labels
test_labels = np.zeros([len(indices),nbr_labels])
for i in range(len(indices)):
test_labels[i]=train_labels[indices[i]]
In [8]:
# I.5/ suppression in the train data and labels of the video used for the test
for i in range(len(indices)):
train_data = np.delete(train_data, indices[len(indices)-i-1],axis=0)
train_labels = np.delete(train_labels,indices[len(indices)-i-1],axis=0)
print(train_data.shape)
print(train_labels.shape)
II/ Model 1
random database
In [109]:
# II.1/
tf.reset_default_graph();
# Define computational graph (CG)
batch_size = len(indices) # batch size
d = train_data.shape[1] # data dimensionality
nc = nbr_labels # number of classes
tf.reset_default_graph();
# CG inputs
xin = tf.placeholder(tf.float32,[batch_size,d]); print('xin=',xin,xin.get_shape())
y_label = tf.placeholder(tf.float32,[batch_size,nc]); print('y_label=',y_label,y_label.get_shape())
# Convolutional layer
K = 5 # size of the patch
F = 8 # number of filters
Wcl = tf.get_variable("Wcl",shape=[K,K,1,F],initializer=tf.contrib.layers.xavier_initializer()); print('Wcl=',Wcl.get_shape())
x_2d = tf.reshape(xin, [-1,120,90,1]);print('x_2d=',x_2d.get_shape())
b1 = tf.Variable(tf.zeros([nc]));
x = tf.nn.conv2d(x_2d, Wcl, strides=[1, 1, 1, 1], padding='SAME'); print('x=',x.get_shape())
x+=b1;
# ReLU activation
x = tf.nn.relu(x)
print('x',x.get_shape())
# Fully Connected layer
nfc = 120*90*F
x = tf.reshape(x, [batch_size,nfc]); print('x',x.get_shape())
W2 = tf.get_variable("W2",shape=[nfc,nc], initializer=tf.contrib.layers.xavier_initializer()); print('W2=',W2.get_shape())
b2 = tf.Variable(tf.zeros([nc])); print('b2',b2.get_shape())
y = tf.matmul(x,W2);print('y',y.get_shape())
y+=b2;
# Softmax
y = tf.nn.softmax(y)
print('y',y.get_shape())
# Loss
cross_entropy = tf.reduce_mean(-tf.reduce_sum(y_label * tf.log(tf.maximum(y,1e-15)), 1))
total_loss = cross_entropy
# Optimization scheme
train_step = tf.train.GradientDescentOptimizer(0.3).minimize(total_loss)
# Accuracy
correct_prediction = tf.equal(tf.argmax(y,1), tf.argmax(y_label,1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
In [110]:
# II.2/
today = datetime.datetime.now()
print(today)
# Run Computational Graph
n = train_data.shape[0]
indices = collections.deque()
init = tf.initialize_all_variables()
sess = tf.Session()
sess.run(init)
for i in range(10001):
# Batch extraction
if len(indices) < batch_size:
indices.extend(np.random.permutation(n))
idx = [indices.popleft() for i in range(batch_size)]
batch_x, batch_y = train_data[idx,:], train_labels[idx]
#print(batch_x.shape,batch_y.shape)
# Run CG for variable training
_,acc_train,total_loss_o = sess.run([train_step,accuracy,total_loss], feed_dict={xin: batch_x, y_label: batch_y})
# Run CG for test set
if not i%1000:
print('\nIteration i=',i,', train accuracy=',acc_train,', loss=',total_loss_o)
acc_test = sess.run(accuracy, feed_dict={xin: test_data, y_label: test_labels})
print('test accuracy=',acc_test)
today2 = datetime.datetime.now()
print('time=',today2,'delta=',today2-today)
III/ Model 2
test with specify thematic : videos chosen around the subject 'car'
In [9]:
# III.1/
tf.reset_default_graph();
# Define computational graph (CG)
batch_size = len(indices) # batch size
d = train_data.shape[1] # data dimensionality
nc = nbr_labels # number of classes
tf.reset_default_graph();
# CG inputs
xin = tf.placeholder(tf.float32,[batch_size,d]); print('xin=',xin,xin.get_shape())
y_label = tf.placeholder(tf.float32,[batch_size,nc]); print('y_label=',y_label,y_label.get_shape())
# Convolutional layer
K = 5 # size of the patch
F = 7 # number of filters
Wcl = tf.get_variable("Wcl",shape=[K,K,1,F],initializer=tf.contrib.layers.xavier_initializer()); print('Wcl=',Wcl.get_shape())
x_2d = tf.reshape(xin, [-1,120,90,1]);print('x_2d=',x_2d.get_shape())
b1 = tf.Variable(tf.zeros([nc]));
x = tf.nn.conv2d(x_2d, Wcl, strides=[1, 1, 1, 1], padding='SAME'); print('x=',x.get_shape())
x+=b1;
# ReLU activation
x = tf.nn.relu(x)
print('x',x.get_shape())
# Fully Connected layer
nfc = 120*90*F
x = tf.reshape(x, [batch_size,nfc]); print('x',x.get_shape())
W2 = tf.get_variable("W2",shape=[nfc,nc], initializer=tf.contrib.layers.xavier_initializer()); print('W2=',W2.get_shape())
b2 = tf.Variable(tf.zeros([nc])); print('b2',b2.get_shape())
y = tf.matmul(x,W2);print('y',y.get_shape())
y+=b2;
# Softmax
y = tf.nn.softmax(y)
print('y',y.get_shape())
# Loss
cross_entropy = tf.reduce_mean(-tf.reduce_sum(y_label * tf.log(tf.maximum(y,1e-15)), 1))
total_loss = cross_entropy
# Optimization scheme
train_step = tf.train.GradientDescentOptimizer(0.3).minimize(total_loss)
# Accuracy
correct_prediction = tf.equal(tf.argmax(y,1), tf.argmax(y_label,1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
In [10]:
# III.2/
today = datetime.datetime.now()
print(today)
# Run Computational Graph
n = train_data.shape[0]
indices = collections.deque()
init = tf.initialize_all_variables()
sess = tf.Session()
sess.run(init)
for i in range(10001):
# Batch extraction
if len(indices) < batch_size:
indices.extend(np.random.permutation(n))
idx = [indices.popleft() for i in range(batch_size)]
batch_x, batch_y = train_data[idx,:], train_labels[idx]
#print(batch_x.shape,batch_y.shape)
# Run CG for variable training
_,acc_train,total_loss_o = sess.run([train_step,accuracy,total_loss], feed_dict={xin: batch_x, y_label: batch_y})
# Run CG for test set
if not i%1000:
print('\nIteration i=',i,', train accuracy=',acc_train,', loss=',total_loss_o)
acc_test = sess.run(accuracy, feed_dict={xin: test_data, y_label: test_labels})
print('test accuracy=',acc_test)
today2 = datetime.datetime.now()
print('time=',today2,'delta=',today2-today)
IV/ Exploitation of the results
We can see that the train and test accuracies of the two models stay low : 0.18 for the train and 0.31 for the test for model 1 and 0.34 for the train and 0.38 for the test for model 2. It is explained by the fact that the thumbnail of a youtube video is completely random and depends of the choice of the youtuber. However we can see that for model 2, where the videos were chosen based on a thematic ("cars"), the accuracy of both train and test is higher than for the random model 1.
The low accuracy of the train can be explained by the fact that the dataset is too small (around 1000 images for model 1, around 600 for model 2) or that the CNN does not go deep enough. We can also observe that the train accuracy is changing a lot and is not stabilizing during the session (the number of iteration could be increased), whereas the test accuracy does not change after 1000 iterations.
A possibility to include the numbers of suscribers of a channel in the CNN was to normalize the number of view per number of subcribers in order to increase the accuracy of the train.
In [ ]: