Attempt to treat the overview of the films:
https://pandas.pydata.org/pandas-docs/version/0.21/generated/pandas.DataFrame.replace.html
In [1]:
%matplotlib inline
import configparser
import os
import requests
from tqdm import tqdm
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import sparse, stats, spatial
import scipy.sparse.linalg
from sklearn import preprocessing, decomposition
import librosa
import IPython.display as ipd
import json
from imdb import IMDb
import tmdbsimple as tmdb
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer
from collections import OrderedDict
from pygsp import graphs, filters, plotting
plt.rcParams['figure.figsize'] = (17, 5)
plotting.BACKEND = 'matplotlib'
In [2]:
df = pd.read_csv('Saved_Datasets/NewFeaturesDataset.csv')
In [3]:
df.head()
Out[3]:
In [4]:
tokens = word_tokenize(df['overview'][34])
In [5]:
tokens
Out[5]:
We can even set all characters to lower case
In [6]:
tokens[0].lower()
Out[6]:
In [7]:
print('Initial number of tokens is {}'.format(len(tokens)))
In [8]:
clean_tokens = tokens[:]
sr = stopwords.words('english')
unwanted = "!.,--1234567890<>A?():"
sr.extend(unwanted)
sr.extend(['The', "'s"])
for token in tokens:
if token in sr:
clean_tokens.remove(token)
In [9]:
print(sr)
In [10]:
clean_tokens
Out[10]:
In [11]:
print('Number of tokens after cleaning is {}'.format(len(clean_tokens)))
In [12]:
freq = nltk.FreqDist(clean_tokens)
for key,val in freq.items():
print (str(key) + ':' + str(val))
In [13]:
freq.plot(20, cumulative=False)
In [14]:
freq.items()
Out[14]:
In [15]:
freq.keys()
Out[15]:
In [16]:
val = list(freq.values())
val
Out[16]:
In [17]:
#for key,val in freq.items():
# print('key is {} and val is {}'.format(key, val))
In [18]:
overviewDic = {}
sr = stopwords.words('english')
unwanted = "!.,--1234567890<>A?()\'':"
sr.extend(unwanted)
sr.extend(['The', "'s", 'But', 'When', 'In', 'As', 'After', '-', '"', "''", '...', '--', '’', '–', '``', "n't", 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine'])
sr.extend(['get', 'set', 'go', 'based', 'meets', 'around', 'named', 'comes', 'even', 'come', 'decides', 'however', 'goes', 'also', 'behind', 'turns', 'must', 'finds', 'put', 'sets', 'takes'])
for i in range(0, len(df)):
# avoid repititions in the list
tokens = list(set(word_tokenize(df['overview'][i].lower())))
clean_tokens = tokens[:]
for token in tokens:
if token in sr:
clean_tokens.remove(token)
#for token in tokens:
# if token in stopwords.words('english'):
# clean_tokens.remove(token)
# elif token in unwanted:
# clean_tokens.remove(token)
overviewDic.setdefault(i, [])
overviewDic[i] = clean_tokens
Example on the first film (Finding Nemo)
In [19]:
overviewDic[0]
Out[19]:
In [20]:
ls = []
for i in range(0, len(overviewDic)):
ls.extend(overviewDic[i])
In [21]:
len(ls)
Out[21]:
In [22]:
CleanedWords = pd.DataFrame(ls)
CleanedWords.head()
Out[22]:
In [23]:
CleanedWords.to_csv('Saved_Datasets/MostCommonWords.csv', index=False)
In [24]:
freq = nltk.FreqDist(ls)
In [25]:
freq.plot(30, cumulative=False)
In [26]:
freq.most_common(50)
Out[26]:
In [27]:
mostCommon = freq.most_common()
mostCommon[0][0]
Out[27]:
In [28]:
NB_WORDS = 100
words = []
for i in range(0, NB_WORDS):
words.append(mostCommon[i][0])
List of the most common words associated to movies that were successful
In [29]:
print(words)
In [30]:
wordSuccess = np.zeros(len(mostCommon), dtype=float) #words
for i in range(0, len(df)):
SuccessTokens = list(set(word_tokenize(df['overview'][i].lower())))
for j in range(0, len(mostCommon)): #words
if mostCommon[j][0] in SuccessTokens:
if df['success'][i] == 1:
wordSuccess[j] += (1/mostCommon[j][1])*100
In [31]:
print(words[1])
print(wordSuccess[1])
In [32]:
plt.bar(words[:15], wordSuccess[:15], align='center');
plt.setp(plt.gca().get_xticklabels(), rotation=45, horizontalalignment='right');
plt.xlabel('Words');
plt.ylabel('Success rate [%]');
plt.savefig('images/TopSuccessWords.png', dpi =300, bbox_inches='tight')
In [33]:
NEIGHBORS = 100
#sort the order of the weights
sort_order = np.argsort(-wordSuccess, axis = 0)
topWord = []
topWordSuccess = np.zeros(NEIGHBORS)
for i in range (0, NEIGHBORS):
topWord.append(mostCommon[sort_order[i]][0])
topWordSuccess[i] = wordSuccess[sort_order[i]]
In [34]:
print(topWord)
print(topWordSuccess)
In [35]:
plt.bar(topWord[:15], topWordSuccess[:15], align='center');
plt.setp(plt.gca().get_xticklabels(), rotation=45, horizontalalignment='right');
plt.xlabel('Words');
plt.ylabel('Success rate [%]');
These words all have an 100% success rate, but don't appear very often! See the graph below
In [36]:
nbCounts = []
for i in range(0, len(topWord)):
nbCounts.append(mostCommon[sort_order[i]][1])
In [37]:
plt.bar(topWord[:15], nbCounts[:15], align='center');
plt.setp(plt.gca().get_xticklabels(), rotation=45, horizontalalignment='right');
plt.xlabel('Words');
plt.ylabel('Counts');
In [38]:
wordSuccess = np.zeros(len(mostCommon), dtype=float)
for i in range(0, len(df)):
SuccessTokens = list(set(word_tokenize(df['overview'][i].lower())))
for j in range(0, len(mostCommon)): #words
if mostCommon[j][0] in SuccessTokens:
if mostCommon[j][1] >= 100:
if df['success'][i] == 1:
wordSuccess[j] += (1/mostCommon[j][1])*100
In [39]:
NEIGHBORS = 100
#sort the order of the weights
sort_order = np.argsort(-wordSuccess, axis = 0)
topWord = []
topWordSuccess = np.zeros(NEIGHBORS)
for i in range (0, NEIGHBORS):
topWord.append(mostCommon[sort_order[i]][0])
topWordSuccess[i] = wordSuccess[sort_order[i]]
In [40]:
plt.bar(topWord[:15], topWordSuccess[:15], align='center');
plt.setp(plt.gca().get_xticklabels(), rotation=45, horizontalalignment='right');
plt.xlabel('Words');
plt.ylabel('Success rate [%]');
In [41]:
print(topWord)
In [42]:
last = nltk.FreqDist(dict(freq.most_common()[-30:]))
last.plot()
In [43]:
last
Out[43]:
In [44]:
NEIGHBORS = 100
#sort the order of the weights
sort_order = np.argsort(wordSuccess, axis = 0)
lowWord = []
lowWordSuccess = np.zeros(NEIGHBORS)
for i in range (0, NEIGHBORS):
lowWord.append(mostCommon[sort_order[i]][0])
lowWordSuccess[i] = wordSuccess[sort_order[i]]
In [45]:
print(lowWord)
In [46]:
plt.bar(lowWord[:15], lowWordSuccess[:15], align='center');
plt.setp(plt.gca().get_xticklabels(), rotation=45, horizontalalignment='right');
plt.xlabel('Words');
plt.ylabel('Success rate [%]');
In [47]:
TextW = np.ndarray(shape=(len(df), len(df)), dtype=int)
for i in range(0, len(df)):
for j in range(i, len(df)):
counts = 0
for k in range(0, len(overviewDic[j])):
if overviewDic[j][k] in overviewDic[i]:
counts = counts + 1
TextW[i][j] = counts
In [48]:
plt.spy(TextW)
Out[48]:
In [49]:
#ensure the matrix is symmetric
bigger = TextW.transpose() > TextW
TextW = TextW - TextW*bigger + TextW.transpose()*bigger
#fill the diagonal values to zero, i.e. no self-connections
np.fill_diagonal(TextW, 0)
In [50]:
plt.spy(TextW)
Out[50]:
In [51]:
print('The mean value: {}'.format(TextW.mean()))
print('The max value: {}'.format(TextW.max()))
print('The min value: {}'.format(TextW.min()))
$\textbf{Reminder:}$ These are the most common words from the dataset
In [52]:
print(words)
In [53]:
commonWordDic = {}
for i in range(0, len(df)):
commonWordDic.setdefault(i, [])
for k in range(0, len(overviewDic[i])):
if overviewDic[i][k] in words:
commonWordDic[i].append(overviewDic[i][k])
In [54]:
TextW = np.zeros(shape=(len(df), len(df)), dtype=float)
WmaxNorm = np.zeros(shape=(len(df), len(df)), dtype=float)
maxLen = 0
for i in range(0, len(df)):
for j in range(i, len(df)):
maxLen = max([len(commonWordDic[i]), len(commonWordDic[j])])
if maxLen != 0:
#compute the similarity
for k in range(0, len(commonWordDic[i])):
if commonWordDic[i][k] in commonWordDic[j]:
TextW[i][j] += 1
#normalization by division of the maximum length
WmaxNorm[i][j] = TextW[i][j]/maxLen
else:
#assign a similarity of 1 since neither film has one of the most freq plot words
TextW[i][j] = 1
WmaxNorm[i][j] = 1
In [55]:
#ensure the matrix is symmetric
bigger = WmaxNorm.transpose() > WmaxNorm
WmaxNorm = WmaxNorm - WmaxNorm*bigger + WmaxNorm.transpose()*bigger
#fill the diagonal values to zero, i.e. no self-connections
np.fill_diagonal(WmaxNorm, 0)
Do this also for TextW
In [56]:
#ensure the matrix is symmetric
bigger = TextW.transpose() > TextW
TextW = TextW - TextW*bigger + TextW.transpose()*bigger
#fill the diagonal values to zero, i.e. no self-connections
np.fill_diagonal(TextW, 0)
In [57]:
plt.spy(WmaxNorm)
Out[57]:
In [58]:
plt.hist(WmaxNorm.reshape(-1), bins=50);
In [59]:
print('The mean value is: {}'.format(WmaxNorm.mean()))
print('The max value is: {}'.format(WmaxNorm.max()))
print('The min value is: {}'.format(WmaxNorm.min()))
Plot the degree distribution
In [60]:
degrees = np.zeros(len(WmaxNorm))
#reminder: the degrees of a node for a weighted graph are the sum of its weights
for i in range(0, len(WmaxNorm)):
degrees[i] = sum(WmaxNorm[i])
plt.hist(degrees, bins=50);
In [61]:
print('The mean value is: {}'.format(degrees.mean()))
print('The max value is: {}'.format(degrees.max()))
print('The min value is: {}'.format(degrees.min()))
In [122]:
np.percentile(TextW, 75)
Out[122]:
In [99]:
plt.hist(TextW.reshape(-1), bins=50);
In [102]:
print('The mean value is: {}'.format(TextW.mean()))
print('The max value is: {}'.format(TextW.max()))
print('The min value is: {}'.format(TextW.min()))
In [104]:
sum(halfTextW[TextW == 8])
Out[104]:
Normalization:
$$W_{ij} = \begin{cases}0, & Nb \ of \ similar \ common \ words \ between \ i \ and \ j =0 \\ 1, & Nb \ of \ similar \ common \ words \ between \ i \ and \ j \geq 1 \end{cases}$$
In [142]:
WNormPerc = np.zeros(shape=(len(df), len(df)), dtype=float)
for i in range(0, len(df)):
for j in range(i, len(df)):
if TextW[i][j] >= 1:
WNormPerc[i][j] = 1
#else:
# WNormPerc[i][j] = WNormPerc[i][j]/2
In [143]:
#ensure the matrix is symmetric
bigger = WNormPerc.transpose() > WNormPerc
WNormPerc = WNormPerc - WNormPerc*bigger + WNormPerc.transpose()*bigger
#fill the diagonal values to zero, i.e. no self-connections
np.fill_diagonal(WNormPerc, 0)
In [144]:
plt.spy(WNormPerc)
Out[144]:
In [145]:
plt.hist(WNormPerc.reshape(-1), bins=50);
In [70]:
NormW = pd.DataFrame(WmaxNorm)
NormW.head()
Out[70]:
In [71]:
NormW.to_csv('Saved_Datasets/NormalizedTextW.csv', index=False)
With pygsp
In [127]:
G = graphs.Graph(WmaxNorm)
G.compute_laplacian('normalized')
Normally
In [128]:
#reminder: L = D - W for weighted graphs
laplacian = np.diag(degrees) - WmaxNorm
#computation of the normalized Laplacian
laplacian_norm = scipy.sparse.csgraph.laplacian(WmaxNorm, normed = True)
plt.spy(laplacian_norm);
In [129]:
laplacian_norm = sparse.csr_matrix(laplacian_norm)
With pygsp
In [130]:
G.compute_fourier_basis(recompute=True)
plt.plot(G.e[0:10]);
In [131]:
print('The value of the 1st eigenvalue is: {}'.format(G.e[1]))
Normally
In [132]:
eigenvalues, eigenvectors = sparse.linalg.eigsh(laplacian_norm, k = 10, which = 'SM')
In [133]:
plt.plot(eigenvalues, '.-', markersize=15);
plt.xlabel('')
plt.ylabel('Eigenvalues')
plt.show()
In [134]:
genres = preprocessing.LabelEncoder().fit_transform(df['success'])
x = eigenvectors[:, 1]
y = eigenvectors[:, 2]
plt.scatter(x, y, c=genres, cmap='RdBu', alpha=0.5);
In [135]:
#Note: eigenvalues and their respective eigenvectors are already sorted from smallest to biggest
#plot on the eigenvectors 2 and 3 (set_coordinates takes Nx2 or Nx3 array size)
G.set_coordinates(G.U[:, 1:3])
G.plot()
In [136]:
G.plot_signal(genres, vertex_size=20)
NEIGHBORS = 300
sort_order = np.argsort(weightsNorm, axis = 1)
sorted_weights = np.zeros((len(weightsNorm), len(weightsNorm)))
for i in range (0, len(weightsNorm)):
for j in range(0, len(weightsNorm)):
if (j >= len(weightsNorm) - NEIGHBORS):
#copy the k strongest edges for each node
sorted_weights[i, sort_order[i,j]] = weightsNorm[i,sort_order[i,j]]
else:
#set the other edges to zero
sorted_weights[i, sort_order[i,j]] = 0
bigger = sorted_weights.transpose() > sorted_weights sorted_weights = sorted_weights - sorted_weightsbigger + sorted_weights.transpose()bigger
In [82]:
#plt.spy(sorted_weights)
In [83]:
#plt.hist(sorted_weights.reshape(-1), bins=50);
In [84]:
NormW = pd.DataFrame(WNormPerc)
NormW.head()
Out[84]:
In [85]:
NormW.to_csv('Saved_Datasets/TextWSparsePerc.csv', index=False)
With pygsp
In [146]:
G = graphs.Graph(WNormPerc)
G.compute_laplacian('normalized')
Other
In [147]:
#reminder: L = D - W for weighted graphs
laplacian = np.diag(degrees) - WNormPerc
#computation of the normalized Laplacian
laplacian_norm = scipy.sparse.csgraph.laplacian(WNormPerc, normed = True)
plt.spy(laplacian_norm);
In [148]:
G.compute_fourier_basis(recompute=True)
plt.plot(G.e[0:10]);
In [149]:
G.set_coordinates(G.U[:, 1:3])
G.plot()
In [150]:
G.plot_signal(genres, vertex_size=20)
In [ ]: