In [68]:
%matplotlib inline
import configparser
import os
import requests
from tqdm import tqdm
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import sparse, stats, spatial
import scipy.sparse.linalg
from sklearn import preprocessing, decomposition
import librosa
import IPython.display as ipd
import json
from imdb import IMDb
import tmdbsimple as tmdb
from pygsp import graphs, filters, plotting
plt.rcParams['figure.figsize'] = (17, 5)
plotting.BACKEND = 'matplotlib'
In [69]:
df = pd.read_csv('Saved_Datasets/NewFeaturesDataset.csv')
In [70]:
print('There are {} movies'.format(len(df)))
In [71]:
df['genres'][1]
Out[71]:
In [72]:
df.head()
#df.iloc[100:150]
Out[72]:
In [73]:
df['genres'] = df['genres'].str.replace('|', ',')
In [74]:
i = 1
newgenres = df['genres'][i].split(",")
print(newgenres)
print(len(newgenres))
In [75]:
Diffgenres = [];
genres = {}
movies_dic = {}
for i in range(0, len(df)):
movies_dic[i] = df['id'][i]
if df['genres'][i] == 'NaN':
newgenres = []
else:
newgenres = df['genres'][i].split(",")
genres.setdefault(i, [])
for j in range (0, len(newgenres)):
Diffgenres.append(newgenres[j])
genres[i].append(newgenres[j])
In [76]:
Diffgenres = set(Diffgenres)
Diffgenres = list(Diffgenres)
print('There are {} different genres'.format(len(Diffgenres)))
print(Diffgenres)
In [77]:
df.head()
Out[77]:
Binary vector where the elements are 1 if the film has the genre corresponding to the index of the film. Otherwise the elements are zero.
Quick example:
In [78]:
print(genres[0][0])
len(genres[0][0])
vector = (genres[0][0] == np.array(Diffgenres)).astype(int)
print(vector)
In [79]:
genreArray = np.ndarray(shape=(len(df), len(Diffgenres)), dtype=int)
for i in range(0, len(df)):
vector = np.zeros(len(Diffgenres))
for j in range(0, len(genres[i])):
vector += (genres[i][j] == np.array(Diffgenres)).astype(int)
genreArray[i] = vector
In [80]:
print(genreArray[0])
print(genreArray.size)
Observe the result in the dataframe
In [81]:
Genres = pd.DataFrame(genreArray, columns=Diffgenres)
Genres.head(10)
Out[81]:
In [82]:
#Genres.iloc[120:150]
In [83]:
plt.spy(Genres[120:150])
Out[83]:
In [84]:
freqGenre = np.ndarray(shape=(1, len(Diffgenres)), dtype=int)
for i in range(0, len(Diffgenres)):
freqGenre[0][i] = sum(Genres[Diffgenres[i]] == 1)
Display of the number of times a genre appears in the dataframe
In [85]:
NbGenre = pd.DataFrame(freqGenre, columns=Diffgenres)
NbGenre
Out[85]:
In [86]:
NbGenre.to_csv('Saved_Datasets/NbGenre.csv', index=False)
In [87]:
plt.bar(Diffgenres, freqGenre[0], align='center');
plt.setp(plt.gca().get_xticklabels(), rotation=45, horizontalalignment='right');
plt.xlabel('Genres');
plt.ylabel('Counts');
plt.savefig('images/GenreFreq.png', dpi =300, bbox_inches='tight')
In [88]:
assosGenre = np.ndarray(shape=(len(Diffgenres), len(Diffgenres)), dtype=int)
for i in range(0, len(Diffgenres)):
for j in range(0, len(Diffgenres)):
if i != j:
assosGenre[i][j] = sum((Genres[Diffgenres[i]] == 1) & (Genres[Diffgenres[j]] == 1))
else:
assosGenre[i][j] = 0
In [89]:
#ensure the matrix is symmetric
assosGenreSym = assosGenre.transpose() > assosGenre
assosGenre = assosGenre - assosGenre*assosGenreSym + assosGenre.transpose()*assosGenreSym
In [90]:
plt.spy(assosGenre)
Out[90]:
In [91]:
NbGenreAssos = pd.DataFrame(assosGenre, columns=Diffgenres, index = Diffgenres)
NbGenreAssos
Out[91]:
In [92]:
NbGenreAssos.to_csv('Saved_Datasets/NbGenreAssos.csv', index=False)
In [93]:
assosRank = {}
rank = np.argsort(-assosGenre, axis=1) #negative for ascending order
Diffgenres[rank[0][1]]
for i in range(0, len(Diffgenres)):
for j in range(0, len(Diffgenres)):
assosRank.setdefault(Diffgenres[i], [])
#Only if not comparing with the same genre
if Diffgenres[i] != Diffgenres[rank[i][j]]:
assosRank[Diffgenres[i]].append(Diffgenres[rank[i][j]])
Display of the ranking of the other genres with which each genre is most often associated
In [94]:
ranking = np.linspace(1, len(Diffgenres)-1, num=len(Diffgenres)-1, endpoint=True, retstep=False, dtype=int)
Rankdf = pd.DataFrame(assosRank, index=ranking)
Rankdf
Out[94]:
In [95]:
Rankdf.to_csv('Saved_Datasets/GenreRanking.csv', index=False)
In [96]:
genreArray[0]
Out[96]:
In [97]:
genreSuccess = np.zeros(shape=(1, len(Diffgenres)), dtype=float)
genreSuccessPc = np.zeros(shape=(1, len(Diffgenres)), dtype=float)
for i in range(0, len(Diffgenres)):
for j in range(0, len(df)):
if genreArray[j][i] == 1:
if df['success'][j] == 1:
genreSuccess[0][i] += 1
genreSuccessPc[0][i] = (genreSuccess[0][i]/freqGenre[0][i])*100
In [98]:
plt.bar(Diffgenres, genreSuccessPc[0], align='center');
plt.setp(plt.gca().get_xticklabels(), rotation=45, horizontalalignment='right');
plt.xlabel('Genres');
plt.ylabel('Success rate [%]');
Don't forget that the number of successful movies is not equal to the sum of the success rate of the genres since movies often have multiple genres.
In [99]:
print(sum(sum(genreSuccess)))
print('The number of films that are succesful: {}'.format(len(df[df['success'] == 1])))
print('The number of films that are unsuccesful: {}'.format(len(df[df['success'] == 0])))
In this case, the similarity is the number of genres that the movie has in common:
$\mathbf{W}(u,v) = sum(u \cdot v) \ \in [0, 20]$
In [100]:
weights = np.ndarray(shape=(len(df), len(df)), dtype=int)
weights = genreArray @ genreArray.T
#fill the diagonal values to zero, i.e. no self-connections
np.fill_diagonal(weights, 0)
In [101]:
plt.spy(weights)
Out[101]:
In [102]:
plt.hist(weights[weights > 0].reshape(-1), bins=50);
In [103]:
print('There are {} weights equal to zero'.format(np.sum(weights == 0)))
print('There are {} weights equal to one'.format(np.sum(weights == 1)))
print('There are {} weights equal to seven'.format(np.sum(weights == 7)))
In [104]:
meanW = weights.mean()
maxW = weights.max()
minW = weights.min()
print('The mean value of the similarity in terms of genre is: {}'.format(meanW))
print('The max value of the similarity is: {}'.format(maxW))
print('The min value of the similarity is: {}'.format(minW))
In [105]:
print(genreArray[1])
print(sum(genreArray[1]))
In [106]:
weightsNorm = np.ndarray(shape=(len(df), len(df)), dtype=float)
lengths = np.ndarray(shape=(1, 2), dtype=int)
lenMax = 0;
for i in range(0, len(weights)):
for j in range(0, len(weights)):
if i!=j:
lengths = [sum(genreArray[i]), sum(genreArray[j])]
weightsNorm[i][j] = (weights[i][j])/max(lengths)
np.fill_diagonal(weightsNorm, 0)
In [107]:
sigma = np.std(weights)
print(sigma)
mu = np.mean(weights)
print(mu)
#1/(sigma*math.sqrt(2*math.pi))*
Wgauss = np.exp(-((weights-mu)**2)/(2*sigma**2))
#fill the diagonal values to zero, i.e. no self-connections
np.fill_diagonal(Wgauss, 0)
Maximum normalization
In [108]:
plt.spy(weightsNorm)
Out[108]:
In [109]:
plt.hist(weightsNorm.reshape(-1), bins=50);
In [110]:
print('The mean value is: {}'.format(weightsNorm.mean()))
print('The max value is: {}'.format(weightsNorm.max()))
print('The min value is: {}'.format(weightsNorm.min()))
Plot the degree distribution
In [111]:
degrees = np.zeros(len(weightsNorm))
#reminder: the degrees of a node for a weighted graph are the sum of its weights
for i in range(0, len(weightsNorm)):
degrees[i] = sum(weightsNorm[i])
plt.hist(degrees, bins=50);
In [112]:
print('The mean value is: {}'.format(degrees.mean()))
print('The max value is: {}'.format(degrees.max()))
print('The min value is: {}'.format(degrees.min()))
Gaussian normalization
In [113]:
plt.spy(Wgauss)
Out[113]:
In [114]:
NormW = pd.DataFrame(weightsNorm)
NormW.head()
Out[114]:
In [115]:
NormW.to_csv('Saved_Datasets/NormalizedGenreW.csv', index=False)
With pygsp
In [116]:
G = graphs.Graph(weightsNorm)
G.compute_laplacian('normalized')
Normally
In [117]:
#reminder: L = D - W for weighted graphs
laplacian = np.diag(degrees) - weightsNorm
#computation of the normalized Laplacian
laplacian_norm = scipy.sparse.csgraph.laplacian(weightsNorm, normed = True)
plt.spy(laplacian_norm);
In [118]:
laplacian_norm = sparse.csr_matrix(laplacian_norm)
With pygsp
In [119]:
G.compute_fourier_basis(recompute=True)
plt.plot(G.e[0:10]);
Normally
In [120]:
eigenvalues, eigenvectors = sparse.linalg.eigsh(laplacian_norm, k = 10, which = 'SM')
In [121]:
plt.plot(eigenvalues, '.-', markersize=15);
plt.xlabel('')
plt.ylabel('Eigenvalues')
plt.show()
In [122]:
genres = preprocessing.LabelEncoder().fit_transform(df['success'])
x = eigenvectors[:, 1]
y = eigenvectors[:, 2]
plt.scatter(x, y, c=genres, cmap='RdBu', alpha=0.5);
In [123]:
G.set_coordinates(G.U[:, 1:3])
G.plot()
In [124]:
G.plot_signal(genres, vertex_size=20)
In [125]:
NEIGHBORS = 300
#sort the order of the weights
sort_order = np.argsort(Wgauss, axis = 1)
#declaration of a sorted weight matrix
sorted_weights = np.zeros((len(Wgauss), len(Wgauss)))
for i in range (0, len(Wgauss)):
for j in range(0, len(Wgauss)):
if (j >= len(Wgauss) - NEIGHBORS):
#copy the k strongest edges for each node
sorted_weights[i, sort_order[i,j]] = Wgauss[i,sort_order[i,j]]
else:
#set the other edges to zero
sorted_weights[i, sort_order[i,j]] = 0
#ensure the matrix is symmetric
bigger = sorted_weights.transpose() > sorted_weights
sorted_weights = sorted_weights - sorted_weights*bigger + sorted_weights.transpose()*bigger
In [126]:
plt.spy(sorted_weights)
Out[126]:
In [127]:
plt.hist(sorted_weights.reshape(-1), bins=50);
In [128]:
NormW = pd.DataFrame(sorted_weights)
NormW.head()
Out[128]:
In [129]:
NormW.to_csv('Saved_Datasets/NormalizedGenreWSparse.csv', index=False)
With pygsp
In [130]:
G = graphs.Graph(sorted_weights)
G.compute_laplacian('normalized')
Other
In [131]:
#reminder: L = D - W for weighted graphs
laplacian = np.diag(degrees) - sorted_weights
#computation of the normalized Laplacian
laplacian_norm = scipy.sparse.csgraph.laplacian(sorted_weights, normed = True)
plt.spy(laplacian_norm);
In [132]:
G.compute_fourier_basis(recompute=True)
plt.plot(G.e[0:10]);
In [133]:
G.set_coordinates(G.U[:, 1:3])
G.plot()
In [134]:
G.plot_signal(genres, vertex_size=20)
In [ ]: