In [48]:
%matplotlib inline
import configparser
import os
import requests
from tqdm import tqdm
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import sparse, stats, spatial
import scipy.sparse.linalg
from sklearn import preprocessing, decomposition
import librosa
import IPython.display as ipd
import json
from imdb import IMDb
import tmdbsimple as tmdb
from pygsp import graphs, filters, plotting
plt.rcParams['figure.figsize'] = (17, 5)
plotting.BACKEND = 'matplotlib'
In [49]:
dfGenre = pd.read_csv('Saved_Datasets/NormalizedGenreWSparse.csv', encoding = 'latin-1')
dfActor = pd.read_csv('Saved_Datasets/NormalizedActorW.csv', encoding = 'latin-1')
dfDirector = pd.read_csv('Saved_Datasets/NormalizedDirectorW.csv', encoding = 'latin-1')
dfText = pd.read_csv('Saved_Datasets/NormalizedTextW.csv', encoding = 'latin-1')
dfTenures = pd.read_csv('Saved_Datasets/NormSparsActTenuresW.csv', encoding = 'latin-1')
dfProd = pd.read_csv('Saved_Datasets/NormalizedCompaniesW.csv', encoding = 'latin-1')
In [50]:
print('There are {} movies in dfGenre'.format(len(dfGenre)))
print('There are {} movies in dfActor'.format(len(dfActor)))
print('There are {} movies in dfDirector'.format(len(dfDirector)))
print('There are {} movies in dfText'.format(len(dfText)))
print('There are {} movies in dfTenures'.format(len(dfTenures)))
print('There are {} movies in dfProd'.format(len(dfProd)))
In [51]:
dfActor.head()
Out[51]:
In [52]:
WGenre = dfGenre.as_matrix(columns=None)
WActors = dfActor.as_matrix(columns=None)
WDirectors = dfDirector.as_matrix(columns=None)
WText = dfText.as_matrix(columns=None)
WTenures = dfTenures.as_matrix(columns=None)
WProd = dfProd.as_matrix(columns=None)
WTot = (WGenre + WActors + WDirectors + WText + WTenures + WProd)/6
Check that the size is correct
In [53]:
print(WTot.shape)
In [54]:
NormW = pd.DataFrame(WTot)
NormW.head()
Out[54]:
In [55]:
NormW.to_csv('Saved_Datasets/NormalizedTotW.csv', index=False)
Visualize the graph
In [56]:
plt.spy(WTot)
Out[56]:
Plot a histogram of the weights
In [57]:
plt.hist(WTot.reshape(-1), bins=50);
In [58]:
print('The mean value is: {}'.format(WTot.mean()))
print('The max value is: {}'.format(WTot.max()))
print('The min value is: {}'.format(WTot.min()))
In [59]:
sum(sum(WTot > 0.5))/2
Out[59]:
NEIGHBORS = 300
sort_order = np.argsort(WTot, axis = 1)
sorted_weights = np.zeros((len(WTot), len(WTot)))
for i in range (0, len(WTot)):
for j in range(0, len(WTot)):
if (j >= len(WTot) - NEIGHBORS):
#copy the k strongest edges for each node
sorted_weights[i, sort_order[i,j]] = WTot[i,sort_order[i,j]]
else:
#set the other edges to zero
sorted_weights[i, sort_order[i,j]] = 0
bigger = sorted_weights.transpose() > sorted_weights sorted_weights = sorted_weights - sorted_weightsbigger + sorted_weights.transpose()bigger
In [60]:
#WTot = sorted_weights
In [61]:
#plt.spy(WTot)
In [62]:
#plt.hist(WTot.reshape(-1), bins=50);
In [63]:
print(WTot[1])
print(len(WTot[1]))
print(sum(WTot[1]))
In [64]:
degrees = np.zeros(len(WTot))
#reminder: the degrees of a node for a weighted graph are the sum of its weights
for i in range(0, len(WTot)):
degrees[i] = sum(WTot[i])
plt.hist(degrees, bins=50);
In [65]:
print('The mean value is: {}'.format(degrees.mean()))
print('The max value is: {}'.format(degrees.max()))
print('The min value is: {}'.format(degrees.min()))
In [66]:
#reminder: L = D - W for weighted graphs
laplacian = np.diag(degrees) - WTot
#computation of the normalized Laplacian
laplacian_norm = scipy.sparse.csgraph.laplacian(WTot, normed = True)
plt.spy(laplacian_norm);
In [67]:
laplacian_norm = sparse.csr_matrix(laplacian_norm)
In [68]:
eigenvalues, eigenvectors = sparse.linalg.eigsh(laplacian_norm, k = 10, which = 'SM')
In [69]:
plt.plot(eigenvalues, '.-', markersize=15);
plt.xlabel('')
plt.ylabel('Eigenvalues')
plt.show()
In [70]:
print(np.diag(degrees).shape)
In [71]:
print(WTot.shape)
In [72]:
print(laplacian.shape)
In [73]:
G = graphs.Graph(WTot)
In [74]:
G.compute_laplacian('normalized')
In [75]:
#plt.spy(G.L)
In [76]:
G.compute_fourier_basis(recompute=True)
In [77]:
plt.plot(G.e[0:10]);
In [78]:
dfNewFeats = pd.read_csv('Saved_Datasets/NewFeaturesDataset.csv')
In [79]:
len(dfNewFeats)
Out[79]:
In [80]:
dfNewFeats.head()
Out[80]:
In [81]:
genres = preprocessing.LabelEncoder().fit_transform(dfNewFeats['success'])
x = eigenvectors[:, 1]
y = eigenvectors[:, 2]
plt.scatter(x, y, c=genres, cmap='RdBu', alpha=0.5);
In [82]:
len(dfNewFeats[dfNewFeats['success'] == 0])
Out[82]:
In [83]:
#Note: eigenvalues and their respective eigenvectors are already sorted from smallest to biggest
#plot on the eigenvectors 2 and 3 (set_coordinates takes Nx2 or Nx3 array size)
G.set_coordinates(G.U[:,1:3])
G.plot()
In [84]:
G.plot_signal(genres, vertex_size=20)
In [85]:
#G.plot_signal(G.U[:, 1], vertex_size=50)
#Note: The signal is equal to the coordinate along the defined eigenvector axis (1 or 2)
In [86]:
dfTrainLabels = pd.read_csv('Saved_Datasets/Train.csv')
dfTestLabels = pd.read_csv('Saved_Datasets/Test.csv')
In [87]:
len(dfTrainLabels) + len(dfTestLabels)
Out[87]:
In [88]:
dfTrainLabels.iloc[-5:]
Out[88]:
In [89]:
dfTestLabels.iloc[:5]
Out[89]:
In [90]:
#dfNewFeats = pd.read_csv('Saved_Datasets/NewFeaturesDataset.csv')
In [91]:
#len(dfNewFeats)
In [92]:
#dfNewFeats.head()
In [ ]: