In [1]:
%matplotlib inline
import configparser
import os
import requests
from tqdm import tqdm
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import sparse, stats, spatial
import scipy.sparse.linalg
from sklearn import preprocessing, decomposition
import librosa
import IPython.display as ipd
import json
import tmdbsimple as tmdb
from itertools import chain
import statistics as stat
import math
from pygsp import graphs, filters, plotting
plt.rcParams['figure.figsize'] = (17, 5)
plotting.BACKEND = 'matplotlib'
In [2]:
dataset = pd.read_csv('Saved_Datasets/NewFeaturesDataset.csv')
In [3]:
Actors = pd.read_csv('Saved_Datasets/Actorsv3Dataset.csv')
In [4]:
Actors.iloc[0:3]
Out[4]:
In [5]:
dataset.head(5)
Out[5]:
In [14]:
#Extract list of different tenures
all_act_tenures = list(Actors['total_tenure'])
diff_act_tenures = list(set(all_act_tenures))
freqTenure = []
for i in diff_act_tenures:
freqTenure.append(all_act_tenures.count(i))
print(freqTenure)
print(diff_act_tenures)
In [122]:
print(Actors[Actors['total_tenure'] == 102]) #James Russel 1915-2016 2movies -----> The boy 2016 1movie
print(Actors[Actors['total_tenure'] == 117]) #Shohreh Aghdashloo 1900-2016 28 movies ----> 1976-2016 26 movies
print(Actors[Actors['total_tenure'] == 98]) #Fredro Starr 1918-2015 14 movies ---> 1993-2015 13 movies
print(Actors[Actors['total_tenure'] == 92]) #William Black 2009 2 movies -------> 2009 1 movies
print(Actors[Actors['total_tenure'] == 83]) #Norma Shearer 1920-2002 49 movies ------> 1919-1942 45 movies (Pas pour celui dans lequel on a dans le dataset)
print(Actors[Actors['total_tenure'] == 82]) #Barbara E. Robertson 1934-2015 4 movies -----> 2001-2015 3 movies
print(Actors[Actors['total_tenure'] == 80]) # Mary McCormack 1937-2016 28 movies ----> 1995-2016 27 movies
#Ronald Reagan 1937-2016 85 movies ---> 1937-1964 63 films
print(Actors[Actors['total_tenure'] == 79]) #Michael Moreland 1936-2014 3 movies ----> 1999-2014 2 movies
# Tom Payne 1937-2015 12 movies ---> 2007 - 2015 11 movies
print(Actors[Actors['total_tenure'] == 78]) #Jackie Long 1938-2015 14 movies ---> 2005-2015 13 movies
print(Actors[Actors['total_tenure'] == 76]) #The kids stay in the picture (2002) mais déja mort
print(Actors[Actors['total_tenure'] == 74]) #Pour un film ou il est déja mort
print(Actors[Actors['total_tenure'] == 72]) # Juste !
In [123]:
Actors[Actors['Name']=='Shohreh Aghdashloo']
Out[123]:
In [42]:
plt.bar(diff_act_tenures, freqTenure, align='center');
plt.setp(plt.gca().get_xticklabels(), rotation=45, horizontalalignment='right');
plt.xlabel('Total tenures of actors in years');
plt.ylabel('Number of Actors with corresponding total tenure');
plt.savefig('images/tot_tenures_frequency_distri.png', dpi=300, bbox_inches='tight')
In [16]:
all_sum_tenures = list(dataset['total_tenure'])
diff_all_sum_tenures = list(set(all_sum_tenures))
freqSumtenure = []
for i in diff_all_sum_tenures:
freqSumtenure.append(all_sum_tenures.count(i))
print(diff_all_sum_tenures)
print(freqSumtenure)
In [8]:
fig=plt.figure(figsize=(15, 4))
plt.bar(diff_all_sum_tenures, freqSumtenure, align='center');
plt.setp(plt.gca().get_xticklabels(), rotation=45, horizontalalignment='right');
plt.xlabel('Total tenure of movies in years');
plt.ylabel('Number of movies with corresponding total tenure');
plt.savefig('images/sum_tenures_frequency_distri.png', dpi=300, bbox_inches='tight')
In [18]:
all_average_tenures = list(dataset['average_tenure'])
diff_all_average_tenures = list(set(all_average_tenures))
diff_all_average_tenures = sorted(diff_all_average_tenures)
freqAvgtenure = []
for i in diff_all_average_tenures:
freqAvgtenure.append(all_average_tenures.count(i))
In [19]:
fig=plt.figure(figsize=(15, 4))
plt.bar(diff_all_average_tenures, freqAvgtenure, align='center');
plt.setp(plt.gca().get_xticklabels(), rotation=45, horizontalalignment='right');
plt.xlabel('Average tenure of movies in years');
plt.ylabel('Number of movies with corresponding average tenure');
plt.savefig('images/avg_tenures_frequency_distri.png', dpi=300, bbox_inches='tight')
In [5]:
#W = np.ndarray(shape=(10, 10), dtype=int)
W = np.ndarray(shape=(len(df_ten), len(df_ten)), dtype=int)
for i in range(0,len(df_ten)):
for j in range(i,len(df_ten)):
W[i][j] = abs(df_ten['total tenures'][i]-df_ten['total tenures'][j])
In [9]:
bigger = W.transpose() > W
W = W - W*bigger + W.transpose()*bigger
np.fill_diagonal(W, 0)
plt.spy(W)
Out[9]:
In [104]:
plt.hist(W.reshape(-1),bins=50);
In [10]:
sigma = np.std(W)
print(sigma)
mu = np.mean(W)
print(mu)
#1/(sigma*math.sqrt(2*math.pi))*
Wnorm = np.exp(-((W-mu)**2)/(2*sigma**2))
np.fill_diagonal(Wnorm, 0)
In [12]:
sum(np.diag(Wnorm))
Out[12]:
In [23]:
np.sum(Wnorm > 0.99)
Out[23]:
In [106]:
plt.hist(Wnorm.reshape(-1),bins=50);
In [13]:
#Compute degree distribution
degrees = np.zeros(len(Wnorm))
for i in range(0, len(Wnorm)):
degrees[i] = sum(Wnorm[i])
plt.hist(degrees, bins=50);
In [14]:
plt.spy(Wnorm)
Out[14]:
In [18]:
sum(np.diag(Wnorm[:500][:500]))
Out[18]:
In [68]:
len(Wnorm)
Out[68]:
In [19]:
laplacian_norm = scipy.sparse.csgraph.laplacian(Wnorm, normed = True)
print(laplacian_norm)
In [34]:
G = graphs.Graph(Wnorm)
G.compute_laplacian('normalized')
In [35]:
G.compute_fourier_basis(recompute=True)
plt.plot(G.e[0:10]);
In [39]:
G.set_coordinates(G.U[:,1:3])
G.plot()
In [83]:
genres = preprocessing.LabelEncoder().fit_transform(dataset['success'])
G.plot_signal(genres, vertex_size=20)
In [48]:
maxW = W.max()
print(maxW)
In [126]:
NEIGHBORS = 1000
#sort the order of the weights
sort_order = np.argsort(Wnorm, axis = 1)
#declaration of a sorted weight matrix
sorted_weights = np.zeros((len(Wnorm), len(Wnorm)))
for i in range (0, len(Wnorm)):
for j in range(0, len(Wnorm)):
if (j >= len(Wnorm) - NEIGHBORS):
#copy the k strongest edges for each node
sorted_weights[i, sort_order[i,j]] = Wnorm[i,sort_order[i,j]]
else:
#set the other edges to zero
sorted_weights[i, sort_order[i,j]] = 0
#ensure the matrix is symmetric
bigger = sorted_weights.transpose() > sorted_weights
sorted_weights = sorted_weights - sorted_weights*bigger + sorted_weights.transpose()*bigger
In [127]:
plt.spy(sorted_weights)
Out[127]:
In [128]:
plt.hist(sorted_weights.reshape(-1), bins=50);
In [129]:
#Compute degree distribution
degrees_spars = np.zeros(len(sorted_weights))
for i in range(0, len(sorted_weights)):
degrees_spars[i] = sum(sorted_weights[i])
plt.hist(degrees_spars, bins=50);
In [131]:
NormW = pd.DataFrame(Wnorm)
NormW.head()
Out[131]:
In [133]:
NormSparsW = pd.DataFrame(sorted_weights)
NormSparsW.head()
Out[133]:
In [141]:
NormW.to_csv('Saved_Datasets/NormActTenuresW.csv', index=False)
In [142]:
NormSparsW.to_csv('Saved_Datasets/NormSparsActTenuresW.csv', index=False)
In [136]:
#reminder: L = D - W for weighted graphs
laplacian = np.diag(degrees) - Wnorm
#computation of the normalized Laplacian
laplacian_norm = scipy.sparse.csgraph.laplacian(sorted_weights, normed = True)
plt.spy(laplacian_norm);
In [137]:
eigenvalues, eigenvectors = sparse.linalg.eigsh(laplacian_norm, k = 10, which = 'SM')
In [138]:
plt.plot(eigenvalues, '.-', markersize=15);
plt.xlabel('')
plt.ylabel('Eigenvalues')
plt.show()
In [139]:
genres = preprocessing.LabelEncoder().fit_transform(dataset['success'])
x = eigenvectors[:, 2]
y = eigenvectors[:, 3]
plt.scatter(x, y, c=genres, cmap='RdBu', alpha=0.5);
In [7]:
val_75 = np.percentile(W,75)
print(val_75)
In [10]:
W_diff_norm = np.zeros(shape=(len(df_ten), len(df_ten)), dtype=float)
for i in range(0,len(df_ten)):
for j in range(i,len(df_ten)):
if W[i][j] == 0:
W_diff_norm[i][j] = 1
elif W[i][j] <= val_75:
W_diff_norm[i][j] = 1-(W[i][j])/(val_75)
else:
W_diff_norm[i][j] = 0
In [11]:
bigger = W_diff_norm.transpose() > W_diff_norm
W_diff_norm = W_diff_norm - W_diff_norm*bigger + W_diff_norm.transpose()*bigger
np.fill_diagonal(W_diff_norm, 0)
In [12]:
plt.spy(W_diff_norm)
Out[12]:
In [13]:
DiffNormW = pd.DataFrame(W_diff_norm)
DiffNormW.head()
Out[13]:
In [14]:
DiffNormW.to_csv('Saved_Datasets/DiffNorm75ActTenW.csv', index=False)
In [15]:
plt.hist(W_diff_norm.reshape(-1),bins=50);
In [16]:
G = graphs.Graph(W_diff_norm)
G.compute_laplacian('normalized')
G.compute_fourier_basis(recompute=True)
plt.plot(G.e[0:10]);
In [17]:
labels = preprocessing.LabelEncoder().fit_transform(df_ten['success'])
G.set_coordinates(G.U[:,1:3])
G.plot_signal(labels, vertex_size=20)
In [21]:
NEIGHBORS = 200
#sort the order of the weights
sort_order = np.argsort(W_diff_norm, axis = 1)
#declaration of a sorted weight matrix
sorted_weights = np.zeros((len(W_diff_norm), len(W_diff_norm)))
for i in range (0, len(W_diff_norm)):
for j in range(0, len(W_diff_norm)):
if (j >= len(W_diff_norm) - NEIGHBORS):
#copy the k strongest edges for each node
sorted_weights[i, sort_order[i,j]] = W_diff_norm[i,sort_order[i,j]]
else:
#set the other edges to zero
sorted_weights[i, sort_order[i,j]] = 0
#ensure the matrix is symmetric
bigger = sorted_weights.transpose() > sorted_weights
sorted_weights = sorted_weights - sorted_weights*bigger + sorted_weights.transpose()*bigger
In [22]:
plt.spy(sorted_weights)
Out[22]:
In [23]:
plt.hist(sorted_weights.reshape(-1),bins=50);
In [26]:
G = graphs.Graph(sorted_weights)
G.compute_laplacian('normalized')
G.compute_fourier_basis(recompute=True)
plt.plot(G.e[0:10]);
In [30]:
labels = preprocessing.LabelEncoder().fit_transform(df_ten['success'])
G.set_coordinates(G.U[:,1:3])
G.plot_signal(labels, vertex_size=20)
In [ ]: