In [1]:

    
%matplotlib inline

import configparser
import os

import requests
from tqdm import tqdm
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import sparse, stats, spatial
import scipy.sparse.linalg
from sklearn import preprocessing, decomposition
import librosa
import IPython.display as ipd
import json
import tmdbsimple as tmdb
from itertools import chain
import statistics as stat
import math
from pygsp import graphs, filters, plotting

plt.rcParams['figure.figsize'] = (17, 5)
plotting.BACKEND = 'matplotlib'

1. Loading dataset



In [2]:

    
dataset = pd.read_csv('Saved_Datasets/NewFeaturesDataset.csv')



In [3]:

    
Actors = pd.read_csv('Saved_Datasets/Actorsv3Dataset.csv')



In [4]:

    
Actors.iloc[0:3]









    Out[4]:







  
    
      
      tmdb_id
      Name
      date
      total_tenure
      nb_total_movies
      movies_in_dataset
      Realease_date_of_movies_in_dataset
      Actors_tenure_in_movies
    
  
  
    
      0
      15295
      Vicky Haughton
      ['2000', '2010']
      11
      5
      ['Whale Rider']
      ['2003']
      [4]
    
    
      1
      16940
      Jeremy Irons
      ['1974', '2016']
      43
      90
      ['Kingdom of Heaven', 'Eragon', 'Dungeons & Dr...
      ['2005', '2006', '2000', '2008', '2011', '2012...
      [32, 33, 27, 35, 38, 39, 40, 43, 43]
    
    
      2
      41087
      Leslie Mann
      ['1996', '2016']
      21
      31
      ['Knocked Up', 'I Love You Phillip Morris', '1...
      ['2007', '2009', '2009', '2009', '2011', '2011...
      [12, 14, 14, 14, 16, 16, 15, 17, 19, 19]



In [5]:

    
dataset.head(5)









    Out[5]:







  
    
      
      id
      budget
      genres
      imdb_id
      overview
      production_companies
      release_date
      revenue
      title
      director_name
      actor_names
      actors_ids
      actors_tenures
      total_tenure
      average_tenure
      Metacritic
      ROI
      success
    
  
  
    
      0
      12
      94000000
      Animation|Family
      266543
      Nemo, an adventurous young clownfish, is unexp...
      Pixar Animation Studios
      2003-05-30
      940335536
      Finding Nemo
      Andrew Stanton
      ['Albert Brooks', 'Ellen DeGeneres', 'Alexande...
      [14, 5293, 12, 13, 18]
      [18, 24, 2, 28, 14]
      86
      17.20
      90
      2.639
      4
    
    
      1
      16
      12800000
      Drama|Crime|Music
      168629
      Selma, a Czech immigrant on the verge of blind...
      Fine Line Features
      2000-05-17
      40031879
      Dancer in the Dark
      Lars von Trier
      ['Björk', 'Catherine Deneuve', 'David Morse', ...
      [6748, 47, 52, 50, 53]
      [49, 19, 21, 44, 15]
      148
      29.60
      61
      2.127
      3
    
    
      2
      22
      140000000
      Adventure|Fantasy|Action
      325980
      Jack Sparrow, a freewheeling 17th-century pira...
      Walt Disney Pictures
      2003-09-07
      655011224
      Pirates of the Caribbean: The Curse of the Bla...
      Gore Verbinski
      ['Johnny Depp', 'Geoffrey Rush', 'Orlando Bloo...
      [1709, 116, 114, 118, 85]
      [7, 9, 7, 22, 20]
      65
      13.00
      63
      2.639
      4
    
    
      3
      24
      30000000
      Action|Crime
      266697
      An assassin is shot at the altar by her ruthle...
      Miramax Films
      2003-10-10
      180949000
      Kill Bill: Vol. 1
      Quentin Tarantino
      ['Uma Thurman', 'Lucy Liu', 'Vivica A. Fox', '...
      [140, 589, 2535, 139, 141]
      [12, 26, 15, 17, 39]
      109
      21.80
      69
      2.639
      4
    
    
      4
      25
      72000000
      Drama|War
      418763
      Jarhead is a film about a US Marine Anthony Sw...
      Universal Pictures
      2005-04-11
      96889998
      Jarhead
      Sam Mendes
      ['Jamie Foxx', 'Scott MacDonald', 'Lucas Black...
      [133, 134, 1350483, 155]
      [11, 14, 0, 12]
      37
      9.25
      58
      0.346
      2

2. Data exploration

2.1 Determine distribution actors total tenure



In [14]:

    
#Extract list of different tenures
all_act_tenures = list(Actors['total_tenure'])

diff_act_tenures = list(set(all_act_tenures))

freqTenure = []
for i in diff_act_tenures:
    freqTenure.append(all_act_tenures.count(i))

print(freqTenure)    
print(diff_act_tenures)









    



[48, 342, 52, 76, 90, 97, 99, 102, 121, 127, 126, 164, 159, 143, 143, 141, 145, 150, 150, 104, 110, 121, 132, 110, 103, 106, 88, 78, 78, 75, 80, 80, 66, 64, 68, 59, 53, 53, 57, 38, 33, 35, 22, 31, 27, 23, 20, 27, 25, 21, 18, 16, 15, 13, 11, 11, 10, 16, 8, 4, 8, 5, 13, 4, 2, 2, 8, 2, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1]
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 68, 69, 71, 72, 74, 76, 78, 79, 80, 82, 83, 92, 98, 102, 117]



In [122]:

    
print(Actors[Actors['total_tenure'] == 102]) #James Russel 1915-2016 2movies -----> The boy  2016 1movie
print(Actors[Actors['total_tenure'] == 117]) #Shohreh Aghdashloo 1900-2016 28 movies ----> 1976-2016 26 movies 
print(Actors[Actors['total_tenure'] == 98]) #Fredro Starr 1918-2015 14 movies ---> 1993-2015 13 movies
print(Actors[Actors['total_tenure'] == 92]) #William Black 2009 2 movies -------> 2009 1 movies
print(Actors[Actors['total_tenure'] == 83]) #Norma Shearer 1920-2002 49 movies ------> 1919-1942 45 movies (Pas pour celui dans lequel on a dans le dataset)
print(Actors[Actors['total_tenure'] == 82]) #Barbara E. Robertson 1934-2015 4 movies -----> 2001-2015 3 movies
print(Actors[Actors['total_tenure'] == 80]) # Mary McCormack 1937-2016 28 movies ----> 1995-2016 27 movies
                                        #Ronald Reagan 1937-2016 85 movies ---> 1937-1964 63 films
print(Actors[Actors['total_tenure'] == 79]) #Michael Moreland 1936-2014 3 movies ----> 1999-2014 2 movies
                                        # Tom Payne 1937-2015 12 movies ---> 2007 - 2015 11 movies
print(Actors[Actors['total_tenure'] == 78])  #Jackie Long 1938-2015 14 movies   ---> 2005-2015 13 movies
print(Actors[Actors['total_tenure'] == 76])  #The kids stay in the picture (2002) mais déja mort 
print(Actors[Actors['total_tenure'] == 74]) #Pour un film ou il est déja mort
print(Actors[Actors['total_tenure'] == 72]) # Juste !









    



      tmdb_id           Name           date  total_tenure  nb_total_movies  \
1802  1029403  James Russell  [1915,  2016]           102                2   

     movies_in_dataset Realease_date_of_movies_in_dataset  \
1802       ['The Boy']                           ['2016']   

     Actors_tenure_in_movies  
1802                   [102]  
      tmdb_id                Name           date  total_tenure  \
2437    21041  Shohreh Aghdashloo  [1900,  2016]           117   

      nb_total_movies   movies_in_dataset Realease_date_of_movies_in_dataset  \
2437               28  ['The Lake House']                           ['2006']   

     Actors_tenure_in_movies  
2437                   [107]  
      tmdb_id          Name           date  total_tenure  nb_total_movies  \
1386    59568  Fredro Starr  [1918,  2015]            98               14   

            movies_in_dataset Realease_date_of_movies_in_dataset  \
1386  ['Save the Last Dance']                           ['2001']   

     Actors_tenure_in_movies  
1386                    [84]  
      tmdb_id           Name           date  total_tenure  nb_total_movies  \
3685   938880  William Black  [1918,  2009]            92                2   

                 movies_in_dataset Realease_date_of_movies_in_dataset  \
3685  ['Capitalism: A Love Story']                           ['2009']   

     Actors_tenure_in_movies  
3685                    [92]  
      tmdb_id           Name           date  total_tenure  nb_total_movies  \
2136    88867  Norma Shearer  [1920,  2002]            83               49   

                     movies_in_dataset Realease_date_of_movies_in_dataset  \
2136  ['The Kid Stays in the Picture']                           ['2002']   

     Actors_tenure_in_movies  
2136                    [83]  
      tmdb_id                  Name           date  total_tenure  \
3891   145204  Barbara E. Robertson  [1934,  2015]            82   

      nb_total_movies movies_in_dataset Realease_date_of_movies_in_dataset  \
3891                4   ['The Company']                           ['2003']   

     Actors_tenure_in_movies  
3891                    [70]  
      tmdb_id            Name           date  total_tenure  nb_total_movies  \
363      1980  Mary McCormack  [1937,  2016]            80               28   
4250    18802   Ronald Reagan  [1937,  2016]            80               85   

                      movies_in_dataset Realease_date_of_movies_in_dataset  \
363   ['K-PAX', '1408', 'Full Frontal']           ['2001', '2007', '2002']   
4250          ['An Inconvenient Truth']                           ['2006']   

     Actors_tenure_in_movies  
363             [65, 71, 66]  
4250                    [70]  
      tmdb_id              Name           date  total_tenure  nb_total_movies  \
80    1047642  Michael Moreland  [1936,  2014]            79                3   
1197    30319         Tom Payne  [1937,  2015]            79               12   

       movies_in_dataset Realease_date_of_movies_in_dataset  \
80    ['Under the Skin']                           ['2014']   
1197   ['The Physician']                           ['2013']   

     Actors_tenure_in_movies  
80                      [79]  
1197                    [77]  
      tmdb_id         Name           date  total_tenure  nb_total_movies  \
1775    66526  Jackie Long  [1938,  2015]            78               14   

     movies_in_dataset Realease_date_of_movies_in_dataset  \
1775           ['ATL']                           ['2006']   

     Actors_tenure_in_movies  
1775                    [69]  
      tmdb_id              Name           date  total_tenure  nb_total_movies  \
4108    38233  Ernest Hemingway  [1937,  2012]            76                5   

                     movies_in_dataset Realease_date_of_movies_in_dataset  \
4108  ['The Kid Stays in the Picture']                           ['2002']   

     Actors_tenure_in_movies  
4108                    [66]  
      tmdb_id        Name           date  total_tenure  nb_total_movies  \
2374    30111  Bud Abbott  [1940,  2013]            74               47   

           movies_in_dataset Realease_date_of_movies_in_dataset  \
2374  ['Behind the Burly Q']                           ['2010']   

     Actors_tenure_in_movies  
2374                    [71]  
      tmdb_id             Name           date  total_tenure  nb_total_movies  \
1262    14730  Angela Lansbury  [1944,  2015]            72               91   

              movies_in_dataset Realease_date_of_movies_in_dataset  \
1262  ["Mr. Popper's Penguins"]                           ['2011']   

     Actors_tenure_in_movies  
1262                    [68]



In [123]:

    
Actors[Actors['Name']=='Shohreh Aghdashloo']









    Out[123]:







  
    
      
      tmdb_id
      Name
      date
      total_tenure
      nb_total_movies
      movies_in_dataset
      Realease_date_of_movies_in_dataset
      Actors_tenure_in_movies
    
  
  
    
      2437
      21041
      Shohreh Aghdashloo
      [1900,  2016]
      117
      28
      ['The Lake House']
      ['2006']
      [107]



In [42]:

    
plt.bar(diff_act_tenures, freqTenure, align='center');
plt.setp(plt.gca().get_xticklabels(), rotation=45, horizontalalignment='right');
plt.xlabel('Total tenures of actors in years');
plt.ylabel('Number of Actors with corresponding total tenure');
plt.savefig('images/tot_tenures_frequency_distri.png', dpi=300, bbox_inches='tight')

2.2 Distribution movies tenures



In [16]:

    
all_sum_tenures = list(dataset['total_tenure'])
diff_all_sum_tenures = list(set(all_sum_tenures))

freqSumtenure = []
for i in diff_all_sum_tenures:
    freqSumtenure.append(all_sum_tenures.count(i))
    
print(diff_all_sum_tenures)    
print(freqSumtenure)









    



[0, 1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 170, 171, 172, 175, 176, 178, 179, 180, 181, 183, 190, 196, 201, 202, 205, 207, 212, 219, 233]
[2, 2, 4, 4, 6, 3, 5, 3, 1, 3, 5, 4, 4, 3, 2, 2, 5, 2, 6, 4, 5, 3, 7, 7, 3, 5, 6, 3, 9, 9, 9, 5, 6, 12, 11, 11, 15, 9, 20, 16, 14, 13, 11, 15, 18, 18, 14, 13, 15, 22, 11, 19, 26, 12, 27, 32, 23, 23, 29, 26, 32, 16, 24, 32, 28, 23, 30, 34, 32, 27, 26, 37, 26, 31, 37, 37, 23, 35, 30, 29, 35, 32, 35, 33, 32, 30, 32, 44, 32, 32, 27, 39, 21, 22, 29, 28, 23, 28, 15, 30, 31, 33, 21, 15, 21, 16, 18, 23, 17, 17, 23, 36, 25, 13, 25, 16, 20, 16, 26, 15, 13, 12, 15, 7, 16, 11, 19, 13, 10, 12, 13, 14, 12, 3, 12, 13, 8, 14, 11, 6, 5, 9, 13, 4, 8, 2, 3, 7, 4, 2, 6, 6, 5, 4, 8, 3, 4, 4, 2, 2, 5, 2, 2, 4, 2, 3, 1, 3, 1, 3, 3, 2, 2, 1, 1, 3, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1]



In [8]:

    
fig=plt.figure(figsize=(15, 4))

plt.bar(diff_all_sum_tenures, freqSumtenure, align='center');
plt.setp(plt.gca().get_xticklabels(), rotation=45, horizontalalignment='right');
plt.xlabel('Total tenure of movies in years');
plt.ylabel('Number of movies with corresponding total tenure');
plt.savefig('images/sum_tenures_frequency_distri.png', dpi=300, bbox_inches='tight')



In [18]:

    
all_average_tenures = list(dataset['average_tenure'])
diff_all_average_tenures = list(set(all_average_tenures))
diff_all_average_tenures = sorted(diff_all_average_tenures)

freqAvgtenure = []
for i in diff_all_average_tenures:
    freqAvgtenure.append(all_average_tenures.count(i))



In [19]:

    
fig=plt.figure(figsize=(15, 4))
plt.bar(diff_all_average_tenures, freqAvgtenure, align='center');
plt.setp(plt.gca().get_xticklabels(), rotation=45, horizontalalignment='right');
plt.xlabel('Average tenure of movies in years');
plt.ylabel('Number of movies with corresponding average tenure');
plt.savefig('images/avg_tenures_frequency_distri.png', dpi=300, bbox_inches='tight')

3. Data exploitation

3.1 Compute Weight matrix



In [5]:

    
#W = np.ndarray(shape=(10, 10), dtype=int)
W = np.ndarray(shape=(len(df_ten), len(df_ten)), dtype=int)
for i in range(0,len(df_ten)):
    for j in range(i,len(df_ten)):
        W[i][j] = abs(df_ten['total tenures'][i]-df_ten['total tenures'][j])



In [9]:

    
bigger = W.transpose() > W
W = W - W*bigger + W.transpose()*bigger
np.fill_diagonal(W, 0)
plt.spy(W)









    Out[9]:





<matplotlib.image.AxesImage at 0x1275c8828>



In [104]:

    
plt.hist(W.reshape(-1),bins=50);

3.2 Gaussian Normalization of W



In [10]:

    
sigma = np.std(W)
print(sigma)
mu = np.mean(W)
print(mu)
#1/(sigma*math.sqrt(2*math.pi))*
Wnorm = np.exp(-((W-mu)**2)/(2*sigma**2))
np.fill_diagonal(Wnorm, 0)









    



29.4314557582
38.4201782888
[[ True False False ..., False False False]
 [False  True False ..., False False False]
 [False False  True ..., False False False]
 ..., 
 [False False False ...,  True False False]
 [False False False ..., False  True False]
 [False False False ..., False False  True]]



In [12]:

    
sum(np.diag(Wnorm))









    Out[12]:





0.0



In [23]:

    
np.sum(Wnorm > 0.99)









    Out[23]:





655292



In [106]:

    
plt.hist(Wnorm.reshape(-1),bins=50);



In [13]:

    
#Compute degree distribution 
degrees = np.zeros(len(Wnorm)) 
for i in range(0, len(Wnorm)):
    degrees[i] = sum(Wnorm[i])

plt.hist(degrees, bins=50);



In [14]:

    
plt.spy(Wnorm)









    Out[14]:





<matplotlib.image.AxesImage at 0x141011c18>



In [18]:

    
sum(np.diag(Wnorm[:500][:500]))









    Out[18]:





0.0



In [68]:

    
len(Wnorm)









    Out[68]:





2621



In [19]:

    
laplacian_norm = scipy.sparse.csgraph.laplacian(Wnorm, normed = True)
print(laplacian_norm)









    



[[  1.00000000e+00  -4.22955374e-04  -4.27552009e-04 ...,  -2.87823217e-04
   -3.64949350e-04  -3.74725856e-04]
 [ -4.22955374e-04   1.00000000e+00  -1.86906604e-04 ...,  -2.63733425e-06
   -5.54686345e-04  -5.59871020e-04]
 [ -4.27552009e-04  -1.86906604e-04   1.00000000e+00 ...,  -5.46847947e-04
   -5.03985112e-04  -5.05091364e-04]
 ..., 
 [ -2.87823217e-04  -2.63733425e-06  -5.46847947e-04 ...,   1.00000000e+00
   -1.33181089e-04  -1.25406677e-04]
 [ -3.64949350e-04  -5.54686345e-04  -5.03985112e-04 ...,  -1.33181089e-04
    1.00000000e+00  -2.21574262e-04]
 [ -3.74725856e-04  -5.59871020e-04  -5.05091364e-04 ...,  -1.25406677e-04
   -2.21574262e-04   1.00000000e+00]]



In [34]:

    
G = graphs.Graph(Wnorm)
G.compute_laplacian('normalized')



In [35]:

    
G.compute_fourier_basis(recompute=True)
plt.plot(G.e[0:10]);



In [39]:

    
G.set_coordinates(G.U[:,1:3])
G.plot()



In [83]:

    
genres = preprocessing.LabelEncoder().fit_transform(dataset['success'])
G.plot_signal(genres, vertex_size=20)



In [48]:

    
maxW = W.max()
print(maxW)

Sparsification of the graph



In [126]:

    
NEIGHBORS = 1000

#sort the order of the weights
sort_order = np.argsort(Wnorm, axis = 1)

#declaration of a sorted weight matrix
sorted_weights = np.zeros((len(Wnorm), len(Wnorm)))

for i in range (0, len(Wnorm)):  
    for j in range(0, len(Wnorm)):
        if (j >= len(Wnorm) - NEIGHBORS):
            #copy the k strongest edges for each node
            sorted_weights[i, sort_order[i,j]] = Wnorm[i,sort_order[i,j]]
        else:
            #set the other edges to zero
            sorted_weights[i, sort_order[i,j]] = 0

#ensure the matrix is symmetric
bigger = sorted_weights.transpose() > sorted_weights
sorted_weights = sorted_weights - sorted_weights*bigger + sorted_weights.transpose()*bigger



In [127]:

    
plt.spy(sorted_weights)









    Out[127]:





<matplotlib.image.AxesImage at 0x1383034a8>



In [128]:

    
plt.hist(sorted_weights.reshape(-1), bins=50);



In [129]:

    
#Compute degree distribution 
degrees_spars = np.zeros(len(sorted_weights)) 
for i in range(0, len(sorted_weights)):
    degrees_spars[i] = sum(sorted_weights[i])

plt.hist(degrees_spars, bins=50);



In [131]:

    
NormW = pd.DataFrame(Wnorm)
NormW.head()









    Out[131]:







  
    
      
      0
      1
      2
      3
      4
      5
      6
      7
      8
      9
      ...
      2611
      2612
      2613
      2614
      2615
      2616
      2617
      2618
      2619
      2620
    
  
  
    
      0
      0.000000
      0.725465
      0.839317
      0.871749
      0.937433
      0.925518
      0.998836
      0.586162
      0.839317
      0.804369
      ...
      0.484721
      0.668366
      0.705575
      0.855873
      0.987966
      0.708768
      0.728614
      0.461914
      0.728614
      0.748150
    
    
      1
      0.725465
      0.000000
      0.317538
      0.999806
      0.047800
      0.043933
      0.901262
      0.562334
      0.996166
      0.351152
      ...
      0.783118
      0.925518
      0.445627
      0.998560
      0.804369
      0.948405
      0.423524
      0.003663
      0.958399
      0.967380
    
    
      2
      0.839317
      0.317538
      0.000000
      0.982189
      0.939248
      0.950067
      0.801483
      0.688668
      0.992630
      0.465031
      ...
      0.886894
      0.983185
      0.301435
      0.987966
      0.685439
      0.993271
      0.545144
      0.869262
      0.996625
      0.998836
    
    
      3
      0.871749
      0.999806
      0.982189
      0.000000
      0.521585
      0.501462
      0.708768
      0.968718
      0.465031
      0.992630
      ...
      0.822132
      0.647914
      0.998560
      0.445627
      0.822132
      0.606763
      0.999898
      0.128862
      0.586162
      0.565608
    
    
      4
      0.937433
      0.047800
      0.939248
      0.521585
      0.000000
      0.445627
      0.270699
      0.996166
      0.562334
      0.959901
      ...
      0.899022
      0.745054
      0.043933
      0.541889
      0.190691
      0.705575
      0.988786
      0.914806
      0.685439
      0.665110
    
  

5 rows × 2621 columns



In [133]:

    
NormSparsW = pd.DataFrame(sorted_weights)
NormSparsW.head()









    Out[133]:







  
    
      
      0
      1
      2
      3
      4
      5
      6
      7
      8
      9
      ...
      2611
      2612
      2613
      2614
      2615
      2616
      2617
      2618
      2619
      2620
    
  
  
    
      0
      0.000000
      0.000000
      0.000000
      0.000000
      0.937433
      0.925518
      0.998836
      0.000000
      0.000000
      0.000000
      ...
      0.000000
      0.000000
      0.00000
      0.000000
      0.987966
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
    
    
      1
      0.000000
      0.000000
      0.000000
      0.999806
      0.000000
      0.000000
      0.901262
      0.000000
      0.996166
      0.000000
      ...
      0.783118
      0.925518
      0.00000
      0.998560
      0.804369
      0.948405
      0.000000
      0.000000
      0.958399
      0.967380
    
    
      2
      0.000000
      0.000000
      0.000000
      0.982189
      0.939248
      0.950067
      0.000000
      0.000000
      0.992630
      0.000000
      ...
      0.886894
      0.983185
      0.00000
      0.987966
      0.000000
      0.993271
      0.000000
      0.869262
      0.996625
      0.998836
    
    
      3
      0.000000
      0.999806
      0.982189
      0.000000
      0.000000
      0.000000
      0.000000
      0.968718
      0.000000
      0.992630
      ...
      0.000000
      0.000000
      0.99856
      0.000000
      0.000000
      0.000000
      0.999898
      0.000000
      0.000000
      0.000000
    
    
      4
      0.937433
      0.000000
      0.939248
      0.000000
      0.000000
      0.000000
      0.000000
      0.996166
      0.000000
      0.959901
      ...
      0.899022
      0.000000
      0.00000
      0.000000
      0.000000
      0.000000
      0.988786
      0.914806
      0.000000
      0.000000
    
  

5 rows × 2621 columns



In [141]:

    
NormW.to_csv('Saved_Datasets/NormActTenuresW.csv', index=False)



In [142]:

    
NormSparsW.to_csv('Saved_Datasets/NormSparsActTenuresW.csv', index=False)

Compute Laplacian and graph embedding



In [136]:

    
#reminder: L = D - W for weighted graphs
laplacian = np.diag(degrees) - Wnorm

#computation of the normalized Laplacian
laplacian_norm = scipy.sparse.csgraph.laplacian(sorted_weights, normed = True)

plt.spy(laplacian_norm);



In [137]:

    
eigenvalues, eigenvectors =  sparse.linalg.eigsh(laplacian_norm, k = 10, which = 'SM')



In [138]:

    
plt.plot(eigenvalues, '.-', markersize=15);
plt.xlabel('')
plt.ylabel('Eigenvalues')
plt.show()



In [139]:

    
genres = preprocessing.LabelEncoder().fit_transform(dataset['success'])

x = eigenvectors[:, 2] 
y = eigenvectors[:, 3] 
plt.scatter(x, y, c=genres, cmap='RdBu', alpha=0.5);

3.3 3rd percentile normalization



In [7]:

    
val_75 = np.percentile(W,75)
print(val_75)



In [10]:

    
W_diff_norm = np.zeros(shape=(len(df_ten), len(df_ten)), dtype=float)
for i in range(0,len(df_ten)):
    for j in range(i,len(df_ten)):
        if W[i][j] == 0:
            W_diff_norm[i][j] = 1
        elif W[i][j] <= val_75:  
            W_diff_norm[i][j] = 1-(W[i][j])/(val_75)
        else:
            W_diff_norm[i][j] = 0



In [11]:

    
bigger = W_diff_norm.transpose() > W_diff_norm
W_diff_norm = W_diff_norm - W_diff_norm*bigger + W_diff_norm.transpose()*bigger
np.fill_diagonal(W_diff_norm, 0)



In [12]:

    
plt.spy(W_diff_norm)









    Out[12]:





<matplotlib.image.AxesImage at 0x121199390>



In [13]:

    
DiffNormW = pd.DataFrame(W_diff_norm)
DiffNormW.head()









    Out[13]:







  
    
      
      0
      1
      2
      3
      4
      5
      6
      7
      8
      9
      ...
      2611
      2612
      2613
      2614
      2615
      2616
      2617
      2618
      2619
      2620
    
  
  
    
      0
      0.00000
      0.0
      0.34375
      0.28125
      0.000
      0.00000
      0.00000
      0.75000
      0.34375
      0.40625
      ...
      0.90625
      0.62500
      0.00000
      0.31250
      0.00000
      0.56250
      0.53125
      0.0000
      0.53125
      0.50000
    
    
      1
      0.00000
      0.0
      0.00000
      0.00000
      0.000
      0.00000
      0.21875
      0.00000
      0.00000
      0.00000
      ...
      0.00000
      0.00000
      0.96875
      0.00000
      0.40625
      0.00000
      0.00000
      0.0000
      0.00000
      0.00000
    
    
      2
      0.34375
      0.0
      0.00000
      0.00000
      0.125
      0.09375
      0.00000
      0.59375
      0.00000
      0.93750
      ...
      0.25000
      0.00000
      0.00000
      0.00000
      0.00000
      0.00000
      0.81250
      0.0000
      0.00000
      0.00000
    
    
      3
      0.28125
      0.0
      0.00000
      0.00000
      0.000
      0.00000
      0.56250
      0.03125
      0.93750
      0.00000
      ...
      0.37500
      0.65625
      0.00000
      0.96875
      0.37500
      0.71875
      0.00000
      0.0000
      0.75000
      0.78125
    
    
      4
      0.00000
      0.0
      0.12500
      0.00000
      0.000
      0.96875
      0.00000
      0.00000
      0.00000
      0.06250
      ...
      0.00000
      0.00000
      0.00000
      0.00000
      0.00000
      0.00000
      0.00000
      0.1875
      0.00000
      0.00000
    
  

5 rows × 2621 columns



In [14]:

    
DiffNormW.to_csv('Saved_Datasets/DiffNorm75ActTenW.csv', index=False)



In [15]:

    
plt.hist(W_diff_norm.reshape(-1),bins=50);



In [16]:

    
G = graphs.Graph(W_diff_norm)
G.compute_laplacian('normalized')
G.compute_fourier_basis(recompute=True)
plt.plot(G.e[0:10]);



In [17]:

    
labels = preprocessing.LabelEncoder().fit_transform(df_ten['success'])
G.set_coordinates(G.U[:,1:3])
G.plot_signal(labels, vertex_size=20)

Sparsification of the weight matrix



In [21]:

    
NEIGHBORS = 200

#sort the order of the weights
sort_order = np.argsort(W_diff_norm, axis = 1)

#declaration of a sorted weight matrix
sorted_weights = np.zeros((len(W_diff_norm), len(W_diff_norm)))

for i in range (0, len(W_diff_norm)):  
    for j in range(0, len(W_diff_norm)):
        if (j >= len(W_diff_norm) - NEIGHBORS):
            #copy the k strongest edges for each node
            sorted_weights[i, sort_order[i,j]] = W_diff_norm[i,sort_order[i,j]]
        else:
            #set the other edges to zero
            sorted_weights[i, sort_order[i,j]] = 0

#ensure the matrix is symmetric
bigger = sorted_weights.transpose() > sorted_weights
sorted_weights = sorted_weights - sorted_weights*bigger + sorted_weights.transpose()*bigger



In [22]:

    
plt.spy(sorted_weights)









    Out[22]:





<matplotlib.image.AxesImage at 0x120a58358>



In [23]:

    
plt.hist(sorted_weights.reshape(-1),bins=50);



In [26]:

    
G = graphs.Graph(sorted_weights)
G.compute_laplacian('normalized')
G.compute_fourier_basis(recompute=True)
plt.plot(G.e[0:10]);



In [30]:

    
labels = preprocessing.LabelEncoder().fit_transform(df_ten['success'])
G.set_coordinates(G.U[:,1:3])
G.plot_signal(labels, vertex_size=20)



In [ ]:

	tmdb_id	Name	date	total_tenure	nb_total_movies	movies_in_dataset	Realease_date_of_movies_in_dataset	Actors_tenure_in_movies
0	15295	Vicky Haughton	['2000', '2010']	11	5	['Whale Rider']	['2003']	[4]
1	16940	Jeremy Irons	['1974', '2016']	43	90	['Kingdom of Heaven', 'Eragon', 'Dungeons & Dr...	['2005', '2006', '2000', '2008', '2011', '2012...	[32, 33, 27, 35, 38, 39, 40, 43, 43]
2	41087	Leslie Mann	['1996', '2016']	21	31	['Knocked Up', 'I Love You Phillip Morris', '1...	['2007', '2009', '2009', '2009', '2011', '2011...	[12, 14, 14, 14, 16, 16, 15, 17, 19, 19]

	id	budget	genres	imdb_id	overview	production_companies	release_date	revenue	title	director_name	actor_names	actors_ids	actors_tenures	total_tenure	average_tenure	Metacritic	ROI	success
0	12	94000000	Animation\|Family	266543	Nemo, an adventurous young clownfish, is unexp...	Pixar Animation Studios	2003-05-30	940335536	Finding Nemo	Andrew Stanton	['Albert Brooks', 'Ellen DeGeneres', 'Alexande...	[14, 5293, 12, 13, 18]	[18, 24, 2, 28, 14]	86	17.20	90	2.639	4
1	16	12800000	Drama\|Crime\|Music	168629	Selma, a Czech immigrant on the verge of blind...	Fine Line Features	2000-05-17	40031879	Dancer in the Dark	Lars von Trier	['Björk', 'Catherine Deneuve', 'David Morse', ...	[6748, 47, 52, 50, 53]	[49, 19, 21, 44, 15]	148	29.60	61	2.127	3
2	22	140000000	Adventure\|Fantasy\|Action	325980	Jack Sparrow, a freewheeling 17th-century pira...	Walt Disney Pictures	2003-09-07	655011224	Pirates of the Caribbean: The Curse of the Bla...	Gore Verbinski	['Johnny Depp', 'Geoffrey Rush', 'Orlando Bloo...	[1709, 116, 114, 118, 85]	[7, 9, 7, 22, 20]	65	13.00	63	2.639	4
3	24	30000000	Action\|Crime	266697	An assassin is shot at the altar by her ruthle...	Miramax Films	2003-10-10	180949000	Kill Bill: Vol. 1	Quentin Tarantino	['Uma Thurman', 'Lucy Liu', 'Vivica A. Fox', '...	[140, 589, 2535, 139, 141]	[12, 26, 15, 17, 39]	109	21.80	69	2.639	4
4	25	72000000	Drama\|War	418763	Jarhead is a film about a US Marine Anthony Sw...	Universal Pictures	2005-04-11	96889998	Jarhead	Sam Mendes	['Jamie Foxx', 'Scott MacDonald', 'Lucas Black...	[133, 134, 1350483, 155]	[11, 14, 0, 12]	37	9.25	58	0.346	2

	0	1	2	3	4	5	6	7	8	9	...	2611	2612	2613	2614	2615	2616	2617	2618	2619	2620
0	0.000000	0.725465	0.839317	0.871749	0.937433	0.925518	0.998836	0.586162	0.839317	0.804369	...	0.484721	0.668366	0.705575	0.855873	0.987966	0.708768	0.728614	0.461914	0.728614	0.748150
1	0.725465	0.000000	0.317538	0.999806	0.047800	0.043933	0.901262	0.562334	0.996166	0.351152	...	0.783118	0.925518	0.445627	0.998560	0.804369	0.948405	0.423524	0.003663	0.958399	0.967380
2	0.839317	0.317538	0.000000	0.982189	0.939248	0.950067	0.801483	0.688668	0.992630	0.465031	...	0.886894	0.983185	0.301435	0.987966	0.685439	0.993271	0.545144	0.869262	0.996625	0.998836
3	0.871749	0.999806	0.982189	0.000000	0.521585	0.501462	0.708768	0.968718	0.465031	0.992630	...	0.822132	0.647914	0.998560	0.445627	0.822132	0.606763	0.999898	0.128862	0.586162	0.565608
4	0.937433	0.047800	0.939248	0.521585	0.000000	0.445627	0.270699	0.996166	0.562334	0.959901	...	0.899022	0.745054	0.043933	0.541889	0.190691	0.705575	0.988786	0.914806	0.685439	0.665110

	0	2	3	4	5	6	7	8	9	...	2611	2612	2613	2614	2615	2616	2617	2618	2619	2620
0	0.00000	0.34375	0.28125	0.000	0.00000	0.00000	0.75000	0.34375	0.40625	...	0.90625	0.62500	0.00000	0.31250	0.00000	0.56250	0.53125	0.0000	0.53125	0.50000
1	0.00000	0.00000	0.00000	0.000	0.00000	0.21875	0.00000	0.00000	0.00000	...	0.00000	0.00000	0.96875	0.00000	0.40625	0.00000	0.00000	0.0000	0.00000	0.00000
2	0.34375	0.00000	0.00000	0.125	0.09375	0.00000	0.59375	0.00000	0.93750	...	0.25000	0.00000	0.00000	0.00000	0.00000	0.00000	0.81250	0.0000	0.00000	0.00000
3	0.28125	0.00000	0.00000	0.000	0.00000	0.56250	0.03125	0.93750	0.00000	...	0.37500	0.65625	0.00000	0.96875	0.37500	0.71875	0.00000	0.0000	0.75000	0.78125
4	0.00000	0.12500	0.00000	0.000	0.96875	0.00000	0.00000	0.00000	0.06250	...	0.00000	0.00000	0.00000	0.00000	0.00000	0.00000	0.00000	0.1875	0.00000	0.00000