Treat the genres of the movies

Attribute a new number to each new genre and replace in the dataframe



In [68]:

    
%matplotlib inline

import configparser
import os

import requests
from tqdm import tqdm
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import sparse, stats, spatial
import scipy.sparse.linalg
from sklearn import preprocessing, decomposition
import librosa
import IPython.display as ipd
import json
from imdb import IMDb
import tmdbsimple as tmdb
from pygsp import graphs, filters, plotting

plt.rcParams['figure.figsize'] = (17, 5)
plotting.BACKEND = 'matplotlib'



In [69]:

    
df = pd.read_csv('Saved_Datasets/NewFeaturesDataset.csv')



In [70]:

    
print('There are {} movies'.format(len(df)))









    



There are 2621 movies



In [71]:

    
df['genres'][1]









    Out[71]:





'Drama|Crime|Music'



In [72]:

    
df.head()
#df.iloc[100:150]









    Out[72]:







  
    
      
      id
      budget
      genres
      imdb_id
      overview
      production_companies
      release_date
      revenue
      title
      director_name
      actor_names
      Metacritic
      ROI
      success
    
  
  
    
      0
      12
      94000000
      Animation|Family
      266543
      Nemo, an adventurous young clownfish, is unexp...
      Pixar Animation Studios
      2003-05-30
      940335536
      Finding Nemo
      Andrew Stanton
      ['Albert Brooks', 'Ellen DeGeneres', 'Alexande...
      90
      2.639
      4
    
    
      1
      16
      12800000
      Drama|Crime|Music
      168629
      Selma, a Czech immigrant on the verge of blind...
      Fine Line Features
      2000-05-17
      40031879
      Dancer in the Dark
      Lars von Trier
      ['Björk', 'Catherine Deneuve', 'David Morse', ...
      61
      2.127
      3
    
    
      2
      22
      140000000
      Adventure|Fantasy|Action
      325980
      Jack Sparrow, a freewheeling 17th-century pira...
      Walt Disney Pictures
      2003-09-07
      655011224
      Pirates of the Caribbean: The Curse of the Bla...
      Gore Verbinski
      ['Johnny Depp', 'Geoffrey Rush', 'Orlando Bloo...
      63
      2.639
      4
    
    
      3
      24
      30000000
      Action|Crime
      266697
      An assassin is shot at the altar by her ruthle...
      Miramax Films
      2003-10-10
      180949000
      Kill Bill: Vol. 1
      Quentin Tarantino
      ['Uma Thurman', 'Lucy Liu', 'Vivica A. Fox', '...
      69
      2.639
      4
    
    
      4
      25
      72000000
      Drama|War
      418763
      Jarhead is a film about a US Marine Anthony Sw...
      Universal Pictures
      2005-04-11
      96889998
      Jarhead
      Sam Mendes
      ['Jamie Foxx', 'Scott MacDonald', 'Lucas Black...
      58
      0.346
      2

1. Parsing example



In [73]:

    
df['genres'] = df['genres'].str.replace('|', ',')



In [74]:

    
i = 1
newgenres = df['genres'][i].split(",")

print(newgenres)
print(len(newgenres))









    



['Drama', 'Crime', 'Music']
3

2. Determine different genres

Determine the number of different genres and what they are.



In [75]:

    
Diffgenres = [];
genres = {}
movies_dic = {}

for i in range(0, len(df)):
    
    movies_dic[i] = df['id'][i]
    
    if df['genres'][i] == 'NaN':
        newgenres = []
    else:
        newgenres = df['genres'][i].split(",")
        
        genres.setdefault(i, [])
        
        for j in range (0, len(newgenres)):
            Diffgenres.append(newgenres[j])
            genres[i].append(newgenres[j])



In [76]:

    
Diffgenres = set(Diffgenres)
Diffgenres = list(Diffgenres)

print('There are {} different genres'.format(len(Diffgenres)))
print(Diffgenres)









    



There are 20 different genres
['TV Movie', 'War', 'Music', 'Drama', 'Thriller', 'Western', 'Comedy', 'Foreign', 'Crime', 'Mystery', 'Horror', 'History', 'Adventure', 'Animation', 'Action', 'Romance', 'Fantasy', 'Science Fiction', 'Family', 'Documentary']



In [77]:

    
df.head()









    Out[77]:







  
    
      
      id
      budget
      genres
      imdb_id
      overview
      production_companies
      release_date
      revenue
      title
      director_name
      actor_names
      Metacritic
      ROI
      success
    
  
  
    
      0
      12
      94000000
      Animation,Family
      266543
      Nemo, an adventurous young clownfish, is unexp...
      Pixar Animation Studios
      2003-05-30
      940335536
      Finding Nemo
      Andrew Stanton
      ['Albert Brooks', 'Ellen DeGeneres', 'Alexande...
      90
      2.639
      4
    
    
      1
      16
      12800000
      Drama,Crime,Music
      168629
      Selma, a Czech immigrant on the verge of blind...
      Fine Line Features
      2000-05-17
      40031879
      Dancer in the Dark
      Lars von Trier
      ['Björk', 'Catherine Deneuve', 'David Morse', ...
      61
      2.127
      3
    
    
      2
      22
      140000000
      Adventure,Fantasy,Action
      325980
      Jack Sparrow, a freewheeling 17th-century pira...
      Walt Disney Pictures
      2003-09-07
      655011224
      Pirates of the Caribbean: The Curse of the Bla...
      Gore Verbinski
      ['Johnny Depp', 'Geoffrey Rush', 'Orlando Bloo...
      63
      2.639
      4
    
    
      3
      24
      30000000
      Action,Crime
      266697
      An assassin is shot at the altar by her ruthle...
      Miramax Films
      2003-10-10
      180949000
      Kill Bill: Vol. 1
      Quentin Tarantino
      ['Uma Thurman', 'Lucy Liu', 'Vivica A. Fox', '...
      69
      2.639
      4
    
    
      4
      25
      72000000
      Drama,War
      418763
      Jarhead is a film about a US Marine Anthony Sw...
      Universal Pictures
      2005-04-11
      96889998
      Jarhead
      Sam Mendes
      ['Jamie Foxx', 'Scott MacDonald', 'Lucas Black...
      58
      0.346
      2

3. Create vector of genres for each movie and a dataframe

Binary vector where the elements are 1 if the film has the genre corresponding to the index of the film. Otherwise the elements are zero.

Quick example:



In [78]:

    
print(genres[0][0])
len(genres[0][0])

vector = (genres[0][0] == np.array(Diffgenres)).astype(int)
print(vector)









    



Animation
[0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0]



In [79]:

    
genreArray = np.ndarray(shape=(len(df), len(Diffgenres)), dtype=int)

for i in range(0, len(df)):
    vector = np.zeros(len(Diffgenres))
    
    for j in range(0, len(genres[i])):
        vector += (genres[i][j] == np.array(Diffgenres)).astype(int)
    
    genreArray[i] = vector



In [80]:

    
print(genreArray[0])
print(genreArray.size)









    



[0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0]
52420

Observe the result in the dataframe



In [81]:

    
Genres = pd.DataFrame(genreArray, columns=Diffgenres)
Genres.head(10)









    Out[81]:







  
    
      
      TV Movie
      War
      Music
      Drama
      Thriller
      Western
      Comedy
      Foreign
      Crime
      Mystery
      Horror
      History
      Adventure
      Animation
      Action
      Romance
      Fantasy
      Science Fiction
      Family
      Documentary
    
  
  
    
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      1
      0
      0
      0
      0
      1
      0
    
    
      1
      0
      0
      1
      1
      0
      0
      0
      0
      1
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      2
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      1
      0
      1
      0
      1
      0
      0
      0
    
    
      3
      0
      0
      0
      0
      0
      0
      0
      0
      1
      0
      0
      0
      0
      0
      1
      0
      0
      0
      0
      0
    
    
      4
      0
      1
      0
      1
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      5
      0
      0
      1
      1
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      1
      0
      0
      0
      0
    
    
      6
      0
      0
      0
      0
      0
      0
      1
      0
      0
      0
      0
      0
      0
      1
      0
      0
      0
      0
      1
      0
    
    
      7
      0
      0
      0
      1
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      1
      0
      1
      0
      0
    
    
      8
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      1
      0
      1
      0
      1
      0
      0
      0
    
    
      9
      0
      0
      0
      1
      1
      0
      0
      0
      1
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0

VIsual example of the genres



In [82]:

    
#Genres.iloc[120:150]



In [83]:

    
plt.spy(Genres[120:150])









    Out[83]:





<matplotlib.image.AxesImage at 0x1d2b06a0>

3.1 Determine most frequent genres



In [84]:

    
freqGenre = np.ndarray(shape=(1, len(Diffgenres)), dtype=int)

for i in range(0, len(Diffgenres)):
    freqGenre[0][i] = sum(Genres[Diffgenres[i]] == 1)

Display of the number of times a genre appears in the dataframe



In [85]:

    
NbGenre = pd.DataFrame(freqGenre, columns=Diffgenres)
NbGenre









    Out[85]:







  
    
      
      TV Movie
      War
      Music
      Drama
      Thriller
      Western
      Comedy
      Foreign
      Crime
      Mystery
      Horror
      History
      Adventure
      Animation
      Action
      Romance
      Fantasy
      Science Fiction
      Family
      Documentary
    
  
  
    
      0
      1
      74
      81
      1212
      785
      26
      910
      6
      419
      230
      278
      88
      491
      160
      684
      457
      261
      297
      293
      36



In [86]:

    
NbGenre.to_csv('Saved_Datasets/NbGenre.csv', index=False)



In [87]:

    
plt.bar(Diffgenres, freqGenre[0], align='center');
plt.setp(plt.gca().get_xticklabels(), rotation=45, horizontalalignment='right');
plt.xlabel('Genres');
plt.ylabel('Counts');
plt.savefig('images/GenreFreq.png', dpi =300, bbox_inches='tight')

3.2 Determine genres that are most commonly associated with each other

We only consider two genres together at the moment



In [88]:

    
assosGenre = np.ndarray(shape=(len(Diffgenres), len(Diffgenres)), dtype=int)

for i in range(0, len(Diffgenres)):
    for j in range(0, len(Diffgenres)):
        if i != j:
            assosGenre[i][j] = sum((Genres[Diffgenres[i]] == 1) & (Genres[Diffgenres[j]] == 1))
        else:
            assosGenre[i][j] = 0



In [89]:

    
#ensure the matrix is symmetric
assosGenreSym = assosGenre.transpose() > assosGenre
assosGenre = assosGenre - assosGenre*assosGenreSym + assosGenre.transpose()*assosGenreSym



In [90]:

    
plt.spy(assosGenre)









    Out[90]:





<matplotlib.image.AxesImage at 0x103c30b8>



In [91]:

    
NbGenreAssos = pd.DataFrame(assosGenre, columns=Diffgenres, index = Diffgenres)
NbGenreAssos









    Out[91]:







  
    
      
      TV Movie
      War
      Music
      Drama
      Thriller
      Western
      Comedy
      Foreign
      Crime
      Mystery
      Horror
      History
      Adventure
      Animation
      Action
      Romance
      Fantasy
      Science Fiction
      Family
      Documentary
    
  
  
    
      TV Movie
      0
      0
      0
      1
      1
      0
      0
      0
      1
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      War
      0
      0
      1
      56
      18
      1
      4
      0
      2
      1
      1
      26
      14
      1
      31
      10
      2
      2
      0
      0
    
    
      Music
      0
      1
      0
      48
      0
      0
      36
      0
      3
      0
      1
      0
      3
      7
      3
      33
      7
      0
      14
      6
    
    
      Drama
      1
      56
      48
      0
      355
      13
      274
      3
      230
      121
      53
      81
      108
      5
      177
      307
      56
      58
      36
      2
    
    
      Thriller
      1
      18
      0
      355
      0
      7
      50
      1
      268
      161
      174
      18
      114
      4
      330
      34
      40
      134
      4
      0
    
    
      Western
      0
      1
      0
      13
      7
      0
      4
      0
      5
      2
      0
      1
      12
      2
      11
      0
      2
      1
      2
      0
    
    
      Comedy
      0
      4
      36
      274
      50
      4
      0
      1
      92
      18
      22
      3
      142
      96
      143
      262
      80
      42
      181
      5
    
    
      Foreign
      0
      0
      0
      3
      1
      0
      1
      0
      2
      0
      1
      0
      0
      0
      1
      0
      0
      0
      1
      2
    
    
      Crime
      1
      2
      3
      230
      268
      5
      92
      2
      0
      74
      21
      7
      32
      1
      181
      20
      6
      9
      2
      0
    
    
      Mystery
      0
      1
      0
      121
      161
      2
      18
      0
      74
      0
      61
      2
      25
      1
      44
      21
      13
      29
      3
      0
    
    
      Horror
      0
      1
      1
      53
      174
      0
      22
      1
      21
      61
      0
      0
      11
      0
      45
      6
      24
      38
      1
      0
    
    
      History
      0
      26
      0
      81
      18
      1
      3
      0
      7
      2
      0
      0
      12
      0
      20
      11
      0
      0
      0
      0
    
    
      Adventure
      0
      14
      3
      108
      114
      12
      142
      0
      32
      25
      11
      12
      0
      87
      280
      34
      140
      136
      144
      1
    
    
      Animation
      0
      1
      7
      5
      4
      2
      96
      0
      1
      1
      0
      0
      87
      0
      21
      4
      39
      19
      141
      0
    
    
      Action
      0
      31
      3
      177
      330
      11
      143
      1
      181
      44
      45
      20
      280
      21
      0
      24
      103
      177
      37
      1
    
    
      Romance
      0
      10
      33
      307
      34
      0
      262
      0
      20
      21
      6
      11
      34
      4
      24
      0
      40
      17
      27
      0
    
    
      Fantasy
      0
      2
      7
      56
      40
      2
      80
      0
      6
      13
      24
      0
      140
      39
      103
      40
      0
      46
      94
      0
    
    
      Science Fiction
      0
      2
      0
      58
      134
      1
      42
      0
      9
      29
      38
      0
      136
      19
      177
      17
      46
      0
      31
      0
    
    
      Family
      0
      0
      14
      36
      4
      2
      181
      1
      2
      3
      1
      0
      144
      141
      37
      27
      94
      31
      0
      3
    
    
      Documentary
      0
      0
      6
      2
      0
      0
      5
      2
      0
      0
      0
      0
      1
      0
      1
      0
      0
      0
      3
      0



In [92]:

    
NbGenreAssos.to_csv('Saved_Datasets/NbGenreAssos.csv', index=False)

Determining ranking of genre associations

1 indicates most frequently associated and 19 is least frequently associated.

$\textbf{Reminder:}$ This is only the case of our dataset and may not represent reality



In [93]:

    
assosRank = {} 
rank = np.argsort(-assosGenre, axis=1) #negative for ascending order

Diffgenres[rank[0][1]]

for i in range(0, len(Diffgenres)):
    for j in range(0, len(Diffgenres)):
        assosRank.setdefault(Diffgenres[i], [])
        
        #Only if not comparing with the same genre
        if Diffgenres[i] != Diffgenres[rank[i][j]]:
            assosRank[Diffgenres[i]].append(Diffgenres[rank[i][j]])

Display of the ranking of the other genres with which each genre is most often associated



In [94]:

    
ranking = np.linspace(1, len(Diffgenres)-1, num=len(Diffgenres)-1, endpoint=True, retstep=False, dtype=int)

Rankdf = pd.DataFrame(assosRank, index=ranking)
Rankdf









    Out[94]:







  
    
      
      Action
      Adventure
      Animation
      Comedy
      Crime
      Documentary
      Drama
      Family
      Fantasy
      Foreign
      History
      Horror
      Music
      Mystery
      Romance
      Science Fiction
      TV Movie
      Thriller
      War
      Western
    
  
  
    
      1
      Thriller
      Action
      Family
      Drama
      Thriller
      Music
      Thriller
      Comedy
      Adventure
      Drama
      Drama
      Thriller
      Drama
      Thriller
      Drama
      Action
      Drama
      Drama
      Drama
      Drama
    
    
      2
      Adventure
      Family
      Comedy
      Romance
      Drama
      Comedy
      Romance
      Adventure
      Action
      Documentary
      War
      Mystery
      Comedy
      Drama
      Comedy
      Adventure
      Thriller
      Action
      Action
      Adventure
    
    
      3
      Crime
      Comedy
      Adventure
      Family
      Action
      Family
      Comedy
      Animation
      Family
      Crime
      Action
      Drama
      Romance
      Crime
      Fantasy
      Thriller
      Crime
      Crime
      History
      Action
    
    
      4
      Drama
      Fantasy
      Fantasy
      Action
      Comedy
      Drama
      Crime
      Fantasy
      Comedy
      Thriller
      Thriller
      Action
      Family
      Horror
      Thriller
      Drama
      Science Fiction
      Horror
      Thriller
      Thriller
    
    
      5
      Science Fiction
      Science Fiction
      Action
      Adventure
      Mystery
      Foreign
      Action
      Action
      Drama
      Comedy
      Adventure
      Science Fiction
      Fantasy
      Action
      Adventure
      Fantasy
      Fantasy
      Mystery
      Adventure
      Crime
    
    
      6
      Comedy
      Thriller
      Science Fiction
      Animation
      Adventure
      Action
      Mystery
      Drama
      Science Fiction
      Action
      Romance
      Fantasy
      Animation
      Science Fiction
      Music
      Comedy
      Romance
      Science Fiction
      Romance
      Comedy
    
    
      7
      Fantasy
      Drama
      Music
      Crime
      Horror
      Adventure
      Adventure
      Science Fiction
      Thriller
      Family
      Crime
      Comedy
      Documentary
      Adventure
      Family
      Horror
      Action
      Adventure
      Comedy
      Mystery
    
    
      8
      Horror
      Animation
      Drama
      Fantasy
      Romance
      TV Movie
      History
      Romance
      Romance
      Horror
      Comedy
      Crime
      Action
      Romance
      Action
      Family
      Animation
      Comedy
      Science Fiction
      Fantasy
    
    
      9
      Mystery
      Romance
      Thriller
      Thriller
      Science Fiction
      Science Fiction
      Science Fiction
      Music
      Animation
      Science Fiction
      Mystery
      Adventure
      Crime
      Comedy
      Mystery
      Mystery
      Adventure
      Fantasy
      Fantasy
      Family
    
    
      10
      Family
      Crime
      Romance
      Science Fiction
      History
      Fantasy
      War
      Thriller
      Horror
      Fantasy
      Western
      Romance
      Adventure
      Fantasy
      Crime
      Animation
      History
      Romance
      Crime
      Animation
    
    
      11
      War
      Mystery
      Western
      Music
      Fantasy
      Romance
      Fantasy
      Mystery
      Mystery
      Romance
      Science Fiction
      War
      War
      Family
      Science Fiction
      Romance
      Mystery
      War
      Mystery
      War
    
    
      12
      Romance
      War
      Mystery
      Horror
      Western
      Animation
      Horror
      Documentary
      Music
      Animation
      Fantasy
      Music
      Horror
      Western
      History
      Crime
      Family
      History
      Music
      Science Fiction
    
    
      13
      Animation
      Western
      War
      Mystery
      Music
      Mystery
      Music
      Crime
      Crime
      TV Movie
      Animation
      Foreign
      Science Fiction
      History
      War
      War
      Foreign
      Western
      Western
      History
    
    
      14
      History
      History
      Crime
      Documentary
      War
      Horror
      Family
      Western
      War
      History
      TV Movie
      Family
      TV Movie
      War
      Horror
      Western
      Comedy
      Family
      Horror
      Romance
    
    
      15
      Western
      Horror
      TV Movie
      Western
      Foreign
      Crime
      Western
      Horror
      Western
      Western
      Horror
      TV Movie
      Foreign
      Animation
      Animation
      TV Movie
      Western
      Animation
      Animation
      TV Movie
    
    
      16
      Music
      Music
      History
      War
      Family
      Western
      Animation
      Foreign
      TV Movie
      Music
      Family
      History
      Western
      TV Movie
      TV Movie
      Foreign
      Music
      TV Movie
      TV Movie
      Foreign
    
    
      17
      Documentary
      Documentary
      Horror
      History
      TV Movie
      Thriller
      Foreign
      War
      Foreign
      War
      Foreign
      Western
      Thriller
      Music
      Foreign
      Music
      War
      Foreign
      Foreign
      Music
    
    
      18
      Foreign
      Foreign
      Foreign
      Foreign
      Animation
      War
      Documentary
      History
      History
      Adventure
      Music
      Animation
      History
      Foreign
      Western
      History
      Horror
      Music
      Family
      Horror
    
    
      19
      TV Movie
      TV Movie
      Documentary
      TV Movie
      Documentary
      History
      TV Movie
      TV Movie
      Documentary
      Mystery
      Documentary
      Documentary
      Mystery
      Documentary
      Documentary
      Documentary
      Documentary
      Documentary
      Documentary
      Documentary



In [95]:

    
Rankdf.to_csv('Saved_Datasets/GenreRanking.csv', index=False)

3.3 Determine how many films are successful or are non-successful depending on the genre



In [96]:

    
genreArray[0]









    Out[96]:





array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0])



In [97]:

    
genreSuccess = np.zeros(shape=(1, len(Diffgenres)), dtype=float)
genreSuccessPc = np.zeros(shape=(1, len(Diffgenres)), dtype=float)

for i in range(0, len(Diffgenres)):
    for j in range(0, len(df)):
        if genreArray[j][i] == 1:
            if df['success'][j] == 1:
                genreSuccess[0][i] += 1
    
    genreSuccessPc[0][i] = (genreSuccess[0][i]/freqGenre[0][i])*100



In [98]:

    
plt.bar(Diffgenres, genreSuccessPc[0], align='center');
plt.setp(plt.gca().get_xticklabels(), rotation=45, horizontalalignment='right');
plt.xlabel('Genres');
plt.ylabel('Success rate [%]');

Don't forget that the number of successful movies is not equal to the sum of the success rate of the genres since movies often have multiple genres.



In [99]:

    
print(sum(sum(genreSuccess)))
print('The number of films that are succesful: {}'.format(len(df[df['success'] == 1])))
print('The number of films that are unsuccesful: {}'.format(len(df[df['success'] == 0])))









    



1651.0
The number of films that are succesful: 655
The number of films that are unsuccesful: 0

4. Create a similarity graph between films depending on genre

In this case, the similarity is the number of genres that the movie has in common:

$\mathbf{W}(u,v) = sum(u \cdot v) \ \in [0, 20]$



In [100]:

    
weights = np.ndarray(shape=(len(df), len(df)), dtype=int)

weights = genreArray @ genreArray.T

#fill the diagonal values to zero, i.e. no self-connections
np.fill_diagonal(weights, 0)



In [101]:

    
plt.spy(weights)









    Out[101]:





<matplotlib.image.AxesImage at 0x10786b70>



In [102]:

    
plt.hist(weights[weights > 0].reshape(-1), bins=50);



In [103]:

    
print('There are {} weights equal to zero'.format(np.sum(weights == 0)))
print('There are {} weights equal to one'.format(np.sum(weights == 1)))
print('There are {} weights equal to seven'.format(np.sum(weights == 7)))









    



There are 3430935 weights equal to zero
There are 2611932 weights equal to one
There are 0 weights equal to seven



In [104]:

    
meanW = weights.mean()
maxW = weights.max()
minW = weights.min()

print('The mean value of the similarity in terms of genre is: {}'.format(meanW))
print('The max value of the similarity is: {}'.format(maxW))
print('The min value of the similarity is: {}'.format(minW))









    



The mean value of the similarity in terms of genre is: 0.6433494850749842
The max value of the similarity is: 6
The min value of the similarity is: 0

4.1 Normalization of the matrix



In [105]:

    
print(genreArray[1])
print(sum(genreArray[1]))









    



[0 0 1 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0]
3



In [106]:

    
weightsNorm = np.ndarray(shape=(len(df), len(df)), dtype=float)
lengths =  np.ndarray(shape=(1, 2), dtype=int)
lenMax = 0;

for i in range(0, len(weights)):
    for j in range(0, len(weights)):
        if i!=j: 
            lengths = [sum(genreArray[i]), sum(genreArray[j])]
            weightsNorm[i][j] = (weights[i][j])/max(lengths) 

np.fill_diagonal(weightsNorm, 0)



In [107]:

    
sigma = np.std(weights)
print(sigma)
mu = np.mean(weights)
print(mu)
#1/(sigma*math.sqrt(2*math.pi))*
Wgauss = np.exp(-((weights-mu)**2)/(2*sigma**2))

#fill the diagonal values to zero, i.e. no self-connections
np.fill_diagonal(Wgauss, 0)









    



0.750927221103
0.643349485075

Maximum normalization



In [108]:

    
plt.spy(weightsNorm)









    Out[108]:





<matplotlib.image.AxesImage at 0x1124e1d0>



In [109]:

    
plt.hist(weightsNorm.reshape(-1), bins=50);



In [110]:

    
print('The mean value is: {}'.format(weightsNorm.mean()))
print('The max value is: {}'.format(weightsNorm.max()))
print('The min value is: {}'.format(weightsNorm.min()))









    



The mean value is: 0.20359825069114826
The max value is: 1.0
The min value is: 0.0

Plot the degree distribution



In [111]:

    
degrees = np.zeros(len(weightsNorm)) 

#reminder: the degrees of a node for a weighted graph are the sum of its weights

for i in range(0, len(weightsNorm)):
    degrees[i] = sum(weightsNorm[i])

plt.hist(degrees, bins=50);



In [112]:

    
print('The mean value is: {}'.format(degrees.mean()))
print('The max value is: {}'.format(degrees.max()))
print('The min value is: {}'.format(degrees.min()))









    



The mean value is: 533.6310150614994
The max value is: 880.2142857142858
The min value is: 7.999999999999999

Gaussian normalization



In [113]:

    
plt.spy(Wgauss)









    Out[113]:





<matplotlib.image.AxesImage at 0x171555c0>

4.3 Save the dataset



In [114]:

    
NormW = pd.DataFrame(weightsNorm)
NormW.head()









    Out[114]:







  
    
      
      0
      1
      2
      3
      4
      5
      6
      7
      8
      9
      ...
      2611
      2612
      2613
      2614
      2615
      2616
      2617
      2618
      2619
      2620
    
  
  
    
      0
      0.0
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.666667
      0.000000
      0.000000
      0.000000
      ...
      0.0
      0.00
      0.000000
      0.000000
      0.000000
      0.333333
      0.000000
      0.0
      0.4
      0.000000
    
    
      1
      0.0
      0.000000
      0.000000
      0.333333
      0.333333
      0.666667
      0.000000
      0.333333
      0.000000
      0.666667
      ...
      0.0
      0.50
      0.333333
      0.333333
      0.333333
      0.333333
      0.333333
      0.0
      0.0
      0.333333
    
    
      2
      0.0
      0.000000
      0.000000
      0.333333
      0.000000
      0.000000
      0.000000
      0.000000
      1.000000
      0.000000
      ...
      0.0
      0.25
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.0
      0.6
      0.000000
    
    
      3
      0.0
      0.333333
      0.333333
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.333333
      0.333333
      ...
      0.0
      0.50
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.0
      0.2
      0.000000
    
    
      4
      0.0
      0.333333
      0.000000
      0.000000
      0.000000
      0.333333
      0.000000
      0.333333
      0.000000
      0.333333
      ...
      0.0
      0.25
      0.333333
      0.500000
      0.500000
      0.333333
      0.500000
      0.0
      0.0
      0.500000
    
  

5 rows × 2621 columns



In [115]:

    
NormW.to_csv('Saved_Datasets/NormalizedGenreW.csv', index=False)

5. Graph Laplacian and Embedding for maximum normalization

5.1 Compute the graph Laplacian

With pygsp



In [116]:

    
G = graphs.Graph(weightsNorm)
G.compute_laplacian('normalized')

Normally



In [117]:

    
#reminder: L = D - W for weighted graphs
laplacian = np.diag(degrees) - weightsNorm

#computation of the normalized Laplacian
laplacian_norm = scipy.sparse.csgraph.laplacian(weightsNorm, normed = True)

plt.spy(laplacian_norm);



In [118]:

    
laplacian_norm = sparse.csr_matrix(laplacian_norm)

5.2 Compute the Fourier basis

With pygsp



In [119]:

    
G.compute_fourier_basis(recompute=True)
plt.plot(G.e[0:10]);

Normally



In [120]:

    
eigenvalues, eigenvectors =  sparse.linalg.eigsh(laplacian_norm, k = 10, which = 'SM')



In [121]:

    
plt.plot(eigenvalues, '.-', markersize=15);
plt.xlabel('')
plt.ylabel('Eigenvalues')
plt.show()

5.3 Graph embedding



In [122]:

    
genres = preprocessing.LabelEncoder().fit_transform(df['success'])

x = eigenvectors[:, 1] 
y = eigenvectors[:, 2] 
plt.scatter(x, y, c=genres, cmap='RdBu', alpha=0.5);



In [123]:

    
G.set_coordinates(G.U[:, 1:3])
G.plot()



In [124]:

    
G.plot_signal(genres, vertex_size=20)

6. Graph Laplacian and Embedding for gaussian normalization

6.1. Sparsification of the graph

Keep only a certain number of the weights



In [125]:

    
NEIGHBORS = 300

#sort the order of the weights
sort_order = np.argsort(Wgauss, axis = 1)

#declaration of a sorted weight matrix
sorted_weights = np.zeros((len(Wgauss), len(Wgauss)))

for i in range (0, len(Wgauss)):  
    for j in range(0, len(Wgauss)):
        if (j >= len(Wgauss) - NEIGHBORS):
            #copy the k strongest edges for each node
            sorted_weights[i, sort_order[i,j]] = Wgauss[i,sort_order[i,j]]
        else:
            #set the other edges to zero
            sorted_weights[i, sort_order[i,j]] = 0

#ensure the matrix is symmetric
bigger = sorted_weights.transpose() > sorted_weights
sorted_weights = sorted_weights - sorted_weights*bigger + sorted_weights.transpose()*bigger



In [126]:

    
plt.spy(sorted_weights)









    Out[126]:





<matplotlib.image.AxesImage at 0x1087bef0>



In [127]:

    
plt.hist(sorted_weights.reshape(-1), bins=50);

6.1. Save the sparsed dataset



In [128]:

    
NormW = pd.DataFrame(sorted_weights)
NormW.head()









    Out[128]:







  
    
      
      0
      1
      2
      3
      4
      5
      6
      7
      8
      9
      ...
      2611
      2612
      2613
      2614
      2615
      2616
      2617
      2618
      2619
      2620
    
  
  
    
      0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.893341
      0.0
      0.0
      0.0
      0.000000
    
    
      1
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.000000
      0.0
      0.0
      0.0
      0.893341
    
    
      2
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.000000
      0.0
      0.0
      0.0
      0.000000
    
    
      3
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.000000
      0.0
      0.0
      0.0
      0.000000
    
    
      4
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.000000
      0.0
      0.0
      0.0
      0.893341
    
  

5 rows × 2621 columns



In [129]:

    
NormW.to_csv('Saved_Datasets/NormalizedGenreWSparse.csv', index=False)

6.2. Laplacian and graph embedding

With pygsp



In [130]:

    
G = graphs.Graph(sorted_weights)
G.compute_laplacian('normalized')

Other



In [131]:

    
#reminder: L = D - W for weighted graphs
laplacian = np.diag(degrees) - sorted_weights

#computation of the normalized Laplacian
laplacian_norm = scipy.sparse.csgraph.laplacian(sorted_weights, normed = True)

plt.spy(laplacian_norm);



In [132]:

    
G.compute_fourier_basis(recompute=True)
plt.plot(G.e[0:10]);



In [133]:

    
G.set_coordinates(G.U[:, 1:3])
G.plot()



In [134]:

    
G.plot_signal(genres, vertex_size=20)



In [ ]:

	id	budget	genres	imdb_id	overview	production_companies	release_date	revenue	title	director_name	actor_names	Metacritic	ROI	success
0	12	94000000	Animation\|Family	266543	Nemo, an adventurous young clownfish, is unexp...	Pixar Animation Studios	2003-05-30	940335536	Finding Nemo	Andrew Stanton	['Albert Brooks', 'Ellen DeGeneres', 'Alexande...	90	2.639	4
1	16	12800000	Drama\|Crime\|Music	168629	Selma, a Czech immigrant on the verge of blind...	Fine Line Features	2000-05-17	40031879	Dancer in the Dark	Lars von Trier	['Björk', 'Catherine Deneuve', 'David Morse', ...	61	2.127	3
2	22	140000000	Adventure\|Fantasy\|Action	325980	Jack Sparrow, a freewheeling 17th-century pira...	Walt Disney Pictures	2003-09-07	655011224	Pirates of the Caribbean: The Curse of the Bla...	Gore Verbinski	['Johnny Depp', 'Geoffrey Rush', 'Orlando Bloo...	63	2.639	4
3	24	30000000	Action\|Crime	266697	An assassin is shot at the altar by her ruthle...	Miramax Films	2003-10-10	180949000	Kill Bill: Vol. 1	Quentin Tarantino	['Uma Thurman', 'Lucy Liu', 'Vivica A. Fox', '...	69	2.639	4
4	25	72000000	Drama\|War	418763	Jarhead is a film about a US Marine Anthony Sw...	Universal Pictures	2005-04-11	96889998	Jarhead	Sam Mendes	['Jamie Foxx', 'Scott MacDonald', 'Lucas Black...	58	0.346	2

	id	budget	genres	imdb_id	overview	production_companies	release_date	revenue	title	director_name	actor_names	Metacritic	ROI	success
0	12	94000000	Animation,Family	266543	Nemo, an adventurous young clownfish, is unexp...	Pixar Animation Studios	2003-05-30	940335536	Finding Nemo	Andrew Stanton	['Albert Brooks', 'Ellen DeGeneres', 'Alexande...	90	2.639	4
1	16	12800000	Drama,Crime,Music	168629	Selma, a Czech immigrant on the verge of blind...	Fine Line Features	2000-05-17	40031879	Dancer in the Dark	Lars von Trier	['Björk', 'Catherine Deneuve', 'David Morse', ...	61	2.127	3
2	22	140000000	Adventure,Fantasy,Action	325980	Jack Sparrow, a freewheeling 17th-century pira...	Walt Disney Pictures	2003-09-07	655011224	Pirates of the Caribbean: The Curse of the Bla...	Gore Verbinski	['Johnny Depp', 'Geoffrey Rush', 'Orlando Bloo...	63	2.639	4
3	24	30000000	Action,Crime	266697	An assassin is shot at the altar by her ruthle...	Miramax Films	2003-10-10	180949000	Kill Bill: Vol. 1	Quentin Tarantino	['Uma Thurman', 'Lucy Liu', 'Vivica A. Fox', '...	69	2.639	4
4	25	72000000	Drama,War	418763	Jarhead is a film about a US Marine Anthony Sw...	Universal Pictures	2005-04-11	96889998	Jarhead	Sam Mendes	['Jamie Foxx', 'Scott MacDonald', 'Lucas Black...	58	0.346	2

	War	Music	Drama	Thriller	Comedy	Crime	Adventure	Animation	Action	Romance	Fantasy	Science Fiction	Family
0	0	0	0	0	0	0	0	1	0	0	0	0	1
1	0	1	1	0	0	1	0	0	0	0	0	0	0
2	0	0	0	0	0	0	1	0	1	0	1	0	0
3	0	0	0	0	0	1	0	0	1	0	0	0	0
4	1	0	1	0	0	0	0	0	0	0	0	0	0
5	0	1	1	0	0	0	0	0	0	1	0	0	0
6	0	0	0	0	1	0	0	1	0	0	0	0	1
7	0	0	1	0	0	0	0	0	0	1	0	1	0
8	0	0	0	0	0	0	1	0	1	0	1	0	0
9	0	0	1	1	0	1	0	0	0	0	0	0	0

	TV Movie	War	Music	Drama	Thriller	Western	Comedy	Foreign	Crime	Mystery	Horror	History	Adventure	Animation	Action	Romance	Fantasy	Science Fiction	Family	Documentary
TV Movie	0	0	0	1	1	0	0	0	1	0	0	0	0	0	0	0	0	0	0	0
War	0	0	1	56	18	1	4	0	2	1	1	26	14	1	31	10	2	2	0	0
Music	0	1	0	48	0	0	36	0	3	0	1	0	3	7	3	33	7	0	14	6
Drama	1	56	48	0	355	13	274	3	230	121	53	81	108	5	177	307	56	58	36	2
Thriller	1	18	0	355	0	7	50	1	268	161	174	18	114	4	330	34	40	134	4	0
Western	0	1	0	13	7	0	4	0	5	2	0	1	12	2	11	0	2	1	2	0
Comedy	0	4	36	274	50	4	0	1	92	18	22	3	142	96	143	262	80	42	181	5
Foreign	0	0	0	3	1	0	1	0	2	0	1	0	0	0	1	0	0	0	1	2
Crime	1	2	3	230	268	5	92	2	0	74	21	7	32	1	181	20	6	9	2	0
Mystery	0	1	0	121	161	2	18	0	74	0	61	2	25	1	44	21	13	29	3	0
Horror	0	1	1	53	174	0	22	1	21	61	0	0	11	0	45	6	24	38	1	0
History	0	26	0	81	18	1	3	0	7	2	0	0	12	0	20	11	0	0	0	0
Adventure	0	14	3	108	114	12	142	0	32	25	11	12	0	87	280	34	140	136	144	1
Animation	0	1	7	5	4	2	96	0	1	1	0	0	87	0	21	4	39	19	141	0
Action	0	31	3	177	330	11	143	1	181	44	45	20	280	21	0	24	103	177	37	1
Romance	0	10	33	307	34	0	262	0	20	21	6	11	34	4	24	0	40	17	27	0
Fantasy	0	2	7	56	40	2	80	0	6	13	24	0	140	39	103	40	0	46	94	0
Science Fiction	0	2	0	58	134	1	42	0	9	29	38	0	136	19	177	17	46	0	31	0
Family	0	0	14	36	4	2	181	1	2	3	1	0	144	141	37	27	94	31	0	3
Documentary	0	0	6	2	0	0	5	2	0	0	0	0	1	0	1	0	0	0	3	0

	1	2	3	4	5	6	7	8	9	...	2612	2613	2614	2615	2616	2617	2619	2620
0	0.000000	0.000000	0.000000	0.000000	0.000000	0.666667	0.000000	0.000000	0.000000	...	0.00	0.000000	0.000000	0.000000	0.333333	0.000000	0.4	0.000000
1	0.000000	0.000000	0.333333	0.333333	0.666667	0.000000	0.333333	0.000000	0.666667	...	0.50	0.333333	0.333333	0.333333	0.333333	0.333333	0.0	0.333333
2	0.000000	0.000000	0.333333	0.000000	0.000000	0.000000	0.000000	1.000000	0.000000	...	0.25	0.000000	0.000000	0.000000	0.000000	0.000000	0.6	0.000000
3	0.333333	0.333333	0.000000	0.000000	0.000000	0.000000	0.000000	0.333333	0.333333	...	0.50	0.000000	0.000000	0.000000	0.000000	0.000000	0.2	0.000000
4	0.333333	0.000000	0.000000	0.000000	0.333333	0.000000	0.333333	0.000000	0.333333	...	0.25	0.333333	0.500000	0.500000	0.333333	0.500000	0.0	0.500000

	...	2616	2620
0	...	0.893341	0.000000
1	...	0.000000	0.893341
2	...	0.000000	0.000000
3	...	0.000000	0.000000
4	...	0.000000	0.893341

	War	Music	Drama	Thriller	Comedy	Crime	Adventure	Animation	Action	Romance	Fantasy	Science Fiction	Family
0	0	0	0	0	0	0	0	1	0	0	0	0	1
1	0	1	1	0	0	1	0	0	0	0	0	0	0
2	0	0	0	0	0	0	1	0	1	0	1	0	0
3	0	0	0	0	0	1	0	0	1	0	0	0	0
4	1	0	1	0	0	0	0	0	0	0	0	0	0
5	0	1	1	0	0	0	0	0	0	1	0	0	0
6	0	0	0	0	1	0	0	1	0	0	0	0	1
7	0	0	1	0	0	0	0	0	0	1	0	1	0
8	0	0	0	0	0	0	1	0	1	0	1	0	0
9	0	0	1	1	0	1	0	0	0	0	0	0	0

	War	Music	Drama	Thriller	Comedy	Crime	Adventure	Animation	Action	Romance	Fantasy	Science Fiction	Family
0	0	0	0	0	0	0	0	1	0	0	0	0	1
1	0	1	1	0	0	1	0	0	0	0	0	0	0
2	0	0	0	0	0	0	1	0	1	0	1	0	0
3	0	0	0	0	0	1	0	0	1	0	0	0	0
4	1	0	1	0	0	0	0	0	0	0	0	0	0
5	0	1	1	0	0	0	0	0	0	1	0	0	0
6	0	0	0	0	1	0	0	1	0	0	0	0	1
7	0	0	1	0	0	0	0	0	0	1	0	1	0
8	0	0	0	0	0	0	1	0	1	0	1	0	0
9	0	0	1	1	0	1	0	0	0	0	0	0	0