Text analysis

Attempt to treat the overview of the films:

https://pandas.pydata.org/pandas-docs/version/0.21/generated/pandas.DataFrame.replace.html



In [1]:

    
%matplotlib inline

import configparser
import os

import requests
from tqdm import tqdm
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import sparse, stats, spatial
import scipy.sparse.linalg
from sklearn import preprocessing, decomposition
import librosa
import IPython.display as ipd
import json
from imdb import IMDb
import tmdbsimple as tmdb
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize 
from nltk.stem import WordNetLemmatizer, PorterStemmer 
from collections import OrderedDict
from pygsp import graphs, filters, plotting

plt.rcParams['figure.figsize'] = (17, 5)
plotting.BACKEND = 'matplotlib'









    



2018-01-22 17:51:39,570:[WARNING](pygsp.graphs.nngraphs.nngraph.<module>): Cannot import pyflann (used for faster kNN computations): Traceback (most recent call last):
  File "C:\Users\Conda\lib\site-packages\pygsp\graphs\nngraphs\nngraph.py", line 14, in <module>
    import pyflann as pfl
  File "C:\Users\Conda\lib\site-packages\pyflann\__init__.py", line 27, in <module>
    from pyflann.index import *
  File "C:\Users\Conda\lib\site-packages\pyflann\index.py", line 27, in <module>
    from pyflann.bindings.flann_ctypes import *
  File "C:\Users\Conda\lib\site-packages\pyflann\bindings\__init__.py", line 30, in <module>
    from pyflann.bindings.flann_ctypes import *
  File "C:\Users\Conda\lib\site-packages\pyflann\bindings\flann_ctypes.py", line 171, in <module>
    raise ImportError('Cannot load dynamic library. Did you compile FLANN?')
ImportError: Cannot load dynamic library. Did you compile FLANN?



In [2]:

    
df = pd.read_csv('Saved_Datasets/NewFeaturesDataset.csv')



In [3]:

    
df.head()









    Out[3]:







  
    
      
      id
      budget
      genres
      imdb_id
      overview
      production_companies
      release_date
      revenue
      title
      director_name
      actor_names
      Metacritic
      ROI
      success
    
  
  
    
      0
      12
      94000000
      Animation|Family
      266543
      Nemo, an adventurous young clownfish, is unexp...
      Pixar Animation Studios
      2003-05-30
      940335536
      Finding Nemo
      Andrew Stanton
      ['Albert Brooks', 'Ellen DeGeneres', 'Alexande...
      90
      2.639
      4
    
    
      1
      16
      12800000
      Drama|Crime|Music
      168629
      Selma, a Czech immigrant on the verge of blind...
      Fine Line Features
      2000-05-17
      40031879
      Dancer in the Dark
      Lars von Trier
      ['Björk', 'Catherine Deneuve', 'David Morse', ...
      61
      2.127
      3
    
    
      2
      22
      140000000
      Adventure|Fantasy|Action
      325980
      Jack Sparrow, a freewheeling 17th-century pira...
      Walt Disney Pictures
      2003-09-07
      655011224
      Pirates of the Caribbean: The Curse of the Bla...
      Gore Verbinski
      ['Johnny Depp', 'Geoffrey Rush', 'Orlando Bloo...
      63
      2.639
      4
    
    
      3
      24
      30000000
      Action|Crime
      266697
      An assassin is shot at the altar by her ruthle...
      Miramax Films
      2003-10-10
      180949000
      Kill Bill: Vol. 1
      Quentin Tarantino
      ['Uma Thurman', 'Lucy Liu', 'Vivica A. Fox', '...
      69
      2.639
      4
    
    
      4
      25
      72000000
      Drama|War
      418763
      Jarhead is a film about a US Marine Anthony Sw...
      Universal Pictures
      2005-04-11
      96889998
      Jarhead
      Sam Mendes
      ['Jamie Foxx', 'Scott MacDonald', 'Lucas Black...
      58
      0.346
      2

1. Tokenization example



In [4]:

    
tokens = word_tokenize(df['overview'][34])



In [5]:

    
tokens









    Out[5]:





['Twenty-eight',
 'days',
 'after',
 'a',
 'killer',
 'virus',
 'was',
 'accidentally',
 'unleashed',
 'from',
 'a',
 'British',
 'research',
 'facility',
 ',',
 'a',
 'small',
 'group',
 'of',
 'London',
 'survivors',
 'are',
 'caught',
 'in',
 'a',
 'desperate',
 'struggle',
 'to',
 'protect',
 'themselves',
 'from',
 'the',
 'infected',
 '.',
 'Carried',
 'by',
 'animals',
 'and',
 'humans',
 ',',
 'the',
 'virus',
 'turns',
 'those',
 'it',
 'infects',
 'into',
 'homicidal',
 'maniacs',
 '--',
 'and',
 'it',
 "'s",
 'absolutely',
 'impossible',
 'to',
 'contain',
 '.']

We can even set all characters to lower case



In [6]:

    
tokens[0].lower()









    Out[6]:





'twenty-eight'



In [7]:

    
print('Initial number of tokens is {}'.format(len(tokens)))









    



Initial number of tokens is 58



In [8]:

    
clean_tokens = tokens[:] 
sr = stopwords.words('english') 

unwanted = "!.,--1234567890<>A?():"
sr.extend(unwanted)
sr.extend(['The', "'s"])

for token in tokens:
    if token in sr: 
        clean_tokens.remove(token)



In [9]:

    
print(sr)









    



['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't", '!', '.', ',', '-', '-', '1', '2', '3', '4', '5', '6', '7', '8', '9', '0', '<', '>', 'A', '?', '(', ')', ':', 'The', "'s"]



In [10]:

    
clean_tokens









    Out[10]:





['Twenty-eight',
 'days',
 'killer',
 'virus',
 'accidentally',
 'unleashed',
 'British',
 'research',
 'facility',
 'small',
 'group',
 'London',
 'survivors',
 'caught',
 'desperate',
 'struggle',
 'protect',
 'infected',
 'Carried',
 'animals',
 'humans',
 'virus',
 'turns',
 'infects',
 'homicidal',
 'maniacs',
 '--',
 'absolutely',
 'impossible',
 'contain']



In [11]:

    
print('Number of tokens after cleaning is {}'.format(len(clean_tokens)))









    



Number of tokens after cleaning is 30



In [12]:

    
freq = nltk.FreqDist(clean_tokens) 

for key,val in freq.items(): 
    print (str(key) + ':' + str(val))









    



Twenty-eight:1
days:1
killer:1
virus:2
accidentally:1
unleashed:1
British:1
research:1
facility:1
small:1
group:1
London:1
survivors:1
caught:1
desperate:1
struggle:1
protect:1
infected:1
Carried:1
animals:1
humans:1
turns:1
infects:1
homicidal:1
maniacs:1
--:1
absolutely:1
impossible:1
contain:1



In [13]:

    
freq.plot(20, cumulative=False)



In [14]:

    
freq.items()









    Out[14]:





dict_items([('Twenty-eight', 1), ('days', 1), ('killer', 1), ('virus', 2), ('accidentally', 1), ('unleashed', 1), ('British', 1), ('research', 1), ('facility', 1), ('small', 1), ('group', 1), ('London', 1), ('survivors', 1), ('caught', 1), ('desperate', 1), ('struggle', 1), ('protect', 1), ('infected', 1), ('Carried', 1), ('animals', 1), ('humans', 1), ('turns', 1), ('infects', 1), ('homicidal', 1), ('maniacs', 1), ('--', 1), ('absolutely', 1), ('impossible', 1), ('contain', 1)])



In [15]:

    
freq.keys()









    Out[15]:





dict_keys(['Twenty-eight', 'days', 'killer', 'virus', 'accidentally', 'unleashed', 'British', 'research', 'facility', 'small', 'group', 'London', 'survivors', 'caught', 'desperate', 'struggle', 'protect', 'infected', 'Carried', 'animals', 'humans', 'turns', 'infects', 'homicidal', 'maniacs', '--', 'absolutely', 'impossible', 'contain'])



In [16]:

    
val = list(freq.values())
val









    Out[16]:





[1,
 1,
 1,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1]



In [17]:

    
#for key,val in freq.items():
#    print('key is {} and val is {}'.format(key, val))

2. Tokenization on the whole dataset

Perform tokenization on the whole dataset



In [18]:

    
overviewDic = {}

sr = stopwords.words('english') 
unwanted = "!.,--1234567890<>A?()\'':"  
sr.extend(unwanted)
sr.extend(['The', "'s", 'But', 'When', 'In', 'As', 'After', '-', '"', "''", '...', '--', '’', '–', '``', "n't", 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine'])
sr.extend(['get', 'set', 'go', 'based', 'meets', 'around', 'named', 'comes', 'even', 'come', 'decides', 'however', 'goes', 'also', 'behind', 'turns', 'must', 'finds', 'put', 'sets', 'takes'])

for i in range(0, len(df)):
    # avoid repititions in the list
    tokens = list(set(word_tokenize(df['overview'][i].lower())))
    
    clean_tokens = tokens[:] 
    
    for token in tokens: 
        if token in sr: 
            clean_tokens.remove(token)

    #for token in tokens: 
    #    if token in stopwords.words('english'): 
    #        clean_tokens.remove(token)
    #    elif token in unwanted:
    #        clean_tokens.remove(token)
    
    overviewDic.setdefault(i, []) 
    overviewDic[i] = clean_tokens

Example on the first film (Finding Nemo)



In [19]:

    
overviewDic[0]









    Out[19]:





['friendly',
 'young',
 'meeting',
 'fish',
 'office',
 'reef',
 'dory',
 'hypnotic',
 'bring',
 'surfer',
 'unexpectedly',
 'along',
 'forgetful',
 'home',
 'marlin',
 'worrisome',
 'jellyfish',
 'vegetarian',
 'seagulls',
 'dentist',
 'barrier',
 'sharks',
 'adventurous',
 'taken',
 'clownfish',
 'aquarium',
 'hungry',
 'turtles',
 'nemo',
 'great',
 'way',
 'dude',
 'father']

3. Data exploration

Exploration of the words of the overviews of the movies

3.1 Most common words

Exploration of the most common words used in film independant of genre



In [20]:

    
ls = []

for i in range(0, len(overviewDic)):
    ls.extend(overviewDic[i])



In [21]:

    
len(ls)









    Out[21]:





68247



In [22]:

    
CleanedWords = pd.DataFrame(ls)
CleanedWords.head()



In [23]:

    
CleanedWords.to_csv('Saved_Datasets/MostCommonWords.csv', index=False)



In [24]:

    
freq = nltk.FreqDist(ls)



In [25]:

    
freq.plot(30, cumulative=False)



In [26]:

    
freq.most_common(50)









    Out[26]:





[('life', 370),
 ('new', 357),
 ('world', 350),
 ('young', 299),
 ('family', 259),
 ('man', 247),
 ('find', 241),
 ('story', 220),
 ('love', 207),
 ('years', 180),
 ('father', 179),
 ('home', 166),
 ('help', 163),
 ('back', 163),
 ('friends', 161),
 ('lives', 158),
 ('time', 154),
 ('woman', 154),
 ('together', 132),
 ('city', 132),
 ('son', 130),
 ('way', 128),
 ('become', 126),
 ('make', 125),
 ('true', 121),
 ('save', 120),
 ('wife', 120),
 ('group', 120),
 ('soon', 119),
 ('school', 119),
 ('war', 118),
 ('team', 116),
 ('take', 113),
 ('friend', 111),
 ('first', 110),
 ('becomes', 109),
 ('day', 109),
 ('girl', 108),
 ('town', 107),
 ('mysterious', 106),
 ('york', 105),
 ('film', 103),
 ('best', 103),
 ('past', 102),
 ('mother', 102),
 ('discovers', 100),
 ('death', 98),
 ('daughter', 97),
 ('american', 97),
 ('old', 95)]



In [27]:

    
mostCommon = freq.most_common()
mostCommon[0][0]









    Out[27]:





'life'

3.2 Words commonly associated to success or not



In [28]:

    
NB_WORDS = 100
words = []

for i in range(0, NB_WORDS):
    words.append(mostCommon[i][0])

List of the most common words associated to movies that were successful



In [29]:

    
print(words)









    



['life', 'new', 'world', 'young', 'family', 'man', 'find', 'story', 'love', 'years', 'father', 'home', 'help', 'back', 'friends', 'lives', 'time', 'woman', 'together', 'city', 'son', 'way', 'become', 'make', 'true', 'save', 'wife', 'group', 'soon', 'school', 'war', 'team', 'take', 'friend', 'first', 'becomes', 'day', 'girl', 'town', 'mysterious', 'york', 'film', 'best', 'past', 'mother', 'discovers', 'death', 'daughter', 'american', 'old', 'former', 'gets', 'people', 'agent', 'mission', 'discover', 'secret', 'forced', 'work', 'begins', 'evil', 'high', 'forces', 'police', 'men', 'job', 'order', 'relationship', 'living', 'couple', 'house', 'small', 'last', 'night', 'away', 'boy', 'stop', 'left', 'turn', 'brother', 'fight', 'real', 'events', 'earth', 'good', 'journey', 'could', 'falls', 'battle', 'tries', 'john', 'returns', 'dangerous', 'college', 'end', 'year', 'things', 'plan', 'face', 'across']



In [30]:

    
wordSuccess = np.zeros(len(mostCommon), dtype=float) #words

for i in range(0, len(df)): 
    SuccessTokens = list(set(word_tokenize(df['overview'][i].lower())))
    
    for j in range(0, len(mostCommon)): #words
        if mostCommon[j][0] in SuccessTokens:
            if df['success'][i] == 1:
                wordSuccess[j] += (1/mostCommon[j][1])*100



In [31]:

    
print(words[1])
print(wordSuccess[1])









    



new
22.1288515406



In [32]:

    
plt.bar(words[:15], wordSuccess[:15], align='center');
plt.setp(plt.gca().get_xticklabels(), rotation=45, horizontalalignment='right');
plt.xlabel('Words');
plt.ylabel('Success rate [%]');
plt.savefig('images/TopSuccessWords.png', dpi =300, bbox_inches='tight')

3.3 Determine top 100 words with the highest success rate



In [33]:

    
NEIGHBORS = 100

#sort the order of the weights
sort_order = np.argsort(-wordSuccess, axis = 0)

topWord = []
topWordSuccess = np.zeros(NEIGHBORS)

for i in range (0, NEIGHBORS):  
    topWord.append(mostCommon[sort_order[i]][0])
    topWordSuccess[i] = wordSuccess[sort_order[i]]



In [34]:

    
print(topWord)
print(topWordSuccess)









    



['reagan', 'shootout', 'metallica', 'alexandria', 'jerrica', 'i.', 'davus', 'hoax', '1921', 'autobahns', 'backpacker', 'christianity', 'hypatia', 'philosopher', 'hansen', 'recording', 'urgent', 'imelda', 'roadie', 'cathcart', 'frenzy', 'hooligan', 'wish-granting', 'setup', 'karl', 'reveries', 'simpler', 'promoters', 'malby', 'ryden', 'milwaukee', '937', 'dahmer', 'jesuit', 'wisconsin', 'exposer', 'whales', 'anti-semitic', 'jem', 'audrey', 'portray', 'mae', 'oily', 'initiates', 'folksinger', 'suribachi', 'rosenthal', 'coercive', 'lovelace', 'gagnon', 'remembering', 'itches', 'emir', 'amar', 'ptsd', 'clings', 'atone', 'cyber', 'hyper-linked', 'pitcher', 'music-driven', 'atm', 'tenley', 'biel', 'kubic', 'pyke', 'staunton', 'sightings', 'benton', 'stroke', 'reinvented', 'congo', 'personas', 'natasha', 'gilded', 'inauguration', 'barrymore', 'parachute', 'cirque', 'bearded', 'darren', 'brutality', 'sag', 'mum', 'orphanages', 'communism', 'dynamite', 'adulterated', 'kubiš', 'malt', 'convert', 'gabčík', 'jozef', 'propaganda', 'wolf-man', 'ge', 'watched', 'transmission', 'barricades', 'inseparable']
[ 100.  100.  100.  100.  100.  100.  100.  100.  100.  100.  100.  100.
  100.  100.  100.  100.  100.  100.  100.  100.  100.  100.  100.  100.
  100.  100.  100.  100.  100.  100.  100.  100.  100.  100.  100.  100.
  100.  100.  100.  100.  100.  100.  100.  100.  100.  100.  100.  100.
  100.  100.  100.  100.  100.  100.  100.  100.  100.  100.  100.  100.
  100.  100.  100.  100.  100.  100.  100.  100.  100.  100.  100.  100.
  100.  100.  100.  100.  100.  100.  100.  100.  100.  100.  100.  100.
  100.  100.  100.  100.  100.  100.  100.  100.  100.  100.  100.  100.
  100.  100.  100.  100.]



In [35]:

    
plt.bar(topWord[:15], topWordSuccess[:15], align='center');
plt.setp(plt.gca().get_xticklabels(), rotation=45, horizontalalignment='right');
plt.xlabel('Words');
plt.ylabel('Success rate [%]');

These words all have an 100% success rate, but don't appear very often! See the graph below



In [36]:

    
nbCounts = []

for i in range(0, len(topWord)):
    nbCounts.append(mostCommon[sort_order[i]][1])



In [37]:

    
plt.bar(topWord[:15], nbCounts[:15], align='center');
plt.setp(plt.gca().get_xticklabels(), rotation=45, horizontalalignment='right');
plt.xlabel('Words');
plt.ylabel('Counts');

3.4 Determine top 100 words with the highest success rate and that appear at least 100 times in the dataset



In [38]:

    
wordSuccess = np.zeros(len(mostCommon), dtype=float) 

for i in range(0, len(df)): 
    SuccessTokens = list(set(word_tokenize(df['overview'][i].lower())))
    
    for j in range(0, len(mostCommon)): #words
        if mostCommon[j][0] in SuccessTokens:
            if mostCommon[j][1] >= 100:
                if df['success'][i] == 1:
                    wordSuccess[j] += (1/mostCommon[j][1])*100



In [39]:

    
NEIGHBORS = 100

#sort the order of the weights
sort_order = np.argsort(-wordSuccess, axis = 0)

topWord = []
topWordSuccess = np.zeros(NEIGHBORS)

for i in range (0, NEIGHBORS):  
    topWord.append(mostCommon[sort_order[i]][0])
    topWordSuccess[i] = wordSuccess[sort_order[i]]



In [40]:

    
plt.bar(topWord[:15], topWordSuccess[:15], align='center');
plt.setp(plt.gca().get_xticklabels(), rotation=45, horizontalalignment='right');
plt.xlabel('Words');
plt.ylabel('Success rate [%]');



In [41]:

    
print(topWord)









    



['woman', 'film', 'city', 'town', 'mother', 'man', 'past', 'group', 'wife', 'war', 'young', 'york', 'story', 'becomes', 'take', 'love', 'father', 'team', 'become', 'world', 'life', 'lives', 'discovers', 'day', 'first', 'back', 'son', 'mysterious', 'friend', 'save', 'new', 'make', 'soon', 'best', 'girl', 'time', 'together', 'family', 'find', 'friends', 'years', 'help', 'true', 'way', 'home', 'school', 'commoners', 'mage', 'powerless', 'depose', 'role-playing', 'prosperity', 'birch', 'savina', 'mages', 'thora', 'empress', 'good-vs.-evil', 'izmer', 'bynes', 'amacor', 'gobi', 'boxcar', 'mongolia', 'unproductive', 'copilot', 'kaylee', 'big-time', 'stretching', 'giamatti', 'malevolently', 'profion', 'c-119', 'shell', 'pasta', 'neptune', 'connolly', 'behaved', 'delicious', 'plotting', 'misadventure', 'imposter', 'still-green', 'womanizing', 'digger', 'revenge-seeking', 'skinheads', '1983.', 'astro-training', 'canon', 'lots', 'ham', 'luna', 'stuff-style', 'evp', 'electronic', 'neutral', 'courtney', 'club-hopping', 'wimbledon']

3.5 Determine the least common words

Determine the 30 least common words



In [42]:

    
last = nltk.FreqDist(dict(freq.most_common()[-30:]))
last.plot()



In [43]:

    
last









    Out[43]:





FreqDist({'allegiances': 1,
          'bolland': 1,
          'busker': 1,
          'city-wide': 1,
          'compartments': 1,
          'constraints': 1,
          'denier': 1,
          'dumpty': 1,
          'entries': 1,
          'evidenced': 1,
          'fletcher': 1,
          'g.': 1,
          'gems': 1,
          'gender': 1,
          'glenn': 1,
          'holocaust': 1,
          'humpty': 1,
          'libel': 1,
          'lipstadt': 1,
          'mirroring': 1,
          'observant': 1,
          'pakhtun': 1,
          'pakistan': 1,
          'pittsburgh': 1,
          'puss': 1,
          'scott-': 1,
          'softpaws': 1,
          'submerged': 1,
          'sues': 1,
          'visionary': 1})

3.6 Determine words with the lowest success rate



In [44]:

    
NEIGHBORS = 100

#sort the order of the weights
sort_order = np.argsort(wordSuccess, axis = 0)

lowWord = []
lowWordSuccess = np.zeros(NEIGHBORS)

for i in range (0, NEIGHBORS):  
    lowWord.append(mostCommon[sort_order[i]][0])
    lowWordSuccess[i] = wordSuccess[sort_order[i]]



In [45]:

    
print(lowWord)









    



['advancing', 'luna', 'stuff-style', 'evp', 'electronic', 'neutral', 'courtney', 'club-hopping', 'wimbledon', 'rank', '119.', 'hume', 'neptune', 'ham', 'krab', 'profion', 'commoners', 'mage', 'powerless', 'depose', 'role-playing', 'prosperity', 'birch', 'savina', 'mages', 'thora', 'empress', 'shell', 'lots', 'canon', 'astro-training', 'pacha', 'llama', 'kuzco', 'yzma', 'pluto', 'lunar', '2087', 'mogan', 'terrence', 'injures', 'addicted', 'mcdonagh', 'hurricane', 'alarming', 'looches', 'connolly', 'behaved', 'delicious', 'plotting', 'misadventure', 'imposter', 'pasta', 'womanizing', 'digger', 'revenge-seeking', 'skinheads', '1983.', 'good-vs.-evil', 'izmer', 'c-119', 'amacor', 'established', 'advisors', 'dramatisation', 'sparked', 'missile', 'f', 'missle', 'prejudicial', 'conspires', 'brashear', 'diving', 'triumph', 'policies', 'akasha', 'arisen', 'lestat', 'dammed', 'lioncourt', 'slumber', 'throwing', 'leezak', 'mcnerney', 'pre-marital', 'pontius', 'compilation', 'bam', 'essentially', 'bases', 'barricaded', 'cuban', 'shark-infested', 'gobi', 'boxcar', 'mongolia', 'unproductive', 'copilot', 'kaylee', 'big-time']



In [46]:

    
plt.bar(lowWord[:15], lowWordSuccess[:15], align='center');
plt.setp(plt.gca().get_xticklabels(), rotation=45, horizontalalignment='right');
plt.xlabel('Words');
plt.ylabel('Success rate [%]');

4. Creation of the similarity matrix

In this section, we assign a grade based on whether the plot of the movies seem to be similar (similar stories)

4.1 Creation of the similarity matrix based on number of words in common

As no normalization was found for this solution, it is not used, but gives an indication of the number of words that appear in both films



In [47]:

    
TextW = np.ndarray(shape=(len(df), len(df)), dtype=int)

for i in range(0, len(df)):    
    for j in range(i, len(df)):
        
        counts = 0
        
        for k in range(0, len(overviewDic[j])):
            if overviewDic[j][k] in overviewDic[i]:
                counts = counts + 1
    
        TextW[i][j] = counts



In [48]:

    
plt.spy(TextW)









    Out[48]:





<matplotlib.image.AxesImage at 0x11041b00>



In [49]:

    
#ensure the matrix is symmetric
bigger = TextW.transpose() > TextW
TextW = TextW - TextW*bigger + TextW.transpose()*bigger

#fill the diagonal values to zero, i.e. no self-connections
np.fill_diagonal(TextW, 0)



In [50]:

    
plt.spy(TextW)









    Out[50]:





<matplotlib.image.AxesImage at 0x1153ed30>



In [51]:

    
print('The mean value: {}'.format(TextW.mean()))
print('The max value: {}'.format(TextW.max()))
print('The min value: {}'.format(TextW.min()))









    



The mean value: 0.35615805833230585
The max value: 41
The min value: 0

4.2 Creation of similarity matrix based on number of appearances of one of the 100 most frequent words

Most frequent $\neq$ highest success rate

$\textbf{Reminder:}$ These are the most common words from the dataset



In [52]:

    
print(words)









    



['life', 'new', 'world', 'young', 'family', 'man', 'find', 'story', 'love', 'years', 'father', 'home', 'help', 'back', 'friends', 'lives', 'time', 'woman', 'together', 'city', 'son', 'way', 'become', 'make', 'true', 'save', 'wife', 'group', 'soon', 'school', 'war', 'team', 'take', 'friend', 'first', 'becomes', 'day', 'girl', 'town', 'mysterious', 'york', 'film', 'best', 'past', 'mother', 'discovers', 'death', 'daughter', 'american', 'old', 'former', 'gets', 'people', 'agent', 'mission', 'discover', 'secret', 'forced', 'work', 'begins', 'evil', 'high', 'forces', 'police', 'men', 'job', 'order', 'relationship', 'living', 'couple', 'house', 'small', 'last', 'night', 'away', 'boy', 'stop', 'left', 'turn', 'brother', 'fight', 'real', 'events', 'earth', 'good', 'journey', 'could', 'falls', 'battle', 'tries', 'john', 'returns', 'dangerous', 'college', 'end', 'year', 'things', 'plan', 'face', 'across']



In [53]:

    
commonWordDic = {}

for i in range(0, len(df)):
    commonWordDic.setdefault(i, [])
    
    for k in range(0, len(overviewDic[i])):
            if overviewDic[i][k] in words:
                commonWordDic[i].append(overviewDic[i][k])

4.2.1 Normalization by greatest number of words

$$W_{ij} = \frac{Number \ of \ similar \ common \ words \ between \ i \ and \ j}{Highest \ number \ of \ common \ words \ between \ i \ and \ j} \in [0; 1]$$



In [54]:

    
TextW = np.zeros(shape=(len(df), len(df)), dtype=float)
WmaxNorm = np.zeros(shape=(len(df), len(df)), dtype=float)
maxLen = 0

for i in range(0, len(df)):
    for j in range(i, len(df)):
        maxLen = max([len(commonWordDic[i]), len(commonWordDic[j])])
        
        if maxLen != 0:
            #compute the similarity
            for k in range(0, len(commonWordDic[i])):
                if commonWordDic[i][k] in commonWordDic[j]:
                    TextW[i][j] += 1
            
            #normalization by division of the maximum length
            WmaxNorm[i][j] = TextW[i][j]/maxLen
        else:
            #assign a similarity of 1 since neither film has one of the most freq plot words
            TextW[i][j] = 1
            WmaxNorm[i][j] = 1



In [55]:

    
#ensure the matrix is symmetric
bigger = WmaxNorm.transpose() > WmaxNorm
WmaxNorm = WmaxNorm - WmaxNorm*bigger + WmaxNorm.transpose()*bigger

#fill the diagonal values to zero, i.e. no self-connections
np.fill_diagonal(WmaxNorm, 0)

Do this also for TextW



In [56]:

    
#ensure the matrix is symmetric
bigger = TextW.transpose() > TextW
TextW = TextW - TextW*bigger + TextW.transpose()*bigger

#fill the diagonal values to zero, i.e. no self-connections
np.fill_diagonal(TextW, 0)



In [57]:

    
plt.spy(WmaxNorm)









    Out[57]:





<matplotlib.image.AxesImage at 0x1160b400>



In [58]:

    
plt.hist(WmaxNorm.reshape(-1), bins=50);



In [59]:

    
print('The mean value is: {}'.format(WmaxNorm.mean()))
print('The max value is: {}'.format(WmaxNorm.max()))
print('The min value is: {}'.format(WmaxNorm.min()))









    



The mean value is: 0.03878010310628478
The max value is: 1.0
The min value is: 0.0

Plot the degree distribution



In [60]:

    
degrees = np.zeros(len(WmaxNorm)) 

#reminder: the degrees of a node for a weighted graph are the sum of its weights

for i in range(0, len(WmaxNorm)):
    degrees[i] = sum(WmaxNorm[i])

plt.hist(degrees, bins=50);



In [61]:

    
print('The mean value is: {}'.format(degrees.mean()))
print('The max value is: {}'.format(degrees.max()))
print('The min value is: {}'.format(degrees.min()))









    



The mean value is: 101.64265024157257
The max value is: 243.50931152181073
The min value is: 11.552763902763905

4.2.2 Normalization by inspiration of 75th percentile



In [122]:

    
np.percentile(TextW, 75)









    Out[122]:





0.0



In [99]:

    
plt.hist(TextW.reshape(-1), bins=50);



In [102]:

    
print('The mean value is: {}'.format(TextW.mean()))
print('The max value is: {}'.format(TextW.max()))
print('The min value is: {}'.format(TextW.min()))









    



The mean value is: 0.24397519462807446
The max value is: 8.0
The min value is: 0.0



In [104]:

    
sum(halfTextW[TextW == 8])









    Out[104]:





8.0

Normalization:

$$W_{ij} = \begin{cases}0, & Nb \ of \ similar \ common \ words \ between \ i \ and \ j =0 \\ 1, & Nb \ of \ similar \ common \ words \ between \ i \ and \ j \geq 1 \end{cases}$$



In [142]:

    
WNormPerc = np.zeros(shape=(len(df), len(df)), dtype=float)

for i in range(0, len(df)):
    for j in range(i, len(df)):
        if TextW[i][j] >= 1:
            WNormPerc[i][j] = 1
        #else:
        #    WNormPerc[i][j] = WNormPerc[i][j]/2



In [143]:

    
#ensure the matrix is symmetric
bigger = WNormPerc.transpose() > WNormPerc
WNormPerc = WNormPerc - WNormPerc*bigger + WNormPerc.transpose()*bigger

#fill the diagonal values to zero, i.e. no self-connections
np.fill_diagonal(WNormPerc, 0)



In [144]:

    
plt.spy(WNormPerc)









    Out[144]:





<matplotlib.image.AxesImage at 0x10c00fd0>



In [145]:

    
plt.hist(WNormPerc.reshape(-1), bins=50);

4.4 Save the dataset



In [70]:

    
NormW = pd.DataFrame(WmaxNorm)
NormW.head()









    Out[70]:







  
    
      
      0
      1
      2
      3
      4
      5
      6
      7
      8
      9
      ...
      2611
      2612
      2613
      2614
      2615
      2616
      2617
      2618
      2619
      2620
    
  
  
    
      0
      0.0
      0.000000
      0.000000
      0.0
      0.000000
      0.166667
      0.0
      0.0
      0.25
      0.25
      ...
      0.200000
      0.000000
      0.0
      0.25
      0.0
      0.0
      0.0
      0.000000
      0.000000
      0.000000
    
    
      1
      0.0
      0.000000
      0.166667
      0.0
      0.166667
      0.166667
      0.0
      0.0
      0.00
      0.00
      ...
      0.200000
      0.000000
      0.0
      0.00
      0.0
      0.2
      0.0
      0.200000
      0.000000
      0.000000
    
    
      2
      0.0
      0.166667
      0.000000
      0.0
      0.000000
      0.166667
      0.0
      0.0
      0.00
      0.00
      ...
      0.166667
      0.000000
      0.0
      0.00
      0.0
      0.0
      0.0
      0.333333
      0.166667
      0.166667
    
    
      3
      0.0
      0.000000
      0.000000
      0.0
      0.000000
      0.000000
      0.0
      0.0
      0.00
      0.00
      ...
      0.000000
      0.000000
      0.0
      0.00
      0.0
      0.0
      0.0
      0.000000
      0.000000
      0.000000
    
    
      4
      0.0
      0.166667
      0.000000
      0.0
      0.000000
      0.000000
      0.0
      0.0
      0.00
      0.00
      ...
      0.166667
      0.166667
      0.0
      0.00
      0.0
      0.0
      0.0
      0.000000
      0.000000
      0.000000
    
  

5 rows × 2621 columns



In [71]:

    
NormW.to_csv('Saved_Datasets/NormalizedTextW.csv', index=False)

5. Graph Laplacian and Embedding for maximum normalization

5.1 Compute the graph Laplacian

With pygsp



In [127]:

    
G = graphs.Graph(WmaxNorm)
G.compute_laplacian('normalized')

Normally



In [128]:

    
#reminder: L = D - W for weighted graphs
laplacian = np.diag(degrees) - WmaxNorm

#computation of the normalized Laplacian
laplacian_norm = scipy.sparse.csgraph.laplacian(WmaxNorm, normed = True)

plt.spy(laplacian_norm);



In [129]:

    
laplacian_norm = sparse.csr_matrix(laplacian_norm)

5.2 Compute the Fourier basis

With pygsp



In [130]:

    
G.compute_fourier_basis(recompute=True)
plt.plot(G.e[0:10]);



In [131]:

    
print('The value of the 1st eigenvalue is: {}'.format(G.e[1]))









    



The value of the 1st eigenvalue is: 1.3791094156358285e-15

Normally



In [132]:

    
eigenvalues, eigenvectors =  sparse.linalg.eigsh(laplacian_norm, k = 10, which = 'SM')



In [133]:

    
plt.plot(eigenvalues, '.-', markersize=15);
plt.xlabel('')
plt.ylabel('Eigenvalues')
plt.show()

5.3 Graph embedding



In [134]:

    
genres = preprocessing.LabelEncoder().fit_transform(df['success'])

x = eigenvectors[:, 1] 
y = eigenvectors[:, 2] 
plt.scatter(x, y, c=genres, cmap='RdBu', alpha=0.5);



In [135]:

    
#Note: eigenvalues and their respective eigenvectors are already sorted from smallest to biggest

#plot on the eigenvectors 2 and 3 (set_coordinates takes Nx2 or Nx3 array size)
G.set_coordinates(G.U[:, 1:3])
G.plot()



In [136]:

    
G.plot_signal(genres, vertex_size=20)

6. Graph Laplacian and Embedding for 75-percentile normalization

6.1. Sparsify (Not needed in the end)

NEIGHBORS = 300

sort the order of the weights

sort_order = np.argsort(weightsNorm, axis = 1)

declaration of a sorted weight matrix

sorted_weights = np.zeros((len(weightsNorm), len(weightsNorm)))

for i in range (0, len(weightsNorm)):
for j in range(0, len(weightsNorm)): if (j >= len(weightsNorm) - NEIGHBORS):

        #copy the k strongest edges for each node
        sorted_weights[i, sort_order[i,j]] = weightsNorm[i,sort_order[i,j]]
    else:
        #set the other edges to zero
        sorted_weights[i, sort_order[i,j]] = 0

ensure the matrix is symmetric

bigger = sorted_weights.transpose() > sorted_weights sorted_weights = sorted_weights - sorted_weightsbigger + sorted_weights.transpose()bigger



In [82]:

    
#plt.spy(sorted_weights)



In [83]:

    
#plt.hist(sorted_weights.reshape(-1), bins=50);

6.2. Save the dataset



In [84]:

    
NormW = pd.DataFrame(WNormPerc)
NormW.head()









    Out[84]:







  
    
      
      0
      1
      2
      3
      4
      5
      6
      7
      8
      9
      ...
      2611
      2612
      2613
      2614
      2615
      2616
      2617
      2618
      2619
      2620
    
  
  
    
      0
      0.0
      0.0
      0.0
      0.0
      0.0
      1.0
      0.0
      0.0
      1.0
      1.0
      ...
      1.0
      0.0
      0.0
      1.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      1
      0.0
      0.0
      1.0
      0.0
      1.0
      1.0
      0.0
      0.0
      0.0
      0.0
      ...
      1.0
      0.0
      0.0
      0.0
      0.0
      1.0
      0.0
      1.0
      0.0
      0.0
    
    
      2
      0.0
      1.0
      0.0
      0.0
      0.0
      1.0
      0.0
      0.0
      0.0
      0.0
      ...
      1.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      1.0
      1.0
      1.0
    
    
      3
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      4
      0.0
      1.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      1.0
      1.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
  

5 rows × 2621 columns



In [85]:

    
NormW.to_csv('Saved_Datasets/TextWSparsePerc.csv', index=False)

6.3. Laplacian and graph embedding

With pygsp



In [146]:

    
G = graphs.Graph(WNormPerc)
G.compute_laplacian('normalized')

Other



In [147]:

    
#reminder: L = D - W for weighted graphs
laplacian = np.diag(degrees) - WNormPerc

#computation of the normalized Laplacian
laplacian_norm = scipy.sparse.csgraph.laplacian(WNormPerc, normed = True)

plt.spy(laplacian_norm);



In [148]:

    
G.compute_fourier_basis(recompute=True)
plt.plot(G.e[0:10]);



In [149]:

    
G.set_coordinates(G.U[:, 1:3])
G.plot()



In [150]:

    
G.plot_signal(genres, vertex_size=20)



In [ ]:

	id	budget	genres	imdb_id	overview	production_companies	release_date	revenue	title	director_name	actor_names	Metacritic	ROI	success
0	12	94000000	Animation\|Family	266543	Nemo, an adventurous young clownfish, is unexp...	Pixar Animation Studios	2003-05-30	940335536	Finding Nemo	Andrew Stanton	['Albert Brooks', 'Ellen DeGeneres', 'Alexande...	90	2.639	4
1	16	12800000	Drama\|Crime\|Music	168629	Selma, a Czech immigrant on the verge of blind...	Fine Line Features	2000-05-17	40031879	Dancer in the Dark	Lars von Trier	['Björk', 'Catherine Deneuve', 'David Morse', ...	61	2.127	3
2	22	140000000	Adventure\|Fantasy\|Action	325980	Jack Sparrow, a freewheeling 17th-century pira...	Walt Disney Pictures	2003-09-07	655011224	Pirates of the Caribbean: The Curse of the Bla...	Gore Verbinski	['Johnny Depp', 'Geoffrey Rush', 'Orlando Bloo...	63	2.639	4
3	24	30000000	Action\|Crime	266697	An assassin is shot at the altar by her ruthle...	Miramax Films	2003-10-10	180949000	Kill Bill: Vol. 1	Quentin Tarantino	['Uma Thurman', 'Lucy Liu', 'Vivica A. Fox', '...	69	2.639	4
4	25	72000000	Drama\|War	418763	Jarhead is a film about a US Marine Anthony Sw...	Universal Pictures	2005-04-11	96889998	Jarhead	Sam Mendes	['Jamie Foxx', 'Scott MacDonald', 'Lucas Black...	58	0.346	2

	1	2	4	5	8	9	...	2611	2612	2614	2616	2618	2619	2620
0	0.000000	0.000000	0.000000	0.166667	0.25	0.25	...	0.200000	0.000000	0.25	0.0	0.000000	0.000000	0.000000
1	0.000000	0.166667	0.166667	0.166667	0.00	0.00	...	0.200000	0.000000	0.00	0.2	0.200000	0.000000	0.000000
2	0.166667	0.000000	0.000000	0.166667	0.00	0.00	...	0.166667	0.000000	0.00	0.0	0.333333	0.166667	0.166667
3	0.000000	0.000000	0.000000	0.000000	0.00	0.00	...	0.000000	0.000000	0.00	0.0	0.000000	0.000000	0.000000
4	0.166667	0.000000	0.000000	0.000000	0.00	0.00	...	0.166667	0.166667	0.00	0.0	0.000000	0.000000	0.000000

	1	2	4	5	8	9	...	2611	2612	2614	2616	2618	2619	2620
0	0.0	0.0	0.0	1.0	1.0	1.0	...	1.0	0.0	1.0	0.0	0.0	0.0	0.0
1	0.0	1.0	1.0	1.0	0.0	0.0	...	1.0	0.0	0.0	1.0	1.0	0.0	0.0
2	1.0	0.0	0.0	1.0	0.0	0.0	...	1.0	0.0	0.0	0.0	1.0	1.0	1.0
3	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0
4	1.0	0.0	0.0	0.0	0.0	0.0	...	1.0	1.0	0.0	0.0	0.0	0.0	0.0

	1	2	4	5	8	9	...	2611	2612	2614	2616	2618	2619	2620
0	0.0	0.0	0.0	1.0	1.0	1.0	...	1.0	0.0	1.0	0.0	0.0	0.0	0.0
1	0.0	1.0	1.0	1.0	0.0	0.0	...	1.0	0.0	0.0	1.0	1.0	0.0	0.0
2	1.0	0.0	0.0	1.0	0.0	0.0	...	1.0	0.0	0.0	0.0	1.0	1.0	1.0
3	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0
4	1.0	0.0	0.0	0.0	0.0	0.0	...	1.0	1.0	0.0	0.0	0.0	0.0	0.0

	1	2	4	5	8	9	...	2611	2612	2614	2616	2618	2619	2620
0	0.0	0.0	0.0	1.0	1.0	1.0	...	1.0	0.0	1.0	0.0	0.0	0.0	0.0
1	0.0	1.0	1.0	1.0	0.0	0.0	...	1.0	0.0	0.0	1.0	1.0	0.0	0.0
2	1.0	0.0	0.0	1.0	0.0	0.0	...	1.0	0.0	0.0	0.0	1.0	1.0	1.0
3	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0
4	1.0	0.0	0.0	0.0	0.0	0.0	...	1.0	1.0	0.0	0.0	0.0	0.0	0.0