notebook.community

Edit and run



In [2]:

    
%matplotlib inline

import configparser
import os

import requests
from tqdm import tqdm
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import sparse, stats, spatial
import scipy.sparse.linalg
from sklearn import preprocessing, decomposition
import librosa
import IPython.display as ipd
import json
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize 
from nltk.stem import WordNetLemmatizer, PorterStemmer 
from collections import OrderedDict
from pygsp import graphs, filters, plotting
from IPython.display import Image

plt.rcParams['figure.figsize'] = (17, 5)
plotting.BACKEND = 'matplotlib'



In [3]:

    
dataset = pd.read_csv('Saved_Datasets/NewFeaturesDataset.csv')



In [4]:

    
dataset.head(3)









    Out[4]:







  
    
      
      id
      budget
      genres
      imdb_id
      overview
      production_companies
      release_date
      revenue
      title
      director_name
      actor_names
      actors_ids
      actors_tenures
      total_tenure
      average_tenure
      Metacritic
      ROI
      success
    
  
  
    
      0
      12
      94000000
      Animation|Family
      266543
      Nemo, an adventurous young clownfish, is unexp...
      Pixar Animation Studios
      2003-05-30
      940335536
      Finding Nemo
      Andrew Stanton
      ['Albert Brooks', 'Ellen DeGeneres', 'Alexande...
      [14, 5293, 12, 13, 18]
      [18, 24, 2, 28, 14]
      86
      17.2
      90
      2.639
      4
    
    
      1
      16
      12800000
      Drama|Crime|Music
      168629
      Selma, a Czech immigrant on the verge of blind...
      Fine Line Features
      2000-05-17
      40031879
      Dancer in the Dark
      Lars von Trier
      ['Björk', 'Catherine Deneuve', 'David Morse', ...
      [6748, 47, 52, 50, 53]
      [49, 19, 21, 44, 15]
      148
      29.6
      61
      2.127
      3
    
    
      2
      22
      140000000
      Adventure|Fantasy|Action
      325980
      Jack Sparrow, a freewheeling 17th-century pira...
      Walt Disney Pictures
      2003-09-07
      655011224
      Pirates of the Caribbean: The Curse of the Bla...
      Gore Verbinski
      ['Johnny Depp', 'Geoffrey Rush', 'Orlando Bloo...
      [1709, 116, 114, 118, 85]
      [7, 9, 7, 22, 20]
      65
      13.0
      63
      2.639
      4



In [5]:

    
dataset['Profitability'] = dataset['revenue']-dataset['budget']



In [6]:

    
dataset.head(3)









    Out[6]:







  
    
      
      id
      budget
      genres
      imdb_id
      overview
      production_companies
      release_date
      revenue
      title
      director_name
      actor_names
      actors_ids
      actors_tenures
      total_tenure
      average_tenure
      Metacritic
      ROI
      success
      Profitability
    
  
  
    
      0
      12
      94000000
      Animation|Family
      266543
      Nemo, an adventurous young clownfish, is unexp...
      Pixar Animation Studios
      2003-05-30
      940335536
      Finding Nemo
      Andrew Stanton
      ['Albert Brooks', 'Ellen DeGeneres', 'Alexande...
      [14, 5293, 12, 13, 18]
      [18, 24, 2, 28, 14]
      86
      17.2
      90
      2.639
      4
      846335536
    
    
      1
      16
      12800000
      Drama|Crime|Music
      168629
      Selma, a Czech immigrant on the verge of blind...
      Fine Line Features
      2000-05-17
      40031879
      Dancer in the Dark
      Lars von Trier
      ['Björk', 'Catherine Deneuve', 'David Morse', ...
      [6748, 47, 52, 50, 53]
      [49, 19, 21, 44, 15]
      148
      29.6
      61
      2.127
      3
      27231879
    
    
      2
      22
      140000000
      Adventure|Fantasy|Action
      325980
      Jack Sparrow, a freewheeling 17th-century pira...
      Walt Disney Pictures
      2003-09-07
      655011224
      Pirates of the Caribbean: The Curse of the Bla...
      Gore Verbinski
      ['Johnny Depp', 'Geoffrey Rush', 'Orlando Bloo...
      [1709, 116, 114, 118, 85]
      [7, 9, 7, 22, 20]
      65
      13.0
      63
      2.639
      4
      515011224

Data exploration



In [23]:

    
plt.hist(dataset['budget'],bins=100);
plt.xlabel('Budget of movies')
plt.ylabel('Number of movies [$]')
plt.savefig('images/movies_budget.png', dpi=300, bbox_inches='tight')



In [18]:

    
plt.bar(dataset['ROI'],dataset['budget'], align='center',width=0.03);
plt.setp(plt.gca().get_xticklabels(), rotation=0, horizontalalignment='right');
plt.xlabel('ROI');
plt.ylabel('Budget of movies [$]');
plt.savefig('images/budget_ROI.png', dpi=300, bbox_inches='tight')



In [20]:

    
#print(profi)
#print(bud)
plt.stem(dataset['Profitability'], dataset['budget']);
#plt.setp(plt.gca().get_xticklabels(), rotation=0, horizontalalignment='right');
#plt.xlabel('ROI');
#plt.ylabel('Budget of movies');
plt.xlabel('Profitability of movies[$]')
plt.ylabel('Budget of movies [$]')
plt.savefig('images/budget_Profitability.png', dpi=300, bbox_inches='tight')



In [15]:

    
plt.stem(dataset['budget'],dataset['Profitability']);



In [16]:

    
min(dataset['budget'])









    Out[16]:





1000



In [12]:

    
#min(dataset['budget'])
print(min(test))
print(min(prof))



In [ ]:

    
plt.bar(tt,vv)



In [ ]:

    
dataset['budget'][:]



In [ ]:

Difference of budget between movies



In [ ]:

    
#W = np.ndarray(shape=(10, 10), dtype=int)
W_diff = np.zeros(shape=(len(dataset), len(dataset)), dtype=int)
for i in range(0,len(dataset)):
    for j in range(i,len(dataset)):
        W_diff[i][j] = abs(dataset['budget'][i]-dataset['budget'][j])



In [ ]:

    
plt.spy(W_diff)



In [ ]:

    
bigger = W_diff.transpose() > W_diff
W_diff = W_diff - W_diff*bigger + W_diff.transpose()*bigger
np.fill_diagonal(W_diff, 0)



In [ ]:

    
plt.hist(W_diff.reshape(-1),bins=50);

Weights Normalization



In [ ]:

    
val_75 = np.percentile(W_diff,75)
print(val_75)



In [ ]:

    
W_diff_norm = np.zeros(shape=(len(dataset), len(dataset)), dtype=float)
for i in range(0,len(dataset)):
    for j in range(i,len(dataset)):
        if W_diff[i][j] == 0:
            W_diff_norm[i][j] = 1
        elif W_diff[i][j] <= val_75:  
            W_diff_norm[i][j] = 1-(W_diff[i][j])/(val_75)
        else:
            W_diff_norm[i][j] = 0



In [ ]:

    
max_W_diff = W_diff.max()
W_diff_norm = np.zeros(shape=(len(dataset), len(dataset)), dtype=float)
for i in range(0,len(dataset)):
    for j in range(i,len(dataset)):
        if W_diff[i][j] == 0:
            W_diff_norm[i][j] = 1
        else:  
            W_diff_norm[i][j] = 1-(W_diff[i][j])/(max_W_diff)



In [ ]:

    
bigger = W_diff_norm.transpose() > W_diff_norm
W_diff_norm = W_diff_norm - W_diff_norm*bigger + W_diff_norm.transpose()*bigger
np.fill_diagonal(W_diff_norm, 0)



In [ ]:

    
plt.spy(W_diff_norm)



In [ ]:

    
DiffNormW = pd.DataFrame(W_diff_norm)
DiffNormW.head()



In [ ]:

    
plt.hist(W_diff_norm.reshape(-1),bins=50);



In [ ]:

    
#Compute degree distribution 
degrees = np.zeros(len(W_diff_norm)) 
for i in range(0, len(W_diff_norm)):
    degrees[i] = sum(W_diff_norm[i])

plt.hist(degrees, bins=50);



In [ ]:

    
DiffNormW.to_csv('Saved_Datasets/DiffNormBudgW.csv', index=False)



In [ ]:

    
G = graphs.Graph(W_diff_norm)
G.compute_laplacian('normalized')
G.compute_fourier_basis(recompute=True)
plt.plot(G.e[0:10]);



In [ ]:

    
labels = preprocessing.LabelEncoder().fit_transform(dataset['success'])
G.set_coordinates(G.U[:,1:3])



In [ ]:

    
G.plot_signal(labels, vertex_size=20)

Weight matrix sparsification



In [ ]:

    
NEIGHBORS = 300

#sort the order of the weights
sort_order = np.argsort(W_diff_norm, axis = 1)

#declaration of a sorted weight matrix
sorted_weights = np.zeros((len(W_diff_norm), len(W_diff_norm)))

for i in range (0, len(W_diff_norm)):  
    for j in range(0, len(W_diff_norm)):
        if (j >= len(W_diff_norm) - NEIGHBORS):
            #copy the k strongest edges for each node
            sorted_weights[i, sort_order[i,j]] = W_diff_norm[i,sort_order[i,j]]
        else:
            #set the other edges to zero
            sorted_weights[i, sort_order[i,j]] = 0

#ensure the matrix is symmetric
bigger = sorted_weights.transpose() > sorted_weights
sorted_weights = sorted_weights - sorted_weights*bigger + sorted_weights.transpose()*bigger



In [ ]:

    
plt.spy(sorted_weights)



In [ ]:

    
DiffSparsW = pd.DataFrame(sorted_weights)
DiffSparsW.head()



In [ ]:

    
DiffSparsW.to_csv('Saved_Datasets/DiffNormSparsBudgW.csv', index=False)



In [ ]:

    
G = graphs.Graph(sorted_weights)
G.compute_laplacian('normalized')
G.compute_fourier_basis(recompute=True)
plt.plot(G.e[0:10]);



In [ ]:

    
labels = preprocessing.LabelEncoder().fit_transform(dataset['success'])
G.set_coordinates(G.U[:,1:3])



In [ ]:

    
G.plot_signal(labels, vertex_size=20)



In [ ]:

	id	budget	genres	imdb_id	overview	production_companies	release_date	revenue	title	director_name	actor_names	actors_ids	actors_tenures	total_tenure	average_tenure	Metacritic	ROI	success
0	12	94000000	Animation\|Family	266543	Nemo, an adventurous young clownfish, is unexp...	Pixar Animation Studios	2003-05-30	940335536	Finding Nemo	Andrew Stanton	['Albert Brooks', 'Ellen DeGeneres', 'Alexande...	[14, 5293, 12, 13, 18]	[18, 24, 2, 28, 14]	86	17.2	90	2.639	4
1	16	12800000	Drama\|Crime\|Music	168629	Selma, a Czech immigrant on the verge of blind...	Fine Line Features	2000-05-17	40031879	Dancer in the Dark	Lars von Trier	['Björk', 'Catherine Deneuve', 'David Morse', ...	[6748, 47, 52, 50, 53]	[49, 19, 21, 44, 15]	148	29.6	61	2.127	3
2	22	140000000	Adventure\|Fantasy\|Action	325980	Jack Sparrow, a freewheeling 17th-century pira...	Walt Disney Pictures	2003-09-07	655011224	Pirates of the Caribbean: The Curse of the Bla...	Gore Verbinski	['Johnny Depp', 'Geoffrey Rush', 'Orlando Bloo...	[1709, 116, 114, 118, 85]	[7, 9, 7, 22, 20]	65	13.0	63	2.639	4