In [2]:
%matplotlib inline
import configparser
import os
import requests
from tqdm import tqdm
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import sparse, stats, spatial
import scipy.sparse.linalg
from sklearn import preprocessing, decomposition
import librosa
import IPython.display as ipd
import json
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer
from collections import OrderedDict
from pygsp import graphs, filters, plotting
from IPython.display import Image
plt.rcParams['figure.figsize'] = (17, 5)
plotting.BACKEND = 'matplotlib'
In [3]:
dataset = pd.read_csv('Saved_Datasets/NewFeaturesDataset.csv')
In [4]:
dataset.head(3)
Out[4]:
In [5]:
dataset['Profitability'] = dataset['revenue']-dataset['budget']
In [6]:
dataset.head(3)
Out[6]:
In [23]:
plt.hist(dataset['budget'],bins=100);
plt.xlabel('Budget of movies')
plt.ylabel('Number of movies [$]')
plt.savefig('images/movies_budget.png', dpi=300, bbox_inches='tight')
In [18]:
plt.bar(dataset['ROI'],dataset['budget'], align='center',width=0.03);
plt.setp(plt.gca().get_xticklabels(), rotation=0, horizontalalignment='right');
plt.xlabel('ROI');
plt.ylabel('Budget of movies [$]');
plt.savefig('images/budget_ROI.png', dpi=300, bbox_inches='tight')
In [20]:
#print(profi)
#print(bud)
plt.stem(dataset['Profitability'], dataset['budget']);
#plt.setp(plt.gca().get_xticklabels(), rotation=0, horizontalalignment='right');
#plt.xlabel('ROI');
#plt.ylabel('Budget of movies');
plt.xlabel('Profitability of movies[$]')
plt.ylabel('Budget of movies [$]')
plt.savefig('images/budget_Profitability.png', dpi=300, bbox_inches='tight')
In [15]:
plt.stem(dataset['budget'],dataset['Profitability']);
In [16]:
min(dataset['budget'])
Out[16]:
In [12]:
#min(dataset['budget'])
print(min(test))
print(min(prof))
In [ ]:
plt.bar(tt,vv)
In [ ]:
dataset['budget'][:]
In [ ]:
In [ ]:
#W = np.ndarray(shape=(10, 10), dtype=int)
W_diff = np.zeros(shape=(len(dataset), len(dataset)), dtype=int)
for i in range(0,len(dataset)):
for j in range(i,len(dataset)):
W_diff[i][j] = abs(dataset['budget'][i]-dataset['budget'][j])
In [ ]:
plt.spy(W_diff)
In [ ]:
bigger = W_diff.transpose() > W_diff
W_diff = W_diff - W_diff*bigger + W_diff.transpose()*bigger
np.fill_diagonal(W_diff, 0)
In [ ]:
plt.hist(W_diff.reshape(-1),bins=50);
In [ ]:
val_75 = np.percentile(W_diff,75)
print(val_75)
In [ ]:
W_diff_norm = np.zeros(shape=(len(dataset), len(dataset)), dtype=float)
for i in range(0,len(dataset)):
for j in range(i,len(dataset)):
if W_diff[i][j] == 0:
W_diff_norm[i][j] = 1
elif W_diff[i][j] <= val_75:
W_diff_norm[i][j] = 1-(W_diff[i][j])/(val_75)
else:
W_diff_norm[i][j] = 0
In [ ]:
max_W_diff = W_diff.max()
W_diff_norm = np.zeros(shape=(len(dataset), len(dataset)), dtype=float)
for i in range(0,len(dataset)):
for j in range(i,len(dataset)):
if W_diff[i][j] == 0:
W_diff_norm[i][j] = 1
else:
W_diff_norm[i][j] = 1-(W_diff[i][j])/(max_W_diff)
In [ ]:
bigger = W_diff_norm.transpose() > W_diff_norm
W_diff_norm = W_diff_norm - W_diff_norm*bigger + W_diff_norm.transpose()*bigger
np.fill_diagonal(W_diff_norm, 0)
In [ ]:
plt.spy(W_diff_norm)
In [ ]:
DiffNormW = pd.DataFrame(W_diff_norm)
DiffNormW.head()
In [ ]:
plt.hist(W_diff_norm.reshape(-1),bins=50);
In [ ]:
#Compute degree distribution
degrees = np.zeros(len(W_diff_norm))
for i in range(0, len(W_diff_norm)):
degrees[i] = sum(W_diff_norm[i])
plt.hist(degrees, bins=50);
In [ ]:
DiffNormW.to_csv('Saved_Datasets/DiffNormBudgW.csv', index=False)
In [ ]:
G = graphs.Graph(W_diff_norm)
G.compute_laplacian('normalized')
G.compute_fourier_basis(recompute=True)
plt.plot(G.e[0:10]);
In [ ]:
labels = preprocessing.LabelEncoder().fit_transform(dataset['success'])
G.set_coordinates(G.U[:,1:3])
In [ ]:
G.plot_signal(labels, vertex_size=20)
In [ ]:
NEIGHBORS = 300
#sort the order of the weights
sort_order = np.argsort(W_diff_norm, axis = 1)
#declaration of a sorted weight matrix
sorted_weights = np.zeros((len(W_diff_norm), len(W_diff_norm)))
for i in range (0, len(W_diff_norm)):
for j in range(0, len(W_diff_norm)):
if (j >= len(W_diff_norm) - NEIGHBORS):
#copy the k strongest edges for each node
sorted_weights[i, sort_order[i,j]] = W_diff_norm[i,sort_order[i,j]]
else:
#set the other edges to zero
sorted_weights[i, sort_order[i,j]] = 0
#ensure the matrix is symmetric
bigger = sorted_weights.transpose() > sorted_weights
sorted_weights = sorted_weights - sorted_weights*bigger + sorted_weights.transpose()*bigger
In [ ]:
plt.spy(sorted_weights)
In [ ]:
DiffSparsW = pd.DataFrame(sorted_weights)
DiffSparsW.head()
In [ ]:
DiffSparsW.to_csv('Saved_Datasets/DiffNormSparsBudgW.csv', index=False)
In [ ]:
G = graphs.Graph(sorted_weights)
G.compute_laplacian('normalized')
G.compute_fourier_basis(recompute=True)
plt.plot(G.e[0:10]);
In [ ]:
labels = preprocessing.LabelEncoder().fit_transform(dataset['success'])
G.set_coordinates(G.U[:,1:3])
In [ ]:
G.plot_signal(labels, vertex_size=20)
In [ ]: