In [1]:
%matplotlib inline
import configparser
import os
import requests
from tqdm import tqdm
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.mlab as mlab
from scipy import sparse, stats, spatial
import scipy.sparse.linalg
from sklearn import preprocessing, decomposition
import librosa
import IPython.display as ipd
import json
#added by me:
import requests
from pygsp import graphs, filters, plotting
plt.rcParams['figure.figsize'] = (17, 5)
plotting.BACKEND = 'matplotlib'
In [2]:
%pylab inline
pylab.rcParams['figure.figsize'] = (10, 6);
In [2]:
df = pd.read_csv('Saved_Datasets/NewFeaturesDataset.csv')
In [4]:
#df = df[df['Metacritic'] != 0]
In [4]:
df.head()
Out[4]:
In [23]:
unique, counts = np.unique(df['Metacritic'], return_counts=True)
plt.bar(unique,counts,align='center',width=.6);
ratings_nz = np.array(df[df['Metacritic'] != 0]['Metacritic'])
mu = np.mean(ratings_nz)
std = np.std(ratings_nz)
plt.xlabel('Ratings')
plt.ylabel('Counts')
plt.title("Metacritic Ratings ($ \mu=%.2f,$ $\sigma=%.2f $)" %(mu,std));
plt.savefig('images/Metacritic_distribution.png')
In [3]:
plt.hist(df['ROI'],bins='auto');
In [16]:
data = np.array(df['ROI'])
# This is the colormap I'd like to use.
cm = plt.cm.get_cmap('RdYlGn');
# Plot histogram.
n, bins, patches = plt.hist(data, 25, normed=1, color='yellow');
bin_centers = 0.5 * (bins[:-1] + bins[1:]);
# scale values to interval [0,1]
col = bin_centers - min(bin_centers)
col /= max(col)
for c, p in zip(col, patches):
plt.setp(p, 'facecolor', cm(c));
plt.xlabel('ROI');
plt.savefig('images/ROI_regression.png');
plt.show();
In [4]:
np.percentile(df['ROI'], 75)
Out[4]:
In [39]:
df.to_csv('Saved_Datasets/NewFeaturesDataset.csv', encoding='utf-8', index=False)
In [13]:
print("%.2f" % (len(df[df['ROI']>1])/len(df)*100))
print("%.2f" % (len(df[df['Metacritic']>50])/len(df)*100))
We can see that the ROI and the ratings are not correlated as the ROI doesn't necessarily increases for good movies :
In [24]:
df_sorted = df.sort_values(by=['Metacritic'])
plt.plot(df_sorted['Metacritic'],df_sorted['ROI'])
plt.xlabel('Metacritic Ratings')
plt.ylabel('ROI')
plt.title('Evolution of ROI according to Metacritic ratings');
plt.savefig('images/roi_vs_metacritic.png')
To determine an optimal ratio to use, try to find a high enough ratio which leads to a maximum metacritic mean:
In [15]:
df_roi_sorted = df.sort_values(by=['ROI'],ascending=False)
df_met_sorted = df.sort_values(by=['Metacritic'],ascending=False)
mean_roi, mean_met = [], []
for r in np.arange(0.01, 1.0, 0.01):
limit_roi = df_roi_sorted.iloc[int(len(df)*r)]['ROI']
limit_met = df_met_sorted.iloc[int(len(df)*r)]['Metacritic']
success_roi = df[df['ROI'] > limit_roi]
success_met = df[df['Metacritic'] > limit_met]
mean_roi.append([r,np.mean(success_roi['Metacritic'])])
mean_met.append([r,np.mean(success_met['ROI'])])
mean_roi = np.array(mean_roi)
mean_met = np.array(mean_met)
f, axarr = plt.subplots(2, sharex=True)
axarr[0].plot(mean_roi[:,0],mean_roi[:,1]);
axarr[0].set_ylabel('Metacritic Mean')
axarr[1].plot(mean_met[:,0],mean_met[:,1]);
axarr[1].set_xlabel('Success/Failure Ratio')
axarr[1].set_ylabel('ROI')
f.subplots_adjust(hspace=0);
plt.setp([a.get_xticklabels() for a in f.axes[:-1]], visible=False);
In [16]:
ratio = 0.2
df_sorted = df.sort_values(by=['ROI'],ascending=False)
limit_roi = df_sorted.iloc[int(len(df)*ratio)]['ROI']
success = df[df['ROI'] > limit_roi]
failure = df[df['ROI'] <= limit_roi]
print("The ROI needed to be a successful movie is: "+str(limit_roi)[:4])
print("There are "+str(int(len(df)*ratio))+" successful movies in the dataset.")
In [17]:
df = pd.read_csv('Saved_Datasets/NewFeaturesDataset.csv')
In [50]:
df = df.drop(df[df.Metacritic == 0].index)
In [30]:
crit_norm = np.array(df['Metacritic'])
w = np.zeros((len(df),len(df)))
for i in range(0,len(df)):
for j in range(i,len(df)):
if (i == j):
w[i,j] = 0
continue
if (crit_norm[i] == 0 or crit_norm[j] == 0):
w[i,j] = w[j,i] = 0
else:
w[i,j] = w[j,i] = 1.0 - (abs(crit_norm[i]-crit_norm[j])/100)
In [31]:
plt.hist(w.reshape(-1), bins=50);
plt.title('Metacritic weights matrix histogram')
plt.savefig('images/metacritic_weights_hist.png')
In [32]:
print('The mean value is: {}'.format(w.mean()))
print('The max value is: {}'.format(w.max()))
print('The min value is: {}'.format(w.min()))
In [33]:
plt.spy(w)
Out[33]:
Save as csv
In [34]:
W = pd.DataFrame(w)
W.head()
Out[34]:
In [35]:
W.to_csv('Saved_Datasets/NormalizedMetacriticW.csv', encoding='utf-8', index=False)
In [10]:
degrees = np.zeros(len(w))
#reminder: the degrees of a node for a weighted graph are the sum of its weights
for i in range(0, len(w)):
degrees[i] = sum(w[i])
plt.hist(degrees, bins=50);
In [11]:
#reminder: L = D - W for weighted graphs
laplacian = np.diag(degrees) - w
#computation of the normalized Laplacian
laplacian_norm = scipy.sparse.csgraph.laplacian(w, normed = True)
plt.spy(laplacian_norm);
In [12]:
plt.spy(np.diag(degrees))
Out[12]:
In [13]:
NEIGHBORS = 300
#sort the order of the weights
sort_order = np.argsort(w, axis = 1)
#declaration of a sorted weight matrix
sorted_weights = np.zeros((len(w), len(w)))
for i in range (0, len(w)):
for j in range(0, len(w)):
if (j >= len(w) - NEIGHBORS):
#copy the k strongest edges for each node
sorted_weights[i, sort_order[i,j]] = w[i,sort_order[i,j]]
else:
#set the other edges to zero
sorted_weights[i, sort_order[i,j]] = 0
#ensure the matrix is symmetric
bigger = sorted_weights.transpose() > sorted_weights
sorted_weights = sorted_weights - sorted_weights*bigger + sorted_weights.transpose()*bigger
np.fill_diagonal(sorted_weights, 0)
In [14]:
plt.spy(sorted_weights)
Out[14]:
In [15]:
#reminder: L = D - W for weighted graphs
laplacian = np.diag(degrees) - sorted_weights
#computation of the normalized Laplacian
laplacian_norm = scipy.sparse.csgraph.laplacian(sorted_weights, normed = True)
np.fill_diagonal(laplacian_norm, 1)
plt.spy(laplacian_norm);
In [16]:
laplacian_norm = sparse.csr_matrix(laplacian_norm)
In [17]:
eigenvalues, eigenvectors = sparse.linalg.eigsh(laplacian_norm, k = 10, which = 'SM')
plt.plot(eigenvalues, '.-', markersize=15);
plt.xlabel('')
plt.ylabel('Eigenvalues')
plt.show()
success = preprocessing.LabelEncoder().fit_transform(df['success'])
print(success)
x = eigenvectors[:, 1]
y = eigenvectors[:, 2]
plt.scatter(x, y, c=success, cmap='RdBu', alpha=0.5);
In [18]:
G = graphs.Graph(sorted_weights)
G.compute_laplacian('normalized')
In [19]:
G.compute_fourier_basis(recompute=True)
plt.plot(G.e[0:10]);
In [20]:
G.set_coordinates(G.U[:,1:3])
G.plot()
In [21]:
G.plot_signal(success, vertex_size=20)
In [ ]: