In [1]:
%matplotlib inline
import configparser
import os
import requests
from tqdm import tqdm
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import sparse, stats, spatial
import scipy.sparse.linalg
import statistics as stat
from sklearn import preprocessing, decomposition
import librosa
import IPython.display as ipd
import json
In [7]:
all_movies_new = pd.read_csv('350000-movies/AllMoviesDetailsCleaned.csv', sep=';', encoding='utf-8', low_memory=False,
error_bad_lines=False)
all_movies_casting = pd.read_csv('350000-movies/AllMoviesCastingRaw.csv', sep=';', encoding='utf-8', low_memory=False,
error_bad_lines=False)
In [8]:
print('There are {} movies'.format(len(all_movies_new)))
print('There are {} casting infos'.format(len(all_movies_casting)))
In [9]:
all_movies_casting.head(10)
Out[9]:
Merge while taken into account duplicate columns
In [10]:
cols_to_use = all_movies_casting.columns.difference(all_movies_new.columns)
df = all_movies_new.merge(all_movies_casting, left_index=True, right_index=True, how='outer')
print('We have {} movies after merging'.format(len(df)))
In [11]:
df.drop(['spoken_languages_number','production_countries_number', 'status', 'runtime', 'actor_number', 'actor5_gender',
'actor4_gender', 'actor3_gender', 'actor2_gender', 'actor1_gender', 'director_number', 'producer_number',
'director_gender', 'original_title', 'popularity', 'production_countries', 'spoken_languages', 'vote_count',
'vote_average', 'producer_name', 'screeplay_name', 'editor_name','production_companies_number'],
axis=1, inplace = True)
df = df.drop(df[df.budget < 1000].index)
df = df.drop(df[df.revenue == 0].index)
df = df.drop(df[df.original_language != 'en'].index)
df = df.drop(df[df.director_name == 'none'].index)
df = df.drop(df[df.production_companies == 'none'].index)
df = df.drop(df[df['imdb_id'].isnull()].index)
df = df.drop(df[df['genres'].isnull()].index)
df['imdb_id'] = df['imdb_id'].str.replace('tt', '')
df.drop(['id_y'],axis=1, inplace = True)
df.rename(columns={'id_x': 'id'}, inplace=True)
df.drop(['original_language'],axis=1, inplace = True)
df['release_date'] = pd.to_datetime(df['release_date'])
print('There are {} movies left in the dataframe'.format(len(df)))
Retain only the movies between the year 2000 and 2017 (not included since we consider that the revenue of some of them are not yet fixed)
In [12]:
print('There are {} movies before 2000'.format(len(df[df.release_date <= '1999-12-31'])))
print('There are {} movies after 2016'.format(len(df[df.release_date >= '2017-01-01'])))
print('{} movies would be dropped'. format(len(df[df.release_date <= '1999-12-31']) + len(df[df.release_date >= '2017-06-01'])))
In [13]:
df = df.drop(df[df.release_date <= '1999-12-31'].index)
df = df.drop(df[df.release_date >= '2017-01-01'].index)
print('There are {} movies left in the dataset'.format(len(df)))
In [14]:
df.head()
Out[14]:
In [15]:
df.to_csv('Saved_Datasets/CleanDataset.csv', index=False)
In [3]:
df = pd.read_csv('Saved_Datasets/CleanDataset.csv')
len(df) #verification
Out[3]:
In [4]:
#generate dictionnaries assigning ids for each actor/director
actor_dict = {}
director_dict = {}
actors = pd.concat([df['actor1_name'], df['actor2_name'], df['actor3_name'], df['actor4_name'], df['actor5_name']])
actors = actors[actors != 'none']
actors = actors.unique()
directors = df['director_name'].unique()
for i in range(0,len(actors)):
actor_dict[actors[i]] = i
for i in range(0,len(directors)):
director_dict[directors[i]] = i
In [5]:
#function to create list of actor for each movie (and ignore 'none')
def merge_actors(actor_list,mode='name',actor_dict={}):
''' mode "name" : (default) return same list as actor_list, but without "none"
mode "id" : return id of actor. id dictionnary must be provided in actor_dict.
'''
id_l = []
for actor in actor_list:
if actor != 'none':
if (mode == 'id' and len(actor_dict)>0):
id_l.append(actor_dict[actor])
else:
id_l.append(actor)
return id_l
In [6]:
#add list of actor's id to each movie in dataset
df['director_id'] = df.apply(lambda row: director_dict[row['director_name']], axis=1)
df['actor_names'] = df.apply(lambda row: merge_actors([row['actor1_name'], row['actor2_name'],row['actor3_name'],
row['actor4_name'], row['actor5_name']],'name'), axis=1)
df['actor_ids'] = df.apply(lambda row: merge_actors([row['actor1_name'], row['actor2_name'],row['actor3_name'],
row['actor4_name'], row['actor5_name']],'id',actor_dict), axis=1)
In [7]:
#Remove actors columns
df.drop(['actor1_name','actor2_name','actor3_name','actor4_name','actor5_name'],axis=1, inplace = True)
In [8]:
#Remove movies without any actor
df = df[df['actor_ids'].str.len() != 0]
In [9]:
#load critics from external file
metacritic_ratings = pd.read_csv('Saved_Datasets/metacritic_ratings.csv')
len(metacritic_ratings)
Out[9]:
In [10]:
#merge dataframes
df = pd.merge(df,metacritic_ratings,on=['imdb_id','title'])
In [11]:
#Remove Movies without Metacritic rating
df = df.drop(df[df.Metacritic == 'Error'].index)
In [3]:
#load views from external file
views = pd.read_csv('Saved_Datasets/YouTube_views.csv')
len(views)
Out[3]:
In [4]:
#merge dataframes
df = pd.merge(df,views,on=['imdb_id','title'])
In [ ]:
df.drop(['tagline','director_id','actor_ids'],axis=1, inplace = True)
In [20]:
print(len(df))
df.head(50)
Out[20]:
In [7]:
df.to_csv('Saved_Datasets/CleanDataset.csv', encoding='utf-8', index=False)
In [2]:
df = pd.read_csv('Saved_Datasets/CleanDataset.csv')
len(df) #verification
Out[2]:
In [3]:
df['ROI'] = df.apply(lambda row: np.round((row['revenue']-row['budget'])/row['budget'],3), axis=1)
Saturate rows with ROI too high: saturates the 25% highest ROIs
In [4]:
quartile = np.round(np.percentile(df.ROI,75),3)
print("Quartile = "+str(quartile))
df.loc[df.ROI>quartile,'ROI'] = quartile
In [5]:
plt.hist(df.ROI,bins='auto');
Add "success" column: a movie is successful if its ROI belongs to the top 75%
In [4]:
def classify_success(roi):
#if (roi >= np.round(np.percentile(df.ROI,75),3)): return 1
#if (roi >= np.round(np.percentile(df.ROI,50),3)): return 3
if (roi >= np.round(np.percentile(df.ROI,25),3)): return 1
else: return 0
In [5]:
df['success'] = df.apply(lambda row: classify_success(row['ROI']), axis=1)
In [9]:
Actors = pd.read_csv('Saved_Datasets/Actorsv4Dataset.csv')
In [10]:
#Assign actors TMDb ids to a movie in the dataset
def one_movies_actor_id(movies):
actor_index = [i for i, item in enumerate(Actors['Name']) if item in df['actor_names'][movies]]
#print(index_movie)
one_movie_actor_id = []
for i in actor_index:
one_movie_actor_id.append(Actors['tmdb_id'][i])
#print(one_actor_movie_date)
return one_movie_actor_id #ATTENTION ORDRE ACTTEUR MOVIE PAS GARDE
In [11]:
#Apply actors TMDb ids to the whole dataset
movies_actors_ids = list(map(one_movies_actor_id,np.arange(len(df))))
In [12]:
idx = df.columns.get_loc('actor_names')
df.insert(idx+1,'actors_ids',movies_actors_ids)
In [13]:
#Forming data in column actor_names
df['actor_names']=df['actor_names'].str.replace('"','')
df['actor_names'] = df['actor_names'].str.replace("'",'')
df['actor_names'] = df['actor_names'].str.replace(", ",',')
In [14]:
#Convert string in dataset['actor_names'] into list
df['actor_names'] = df.actor_names.apply(lambda x: x[1:-1].split(","))
In [15]:
Actors['date'] = Actors['date'].str.replace("'",'')
In [16]:
Actors['date'] = Actors.date.apply(lambda x: x[1:-1].split(","))
In [17]:
def one_actor_tenure_movie(movies):
actor_index = [i for i, item in enumerate(Actors['Name']) if item in df['actor_names'][movies]]
tenure=[]
for i in actor_index:
if Actors['date'][i][0] !='':
ten = int(df['release_date'][movies][0:4])-int(Actors['date'][i][0])+1
#print(ten)
if (ten <0):
ten = 0
if (ten > 72):
ten = 72
tenure.append (ten)
else:
tenure.append(0)
return tenure
In [18]:
movie_tenures = list(map(one_actor_tenure_movie,np.arange(len(df))))
In [19]:
idx = df.columns.get_loc('actors_ids')
df.insert(idx+1,'actors_tenures',movie_tenures)
In [20]:
#Compute total tenure of each movie
movie_tenure_sum = []
for i in range(0,len(df)):
movie_tenure_sum.append(sum(df['actors_tenures'][i]))
In [21]:
idx = df.columns.get_loc('actors_tenures')
df.insert(idx+1,'total_tenure',movie_tenure_sum)
In [22]:
#Compute average tenure of each movie
movie_tenure_average = []
for i in range(0,len(df)):
#print(movie_tenures[i])
if df['actors_tenures'][i] != []:
movie_tenure_average.append(stat.mean(df['actors_tenures'][i]))
else:
movie_tenure_average.append(0)
In [23]:
idx = df.columns.get_loc('total_tenure')
df.insert(idx+1,'average_tenure',movie_tenure_average)
In [24]:
prof = df['revenue']-df['budget']
In [25]:
idx = df.columns.get_loc('Metacritic')
df.insert(idx+1,'Profitability',prof)
In [26]:
### Compute actor profitabtility to one movie of the dataset
def one_movies_actor_prof(movies):
#print(movies_actors_ids[movies])
#print(len(Actors['tmdb_id']))
#print(Act_prof['actors_ids'])
actor_index = [i for i, item in enumerate(Actors['tmdb_id']) if item in movies_actors_ids[movies]]
#print(actor_index)
one_movie_actor_prof = []
for i in actor_index:
#print(i)
one_movie_actor_prof.append(Actors['Profitability'][i])
prof = sum(one_movie_actor_prof)
return prof
In [27]:
test = one_movies_actor_prof(0)
print(test)
In [28]:
movies_actor_prof = list(map(one_movies_actor_prof,np.arange(len(df))))
In [29]:
idx = df.columns.get_loc('average_tenure')
df.insert(idx+1,'Total_profitability_actors',movies_actor_prof)
In [21]:
df.head(40)
Out[21]:
In [22]:
df.to_csv('Saved_Datasets/NewFeaturesDataset.csv', encoding='utf-8', index=False)
In [ ]: