In [1]:
import sys
from multiprocessing import Pool, cpu_count
import pandas as pd
import numpy as np
from fuzzywuzzy import process
sys.path.append('../src')
from my_aws import S3
In [2]:
s3_connect = S3()
key_tor = 'OMDB_Torrents.csv'
bucket = 'movie-torrents'
df_tor = s3_connect.get_data(key_tor, bucket)
key_num = 'TheNumbers_budgets.csv'
bucket = 'movie-torrents'
df_num = s3_connect.get_data(key_num, bucket)
In [3]:
numbers_titles = df_num['title'].tolist()
torrent_titles = df_tor['Title'].tolist()
def fuzzy_match(search_title):
match_title, match_percent = process.extractOne(search_title, numbers_titles)
return {'RowTitle': search_title, 'FuzzTitle': match_title, 'FuzzPercent': match_percent}
In [4]:
worker_pool = Pool(cpu_count())
%time fuzz_results = worker_pool.map(fuzzy_match, torrent_titles)
worker_pool.close()
worker_pool.join()
In [5]:
# Put results into a dataframe and rename columns
df_fuzz = pd.DataFrame(fuzz_results)
df_fuzz = df_fuzz[['RowTitle','FuzzTitle','FuzzPercent']]
df_fuzz.columns = ['Title', 'FuzzTitle', 'FuzzPercent']
# Append to torrent dataframe
df = pd.merge(df_tor, df_fuzz, how='inner', on='Title')
df.drop_duplicates(subset='Title', inplace=True)
# Drop rows where match was below 95%
df = df[df['FuzzPercent'] >= 95]
In [6]:
# remove where no torrent counts were received from any source
df['CheckTup'] = list(zip(df['Kat_Count'].tolist(), df['Pirate_Count'].tolist(), df['Extra_Count'].tolist(),
df['Torrentz_Count'].tolist(), df['Torrentz_Ver_Count'].tolist(), df['Zoogle_Ver_Count'].tolist()))
df = df[df['CheckTup'] != ('Fail','Fail','Fail','Fail','Fail','Fail')].reset_index(drop=True)
del df['CheckTup']
In [7]:
# replace Fail, None, N and NaN with 0 - remove >, and <
int_cols = ['Metascore', 'Runtime', 'imdbRating','imdbVotes', 'Kat_Count', 'Pirate_Count',
'Extra_Count', 'Torrentz_Count', 'Torrentz_Ver_Count', 'Zoogle_Ver_Count']
for col in int_cols:
df[col] = df[col].replace(['Fail', 'None', 'N', 'NaN'], '0')
df[col] = df[col].apply(lambda x: str(x).replace('>','').replace('<','').replace(',',''))
df[col] = df[col].replace(np.nan, 0)
df[col] = df[col].fillna(value=0)
df[col] = pd.to_numeric(df[col], errors='coerce', downcast='integer')
df[col] = df[col].fillna(value=0)
# fill in remaining NaN's with blanks
df.fillna(value='', inplace=True)
In [8]:
df_num.columns = ['FuzzTitle', 'ReleaseDate', 'ProductionBudget', 'DomesticBudget', 'WorldGross']
In [9]:
# merge data frames
data_df = pd.merge(df, df_num, how='left', on='FuzzTitle')
data_df = data_df.drop_duplicates(subset='imdbID')
print(len(data_df))
In [10]:
data_df['Released'] = pd.to_datetime(data_df['Released'])
data_df['ReleaseDate'] = pd.to_datetime(data_df['ReleaseDate'])
data_df.loc[data_df['Released'].isnull(), 'Released'] = data_df['ReleaseDate']
del data_df['ReleaseDate']
del data_df['FuzzTitle']
del data_df['FuzzPercent']
In [11]:
# sum torrent counts
data_df['Total_Torrents'] = data_df[['Kat_Count','Pirate_Count','Extra_Count','Torrentz_Count',
'Torrentz_Ver_Count','Zoogle_Ver_Count']].sum(axis=1)
In [12]:
data_df.reset_index(drop=True, inplace=True)
In [13]:
data_df.info()
In [ ]: