In [1]:
import sys
from multiprocessing import Pool, cpu_count
import pandas as pd
import numpy as np
from fuzzywuzzy import process

sys.path.append('../src')

from my_aws import S3

In [2]:
s3_connect = S3()

key_tor = 'OMDB_Torrents.csv'
bucket = 'movie-torrents'
df_tor = s3_connect.get_data(key_tor, bucket)

key_num = 'TheNumbers_budgets.csv'
bucket = 'movie-torrents'
df_num = s3_connect.get_data(key_num, bucket)

In [3]:
numbers_titles = df_num['title'].tolist()
torrent_titles = df_tor['Title'].tolist()

def fuzzy_match(search_title):
    match_title, match_percent = process.extractOne(search_title, numbers_titles)
    return {'RowTitle': search_title, 'FuzzTitle': match_title, 'FuzzPercent': match_percent}

Resource heavy - long running


In [4]:
worker_pool = Pool(cpu_count())

%time fuzz_results = worker_pool.map(fuzzy_match, torrent_titles)

worker_pool.close() 
worker_pool.join()


CPU times: user 315 ms, sys: 70.7 ms, total: 385 ms
Wall time: 7min 55s

In [5]:
# Put results into a dataframe and rename columns
df_fuzz = pd.DataFrame(fuzz_results)
df_fuzz = df_fuzz[['RowTitle','FuzzTitle','FuzzPercent']]
df_fuzz.columns = ['Title', 'FuzzTitle', 'FuzzPercent']

# Append to torrent dataframe
df = pd.merge(df_tor, df_fuzz, how='inner', on='Title')
df.drop_duplicates(subset='Title', inplace=True)

# Drop rows where match was below 95%
df = df[df['FuzzPercent'] >= 95]

In [6]:
# remove where no torrent counts were received from any source
df['CheckTup'] = list(zip(df['Kat_Count'].tolist(), df['Pirate_Count'].tolist(), df['Extra_Count'].tolist(), 
                df['Torrentz_Count'].tolist(), df['Torrentz_Ver_Count'].tolist(), df['Zoogle_Ver_Count'].tolist()))
df = df[df['CheckTup'] != ('Fail','Fail','Fail','Fail','Fail','Fail')].reset_index(drop=True)

del df['CheckTup']

In [7]:
# replace Fail, None, N and NaN with 0 - remove >, and <
int_cols = ['Metascore', 'Runtime', 'imdbRating','imdbVotes', 'Kat_Count', 'Pirate_Count', 
            'Extra_Count', 'Torrentz_Count', 'Torrentz_Ver_Count', 'Zoogle_Ver_Count']

for col in int_cols:
    df[col] = df[col].replace(['Fail', 'None', 'N', 'NaN'], '0')
    df[col] = df[col].apply(lambda x: str(x).replace('>','').replace('<','').replace(',',''))
    df[col] = df[col].replace(np.nan, 0)
    df[col] = df[col].fillna(value=0)
    df[col] = pd.to_numeric(df[col], errors='coerce', downcast='integer')
    df[col] = df[col].fillna(value=0)
    
# fill in remaining NaN's with blanks
df.fillna(value='', inplace=True)

In [8]:
df_num.columns = ['FuzzTitle', 'ReleaseDate', 'ProductionBudget', 'DomesticBudget', 'WorldGross']

In [9]:
# merge data frames
data_df = pd.merge(df, df_num, how='left', on='FuzzTitle')
data_df = data_df.drop_duplicates(subset='imdbID')
print(len(data_df))


3899

In [10]:
data_df['Released'] = pd.to_datetime(data_df['Released'])
data_df['ReleaseDate'] = pd.to_datetime(data_df['ReleaseDate'])
data_df.loc[data_df['Released'].isnull(), 'Released'] = data_df['ReleaseDate']

del data_df['ReleaseDate']
del data_df['FuzzTitle']
del data_df['FuzzPercent']

In [11]:
# sum torrent counts
data_df['Total_Torrents'] = data_df[['Kat_Count','Pirate_Count','Extra_Count','Torrentz_Count',
                                     'Torrentz_Ver_Count','Zoogle_Ver_Count']].sum(axis=1)

In [12]:
data_df.reset_index(drop=True, inplace=True)

In [13]:
data_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3899 entries, 0 to 3898
Data columns (total 29 columns):
Actors                3899 non-null object
Awards                3899 non-null object
BoxOffice             3899 non-null object
Country               3899 non-null object
DVD                   3899 non-null object
Director              3899 non-null object
Genre                 3899 non-null object
Language              3899 non-null object
Metascore             3899 non-null float64
Production            3899 non-null object
Rated                 3899 non-null object
Released              3899 non-null datetime64[ns]
Runtime               3899 non-null float64
Title                 3899 non-null object
Type                  3899 non-null object
Writer                3899 non-null object
imdbID                3899 non-null object
imdbRating            3899 non-null float64
imdbVotes             3899 non-null float64
Kat_Count             3899 non-null int16
Pirate_Count          3899 non-null int16
Extra_Count           3899 non-null int16
Torrentz_Count        3899 non-null int32
Torrentz_Ver_Count    3899 non-null int16
Zoogle_Ver_Count      3899 non-null float64
ProductionBudget      3899 non-null int64
DomesticBudget        3899 non-null int64
WorldGross            3899 non-null int64
Total_Torrents        3899 non-null float64
dtypes: datetime64[ns](1), float64(6), int16(4), int32(1), int64(3), object(14)
memory usage: 776.8+ KB

In [ ]: