In [1]:
import sys
import pandas as pd
import difflib

In [2]:
filename1 = "movie_list.txt"
filename2 = "movie_ratings.csv"
filename3 = "output.csv" 
# filename1 = sys.argv[1]
# filename2 = sys.argv[2]
# filename3 = sys.argv[3]

In [3]:
with open(filename1) as f:
    df_movie_list = pd.read_table(f, header=None, names=['title'], lineterminator='\n')
    df_movie_list = df_movie_list.sort_values("title")
with open(filename2) as f:
    df_rating_list = pd.read_table(f, sep=',', lineterminator='\n')

In [13]:
def match(x):
    matches = difflib.get_close_matches(x,df_movie_list["title"])
    if len(matches)==0:
        return float('NaN')
    else:
        return matches[0]
    
df_rating_list ["match"] = df_rating_list["title"].apply(match)
df_rating_list ["rating"] = df_rating_list["rating"].apply(float)
df_rating_list = df_rating_list[df_rating_list["match"].notnull()]
df_rating_list = df_rating_list.sort_values("match")
joined_df = df_rating_list.groupby(["match"]).sum().reset_index()
joined_df["rating"] = df_rating_list.groupby(['match']).sum().reset_index()["rating"] /df_rating_list.groupby(['match']).count().reset_index()["rating"]
joined_df["rating"]=joined_df["rating"].round(2)
joined_df.columns = ["title", "rating"]
final_df = pd.merge(joined_df, df_movie_list, on=['title', 'title'])
final_df.to_csv(filename3,index=False)
final_df


Out[13]:
title rating
0 13th 9.00
1 42 7.67
2 Abraham Lincoln: Vampire Hunter 5.86
3 Alice in Wonderland 6.50
4 Arthur Christmas 8.50
5 Before Sunset 7.75
6 Bridget Jones's Baby 7.00
7 Cape Fear 7.25
8 Cars 8.50
9 Cobain: Montage of Heck 8.50
10 Contraband 5.25
11 Dead Poets Society 8.75
12 Defiance 8.00
13 Delivery Man 6.21
14 Drive 7.89
15 Eat Pray Love 6.00
16 Flipped 8.00
17 Forgetting Sarah Marshall 8.00
18 Hercules 6.42
19 If I Stay 7.36
20 Iron Man 3 7.83
21 Just Go with It 6.43
22 Kari-gurashi no Arietti 7.86
23 Kung Fu Panda 7.75
24 Les Misérables 7.78
25 Me, Myself & Irene 6.33
26 Moulin Rouge! 8.14
27 Mr. Turner 5.60
28 Night at the Museum: Secret of the Tomb 6.56
29 Reservoir Dogs 8.50
30 Rush 8.31
31 Solace 7.67
32 Spy 7.24
33 Ted 2 6.46
34 Terminator 3: Rise of the Machines 6.33
35 The Big Short 7.63
36 The Bourne Identity 7.43
37 The Day After Tomorrow 7.50
38 The Fast and the Furious 6.78
39 The Fate of the Furious 7.55
40 The Others 8.00
41 The Rewrite 5.50
42 The Thin Red Line 3.54
43 The Ugly Truth 8.00
44 The Wolf of Wall Street 8.39
45 Thor 7.24
46 Total Recall 6.00
47 Tropic Thunder 7.80
48 Who Am I - Kein System ist sicher 9.00
49 Wreck-It Ralph 8.22