In [ ]:
import pandas as pd
import numpy as np
import graphlab
In [ ]:
df_actor = pd.read_csv('../data/movie_actor_long_example')
df_actor.shape
In [ ]:
df_movie = pd.read_csv('Data/movies.csv')
with open('Data/keywords_clean.list', 'r') as f:
data = f.read().split('\n')
data_np = np.array(data)
movie_title = df_movie.title
In [ ]:
movie_keyword = [s for s in data_np if any(xs in s for xs in movie_title[0:20])]
In [ ]:
import csv
with open('data/new_movie_keywords_example', 'wb') as myfile:
wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)
wr.writerow(movie_keyword)
In [ ]:
df_key = pd.DataFrame(movie_keyword)
In [ ]:
#add columns to name, year and key word
df_key['name'] = [x.split('\t')[0] for x in df_key[0]]
df_key['keyword'] = [x.split('\t')[-1] for x in df_key[0]]
df_key['year'] = [x.split('(')[1][:-1] for x in df_key['name']]
In [ ]:
# movie_actors = [s for s in actors_100 if any(xs in s for xs in movie_title[0:20])]
In [ ]:
from load_imdb_data import load_imdb_data
In [ ]:
cd Code
In [ ]:
filename = '../data/imdb_edges.tsv'
actors, movies = load_imdb_data(filename)
In [ ]:
movie_actors = [xs,s.values for s in movies if any(s in xs for xs in df_key['name'])]
In [ ]:
# df_key['actors'] = [s.value for s in movies if any(s in x for x in )]
movie_actors = [v for k,v in movies.iteritems() if any(k in xs for xs in movie_title[0:20])]
In [ ]:
%%timeit
m_col = []
a_col = []
for movie_t in movie_title[0:1000]:
for k,v in movies.iteritems():
if k in movie_t:
for names in v:
m_col.append(movie_t)
a_col.append(names)
movie_actor_long = pd.concat([pd.Series(m_col), pd.Series(a_col)], axis=1)
movie_actor_long.columns = ['title','actor']
movie_actor_long.to_csv('../Data/movie_actor_long_example',index = False)
In [ ]:
%%timeit
#Make the file into pandas dataframe with movie and actor in long form
m_col = []
a_col = []
for movie_t in movie_title:
for k,v in movies.iteritems():
if k in movie_t:
for names in v:
m_col.append(movie_t)
a_col.append(names)
movie_actor_long = pd.concat([pd.Series(m_col), pd.Series(a_col)], axis=1)
movie_actor_long.columns = ['title','actor']
movie_actor_long.to_csv('../Data/movie_actor_long_example',index = False)
In [ ]:
i = 0
for x in xrange(len(movie_title)/100):
first = x * 100
last = (1+x) * 100
movie_keyword = [s for s in data_np if any(xs in s for xs in movie_title[first:last])]
with open('../Data/new_movie_keywords_example%s' %(str(x+10)), 'wb') as myfile:
wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)
wr.writerow(movie_keyword)
print x*100, 'out of ',len(movie_title), x*100/float(len(movie_title))
In [ ]:
df_actor['count'] = 1
df_actor_matrix = df_actor.reset_index().pivot_table(values='count', index='title', columns='actor', aggfunc='mean')
In [ ]:
df_actor_matrix
In [ ]:
df_key = pd.read_csv('df_key_long')
df_key_matrix = df_key.reset_index().pivot_table(values='count', index='title', columns='keyword', aggfunc='mean')
In [ ]:
df_key_matrix.to_csv('df_key_matrix')
In [ ]:
df_actor_matrix.to_csv('df_actor_matrix')
In [ ]:
df_keyword = df_key[['title','keyword','count']]
keyword_sf = graphlab.SFrame(df_keyword)
m_als = graphlab.recommender.factorization_recommender.create(keyword_sf, num_factors=20,user_id = 'keyword', item_id = 'title', target = 'count', solver='als')
m_als.coefficients
In [ ]:
m_als.coefficients
In [ ]:
from sklearn.decomposition import TruncatedSVD
svd_key = TruncatedSVD(n_components=100, random_state=42)
svd_key.fit(df_key_matrix)
In [ ]:
from sklearn.ensemble import RandomForestClassifier
In [ ]:
rf = RandomForestClassifier()
In [ ]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.cluster import KMeans
In [ ]:
# df_key_mat0 =df_key_matrix.fillna(0)
knn = KNeighborsRegressor(n_neighbors=2)
kmeans = KMeans(n_clusters=2)
# knn.fit(df_key_mat0.as_matrix())
In [ ]:
kmeans.fit_predict(df_key_mat0.as_matrix())
array_features = np.array(df_key_mat0.columns.values())
for i, cluster in enumerate(kmeans.cluster_centers_):
idx = cluster.argsort()[0:10]
print i, array_features[idx]
In [ ]:
import numpy as np
import pandas as pd
from scipy import sparse
import csv
# import graphlab
from sklearn.neighbors import KNeighborsRegressor
from sklearn.cluster import KMeans
df_key = pd.read_csv('../data/df_key_long')
df_actor = pd.read_csv('../data/movie_actor_long_example')
df_actor['count'] = 1
df_key_matrix = pd.read_csv('../data/df_key_matrix')
df_actor_matrix = pd.read_csv('../data/df_actor_matrix')
#Get the df keyword only
df_keyword = df_key[['title','keyword','count']]
# keyword_sf = graphlab.SFrame(df_keyword)
# m_als = graphlab.recommender.factorization_recommender.create(keyword_sf, user_id = 'keyword', item_id = 'title', target = 'count', solver='als')
#Use Knn to get cluters for 2 groups. And try using random forest to get highest feature importance
# knn = KNeighborsRegressor(n_neighbors=2)
# kmeans = KMeans(n_clusters=2)
# df_key_mat0 =df_key_matrix.fillna(0)
# kmeans.fit_predict(df_key_mat0.as_matrix())
# array_features = np.array(df_key_mat0.columns.values())
# for i, cluster in enumerate(kmeans.cluster_centers_):
# idx = cluster.argsort()[0:10]
# print i, array_features[idx]
In [1]:
import numpy as np
import pandas as pd
from scipy import sparse
import csv
In [4]:
df_key = pd.read_csv('../data/df_key_long')
In [ ]:
df_key_matrix = pd.read_csv('../data/df_key_matrix')
In [5]:
df_key_matrix = df_key.reset_index().pivot_table(values='count', index='title', columns='keyword', aggfunc='mean')
In [7]:
df_key_mat_ex = df_key_matrix[df_key_matrix.columns[df_key_matrix.sum()>5]]
In [24]:
df_key_mat_ex.fillna(0,inplace = True)
In [9]:
df_key_mat_ex.shape
Out[9]:
In [10]:
# df_actor = pd.read_csv('../data/movie_actor_long_example')
# df_actor['count'] = 1
# df_actor_matrix = df_actor.reset_index().pivot_table(values='count', index='title', columns='keyword', aggfunc='mean')
df_key_matrix['year'] = [x.split('(')[1][:-1] for x in df_key_matrix.index.values]
df_year = pd.get_dummies(df_key_matrix['year'])
In [15]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import load_iris
In [25]:
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier(max_depth = 20)
clf.fit(df_key_mat_ex.values,np.array(df_key_mat_ex.index))
Out[25]:
In [29]:
from IPython.display import Image
from sklearn.externals.six import StringIO
from sklearn import tree
with open("../data/movie.dot", 'w') as f:
f = tree.export_graphviz(clf, out_file=f)
# dot_data = StringIO()
# DecisionTreeClassifier.tree.export_graphviz(clf, out_file=dot_data,
# feature_names=iris.feature_names,
# class_names=iris.target_names,
# filled=True, rounded=True,
# special_characters=True)
# >>> graph = pydot.graph_from_dot_data(dot_data.getvalue())
# >>> Image(graph.create_png())
In [28]:
import os
os.unlink("../data/movie.dot")
In [30]:
clf.score(df_key_mat_ex.values,np.array(df_key_mat_ex.index))
Out[30]:
In [32]:
clf40 = DecisionTreeClassifier(max_depth = 140)
clf40.fit(df_key_mat_ex.values,np.array(df_key_mat_ex.index))
clf40.score(df_key_mat_ex.values,np.array(df_key_mat_ex.index))
Out[32]:
In [57]:
df = df_key_mat_ex[df_key_mat_ex.sum(axis = 1) > 5]
In [60]:
clf40 = DecisionTreeClassifier(max_depth = 20)
clf40.fit(df.values,np.array(df.index))
clf40.score(df.values,np.array(df.index))
Out[60]:
In [67]:
yr_cnt = Counter(df_key_matrix['year']).most_common()
In [93]:
year_list = [i[0] for i in yr_cnt if i[1]>5]
In [2]:
import numpy as np
import pandas as pd
from scipy import sparse
import csv
# import graphlab
from sklearn.neighbors import KNeighborsRegressor
from sklearn.cluster import KMeans
import os
from sklearn.tree import DecisionTreeClassifier
from IPython.display import Image
from sklearn.externals.six import StringIO
from sklearn import tree
from collections import Counter
In [ ]:
df_key = pd.read_csv('../data/df_key_long',dtype={'title': 'str','keyword': 'S20','count':'int'})
In [21]:
df_key_matrix = df_key.reset_index().pivot_table(values='count', index='title', columns='keyword', aggfunc='mean')
df_key_matrix['year'] = [x.split('(')[1][:-1] for x in df_key_matrix.index.values]
#Better to use this one!
yr_cnt = Counter(df_key_matrix['year']).most_common()
year_list = [i[0] for i in yr_cnt if i[1]>5]
df_key_matrix = df_key_matrix[df_key_matrix['year'].isin(year_list)]
df_key_matrix['year'] = df_key_matrix['year'].astype(int)
df_key_mat_ex = df_key_matrix[df_key_matrix.columns[df_key_matrix.sum()>5]].fillna(0)
# list_of_values = [3,6]
# y = df[df['A'] in list_of_values]
In [53]:
test = df.pop('year')
In [59]:
df = df[df.sum(axis = 1) > 5]
In [65]:
df = df.join(test)
In [99]:
# df['after2000'] = (df.year>2000).astype(int)
clf = DecisionTreeClassifier('entropy',max_depth = 20)
clf.fit(df.values,np.array(df.index))
clf.score(df.values,np.array(df.index))
Out[99]:
In [101]:
with open("../data/movie4.dot", 'w') as f:
f = tree.export_graphviz(clf, out_file=f)
In [105]:
from sklearn.externals import joblib
joblib.dump(clf, 'my_movie_model.pkl')
# >>> from sklearn.externals import joblib
# >>> model_clone = joblib.load('my_movie.model.pkl')
Out[105]:
In [1]:
import pandas as pd
import numpy as np
import csv
# import graphlab
import os
from sklearn.tree import DecisionTreeClassifier
from sklearn.externals import joblib
# class MovieCleanData(object):
#Need to clean up the list before data preprocessing.
def movie_keyword(keyword_file = '../data/keywords_clean.list', pickle_path = '../data/df_key'):
df_key = pd.read_csv(keyword_file, delimiter='\n',header = None)
df_key['Movie'] = [df_key[0][i].split('\t')[0] for i in xrange(df_key.shape[0])]
df_key['Keyword'] = [df_key[0][i].split('\t')[-1] for i in xrange(df_key.shape[0])]
# df_key.to_pickle(pickle_path)
return df_key
#If the dataframe has error, need to cleanup the list file manually.
#Return a pivot dataframe with 27 movie genres.
def movie_genres(genres_file = '../data/genres.list',pickle_path = '../data/df_genres_pivot'):
df_genres = pd.read_csv(genres_file,delimiter= '\t', dtype = 'str', header = None)
df_genres['Genres'] = pd.concat([df_genres[6].dropna(),df_genres[5].dropna(),\
df_genres[4].dropna(),df_genres[3].dropna(),df_genres[2].dropna(),\
df_genres[1].dropna()]).reindex_like(df_genres)
df_genres.drop([1,2,3,4,5,6],axis = 1,inplace = True)
df_genres.columns = ['idx','Genres']
df_genres['cnt'] = 1
df_genres_pivot = df_genres.pivot_table(values = 'cnt', index = 'idx', columns = 'Genres')
# df_genres_pivot.to_pickle(pickle_path)
return df_genres_pivot
#The list file includes votes and rates for each movie; need to clean up the data manually.
#Return DafaFrame with vote number greater 10000.
def movie_rating(rating_file = '../data/ratings.list', min_vote_cnt = 10000):
df_rating = pd.read_csv(rating_file, delimiter='\t', dtype='unicode')
df_rating['Votes'] = [int([i for i in df_rating.loc[j][0].split(' ') if i != ''][1]) \
for j in xrange(df_rating.shape[0])]
df_rating['Movie'] = [df_rating['New Distribution Votes Rank Title'][i][32:] \
for i in xrange(len(df_rating))]
#Exclude the tv dramas in dataframe.
df_tv = df_rating[df_rating.Movie.str.startswith('"')]
df_movie_rating = df_rating.drop(df_tv.index)
df_popular_movie = df_movie_rating[df_movie_rating.Votes > min_vote_cnt]
l = df_popular_movie.Movie
df_popular_movie.Movie = [i if i[0] != ' ' else i[1:] for i in l]
df_popular_movie = df_popular_movie.set_index(df_popular_movie.Movie)
# df_popular_movie.to_pickle('../data/df_popular_movie_10000')
return df_popular_movie
def movie_genres_rating(df_movie_genres, df_movie_rating):
df_gen_vote = df_movie_rating.join(df_movie_genres)
df_gen_vote.drop(['New Distribution Votes Rank Title','Movie','_ Drama<>Mystery<>crime_'],axis = 1,inplace=True)
# df_gen_vote.to_pickle('../data/df_gen_vote')
return df_gen_vote
def movie_force_data(df_key, df_gen_vote, number_keywords = 100):
df_clean_long = df_key.set_index(df_key.Movie).ix[df_gen_vote.index]
df = df_clean_long.ix[df_gen_vote.index[0]][:number_keywords]
for movie in df_gen_vote.index[1:]:
df = pd.concat([df,df_clean_long.ix[movie][:number_keywords]],axis =0)
df['count'] = 1
df100 = df[['Movie','Keyword','count']]
df100 = df100.pivot_table(values='count', index='Movie', columns='Keyword')
df_100key_genres = df100.join(df_gen_vote)
df_100key_genres.pop('Votes')
df_100key_genres = df_100key_genres.fillna(0)
# df_100key_genres.to_pickle('../data/df_100key_genres')
return df_100key_genres
def movie_model(df_100key_genres, max_depth = 20, pickle_path = '../data/my_movie_model20_v10.pkl'):
X = df_100key_genres.values
y = df_100key_genres.index
clf = DecisionTreeClassifier('entropy',max_depth = max_depth)
clf.fit(X,y)
# clf.score(X,y)
# joblib.dump(clf, pickle_path)
return clf
def save_feature_movie_name(df_100key_genres, save_path = '../data/feature_movie_name'):
movie_list = df_100key_genres.index
feature_list = df_100key_genres.columns.values
np.savez(save_path,movie_list,feature_list)
def save_model_features(clf,save_path = '../data/my_movie_model20_10_array'):
np.savez(save_path,clf.tree_.children_left,clf.tree_.children_right,clf.tree_.feature,clf.tree_.threshold)
def save_model_value(clf, save_path = '../data/my_movie_model20_10_value'):
valuefiles = clf.tree_.value
sparse_values = sparse.lil_matrix(valuefiles.astype(int))
np.save(save_path,sparse_values)
In [5]:
df_popular_movie = movie_rating(rating_file = 'data/ratings.list', min_vote_cnt = 10000)
df_genres_pivot = movie_genres(genres_file = 'data/genres.list',pickle_path = 'data/df_genres_pivot')
df_gen_vote = movie_genres_rating(df_movie_genres = df_genres_pivot, df_movie_rating = df_popular_movie)
df_key = movie_keyword(keyword_file = 'data/keywords_clean.list', pickle_path = 'data/df_key')
df_100key_genres = movie_force_data(df_key = df_key, df_gen_vote = df_gen_vote, number_keywords = 100)
clf = movie_model(df_100key_genres = df_100key_genres, max_depth = 20, pickle_path = 'data/my_movie_model20_v10.pkl')
In [3]:
ls
In [6]:
df_popular_movie
Out[6]:
In [7]:
df_genres_pivot
Out[7]:
In [8]:
df_gen_vote
Out[8]:
In [9]:
df_key
Out[9]:
In [10]:
df_100key_genres
Out[10]:
In [13]:
from nltk.corpus import wordnet
from itertools import product
list1 = ['childhood']
list2 = ['childhood','memory']
allsyns1 = set(ss for word in list1 for ss in wordnet.synsets(word))
allsyns2 = set(ss for word in list2 for ss in wordnet.synsets(word))
best = [(wordnet.wup_similarity(s1, s2) or 0, s1, s2) for s1, s2 in
product(allsyns1, allsyns2)]
In [14]:
best
Out[14]:
In [15]:
df_100key_genres.index
Out[15]:
In [16]:
df_100key_genres.columns.values
Out[16]:
In [39]:
df_100key_genres.to_csv('data/df_pickle')
In [38]:
df_100key_genres.to_csv?
In [40]:
df = pd.read_csv('data/df_pickle')
In [70]:
from collections import defaultdict
movie_dict = defaultdict(list)
for movie in df_100key_genres.index.values:
for keyword,value in df_100key_genres.loc[movie].iteritems():
if value == 1:
movie_dict[movie].append(keyword.decode('latin-1').encode("utf-8"))
In [73]:
with open('data/movie_dict.json', 'w') as f:
json.dump(movie_dict, f,encoding='ISO-8859-1')
In [76]:
with open('data/test.json', 'r') as f:
a= json.load(f)
In [90]:
set(movie_dict[u"'71 (2014)"])
Out[90]:
In [98]:
set(['1970s','on-the-run']).issubset(set(movie_dict[u"'71 (2014)"]))
Out[98]:
In [100]:
next_movie_list = []
true_list = ['1970s']
false_list = ['riot','rogue-soldier',]
for movie, features in movie_dict.iteritems():
if (set(true_list).issubset(set(features)) and (not set(false_list).issubset(set(features)))):
next_movie_list.append(movie)
In [102]:
newdict=defaultdict(list)
for i in next_movie_list:
newdict[i] = movie_dict[i]
In [120]:
sample={'user1': {'item1': 2.5, 'item2': 3.5, 'item3': 3.0, 'item4': 3.5, 'item5': 2.5, 'item6': 3.0},
'user2': {'item1': 2.5, 'item2': 3.0, 'item3': 3.5, 'item4': 4.0},
'user3': {'item2':4.5,'item5':1.0,'item6':4.0}}
df = pd.DataFrame([
[col1,d] for col1, d in sample.items()
])
In [138]:
df = pd.DataFrame([
[col1,v] for col1, v in newdict.iteritems()])
In [145]:
new_feature = []
for k, v in newdict.iteritems():
new_feature.extend(v)
In [147]:
len(new_feature)
Out[147]:
In [148]:
print len(set(new_feature))
In [174]:
df = pd.DataFrame(columns = ['name','features','count'])
ix = 0
for k, v in newdict.items():
for i in v:
df.loc[ix] = [k,i,1]
ix+=1
In [175]:
df_new = df.pivot_table(values='count', index='name', columns='features', aggfunc='mean')
In [176]:
df_new
Out[176]:
In [180]:
from sklearn.tree import DecisionTreeClassifier
df_new.fillna(0,inplace = True)
clf = DecisionTreeClassifier('entropy',max_depth = 20)
clf.fit(df_new.values,np.array(df_new.index))
Out[180]:
In [181]:
clf.score(df_new.values,np.array(df_new.index))
Out[181]:
In [182]:
df_new.values
Out[182]:
In [ ]: