In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import KFold
from sklearn.ensemble import RandomForestClassifier
%matplotlib inline
In [2]:
dataframe = pd.read_csv('ign.csv',index_col='Unnamed: 0')
dataframe.head()
Out[2]:
In [3]:
games = dataframe[['score_phrase','platform','score','genre','editors_choice']]
In [4]:
# Dictionaries to perform replace string labels to numeric ones
score_phrase_dict = {'Masterpiece':0,'Amazing':1, 'Great':2, 'Good':3, 'Okay':4, 'Mediocre':5, 'Bad':6,'Awful':7,
'Painful':8, 'Unbearable':9, 'Disaster':10}
reverse_score_dict = {0:'Masterpiece',1:'Amazing', 2:'Great', 3:'Good', 4:'Okay', 5:'Mediocre', 6:'Bad',7:'Awful',
8:'Painful', 9:'Unbearable', 10:'Disaster'}
platform_dict = {key:value for (key,value) in zip(games['platform'].unique(),range(games['platform'].unique().size))}
genre_dict = {key:value for (key,value) in zip(games['genre'].unique(),range(games['genre'].unique().size))}
editors_choice_dict = {key:value for (key,value) in zip(games['editors_choice'].unique(),range(games['editors_choice'].unique().size))}
In [5]:
games = games.replace({'score_phrase':score_phrase_dict,'platform':platform_dict,'genre':genre_dict,'editors_choice':editors_choice_dict})
In [6]:
# Visualizing the distribution based on score of the games
dat = [games[games['score_phrase'] == i]['score'] for i in range(1,11)]
for i in range(len(dat)/2):
sns.kdeplot(dat[i],shade=True,label=reverse_score_dict[i+1])
In [7]:
# Bottom half of the score_phrase
for i in range(len(dat)/2):
sns.kdeplot(dat[i+5],shade=True,label=reverse_score_dict[i+5])
In [8]:
# method to score the predicted output and actual output
# based on the number for class mismatches.
# returns the percentage of correct prediction
def score(val_predict,test_y):
count = 0
for a,b in zip(val_predict,test_y):
if a == b:
count += 1
return (100.0 * count)/test_y.shape[0]
In [9]:
train_X,test_X,train_y,test_y = train_test_split(games[['platform','score','genre','editors_choice']].values,
games['score_phrase'].values,test_size=0.33)
kf = KFold(n =train_X.shape[0], n_folds=5)
RandomForest Classifier
In [10]:
rfc = RandomForestClassifier(n_estimators=100)
fold = 0
for train_index,test_index in kf:
rfc.fit(train_X[train_index],train_y[train_index])
val_predict = rfc.predict(train_X[test_index])
print 'Fold {0}, Score = {1}'.format(fold,score(val_predict,train_y[test_index]))
fold += 1
In [11]:
val_predict = rfc.predict(test_X)
print 'Prediction Score = {1}'.format(fold,score(val_predict,test_y))