Analysis of games over the 20 years based on IGN game reviews



In [1]:

    
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import KFold
from sklearn.ensemble import RandomForestClassifier

%matplotlib inline



In [2]:

    
dataframe = pd.read_csv('ign.csv',index_col='Unnamed: 0')
dataframe.head()









    Out[2]:






  
    
      
      score_phrase
      title
      url
      platform
      score
      genre
      editors_choice
      release_year
      release_month
      release_day
    
  
  
    
      0
      Amazing
      LittleBigPlanet PS Vita
      /games/littlebigplanet-vita/vita-98907
      PlayStation Vita
      9.0
      Platformer
      Y
      2012
      9
      12
    
    
      1
      Amazing
      LittleBigPlanet PS Vita -- Marvel Super Hero E...
      /games/littlebigplanet-ps-vita-marvel-super-he...
      PlayStation Vita
      9.0
      Platformer
      Y
      2012
      9
      12
    
    
      2
      Great
      Splice: Tree of Life
      /games/splice/ipad-141070
      iPad
      8.5
      Puzzle
      N
      2012
      9
      12
    
    
      3
      Great
      NHL 13
      /games/nhl-13/xbox-360-128182
      Xbox 360
      8.5
      Sports
      N
      2012
      9
      11
    
    
      4
      Great
      NHL 13
      /games/nhl-13/ps3-128181
      PlayStation 3
      8.5
      Sports
      N
      2012
      9
      11



In [3]:

    
games = dataframe[['score_phrase','platform','score','genre','editors_choice']]



In [4]:

    
# Dictionaries to perform replace string labels to numeric ones
score_phrase_dict = {'Masterpiece':0,'Amazing':1, 'Great':2, 'Good':3, 'Okay':4, 'Mediocre':5, 'Bad':6,'Awful':7,
       'Painful':8, 'Unbearable':9, 'Disaster':10}
reverse_score_dict = {0:'Masterpiece',1:'Amazing', 2:'Great', 3:'Good', 4:'Okay', 5:'Mediocre', 6:'Bad',7:'Awful',
       8:'Painful', 9:'Unbearable', 10:'Disaster'}

platform_dict = {key:value for (key,value) in zip(games['platform'].unique(),range(games['platform'].unique().size))}
genre_dict = {key:value for (key,value) in zip(games['genre'].unique(),range(games['genre'].unique().size))}
editors_choice_dict = {key:value for (key,value) in zip(games['editors_choice'].unique(),range(games['editors_choice'].unique().size))}



In [5]:

    
games = games.replace({'score_phrase':score_phrase_dict,'platform':platform_dict,'genre':genre_dict,'editors_choice':editors_choice_dict})



In [6]:

    
# Visualizing the distribution based on score of the games
dat = [games[games['score_phrase'] == i]['score'] for i in range(1,11)]
for i in range(len(dat)/2):
    sns.kdeplot(dat[i],shade=True,label=reverse_score_dict[i+1])









    



/usr/lib/pymodules/python2.7/matplotlib/collections.py:548: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison
  if self._edgecolors == 'face':



In [7]:

    
# Bottom half of the score_phrase
for i in range(len(dat)/2):
    sns.kdeplot(dat[i+5],shade=True,label=reverse_score_dict[i+5])



In [8]:

    
# method to score the predicted output and actual output 
# based on the number for class mismatches.
# returns the percentage of correct prediction
def score(val_predict,test_y):
    count = 0
    for a,b in zip(val_predict,test_y):
        if a == b:
            count += 1
    return (100.0 * count)/test_y.shape[0]



In [9]:

    
train_X,test_X,train_y,test_y = train_test_split(games[['platform','score','genre','editors_choice']].values,
                                                 games['score_phrase'].values,test_size=0.33)
kf = KFold(n =train_X.shape[0], n_folds=5)

RandomForest Classifier



In [10]:

    
rfc = RandomForestClassifier(n_estimators=100)
fold = 0
for train_index,test_index in kf:
    rfc.fit(train_X[train_index],train_y[train_index])
    val_predict = rfc.predict(train_X[test_index])
    print 'Fold {0}, Score = {1}'.format(fold,score(val_predict,train_y[test_index]))
    fold += 1









    



Fold 0, Score = 99.8397435897
Fold 1, Score = 99.7996794872
Fold 2, Score = 100.0
Fold 3, Score = 99.879759519
Fold 4, Score = 99.9198396794



In [11]:

    
val_predict = rfc.predict(test_X)
print 'Prediction Score = {1}'.format(fold,score(val_predict,test_y))









    



Prediction Score = 99.8373190174

	score_phrase	title	url	platform	score	genre	editors_choice	release_year	release_month	release_day
0	Amazing	LittleBigPlanet PS Vita	/games/littlebigplanet-vita/vita-98907	PlayStation Vita	9.0	Platformer	Y	2012	9	12
1	Amazing	LittleBigPlanet PS Vita -- Marvel Super Hero E...	/games/littlebigplanet-ps-vita-marvel-super-he...	PlayStation Vita	9.0	Platformer	Y	2012	9	12
2	Great	Splice: Tree of Life	/games/splice/ipad-141070	iPad	8.5	Puzzle	N	2012	9	12
3	Great	NHL 13	/games/nhl-13/xbox-360-128182	Xbox 360	8.5	Sports	N	2012	9	11
4	Great	NHL 13	/games/nhl-13/ps3-128181	PlayStation 3	8.5	Sports	N	2012	9	11