In [1]:

    
# import common APIs
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns
import os
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.metrics import classification_report,confusion_matrix,precision_recall_curve,auc,roc_auc_score,roc_curve

Data observation



In [2]:

    
filepath = '/Users/mac/Desktop/Kaggle_datasets/Video_Game_Sale/'
filename01 = 'Video_Games_Sales_22122016.csv'

df_full = pd.read_csv(os.path.join(filepath, filename01))
df_full.head()









    Out[2]:






  
    
      
      Name
      Platform
      Year_of_Release
      Genre
      Publisher
      NA_Sales
      EU_Sales
      JP_Sales
      Other_Sales
      Global_Sales
      Critic_Score
      Critic_Count
      User_Score
      User_Count
      Developer
      Rating
    
  
  
    
      0
      Wii Sports
      Wii
      2006.0
      Sports
      Nintendo
      41.36
      28.96
      3.77
      8.45
      82.53
      76.0
      51.0
      8
      322.0
      Nintendo
      E
    
    
      1
      Super Mario Bros.
      NES
      1985.0
      Platform
      Nintendo
      29.08
      3.58
      6.81
      0.77
      40.24
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
    
    
      2
      Mario Kart Wii
      Wii
      2008.0
      Racing
      Nintendo
      15.68
      12.76
      3.79
      3.29
      35.52
      82.0
      73.0
      8.3
      709.0
      Nintendo
      E
    
    
      3
      Wii Sports Resort
      Wii
      2009.0
      Sports
      Nintendo
      15.61
      10.93
      3.28
      2.95
      32.77
      80.0
      73.0
      8
      192.0
      Nintendo
      E
    
    
      4
      Pokemon Red/Pokemon Blue
      GB
      1996.0
      Role-Playing
      Nintendo
      11.27
      8.89
      10.22
      1.00
      31.37
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN



In [3]:

    
df_full.info() #注意，User_Score居然是object type，要轉換成numeric type才能操作









    



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16719 entries, 0 to 16718
Data columns (total 16 columns):
Name               16717 non-null object
Platform           16719 non-null object
Year_of_Release    16450 non-null float64
Genre              16717 non-null object
Publisher          16665 non-null object
NA_Sales           16719 non-null float64
EU_Sales           16719 non-null float64
JP_Sales           16719 non-null float64
Other_Sales        16719 non-null float64
Global_Sales       16719 non-null float64
Critic_Score       8137 non-null float64
Critic_Count       8137 non-null float64
User_Score         10015 non-null object
User_Count         7590 non-null float64
Developer          10096 non-null object
Rating             9950 non-null object
dtypes: float64(9), object(7)
memory usage: 2.0+ MB



In [3]:

    
df = df_full.dropna()
df.head()









    Out[3]:






  
    
      
      Name
      Platform
      Year_of_Release
      Genre
      Publisher
      NA_Sales
      EU_Sales
      JP_Sales
      Other_Sales
      Global_Sales
      Critic_Score
      Critic_Count
      User_Score
      User_Count
      Developer
      Rating
    
  
  
    
      0
      Wii Sports
      Wii
      2006.0
      Sports
      Nintendo
      41.36
      28.96
      3.77
      8.45
      82.53
      76.0
      51.0
      8
      322.0
      Nintendo
      E
    
    
      2
      Mario Kart Wii
      Wii
      2008.0
      Racing
      Nintendo
      15.68
      12.76
      3.79
      3.29
      35.52
      82.0
      73.0
      8.3
      709.0
      Nintendo
      E
    
    
      3
      Wii Sports Resort
      Wii
      2009.0
      Sports
      Nintendo
      15.61
      10.93
      3.28
      2.95
      32.77
      80.0
      73.0
      8
      192.0
      Nintendo
      E
    
    
      6
      New Super Mario Bros.
      DS
      2006.0
      Platform
      Nintendo
      11.28
      9.14
      6.50
      2.88
      29.80
      89.0
      65.0
      8.5
      431.0
      Nintendo
      E
    
    
      7
      Wii Play
      Wii
      2006.0
      Misc
      Nintendo
      13.96
      9.18
      2.93
      2.84
      28.92
      58.0
      41.0
      6.6
      129.0
      Nintendo
      E



In [24]:

    
df.columns









    Out[24]:





Index(['Name', 'Platform', 'Year_of_Release', 'Genre', 'Publisher', 'NA_Sales',
       'EU_Sales', 'JP_Sales', 'Other_Sales', 'Global_Sales', 'Critic_Score',
       'Critic_Count', 'User_Score', 'User_Count', 'Developer', 'Rating'],
      dtype='object')



In [13]:

    
df.Genre.value_counts()









    Out[13]:





Action          1630
Sports           943
Shooter          864
Role-Playing     712
Racing           581
Platform         403
Misc             384
Fighting         378
Simulation       297
Strategy         267
Adventure        248
Puzzle           118
Name: Genre, dtype: int64



In [83]:

    
df['User_Score'] = pd.to_numeric(df['User_Score'])









    



//anaconda/lib/python3.5/site-packages/ipykernel/__main__.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':



In [74]:

    
sns.jointplot('Critic_Score','Critic_Count',data=df,kind='hex', cmap='cubehelix', size=8)
plt.show()



In [82]:

    
sns.jointplot('Critic_Score','User_Score',data=df,kind='hex', cmap='cubehelix', size=8)
plt.show()

corr heatmap



In [84]:

    
columns_dum = ['Platform','Genre', 'Publisher','Developer', 'Rating']
df_dum = pd.get_dummies(df, columns=columns_dum)
df_dum.head()









    Out[84]:






  
    
      
      Name
      Year_of_Release
      NA_Sales
      EU_Sales
      JP_Sales
      Other_Sales
      Global_Sales
      Critic_Score
      Critic_Count
      User_Score
      ...
      Developer_odenis studio
      Developer_syn Sophia
      Developer_zSlide
      Rating_AO
      Rating_E
      Rating_E10+
      Rating_K-A
      Rating_M
      Rating_RP
      Rating_T
    
  
  
    
      0
      Wii Sports
      2006.0
      41.36
      28.96
      3.77
      8.45
      82.53
      76.0
      51.0
      8.0
      ...
      0
      0
      0
      0
      1
      0
      0
      0
      0
      0
    
    
      2
      Mario Kart Wii
      2008.0
      15.68
      12.76
      3.79
      3.29
      35.52
      82.0
      73.0
      8.3
      ...
      0
      0
      0
      0
      1
      0
      0
      0
      0
      0
    
    
      3
      Wii Sports Resort
      2009.0
      15.61
      10.93
      3.28
      2.95
      32.77
      80.0
      73.0
      8.0
      ...
      0
      0
      0
      0
      1
      0
      0
      0
      0
      0
    
    
      6
      New Super Mario Bros.
      2006.0
      11.28
      9.14
      6.50
      2.88
      29.80
      89.0
      65.0
      8.5
      ...
      0
      0
      0
      0
      1
      0
      0
      0
      0
      0
    
    
      7
      Wii Play
      2006.0
      13.96
      9.18
      2.93
      2.84
      28.92
      58.0
      41.0
      6.6
      ...
      0
      0
      0
      0
      1
      0
      0
      0
      0
      0
    
  

5 rows × 1598 columns



In [30]:

    
k = 15 #number of variables for heatmap
corrmat = df_dum.corr()
cols = corrmat.nlargest(k, 'Critic_Score')['Critic_Score'].index
cm = np.corrcoef(df_dum[cols].values.T)

plt.figure(figsize=(15,15)) #可以調整大小
sns.set(font_scale=1.25)
hm = sns.heatmap(cm, cbar=True, annot=True, square=True, fmt='.2f', annot_kws={'size': 10},
                 yticklabels = cols.values, xticklabels = cols.values, cmap='rainbow')
plt.show()



In [85]:

    
k = 15 #number of variables for heatmap
corrmat = df_dum.corr()
cols = corrmat.nlargest(k, 'User_Score')['User_Score'].index
cm = np.corrcoef(df_dum[cols].values.T)

plt.figure(figsize=(15,15)) #可以調整大小
sns.set(font_scale=1.25)
hm = sns.heatmap(cm, cbar=True, annot=True, square=True, fmt='.2f', annot_kws={'size': 10},
                 yticklabels = cols.values, xticklabels = cols.values, cmap='rainbow')
plt.show()

7th generation game console: Wii, PS3, X360



In [86]:

    
# Dataframe contain info only on the 7th Gen consoles
video7th = df[(df['Platform'] == 'Wii') | (df['Platform'] == 'PS3') | (df['Platform'] == 'X360')]
video7th.shape









    Out[86]:





(2106, 16)



In [106]:

    
yearlySales = video7th.groupby(['Year_of_Release','Platform']).Global_Sales.sum()
yearlySales.unstack().plot(kind='bar',stacked=True, colormap= 'Blues',  grid=False)
plt.title('Stacked Barplot of Global Yearly Sales of the 7th Gen Consoles')
plt.ylabel('Global Sales')
plt.show()



In [107]:

    
yearlySales = video7th.groupby(['Year_of_Release','Platform']).Global_Sales.sum()
yearlySales.unstack().plot(kind='bar',stacked=False, colormap= 'Blues',  grid=False)
plt.title('Stacked Barplot of Global Yearly Sales of the 7th Gen Consoles')
plt.ylabel('Global Sales')
plt.show()



In [90]:

    
yearlySales.head() #由變項1和變項2做groupby，並且做變項3的計算函數









    Out[90]:





Year_of_Release  Platform
2005.0           X360          7.66
2006.0           PS3          19.61
                 Wii         134.97
                 X360         48.92
2007.0           PS3          68.49
Name: Global_Sales, dtype: float64



In [108]:

    
ratingSales = video7th.groupby(['Rating','Platform']).Global_Sales.sum()
ratingSales.unstack().plot(kind='bar',stacked=True,  colormap= 'Greens', grid=False)
plt.title('Stacked Barplot of Sales per Rating type of the 7th Gen Consoles')
plt.ylabel('Sales')
plt.show()



In [97]:

    
ratingSales









    Out[97]:





Rating  Platform
E       PS3         150.94
        Wii         428.22
        X360        161.54
E10+    PS3          61.67
        Wii         125.28
        X360         86.48
M       PS3         360.15
        Wii          13.52
        X360        424.49
T       PS3         211.54
        Wii          91.96
        X360        180.79
Name: Global_Sales, dtype: float64



In [113]:

    
GenreSales = video7th.groupby(['Genre','Platform']).Global_Sales.sum()
GenreSales.unstack().plot(kind='bar',stacked=True,  colormap= 'Greys', grid=False)
plt.title('Stacked Barplot of Sales per Rating type of the 7th Gen Consoles')
plt.ylabel('Sales')
plt.show()



In [100]:

    
GenreSales









    Out[100]:





Genre         Platform
Action        PS3         262.38
              Wii          75.75
              X360        209.90
Adventure     PS3          16.18
              Wii           7.72
              X360         11.52
Fighting      PS3          47.83
              Wii          21.89
              X360         35.30
Misc          PS3          26.59
              Wii         149.42
              X360         70.09
Platform      PS3          20.91
              Wii          78.25
              X360         10.26
Puzzle        PS3           0.40
              Wii           8.22
              X360          0.36
Racing        PS3          62.17
              Wii          48.35
              X360         56.14
Role-Playing  PS3          64.00
              Wii          11.01
              X360         68.62
Shooter       PS3         174.54
              Wii          19.20
              X360        260.35
Simulation    PS3           7.91
              Wii          23.88
              X360         13.02
Sports        PS3          98.20
              Wii         213.53
              X360        109.74
Strategy      PS3           3.19
              Wii           1.76
              X360          8.00
Name: Global_Sales, dtype: float64



In [114]:

    
colors = ['#008DB8','#00AAAA','#00C69C']

plt.subplot(121)
plt.pie( video7th.groupby('Platform').Global_Sales.sum(),
    # with the labels being platform
    labels=video7th.groupby('Platform').Global_Sales.sum().index,
    # with no shadows
    shadow=False,
    # stating our colors
    colors=colors,
    explode=(0.05, 0.05, 0.05),
    # with the start angle at 90%
    startangle=90,
    # with the percent listed as a fraction
    autopct='%1.1f%%'
    )
plt.axis('equal')
plt.title('Pie Chart of Global Sales')

plt.subplot(122)
plt.pie( video7th.groupby('Platform').User_Count.sum(),
    labels=video7th.groupby('Platform').User_Count.sum().index,
    shadow=False,
    colors=colors,
    explode=(0.05, 0.05, 0.05),
    startangle=90,
    autopct='%1.1f%%'
    )
plt.axis('equal')
plt.title('Pie Chart of User Base')
plt.tight_layout()
plt.show()

8th generation console: PS4 vs XBOXONE vs WiiU



In [116]:

    
video8th = df[(df['Platform'] == 'WiiU') | (df['Platform'] == 'PS4') | (df['Platform'] == 'XOne')]
video8th.shape









    Out[116]:





(487, 16)



In [117]:

    
yearlySales = video8th.groupby(['Year_of_Release','Platform']).Global_Sales.sum()
yearlySales.unstack().plot(kind='bar',stacked=True, colormap= 'Blues',  grid=False)
plt.title('Stacked Barplot of Global Yearly Sales of the 7th Gen Consoles')
plt.ylabel('Global Sales')
plt.show()



In [118]:

    
ratingSales = video8th.groupby(['Rating','Platform']).Global_Sales.sum()
ratingSales.unstack().plot(kind='bar',stacked=True,  colormap= 'Greens', grid=False)
plt.title('Stacked Barplot of Sales per Rating type of the 7th Gen Consoles')
plt.ylabel('Sales')
plt.show()



In [119]:

    
GenreSales = video8th.groupby(['Genre','Platform']).Global_Sales.sum()
GenreSales.unstack().plot(kind='bar',stacked=True,  colormap= 'Greys', grid=False)
plt.title('Stacked Barplot of Sales per Rating type of the 7th Gen Consoles')
plt.ylabel('Sales')
plt.show()



In [120]:

    
colors = ['#008DB8','#00AAAA','#00C69C']

plt.subplot(121)
plt.pie( video8th.groupby('Platform').Global_Sales.sum(),
    # with the labels being platform
    labels=video8th.groupby('Platform').Global_Sales.sum().index,
    # with no shadows
    shadow=False,
    # stating our colors
    colors=colors,
    explode=(0.05, 0.05, 0.05),
    # with the start angle at 90%
    startangle=90,
    # with the percent listed as a fraction
    autopct='%1.1f%%'
    )
plt.axis('equal')
plt.title('Pie Chart of Global Sales')

plt.subplot(122)
plt.pie( video8th.groupby('Platform').User_Count.sum(),
    labels=video8th.groupby('Platform').User_Count.sum().index,
    shadow=False,
    colors=colors,
    explode=(0.05, 0.05, 0.05),
    startangle=90,
    autopct='%1.1f%%'
    )
plt.axis('equal')
plt.title('Pie Chart of User Base')
plt.tight_layout()
plt.show()

目標1: regression, 預測 User_Score



In [40]:

    
from sklearn.utils import shuffle

shuffle_df = shuffle(df_dum, random_state=42)

df_label = shuffle_df['User_Score']
df_feature = shuffle_df.drop('User_Score', axis=1)

cut_point = round(len(df_dum)*0.6)
train_feature = np.array(df_feature.values[:cut_point,1:])
train_label = np.array(df_label.values[:cut_point]).astype(float)
test_feature = np.array(df_feature.values[cut_point:,1:])
test_label = np.array(df_label.values[cut_point:]).astype(float)

Scikit-learn

svm.LinearSVR()



In [41]:

    
from sklearn import datasets,cross_validation,svm

X_train,X_test,y_train,y_test = cross_validation.train_test_split(train_feature,train_label, 
                                              test_size=0.25, random_state=0)
regr=svm.LinearSVR()
regr.fit(X_train,y_train)
print('Coefficients:%s, intercept %s'%(regr.coef_,regr.intercept_))
print('Score: %.2f' % regr.score(X_test, y_test))

np.mean(np.abs((regr.predict(test_feature)-test_label)/test_label)) #平均誤差百分比









    



Coefficients:[  8.06831689e-04  -3.06203718e-03  -7.77597267e-03 ...,   6.23566231e-03
   2.31132192e-05   2.95322851e-02], intercept [ 0.00046795]
Score: -0.08






    Out[41]:





0.18582127209450455

DecisionTreeRegressor()



In [42]:

    
from sklearn.tree import DecisionTreeRegressor
from sklearn import datasets,cross_validation

X_train,X_test,y_train,y_test = cross_validation.train_test_split(train_feature,train_label, 
                                              test_size=0.25, random_state=0)
regr=DecisionTreeRegressor()
regr.fit(X_train,y_train)
print("Traing Score:%f"%regr.score(X_train,y_train))
print("Testing Score:%f"%regr.score(X_test,y_test))

np.mean(np.abs((regr.predict(test_feature)-test_label)/test_label)) #平均誤差百分比









    



Traing Score:1.000000
Testing Score:0.141318






    Out[42]:





0.17416528766137604

ensemble.RandomForestRegressor()



In [43]:

    
from sklearn import datasets,cross_validation,ensemble

X_train,X_test,y_train,y_test = cross_validation.train_test_split(train_feature,train_label, 
                                              test_size=0.25, random_state=0)
regr=ensemble.RandomForestRegressor()
regr.fit(X_train,y_train)
print("Traing Score:%f"%regr.score(X_train,y_train))
print("Testing Score:%f"%regr.score(X_test,y_test))

np.mean(np.abs((regr.predict(test_feature)-test_label)/test_label)) #平均誤差百分比









    



Traing Score:0.901627
Testing Score:0.467207






    Out[43]:





0.13930806598269804



In [65]:

    
plt.figure(figsize=(8,8))
plt.scatter(regr.predict(test_feature), test_label, alpha=0.3)

x=np.linspace(2, 10, 40)
y=x
plt.plot(x,y,c='red') #創造100%正確的線
plt.show()

Keras: MLP (Regression前要MinMaxScaler一下)，結果頗差...



In [21]:

    
# Standardize
scaler = MinMaxScaler()
scaler.fit(train_feature)
train_feature_trans = scaler.transform(train_feature)
test_feature_trans = scaler.transform(test_feature)









    



//anaconda/lib/python3.5/site-packages/sklearn/utils/validation.py:444: DataConversionWarning: Data with input dtype object was converted to float64 by MinMaxScaler.
  warnings.warn(msg, DataConversionWarning)



In [18]:

    
# Keras MLP models
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout

def show_train_history(train_history,train,validation):
    plt.plot(train_history.history[train])
    plt.plot(train_history.history[validation])
    plt.title('Train History')
    plt.ylabel(train)
    plt.xlabel('Epoch')
    plt.legend(['train', 'validation'], loc='best')
    plt.show()

model = Sequential() 
model.add(Dense(units=500, 
                input_dim=1596, 
                kernel_initializer='uniform', 
                ))
model.add(Dropout(0.5))

model.add(Dense(units=200,  
                kernel_initializer='uniform', 
                ))
model.add(Dropout(0.5))

model.add(Dense(units=200,  
                kernel_initializer='uniform', 
                ))
model.add(Dropout(0.5))

model.add(Dense(units=1, #輸出一個數字 
                kernel_initializer='uniform',
                ))

print(model.summary()) #可以清楚看到model還有參數數量

model.compile(loss='mean_squared_error',
              optimizer='adam', metrics=['accuracy'])

train_history = model.fit(x=train_feature_trans, y=train_label,  #上面多分割一步在keras是內建的
                          validation_split=0.8, epochs=40, 
                          batch_size=2000, verbose=1) #verbose=2表示顯示訓練過程

show_train_history(train_history,'acc','val_acc')
show_train_history(train_history,'loss','val_loss')

scores = model.evaluate(test_feature_trans, test_label)
print('\n')
print('accuracy=',scores[1])

prediction = model.predict(test_feature_trans)









    



_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
dense_14 (Dense)             (None, 500)               798500    
_________________________________________________________________
dropout_10 (Dropout)         (None, 500)               0         
_________________________________________________________________
dense_15 (Dense)             (None, 200)               100200    
_________________________________________________________________
dropout_11 (Dropout)         (None, 200)               0         
_________________________________________________________________
dense_16 (Dense)             (None, 200)               40200     
_________________________________________________________________
dropout_12 (Dropout)         (None, 200)               0         
_________________________________________________________________
dense_17 (Dense)             (None, 1)                 201       
=================================================================
Total params: 939,101
Trainable params: 939,101
Non-trainable params: 0
_________________________________________________________________
None
Train on 818 samples, validate on 3277 samples
Epoch 1/40
818/818 [==============================] - 0s - loss: 53.1152 - acc: 0.0000e+00 - val_loss: 53.4194 - val_acc: 0.0000e+00
Epoch 2/40
818/818 [==============================] - 0s - loss: 52.4800 - acc: 0.0000e+00 - val_loss: 52.6093 - val_acc: 0.0000e+00
Epoch 3/40
818/818 [==============================] - 0s - loss: 51.6811 - acc: 0.0000e+00 - val_loss: 51.2843 - val_acc: 0.0000e+00
Epoch 4/40
818/818 [==============================] - 0s - loss: 50.3514 - acc: 0.0000e+00 - val_loss: 49.4462 - val_acc: 0.0000e+00
Epoch 5/40
818/818 [==============================] - 0s - loss: 48.4717 - acc: 0.0000e+00 - val_loss: 46.6318 - val_acc: 0.0000e+00
Epoch 6/40
818/818 [==============================] - 0s - loss: 45.6625 - acc: 0.0000e+00 - val_loss: 42.8620 - val_acc: 3.0516e-04
Epoch 7/40
818/818 [==============================] - 0s - loss: 41.9190 - acc: 0.0000e+00 - val_loss: 37.7238 - val_acc: 3.0516e-04
Epoch 8/40
818/818 [==============================] - 0s - loss: 36.6312 - acc: 0.0000e+00 - val_loss: 31.2505 - val_acc: 6.1031e-04
Epoch 9/40
818/818 [==============================] - 0s - loss: 30.2754 - acc: 0.0024 - val_loss: 23.5718 - val_acc: 0.0015
Epoch 10/40
818/818 [==============================] - 0s - loss: 22.5271 - acc: 0.0012 - val_loss: 15.2207 - val_acc: 0.0018
Epoch 11/40
818/818 [==============================] - 0s - loss: 14.5235 - acc: 0.0012 - val_loss: 7.4030 - val_acc: 0.0067
Epoch 12/40
818/818 [==============================] - 0s - loss: 7.0288 - acc: 0.0086 - val_loss: 2.4621 - val_acc: 0.0220
Epoch 13/40
818/818 [==============================] - 0s - loss: 3.2021 - acc: 0.0220 - val_loss: 3.5257 - val_acc: 0.0284
Epoch 14/40
818/818 [==============================] - 0s - loss: 5.6796 - acc: 0.0208 - val_loss: 8.6716 - val_acc: 0.0058
Epoch 15/40
818/818 [==============================] - 0s - loss: 12.3688 - acc: 0.0086 - val_loss: 10.0105 - val_acc: 0.0040
Epoch 16/40
818/818 [==============================] - 0s - loss: 13.3788 - acc: 0.0086 - val_loss: 7.4796 - val_acc: 0.0082
Epoch 17/40
818/818 [==============================] - 0s - loss: 10.1048 - acc: 0.0073 - val_loss: 4.1711 - val_acc: 0.0238
Epoch 18/40
818/818 [==============================] - 0s - loss: 6.5157 - acc: 0.0159 - val_loss: 2.1477 - val_acc: 0.0391
Epoch 19/40
818/818 [==============================] - 0s - loss: 3.6940 - acc: 0.0244 - val_loss: 1.9126 - val_acc: 0.0287
Epoch 20/40
818/818 [==============================] - 0s - loss: 2.5080 - acc: 0.0293 - val_loss: 2.9271 - val_acc: 0.0159
Epoch 21/40
818/818 [==============================] - 0s - loss: 3.0592 - acc: 0.0147 - val_loss: 4.3794 - val_acc: 0.0134
Epoch 22/40
818/818 [==============================] - 0s - loss: 4.3542 - acc: 0.0098 - val_loss: 5.6534 - val_acc: 0.0098
Epoch 23/40
818/818 [==============================] - 0s - loss: 5.3802 - acc: 0.0098 - val_loss: 6.4173 - val_acc: 0.0088
Epoch 24/40
818/818 [==============================] - 0s - loss: 5.9721 - acc: 0.0061 - val_loss: 6.5667 - val_acc: 0.0082
Epoch 25/40
818/818 [==============================] - 0s - loss: 5.9613 - acc: 0.0073 - val_loss: 6.1409 - val_acc: 0.0101
Epoch 26/40
818/818 [==============================] - 0s - loss: 5.4317 - acc: 0.0073 - val_loss: 5.2710 - val_acc: 0.0104
Epoch 27/40
818/818 [==============================] - 0s - loss: 4.9260 - acc: 0.0061 - val_loss: 4.1369 - val_acc: 0.0140
Epoch 28/40
818/818 [==============================] - 0s - loss: 3.7688 - acc: 0.0073 - val_loss: 2.9959 - val_acc: 0.0156
Epoch 29/40
818/818 [==============================] - 0s - loss: 2.9198 - acc: 0.0159 - val_loss: 2.0963 - val_acc: 0.0204
Epoch 30/40
818/818 [==============================] - 0s - loss: 2.1698 - acc: 0.0318 - val_loss: 1.6397 - val_acc: 0.0311
Epoch 31/40
818/818 [==============================] - 0s - loss: 2.1117 - acc: 0.0379 - val_loss: 1.6692 - val_acc: 0.0415
Epoch 32/40
818/818 [==============================] - 0s - loss: 2.4146 - acc: 0.0244 - val_loss: 1.9940 - val_acc: 0.0394
Epoch 33/40
818/818 [==============================] - 0s - loss: 2.9549 - acc: 0.0269 - val_loss: 2.2944 - val_acc: 0.0357
Epoch 34/40
818/818 [==============================] - 0s - loss: 3.3073 - acc: 0.0318 - val_loss: 2.3309 - val_acc: 0.0351
Epoch 35/40
818/818 [==============================] - 0s - loss: 3.3734 - acc: 0.0183 - val_loss: 2.1022 - val_acc: 0.0384
Epoch 36/40
818/818 [==============================] - 0s - loss: 3.1695 - acc: 0.0367 - val_loss: 1.7859 - val_acc: 0.0418
Epoch 37/40
818/818 [==============================] - 0s - loss: 2.4411 - acc: 0.0269 - val_loss: 1.6078 - val_acc: 0.0351
Epoch 38/40
818/818 [==============================] - 0s - loss: 1.8854 - acc: 0.0403 - val_loss: 1.6603 - val_acc: 0.0308
Epoch 39/40
818/818 [==============================] - 0s - loss: 1.7240 - acc: 0.0342 - val_loss: 1.9046 - val_acc: 0.0247
Epoch 40/40
818/818 [==============================] - 0s - loss: 1.8452 - acc: 0.0293 - val_loss: 2.2400 - val_acc: 0.0229






    












    












    



2720/2730 [============================>.] - ETA: 0s

accuracy= 0.0219780219835



In [39]:

    
np.mean(np.abs(  (prediction-test_label)/test_label  )) #平均誤差百分比









    Out[39]:





55.873868139614117

目標2: regression, 預測 Global_Sales

(但是要剔除各地區的Sale才會比較準確)



In [23]:

    
cols = ['NA_Sales','EU_Sales', 'JP_Sales', 'Other_Sales',]

df_sales_pred = df_dum.drop(cols , axis=1)
df_sales_pred.head()









    Out[23]:






  
    
      
      Name
      Year_of_Release
      Global_Sales
      Critic_Score
      Critic_Count
      User_Score
      User_Count
      Platform_3DS
      Platform_DC
      Platform_DS
      ...
      Developer_odenis studio
      Developer_syn Sophia
      Developer_zSlide
      Rating_AO
      Rating_E
      Rating_E10+
      Rating_K-A
      Rating_M
      Rating_RP
      Rating_T
    
  
  
    
      0
      Wii Sports
      2006.0
      82.53
      76.0
      51.0
      8
      322.0
      0
      0
      0
      ...
      0
      0
      0
      0
      1
      0
      0
      0
      0
      0
    
    
      2
      Mario Kart Wii
      2008.0
      35.52
      82.0
      73.0
      8.3
      709.0
      0
      0
      0
      ...
      0
      0
      0
      0
      1
      0
      0
      0
      0
      0
    
    
      3
      Wii Sports Resort
      2009.0
      32.77
      80.0
      73.0
      8
      192.0
      0
      0
      0
      ...
      0
      0
      0
      0
      1
      0
      0
      0
      0
      0
    
    
      6
      New Super Mario Bros.
      2006.0
      29.80
      89.0
      65.0
      8.5
      431.0
      0
      0
      1
      ...
      0
      0
      0
      0
      1
      0
      0
      0
      0
      0
    
    
      7
      Wii Play
      2006.0
      28.92
      58.0
      41.0
      6.6
      129.0
      0
      0
      0
      ...
      0
      0
      0
      0
      1
      0
      0
      0
      0
      0
    
  

5 rows × 1594 columns



In [73]:

    
k = 15 #number of variables for heatmap
corrmat = df_sales_pred.corr()
cols = corrmat.nlargest(k, 'Global_Sales')['Global_Sales'].index
cm = np.corrcoef(df_sales_pred[cols].values.T)

plt.figure(figsize=(15,15)) #可以調整大小
sns.set(font_scale=1.25)
hm = sns.heatmap(cm, cbar=True, annot=True, square=True, fmt='.2f', annot_kws={'size': 10},
                 yticklabels = cols.values, xticklabels = cols.values, cmap='rainbow')

plt.show()



In [66]:

    
from sklearn.utils import shuffle

shuffle_df = shuffle(df_sales_pred, random_state=42)

df_label = shuffle_df['Global_Sales']
df_feature = shuffle_df.drop('Global_Sales', axis=1)

cut_point = round(len(df_dum)*0.6)
train_feature = np.array(df_feature.values[:cut_point,1:])
train_label = np.array(df_label.values[:cut_point]).astype(float)
test_feature = np.array(df_feature.values[cut_point:,1:])
test_label = np.array(df_label.values[cut_point:]).astype(float)

svm.LinearSVR()



In [38]:

    
from sklearn import datasets,cross_validation,svm

X_train,X_test,y_train,y_test = cross_validation.train_test_split(train_feature,train_label, 
                                              test_size=0.25, random_state=0)
regr=svm.LinearSVR()
regr.fit(X_train,y_train)
print('Coefficients:%s, intercept %s'%(regr.coef_,regr.intercept_))
print('Score: %.2f' % regr.score(X_test, y_test))
np.mean(np.abs((regr.predict(test_feature)-test_label)/test_label)) #平均誤差百分比









    



Coefficients:[ -2.13697658e-04   1.62207415e-02   1.27048397e-02 ...,  -3.62307604e-02
  -5.42543013e-05  -3.32232682e-02], intercept [ 0.00023232]
Score: 0.09






    Out[38]:





6.5198901970602599

DecisionTreeRegressor()



In [37]:

    
from sklearn.tree import DecisionTreeRegressor
from sklearn import datasets,cross_validation

X_train,X_test,y_train,y_test = cross_validation.train_test_split(train_feature,train_label, 
                                              test_size=0.25, random_state=0)
regr=DecisionTreeRegressor()
regr.fit(X_train,y_train)
print("Traing Score:%f"%regr.score(X_train,y_train))
print("Testing Score:%f"%regr.score(X_test,y_test))
np.mean(np.abs((regr.predict(test_feature)-test_label)/test_label)) #平均誤差百分比









    



Traing Score:1.000000
Testing Score:0.191965






    Out[37]:





1.5403973078151827

ensemble.RandomForestRegressor()



In [67]:

    
from sklearn import datasets,cross_validation,ensemble

X_train,X_test,y_train,y_test = cross_validation.train_test_split(train_feature,train_label, 
                                              test_size=0.25, random_state=0)
regr=ensemble.RandomForestRegressor()
regr.fit(X_train,y_train)
print("Traing Score:%f"%regr.score(X_train,y_train))
print("Testing Score:%f"%regr.score(X_test,y_test))
np.mean(np.abs( (regr.predict(test_feature)-test_label)/test_label)) #平均誤差百分比









    



Traing Score:0.890594
Testing Score:0.154715






    Out[67]:





1.5997260429778126



In [72]:

    
plt.figure(figsize=(8,8))
plt.scatter(regr.predict(test_feature), test_label, alpha=0.3)

x=np.linspace(0, 40, 100)
y=x
plt.plot(x,y,c='red') #創造100%正確的線
plt.xlim(0,2)
plt.ylim(0,2)
plt.show() #可以發現在銷售量0~2中間根本是錯誤百出，毫無章法XD



In [ ]:

	Name	Platform	Year_of_Release	Genre	Publisher	NA_Sales	EU_Sales	JP_Sales	Other_Sales	Global_Sales	Critic_Score	Critic_Count	User_Score	User_Count	Developer	Rating
0	Wii Sports	Wii	2006.0	Sports	Nintendo	41.36	28.96	3.77	8.45	82.53	76.0	51.0	8	322.0	Nintendo	E
1	Super Mario Bros.	NES	1985.0	Platform	Nintendo	29.08	3.58	6.81	0.77	40.24	NaN	NaN	NaN	NaN	NaN	NaN
2	Mario Kart Wii	Wii	2008.0	Racing	Nintendo	15.68	12.76	3.79	3.29	35.52	82.0	73.0	8.3	709.0	Nintendo	E
3	Wii Sports Resort	Wii	2009.0	Sports	Nintendo	15.61	10.93	3.28	2.95	32.77	80.0	73.0	8	192.0	Nintendo	E
4	Pokemon Red/Pokemon Blue	GB	1996.0	Role-Playing	Nintendo	11.27	8.89	10.22	1.00	31.37	NaN	NaN	NaN	NaN	NaN	NaN