In [1]:
# import common APIs
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns
import os
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.metrics import classification_report,confusion_matrix,precision_recall_curve,auc,roc_auc_score,roc_curve
In [2]:
filepath = '/Users/mac/Desktop/Kaggle_datasets/Video_Game_Sale/'
filename01 = 'Video_Games_Sales_22122016.csv'
df_full = pd.read_csv(os.path.join(filepath, filename01))
df_full.head()
Out[2]:
In [3]:
df_full.info() #注意,User_Score居然是object type,要轉換成numeric type才能操作
In [3]:
df = df_full.dropna()
df.head()
Out[3]:
In [24]:
df.columns
Out[24]:
In [13]:
df.Genre.value_counts()
Out[13]:
In [83]:
df['User_Score'] = pd.to_numeric(df['User_Score'])
In [74]:
sns.jointplot('Critic_Score','Critic_Count',data=df,kind='hex', cmap='cubehelix', size=8)
plt.show()
In [82]:
sns.jointplot('Critic_Score','User_Score',data=df,kind='hex', cmap='cubehelix', size=8)
plt.show()
In [84]:
columns_dum = ['Platform','Genre', 'Publisher','Developer', 'Rating']
df_dum = pd.get_dummies(df, columns=columns_dum)
df_dum.head()
Out[84]:
In [30]:
k = 15 #number of variables for heatmap
corrmat = df_dum.corr()
cols = corrmat.nlargest(k, 'Critic_Score')['Critic_Score'].index
cm = np.corrcoef(df_dum[cols].values.T)
plt.figure(figsize=(15,15)) #可以調整大小
sns.set(font_scale=1.25)
hm = sns.heatmap(cm, cbar=True, annot=True, square=True, fmt='.2f', annot_kws={'size': 10},
yticklabels = cols.values, xticklabels = cols.values, cmap='rainbow')
plt.show()
In [85]:
k = 15 #number of variables for heatmap
corrmat = df_dum.corr()
cols = corrmat.nlargest(k, 'User_Score')['User_Score'].index
cm = np.corrcoef(df_dum[cols].values.T)
plt.figure(figsize=(15,15)) #可以調整大小
sns.set(font_scale=1.25)
hm = sns.heatmap(cm, cbar=True, annot=True, square=True, fmt='.2f', annot_kws={'size': 10},
yticklabels = cols.values, xticklabels = cols.values, cmap='rainbow')
plt.show()
In [86]:
# Dataframe contain info only on the 7th Gen consoles
video7th = df[(df['Platform'] == 'Wii') | (df['Platform'] == 'PS3') | (df['Platform'] == 'X360')]
video7th.shape
Out[86]:
In [106]:
yearlySales = video7th.groupby(['Year_of_Release','Platform']).Global_Sales.sum()
yearlySales.unstack().plot(kind='bar',stacked=True, colormap= 'Blues', grid=False)
plt.title('Stacked Barplot of Global Yearly Sales of the 7th Gen Consoles')
plt.ylabel('Global Sales')
plt.show()
In [107]:
yearlySales = video7th.groupby(['Year_of_Release','Platform']).Global_Sales.sum()
yearlySales.unstack().plot(kind='bar',stacked=False, colormap= 'Blues', grid=False)
plt.title('Stacked Barplot of Global Yearly Sales of the 7th Gen Consoles')
plt.ylabel('Global Sales')
plt.show()
In [90]:
yearlySales.head() #由變項1和變項2做groupby,並且做變項3的計算函數
Out[90]:
In [108]:
ratingSales = video7th.groupby(['Rating','Platform']).Global_Sales.sum()
ratingSales.unstack().plot(kind='bar',stacked=True, colormap= 'Greens', grid=False)
plt.title('Stacked Barplot of Sales per Rating type of the 7th Gen Consoles')
plt.ylabel('Sales')
plt.show()
In [97]:
ratingSales
Out[97]:
In [113]:
GenreSales = video7th.groupby(['Genre','Platform']).Global_Sales.sum()
GenreSales.unstack().plot(kind='bar',stacked=True, colormap= 'Greys', grid=False)
plt.title('Stacked Barplot of Sales per Rating type of the 7th Gen Consoles')
plt.ylabel('Sales')
plt.show()
In [100]:
GenreSales
Out[100]:
In [114]:
colors = ['#008DB8','#00AAAA','#00C69C']
plt.subplot(121)
plt.pie( video7th.groupby('Platform').Global_Sales.sum(),
# with the labels being platform
labels=video7th.groupby('Platform').Global_Sales.sum().index,
# with no shadows
shadow=False,
# stating our colors
colors=colors,
explode=(0.05, 0.05, 0.05),
# with the start angle at 90%
startangle=90,
# with the percent listed as a fraction
autopct='%1.1f%%'
)
plt.axis('equal')
plt.title('Pie Chart of Global Sales')
plt.subplot(122)
plt.pie( video7th.groupby('Platform').User_Count.sum(),
labels=video7th.groupby('Platform').User_Count.sum().index,
shadow=False,
colors=colors,
explode=(0.05, 0.05, 0.05),
startangle=90,
autopct='%1.1f%%'
)
plt.axis('equal')
plt.title('Pie Chart of User Base')
plt.tight_layout()
plt.show()
In [116]:
video8th = df[(df['Platform'] == 'WiiU') | (df['Platform'] == 'PS4') | (df['Platform'] == 'XOne')]
video8th.shape
Out[116]:
In [117]:
yearlySales = video8th.groupby(['Year_of_Release','Platform']).Global_Sales.sum()
yearlySales.unstack().plot(kind='bar',stacked=True, colormap= 'Blues', grid=False)
plt.title('Stacked Barplot of Global Yearly Sales of the 7th Gen Consoles')
plt.ylabel('Global Sales')
plt.show()
In [118]:
ratingSales = video8th.groupby(['Rating','Platform']).Global_Sales.sum()
ratingSales.unstack().plot(kind='bar',stacked=True, colormap= 'Greens', grid=False)
plt.title('Stacked Barplot of Sales per Rating type of the 7th Gen Consoles')
plt.ylabel('Sales')
plt.show()
In [119]:
GenreSales = video8th.groupby(['Genre','Platform']).Global_Sales.sum()
GenreSales.unstack().plot(kind='bar',stacked=True, colormap= 'Greys', grid=False)
plt.title('Stacked Barplot of Sales per Rating type of the 7th Gen Consoles')
plt.ylabel('Sales')
plt.show()
In [120]:
colors = ['#008DB8','#00AAAA','#00C69C']
plt.subplot(121)
plt.pie( video8th.groupby('Platform').Global_Sales.sum(),
# with the labels being platform
labels=video8th.groupby('Platform').Global_Sales.sum().index,
# with no shadows
shadow=False,
# stating our colors
colors=colors,
explode=(0.05, 0.05, 0.05),
# with the start angle at 90%
startangle=90,
# with the percent listed as a fraction
autopct='%1.1f%%'
)
plt.axis('equal')
plt.title('Pie Chart of Global Sales')
plt.subplot(122)
plt.pie( video8th.groupby('Platform').User_Count.sum(),
labels=video8th.groupby('Platform').User_Count.sum().index,
shadow=False,
colors=colors,
explode=(0.05, 0.05, 0.05),
startangle=90,
autopct='%1.1f%%'
)
plt.axis('equal')
plt.title('Pie Chart of User Base')
plt.tight_layout()
plt.show()
In [40]:
from sklearn.utils import shuffle
shuffle_df = shuffle(df_dum, random_state=42)
df_label = shuffle_df['User_Score']
df_feature = shuffle_df.drop('User_Score', axis=1)
cut_point = round(len(df_dum)*0.6)
train_feature = np.array(df_feature.values[:cut_point,1:])
train_label = np.array(df_label.values[:cut_point]).astype(float)
test_feature = np.array(df_feature.values[cut_point:,1:])
test_label = np.array(df_label.values[cut_point:]).astype(float)
In [41]:
from sklearn import datasets,cross_validation,svm
X_train,X_test,y_train,y_test = cross_validation.train_test_split(train_feature,train_label,
test_size=0.25, random_state=0)
regr=svm.LinearSVR()
regr.fit(X_train,y_train)
print('Coefficients:%s, intercept %s'%(regr.coef_,regr.intercept_))
print('Score: %.2f' % regr.score(X_test, y_test))
np.mean(np.abs((regr.predict(test_feature)-test_label)/test_label)) #平均誤差百分比
Out[41]:
In [42]:
from sklearn.tree import DecisionTreeRegressor
from sklearn import datasets,cross_validation
X_train,X_test,y_train,y_test = cross_validation.train_test_split(train_feature,train_label,
test_size=0.25, random_state=0)
regr=DecisionTreeRegressor()
regr.fit(X_train,y_train)
print("Traing Score:%f"%regr.score(X_train,y_train))
print("Testing Score:%f"%regr.score(X_test,y_test))
np.mean(np.abs((regr.predict(test_feature)-test_label)/test_label)) #平均誤差百分比
Out[42]:
In [43]:
from sklearn import datasets,cross_validation,ensemble
X_train,X_test,y_train,y_test = cross_validation.train_test_split(train_feature,train_label,
test_size=0.25, random_state=0)
regr=ensemble.RandomForestRegressor()
regr.fit(X_train,y_train)
print("Traing Score:%f"%regr.score(X_train,y_train))
print("Testing Score:%f"%regr.score(X_test,y_test))
np.mean(np.abs((regr.predict(test_feature)-test_label)/test_label)) #平均誤差百分比
Out[43]:
In [65]:
plt.figure(figsize=(8,8))
plt.scatter(regr.predict(test_feature), test_label, alpha=0.3)
x=np.linspace(2, 10, 40)
y=x
plt.plot(x,y,c='red') #創造100%正確的線
plt.show()
In [21]:
# Standardize
scaler = MinMaxScaler()
scaler.fit(train_feature)
train_feature_trans = scaler.transform(train_feature)
test_feature_trans = scaler.transform(test_feature)
In [18]:
# Keras MLP models
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
def show_train_history(train_history,train,validation):
plt.plot(train_history.history[train])
plt.plot(train_history.history[validation])
plt.title('Train History')
plt.ylabel(train)
plt.xlabel('Epoch')
plt.legend(['train', 'validation'], loc='best')
plt.show()
model = Sequential()
model.add(Dense(units=500,
input_dim=1596,
kernel_initializer='uniform',
))
model.add(Dropout(0.5))
model.add(Dense(units=200,
kernel_initializer='uniform',
))
model.add(Dropout(0.5))
model.add(Dense(units=200,
kernel_initializer='uniform',
))
model.add(Dropout(0.5))
model.add(Dense(units=1, #輸出一個數字
kernel_initializer='uniform',
))
print(model.summary()) #可以清楚看到model還有參數數量
model.compile(loss='mean_squared_error',
optimizer='adam', metrics=['accuracy'])
train_history = model.fit(x=train_feature_trans, y=train_label, #上面多分割一步在keras是內建的
validation_split=0.8, epochs=40,
batch_size=2000, verbose=1) #verbose=2表示顯示訓練過程
show_train_history(train_history,'acc','val_acc')
show_train_history(train_history,'loss','val_loss')
scores = model.evaluate(test_feature_trans, test_label)
print('\n')
print('accuracy=',scores[1])
prediction = model.predict(test_feature_trans)
In [39]:
np.mean(np.abs( (prediction-test_label)/test_label )) #平均誤差百分比
Out[39]:
In [23]:
cols = ['NA_Sales','EU_Sales', 'JP_Sales', 'Other_Sales',]
df_sales_pred = df_dum.drop(cols , axis=1)
df_sales_pred.head()
Out[23]:
In [73]:
k = 15 #number of variables for heatmap
corrmat = df_sales_pred.corr()
cols = corrmat.nlargest(k, 'Global_Sales')['Global_Sales'].index
cm = np.corrcoef(df_sales_pred[cols].values.T)
plt.figure(figsize=(15,15)) #可以調整大小
sns.set(font_scale=1.25)
hm = sns.heatmap(cm, cbar=True, annot=True, square=True, fmt='.2f', annot_kws={'size': 10},
yticklabels = cols.values, xticklabels = cols.values, cmap='rainbow')
plt.show()
In [66]:
from sklearn.utils import shuffle
shuffle_df = shuffle(df_sales_pred, random_state=42)
df_label = shuffle_df['Global_Sales']
df_feature = shuffle_df.drop('Global_Sales', axis=1)
cut_point = round(len(df_dum)*0.6)
train_feature = np.array(df_feature.values[:cut_point,1:])
train_label = np.array(df_label.values[:cut_point]).astype(float)
test_feature = np.array(df_feature.values[cut_point:,1:])
test_label = np.array(df_label.values[cut_point:]).astype(float)
In [38]:
from sklearn import datasets,cross_validation,svm
X_train,X_test,y_train,y_test = cross_validation.train_test_split(train_feature,train_label,
test_size=0.25, random_state=0)
regr=svm.LinearSVR()
regr.fit(X_train,y_train)
print('Coefficients:%s, intercept %s'%(regr.coef_,regr.intercept_))
print('Score: %.2f' % regr.score(X_test, y_test))
np.mean(np.abs((regr.predict(test_feature)-test_label)/test_label)) #平均誤差百分比
Out[38]:
In [37]:
from sklearn.tree import DecisionTreeRegressor
from sklearn import datasets,cross_validation
X_train,X_test,y_train,y_test = cross_validation.train_test_split(train_feature,train_label,
test_size=0.25, random_state=0)
regr=DecisionTreeRegressor()
regr.fit(X_train,y_train)
print("Traing Score:%f"%regr.score(X_train,y_train))
print("Testing Score:%f"%regr.score(X_test,y_test))
np.mean(np.abs((regr.predict(test_feature)-test_label)/test_label)) #平均誤差百分比
Out[37]:
In [67]:
from sklearn import datasets,cross_validation,ensemble
X_train,X_test,y_train,y_test = cross_validation.train_test_split(train_feature,train_label,
test_size=0.25, random_state=0)
regr=ensemble.RandomForestRegressor()
regr.fit(X_train,y_train)
print("Traing Score:%f"%regr.score(X_train,y_train))
print("Testing Score:%f"%regr.score(X_test,y_test))
np.mean(np.abs( (regr.predict(test_feature)-test_label)/test_label)) #平均誤差百分比
Out[67]:
In [72]:
plt.figure(figsize=(8,8))
plt.scatter(regr.predict(test_feature), test_label, alpha=0.3)
x=np.linspace(0, 40, 100)
y=x
plt.plot(x,y,c='red') #創造100%正確的線
plt.xlim(0,2)
plt.ylim(0,2)
plt.show() #可以發現在銷售量0~2中間根本是錯誤百出,毫無章法XD
In [ ]: