In [3]:
import sys
import os
sys.path.append("/Users/ajeetjha/sandbox/sherlock/blackops/scripts")
import lib.dbUtil as dbUtil
import lib.genericUtil as gUtil
import lib.mongoUtil as mUtil
import pandas as pd
import numpy as np
from datetime import datetime, date, time, timedelta
import pprint
In [4]:
df = gUtil.readCsvToPD('d1_w1_data/first5_day_2017_2_16_d1.csv')
In [5]:
df.head()
Out[5]:
In [4]:
def normaliseDataFrame(dataFrame):
from sklearn import preprocessing
min_max_scaler = preprocessing.MinMaxScaler()
return pd.DataFrame(min_max_scaler.fit_transform(dataFrame))
In [5]:
#df1 = df[['chaals','wins','blinds','max_chips','times_loaded']]
df2 = df[['chaal_by_blind','avg_win_by_boot','avg_loss_by_boot','times_loaded']]
In [6]:
df3.head()
In [6]:
df2.corr()
Out[6]:
In [7]:
%matplotlib inline
In [8]:
import matplotlib.pyplot as plt
In [9]:
plt.scatter(df1['wins'], df1['max_chips'], c= df1['times_loaded'] )
In [6]:
import numpy as np
from sklearn.cross_validation import train_test_split
In [7]:
y = df2.pop('times_loaded')
#df3 = normaliseDataFrame(df2) # to normalize the data
X = df2.as_matrix()
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)
In [8]:
# from sklearn import svm
# clf = svm.SVC() # 0.666021227768 accuracy on test set
# # Features: 'chaals','wins','blinds','max_chips' Labels:'times_loaded'
# clf.fit(X_train, y_train)
In [9]:
from sklearn import svm
clf = svm.SVC()
# 0.703098106713 accuracy on test set, all install data, D1 return
# 0.708183010823 accuracy on test data, gplay install data, D1 return
# 0.657415375789 accuracy on test data, all install data, W1 Return
# Features: 'chaals','wins','blinds' Labels:'times_loaded'
clf.fit(X_train, y_train)
Out[9]:
In [13]:
# from sklearn import svm
# clf = svm.SVC() # 0.69714572576 accuracy on test set
# # Features: 'chaals','blinds' Labels:'times_loaded'
# clf.fit(X_train, y_train)
In [9]:
# from sklearn.naive_bayes import GaussianNB
# clf = GaussianNB()
# # Features: 'chaals','blinds', 'wins' Labels:'times_loaded' , accuracy 0.65
# clf.fit(X_train, y_train)
Out[9]:
In [9]:
# from sklearn import neighbors
# clf = neighbors.KNeighborsClassifier(15, weights='distance')
# # Features: 'chaals','blinds','wins' Labels:'times_loaded' , accuracy 0.68
# clf.fit(X_train, y_train)
Out[9]:
In [ ]:
# from sklearn import tree
# clf = tree.DecisionTreeClassifier()
# # Features: 'chaals','blinds','wins' Labels:'times_loaded' , accuracy 0.649129505192
# clf.fit(X_train, y_train)
In [19]:
# # Trying boosting
# from sklearn import svm
# from sklearn.ensemble import AdaBoostClassifier
# from sklearn.tree import DecisionTreeClassifier
# #dt = svm.SVC()
# dt = DecisionTreeClassifier()
# clf = AdaBoostClassifier(n_estimators=10, base_estimator=dt,learning_rate=1)
# # 0.703098106713 accuracy on test set, first5 games data, D1 return
# # Features: 'chaals','wins','blinds' Labels:'times_loaded'
# clf.fit(X_train, y_train)
Out[19]:
In [10]:
y_pred = clf.predict(X_test)
In [11]:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)
In [12]:
print(accuracy)
In [13]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)
Out[13]:
In [14]:
sum(y_test == 0)
Out[14]:
In [17]:
df_validate = gUtil.readCsvToPD('d1_w1_data/first5_day_2017_1_27_d1.csv')
In [18]:
df_validate2 = df_validate[['chaal_by_blind','avg_win_by_boot','avg_loss_by_boot','times_loaded']]
y_val = df_validate2.pop('times_loaded')
X_val = df_validate2#.as_matrix()
In [19]:
y_val_pred = clf.predict(X_val)
In [20]:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_val, y_val_pred)
print(accuracy)
In [36]:
len(y_val)
Out[36]:
In [37]:
37534/77813
Out[37]:
In [17]:
def writeClassifierToFile(clf, filename):
from sklearn.externals import joblib
pickle_path = os.path.join(gUtil.DPATH,filename)
joblib.dump(clf, pickle_path)
def readClassifierFromFile(filename):
from sklearn.externals import joblib
pickle_path = os.path.join(gUtil.DPATH,filename)
clf = joblib.load(pickle_path)
return clf
In [18]:
writeClassifierToFile(clf,'d1_w1_data/W1_classifier.pkl')
In [19]:
clf2 = readClassifierFromFile('d1_w1_data/W1_classifier.pkl')
In [20]:
y_val_pred2 = clf.predict(X_val)
In [21]:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_val, y_val_pred2)
print(accuracy)
In [23]:
sum(y_val_pred2 == 0)
Out[23]:
In [46]:
36194/77813
Out[46]:
In [ ]: