In [3]:
import sys
import os
sys.path.append("/Users/ajeetjha/sandbox/sherlock/blackops/scripts")
import lib.dbUtil as dbUtil
import lib.genericUtil as gUtil
import lib.mongoUtil as mUtil
import pandas as pd
import numpy as np
from datetime import datetime, date, time, timedelta
import pprint

In [4]:
df = gUtil.readCsvToPD('d1_w1_data/first5_day_2017_2_16_d1.csv')

In [5]:
df.head()


Out[5]:
Unnamed: 0 pid chaal_by_blind avg_win_by_boot avg_loss_by_boot times_loaded
0 0 1006585138905 3.0 1 11 1
1 1 100642117074 0.0 0 0 0
2 2 100488274105 0.0 0 0 0
3 3 1005883985003 0.0 0 0 1
4 4 100624859958 1.0 0 4 1

In [4]:
def normaliseDataFrame(dataFrame):
    from sklearn import preprocessing
    min_max_scaler = preprocessing.MinMaxScaler()
    return pd.DataFrame(min_max_scaler.fit_transform(dataFrame))

In [5]:
#df1 = df[['chaals','wins','blinds','max_chips','times_loaded']]
df2 = df[['chaal_by_blind','avg_win_by_boot','avg_loss_by_boot','times_loaded']]

In [6]:
df3.head()


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-6-334cab173724> in <module>()
----> 1 df3.head()

NameError: name 'df3' is not defined

In [6]:
df2.corr()


Out[6]:
chaal_by_blind avg_win_by_boot avg_loss_by_boot times_loaded
chaal_by_blind 1.000000 0.290618 0.021134 0.120445
avg_win_by_boot 0.290618 1.000000 0.359030 0.198996
avg_loss_by_boot 0.021134 0.359030 1.000000 0.143982
times_loaded 0.120445 0.198996 0.143982 1.000000

In [7]:
%matplotlib inline

In [8]:
import matplotlib.pyplot as plt

In [9]:
plt.scatter(df1['wins'], df1['max_chips'], c= df1['times_loaded'] )


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-9-56d4632c9a5c> in <module>()
----> 1 plt.scatter(df1['wins'], df1['max_chips'], c= df1['times_loaded'] )

NameError: name 'df1' is not defined

In [6]:
import numpy as np
from sklearn.cross_validation import train_test_split

In [7]:
y = df2.pop('times_loaded')
#df3 =  normaliseDataFrame(df2) # to normalize the data
X = df2.as_matrix()

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)

In [8]:
# from sklearn import svm
# clf = svm.SVC() # 0.666021227768 accuracy on test set
# # Features: 'chaals','wins','blinds','max_chips' Labels:'times_loaded'
# clf.fit(X_train, y_train)

In [9]:
from sklearn import svm
clf = svm.SVC() 
# 0.703098106713 accuracy on test set, all install data, D1 return
# 0.708183010823 accuracy on test data, gplay install data, D1 return
# 0.657415375789 accuracy on test data, all install data, W1 Return
# Features: 'chaals','wins','blinds' Labels:'times_loaded'
clf.fit(X_train, y_train)


Out[9]:
SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [13]:
# from sklearn import svm
# clf = svm.SVC() # 0.69714572576 accuracy on test set
# # Features: 'chaals','blinds' Labels:'times_loaded'
# clf.fit(X_train, y_train)

In [9]:
# from sklearn.naive_bayes import GaussianNB
# clf = GaussianNB()
# # Features: 'chaals','blinds', 'wins' Labels:'times_loaded' , accuracy 0.65
# clf.fit(X_train, y_train)


Out[9]:
GaussianNB()

In [9]:
# from sklearn import neighbors
# clf = neighbors.KNeighborsClassifier(15, weights='distance')
# # Features: 'chaals','blinds','wins' Labels:'times_loaded' , accuracy 0.68
# clf.fit(X_train, y_train)


Out[9]:
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=15, p=2,
           weights='distance')

In [ ]:
# from sklearn import tree
# clf = tree.DecisionTreeClassifier()
# # Features: 'chaals','blinds','wins' Labels:'times_loaded' , accuracy 0.649129505192
# clf.fit(X_train, y_train)

In [19]:
# # Trying boosting
# from sklearn import svm
# from sklearn.ensemble import AdaBoostClassifier
# from sklearn.tree import DecisionTreeClassifier
# #dt = svm.SVC()
# dt = DecisionTreeClassifier()
# clf = AdaBoostClassifier(n_estimators=10, base_estimator=dt,learning_rate=1)
# # 0.703098106713 accuracy on test set, first5 games  data, D1 return
# # Features: 'chaals','wins','blinds' Labels:'times_loaded'
# clf.fit(X_train, y_train)


Out[19]:
AdaBoostClassifier(algorithm='SAMME.R',
          base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best'),
          learning_rate=1, n_estimators=10, random_state=None)

In [10]:
y_pred = clf.predict(X_test)

In [11]:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)

In [12]:
print(accuracy)


0.617864780392

In [13]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)


Out[13]:
array([[4878, 3702],
       [2470, 4935]])

In [14]:
sum(y_test == 0)


Out[14]:
8580

Getting Re-validation accuracy on a whole day's different dataset


In [17]:
df_validate = gUtil.readCsvToPD('d1_w1_data/first5_day_2017_1_27_d1.csv')

In [18]:
df_validate2 = df_validate[['chaal_by_blind','avg_win_by_boot','avg_loss_by_boot','times_loaded']]
y_val = df_validate2.pop('times_loaded')
X_val = df_validate2#.as_matrix()

In [19]:
y_val_pred = clf.predict(X_val)

In [20]:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_val, y_val_pred)
print(accuracy)


0.609063950651

In [36]:
len(y_val)


Out[36]:
77813

In [37]:
37534/77813


Out[37]:
0.48236155912251166

Dumping D1 Classifier pickle


In [17]:
def writeClassifierToFile(clf, filename):
    from sklearn.externals import joblib
    pickle_path = os.path.join(gUtil.DPATH,filename)
    joblib.dump(clf, pickle_path)

def readClassifierFromFile(filename):
    from sklearn.externals import joblib
    pickle_path = os.path.join(gUtil.DPATH,filename)
    clf = joblib.load(pickle_path)
    return clf

In [18]:
writeClassifierToFile(clf,'d1_w1_data/W1_classifier.pkl')

In [19]:
clf2 = readClassifierFromFile('d1_w1_data/W1_classifier.pkl')

In [20]:
y_val_pred2 = clf.predict(X_val)

In [21]:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_val, y_val_pred2)
print(accuracy)


0.770287660371

In [23]:
sum(y_val_pred2 == 0)


Out[23]:
53629

In [46]:
36194/77813


Out[46]:
0.46514078624394384

In [ ]: