In [1]:
import lib.dbUtil as dbUtil
import lib.queries as qUtil
import pandas as pd
import numpy as np
from datetime import datetime, date, time, timedelta
import pprint
import pickle
from tqdm import tqdm, tqdm_notebook
import sys
import os
from sklearn import metrics
from sklearn.cross_validation import train_test_split
In [2]:
startDt = date(2017, 10, 1)
DATA_DIR = "../data/ml"
In [3]:
def scaleRange(X, R = (0,1)):
from sklearn.preprocessing import MinMaxScaler
oldRange = [np.amin(X), np.amax(X)]
scaler = MinMaxScaler(feature_range = R)
return scaler.fit_transform(X), oldRange
In [4]:
# dataset-1: queries.getInstallDayGameDataW0ReturnQ
def getInstallDataset(startDate = startDt, save = True, loadExisting = False):
if loadExisting == True:
return pd.read_csv(os.path.join(DATA_DIR, str(startDate) + ".csv"))
data = []
cols = ['pid', 'chaal_by_blind', 'avg_win_by_boot', 'avg_loss_by_boot', 'times_loaded', 'isReturnW0']
q = qUtil.getInstallDayGameDataW0ReturnQ.replace('${date}$', str(startDate))
count = 0
result = dbUtil.runQuery(q)
count = count+1
total = result.rowcount
row = result.fetchone()
while row != None:
rowData = (row[0], float(row[1]), float(row[2]), float(row[3]), int(row[4]), int(row[5]))
data.append(rowData)
print(("Done: ", count, total))
count = count+1
row = result.fetchone()
df = pd.DataFrame(data, columns = cols)
if save == True:
df.to_csv(os.path.join(DATA_DIR, str(startDate) + ".csv"))
return df
In [5]:
# dataset-2: queries.getInstallDayGameDataW0Return_2_Q
def getInstallDataset_2(startDate = startDt, save = True, loadExisting = False):
if loadExisting == True:
return pd.read_csv(os.path.join(DATA_DIR, str(startDate) + ".csv"))
data = []
cols = ['pid', 'chaal_by_blind', 'avg_win_by_boot', 'avg_loss_by_boot', 'games_played','times_loaded', 'isReturnW0']
q = qUtil.getInstallDayGameDataW0Return_2_Q.replace('${date}$', str(startDate))
count = 0
result = dbUtil.runQuery(q)
count = count+1
total = result.rowcount
row = result.fetchone()
while row != None:
rowData = (row[0], float(row[1]), float(row[2]), float(row[3]), int(row[4]) ,int(row[5]), int(row[6]))
data.append(rowData)
print(("Done: ", count, total))
count = count+1
row = result.fetchone()
df = pd.DataFrame(data, columns = cols)
if save == True:
df.to_csv(os.path.join(DATA_DIR, str(startDate) + ".csv"))
return df
In [6]:
#getInstallDataset()
In [7]:
#getInstallDataset_2()
In [8]:
# add visualization
In [9]:
# adding model creation
def trainModel(X_train, y_train, clf):
return clf.fit(X_train, y_train)
def evalModel(X_test, y_test, clf):
y_pred = clf.predict(X_test)
print("Accuracy Score: ", metrics.accuracy_score(y_test, y_pred))
print("F1 score: ", metrics.f1_score(y_test, y_pred, average='binary'))
In [22]:
def splitTrainTestData(df):
y = df.pop('isReturnW0')
X = df[['chaal_by_blind', 'avg_win_by_boot', 'avg_loss_by_boot', 'games_played','times_loaded']]
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)
return X_train,X_test,y_train,y_test
def splitTrainTestDataNorm(df):
y = df.pop('isReturnW0')
#X = df[['chaal_by_blind', 'avg_win_by_boot', 'avg_loss_by_boot', 'games_played','times_loaded']]
# Normalizing data
minMax = {}
X = pd.DataFrame()
X['cbb'], minMax['cbb'] = scaleRange(df['chaal_by_blind'])
X['awd'], minMax['awb'] = scaleRange(df['avg_win_by_boot'])
X['alb'], minMax['alb'] = scaleRange(df['avg_loss_by_boot'])
X['gp'], minMax['gp'] = scaleRange(df['games_played'])
X['tl'], minMax['tl'] = scaleRange(df['times_loaded'])
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)
return X_train,X_test,y_train,y_test , minMax
In [23]:
df = getInstallDataset_2(loadExisting=True)
In [24]:
df.head()
Out[24]:
In [25]:
X_train,X_test,y_train,y_test = splitTrainTestData(df)
In [26]:
from sklearn import svm
clf = svm.SVC()
clf = trainModel(X_train, y_train, clf)
evalModel(X_test, y_test, clf)
In [ ]:
# # Trying boosting
# from sklearn import svm
# from sklearn.ensemble import AdaBoostClassifier
# from sklearn.tree import DecisionTreeClassifier
# #dt = svm.SVC()
# dt = DecisionTreeClassifier()
# clf = AdaBoostClassifier(n_estimators=500, base_estimator=dt,learning_rate=1)
# clf = trainModel(X_train, y_train, clf)
# evalModel(X_test, y_test, clf)
In [41]:
def writeClassifierToFile(clf, filename):
from sklearn.externals import joblib
pickle_path = os.path.join("../data/ml_model",filename)
joblib.dump(clf, pickle_path)
def readClassifierFromFile(filename):
from sklearn.externals import joblib
pickle_path = os.path.join('../data/ml_model',filename)
clf = joblib.load(pickle_path)
return clf
In [43]:
writeClassifierToFile(clf, "W1_return.pkl")
In [33]:
import pickle
def writeClf(clf, filename):
with open(os.path.join('../data/ml_model',filename), "wb") as cf:
pickle.dump(clf,cf,1)
In [34]:
writeClf(clf, "W1_return.pkl")
In [37]:
def readClf(filename):
pickle_path = os.path.join(model_path,filename)
clf = pickle.load(pickle_path)
return clf
In [44]:
clf2 = readClassifierFromFile("W1_return.pkl")
In [45]:
y_pred_1 = clf2.predict(X_test)
In [51]:
y_pred_1
Out[51]:
In [52]:
len(y_pred_1)
Out[52]:
In [55]:
len(X_train)
Out[55]:
In [ ]: