In [1]:
import lib.dbUtil as dbUtil
import lib.queries as qUtil
import pandas as pd
import numpy as np
from datetime import datetime, date, time, timedelta
import pprint
import pickle
from tqdm import tqdm, tqdm_notebook
import sys
import os
from sklearn import metrics
from sklearn.cross_validation import train_test_split

In [2]:
startDt = date(2017, 10, 1)
DATA_DIR = "../data/ml"

In [3]:
def scaleRange(X, R = (0,1)):
    from sklearn.preprocessing import MinMaxScaler
    oldRange = [np.amin(X), np.amax(X)]
    scaler = MinMaxScaler(feature_range = R)
    return scaler.fit_transform(X), oldRange

In [4]:
# dataset-1: queries.getInstallDayGameDataW0ReturnQ
def getInstallDataset(startDate = startDt, save = True, loadExisting = False):
    
    if loadExisting == True:
        return pd.read_csv(os.path.join(DATA_DIR, str(startDate) + ".csv"))
    
    data = []
    cols = ['pid', 'chaal_by_blind', 'avg_win_by_boot', 'avg_loss_by_boot', 'times_loaded', 'isReturnW0']
    q = qUtil.getInstallDayGameDataW0ReturnQ.replace('${date}$', str(startDate))
    count = 0
    result = dbUtil.runQuery(q)
    count = count+1
    total = result.rowcount
    row = result.fetchone()
    while row != None:
        rowData = (row[0], float(row[1]), float(row[2]), float(row[3]), int(row[4]), int(row[5]))
        data.append(rowData)
        print(("Done: ", count, total))
        count = count+1
        row = result.fetchone()
        
    df = pd.DataFrame(data, columns = cols)
    
    if save == True:
        df.to_csv(os.path.join(DATA_DIR, str(startDate) + ".csv"))
    return df

In [5]:
# dataset-2: queries.getInstallDayGameDataW0Return_2_Q
def getInstallDataset_2(startDate = startDt, save = True, loadExisting = False):
    
    if loadExisting == True:
        return pd.read_csv(os.path.join(DATA_DIR, str(startDate) + ".csv"))
    
    data = []
    cols = ['pid', 'chaal_by_blind', 'avg_win_by_boot', 'avg_loss_by_boot', 'games_played','times_loaded', 'isReturnW0']
    q = qUtil.getInstallDayGameDataW0Return_2_Q.replace('${date}$', str(startDate))
    count = 0
    result = dbUtil.runQuery(q)
    count = count+1
    total = result.rowcount
    row = result.fetchone()
    while row != None:
        rowData = (row[0], float(row[1]), float(row[2]), float(row[3]), int(row[4]) ,int(row[5]), int(row[6]))
        data.append(rowData)
        print(("Done: ", count, total))
        count = count+1
        row = result.fetchone()
        
    df = pd.DataFrame(data, columns = cols)
    
    if save == True:
        df.to_csv(os.path.join(DATA_DIR, str(startDate) + ".csv"))
    return df

In [6]:
#getInstallDataset()

In [7]:
#getInstallDataset_2()

In [8]:
# add visualization

In [9]:
# adding model creation
def trainModel(X_train, y_train, clf):
    return clf.fit(X_train, y_train)

def evalModel(X_test, y_test, clf):
    y_pred = clf.predict(X_test)
    print("Accuracy Score: ", metrics.accuracy_score(y_test, y_pred))
    print("F1 score: ", metrics.f1_score(y_test, y_pred, average='binary'))

In [22]:
def splitTrainTestData(df):
    y = df.pop('isReturnW0')
    X = df[['chaal_by_blind', 'avg_win_by_boot', 'avg_loss_by_boot', 'games_played','times_loaded']]
    X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)
    return  X_train,X_test,y_train,y_test

def splitTrainTestDataNorm(df):
    y = df.pop('isReturnW0')
    #X = df[['chaal_by_blind', 'avg_win_by_boot', 'avg_loss_by_boot', 'games_played','times_loaded']]
    
    # Normalizing data
    minMax = {}
    X = pd.DataFrame()
    X['cbb'], minMax['cbb'] = scaleRange(df['chaal_by_blind'])
    X['awd'], minMax['awb'] = scaleRange(df['avg_win_by_boot'])
    X['alb'], minMax['alb'] = scaleRange(df['avg_loss_by_boot'])
    X['gp'], minMax['gp'] = scaleRange(df['games_played'])
    X['tl'], minMax['tl'] = scaleRange(df['times_loaded'])

    X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)
    return  X_train,X_test,y_train,y_test , minMax

In [23]:
df = getInstallDataset_2(loadExisting=True)

In [24]:
df.head()


Out[24]:
Unnamed: 0 pid chaal_by_blind avg_win_by_boot avg_loss_by_boot games_played times_loaded isReturnW0
0 0 1003175132727 31.00000 56.74074 112.14815 27 3 0
1 1 100810475312 0.25000 55.05660 65.62264 53 2 1
2 2 1002462115267 0.91753 21.07692 18.30769 78 4 1
3 3 1003892365546 122.50000 17.41026 22.41667 156 7 1
4 4 1006451838263 5.00000 0.00000 107.25000 4 1 0

In [25]:
X_train,X_test,y_train,y_test = splitTrainTestData(df)

In [26]:
from sklearn import svm
clf = svm.SVC()
clf = trainModel(X_train, y_train, clf)
evalModel(X_test, y_test, clf)


Accuracy Score:  0.695484070523
F1 score:  0.764445507836

In [ ]:
# # Trying boosting
# from sklearn import svm
# from sklearn.ensemble import AdaBoostClassifier
# from sklearn.tree import DecisionTreeClassifier
# #dt = svm.SVC()
# dt = DecisionTreeClassifier()
# clf = AdaBoostClassifier(n_estimators=500, base_estimator=dt,learning_rate=1)
# clf = trainModel(X_train, y_train, clf)
# evalModel(X_test, y_test, clf)

In [41]:
def writeClassifierToFile(clf, filename):
    from sklearn.externals import joblib
    pickle_path = os.path.join("../data/ml_model",filename)
    joblib.dump(clf, pickle_path)

def readClassifierFromFile(filename):
    from sklearn.externals import joblib
    pickle_path = os.path.join('../data/ml_model',filename)
    clf = joblib.load(pickle_path)
    return clf

In [43]:
writeClassifierToFile(clf, "W1_return.pkl")

In [33]:
import pickle
def writeClf(clf, filename):
    with open(os.path.join('../data/ml_model',filename), "wb") as cf:
        pickle.dump(clf,cf,1)

In [34]:
writeClf(clf, "W1_return.pkl")

In [37]:
def readClf(filename):
    pickle_path = os.path.join(model_path,filename)
    clf = pickle.load(pickle_path)
    return clf

In [44]:
clf2 = readClassifierFromFile("W1_return.pkl")

In [45]:
y_pred_1 = clf2.predict(X_test)

In [51]:
y_pred_1


Out[51]:
array([1, 0, 0, ..., 1, 1, 1])

In [52]:
len(y_pred_1)


Out[52]:
12932

In [55]:
len(X_train)


Out[55]:
51728

In [ ]: