In [1]:
import numpy as np
import pickle as pk
import pandas as pd
import timeit as tm
import csv
import sys

In [ ]:

Loading data

Loading training data


In [2]:
# Open training data to pandas
train_dat_pandas = pd.read_csv('../data/clean_data/train_vectors.csv', index_col=0, encoding='utf-8')
del train_dat_pandas['TYPE']

# Open training labels to pandas
train_lbl_pandas = pd.read_csv('../data/clean_data/train_labels.csv', index_col=0, encoding='utf-8')
del train_lbl_pandas['YEAR']

# Save headers
headers = [list(train_dat_pandas)]

# Convert pandas to numpy matrix
train_dat = train_dat_pandas.as_matrix()
print 'training data dimensions:', train_dat.shape

# Convert pandas to numpy matrix
train_lbl = train_lbl_pandas.as_matrix()
print 'training label dimensions:', train_lbl.shape


training data dimensions: (295169, 63)
training label dimensions: (295169, 6)

Loading test data


In [3]:
# Open test data
test_dat_pandas = pd.read_csv('../data/clean_data/test_vectors.csv', index_col=0, encoding='utf-8')
del test_dat_pandas['TYPE']

# Open test labels
test_lbl_pandas = pd.read_csv('../data/clean_data/test_labels.csv', index_col=0, encoding='utf-8')
del test_lbl_pandas['YEAR']

# Convert pandas to numpy matrix
test_dat = test_dat_pandas.as_matrix()
print 'testing data dimensions:', test_dat.shape

# Convert pandas to numpy matrix
test_lbl = test_lbl_pandas.as_matrix()
print 'testing label dimensions:', test_lbl.shape


testing data dimensions: (34142, 63)
testing label dimensions: (34142, 6)

Concatenating test and train for final model


In [4]:
full_dat_pandas = pd.concat([train_dat_pandas, test_dat_pandas])
full_dat = full_dat_pandas.as_matrix()

full_lbl_pandas = pd.concat([train_lbl_pandas, test_lbl_pandas])
full_lbl = full_lbl_pandas.as_matrix()

Converting one hot labels to numeric


In [5]:
# method to convert a one hot encoding array into a numeric array
def onehot_2_numeric(onehot):
    numeric = []
    for elem in onehot:
        result = 0
        for i, k in enumerate(elem):
            result += i * k
        numeric.append(result)
    return np.asarray(numeric)


train_lbl_txt = onehot_2_numeric(train_lbl)
test_lbl_txt = onehot_2_numeric(test_lbl)
full_lbl_txt = onehot_2_numeric(full_lbl)
print train_lbl_txt, test_lbl_txt, full_lbl


[4 4 4 ..., 4 2 3] [4 2 2 ..., 4 2 2] [[0 0 0 0 1 0]
 [0 0 0 0 1 0]
 [0 0 0 0 1 0]
 ..., 
 [0 0 0 0 1 0]
 [0 0 1 0 0 0]
 [0 0 1 0 0 0]]

Scaling data


In [6]:
# Feature vector scalling
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

scaler.fit(train_dat)
train_dat = scaler.transform(train_dat)
test_dat = scaler.transform(test_dat)

scaler.fit_transform(full_dat)


Out[6]:
array([[-1.43707687, -0.46160515,  0.88683494, ...,  0.25718323,
         0.46704734, -0.77932334],
       [-1.43707687, -0.46160515,  0.88683494, ...,  0.25718323,
         0.46704734, -0.77932334],
       [-1.43707687, -0.16683183,  0.88683494, ...,  1.03117731,
         1.0494166 , -0.51868547],
       ..., 
       [ 1.57106923, -1.05115178,  0.07267337, ..., -0.45880567,
        -0.35490798,  0.78072654],
       [ 1.57106923,  0.42271481, -1.80602055, ...,  1.46893604,
         1.48036877, -0.50420559],
       [ 1.57106923, -1.05115178,  0.79614277, ..., -0.45880567,
        -0.35490798,  0.78072654]])

Dimensionality Reduction


In [ ]:
# from sklearn.decomposition import PCA

# pca = PCA(n_components='mle')
# print pca

In [ ]:
# pca.fit(train_dat)
# print train_dat.shape
# train_dat = pca.transform(train_dat)
# print train_dat.shape

# print test_dat.shape
# test_dat = pca.transform(test_dat)
# print test_dat.shape

Linear Regression


In [ ]:
from sklearn.linear_model import LinearRegression

# Fit Linear Regression
lin_reg = LinearRegression(n_jobs=-1, normalize=True)
lin_reg.fit(train_dat, train_lbl)

In [ ]:
# Generate predictions
predictions = lin_reg.predict(test_dat)
print predictions.shape

In [ ]:
# Compute RMSE

import math

errors = []

# compute squared errors
for i in xrange(predictions.shape[0]):
    p = predictions[i]
    t = test_lbl[i]
    
    # compute distance
    squared_distance = 0.0
    for j in xrange(predictions.shape[1]):
        squared_distance += (p[j] - t[j])**2
    
    errors.append(squared_distance)

rmse = math.sqrt(sum(errors)/len(errors))
print 'Root mean squared error:', rmse

In [ ]:
# Save model
from sklearn.externals import joblib
joblib.dump(lin_reg, '../models/linear_regression_model.p')

Logistic regression


In [ ]:
# from sklearn.linear_model import LogisticRegression

# clf = LogisticRegression(n_jobs=-1)
# clf.fit(train_dat, train_lbl_txt)

In [ ]:
# predictions = clf.predict(test_dat)
# p_predictions = clf.predict_proba(test_dat)

# print 'predictions dimensions:', predictions.shape
# print 'probabilities per class:', p_predictions.shape

In [ ]:
# # Table of probabilities for each class
# for i in range(6):
#     print str(i)+'\t',

# print ''

# for i in xrange(len(p_predictions)):
    
#     for j in xrange(len(p_predictions[i])):
#         print("%.2f" % (p_predictions[i][j]*100))+'%\t',
    
#     print ''

In [ ]:
# from sklearn.metrics import accuracy_score
# score = accuracy_score(test_lbl_txt, predictions)
# print score

Logistic Regression Cross validation


In [ ]:
from sklearn.linear_model import LogisticRegressionCV
from sklearn.cross_validation import StratifiedKFold

folder = StratifiedKFold(train_lbl_txt, n_folds=5, shuffle=False)

clf = LogisticRegressionCV(n_jobs=-1, solver='liblinear', cv=folder, verbose=5)
print clf

In [ ]:
clf = clf.fit(train_dat, train_lbl_txt)
print clf.score(test_dat, test_lbl_txt)

In [ ]:
# Save model
from sklearn.externals import joblib
joblib.dump(clf, '../models/logistic_regression_model.p')

Decision Tree Classifier


In [ ]:
from sklearn.tree import DecisionTreeClassifier

clf = DecisionTreeClassifier()
clf.fit(train_dat, train_lbl_txt)
predictions = clf.predict(test_dat)

from sklearn.metrics import accuracy_score
score = accuracy_score(test_lbl_txt, predictions)
print score

Decision Tree Cross Validation


In [ ]:
from sklearn.grid_search import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.cross_validation import StratifiedKFold

folder = StratifiedKFold(train_lbl_txt, n_folds=5, shuffle=False)
parameters = {'max_depth':[None, 2, 4, 8, 16, 32, 64]}
dtc_clf = DecisionTreeClassifier()

clf = GridSearchCV(dtc_clf, parameters, n_jobs=-1, pre_dispatch='n_jobs', cv=folder, refit=True, verbose=5)
clf.fit(train_dat, train_lbl_txt)

print 'Score on test data:', clf.score(test_dat, test_lbl_txt)

print 'best params:', clf.best_params_

In [ ]:
# Save model
from sklearn.externals import joblib
joblib.dump(clf, '../models/decision_tree_model.p')

In [ ]:
from sklearn.cross_validation import StratifiedKFold
from sklearn.metrics import accuracy_score

# Generate k-fold split in the training data
kf = StratifiedKFold(train_lbl_txt, n_folds=5, shuffle=True)

# Do multiple runs and save'em to runs list
runs = []
depths = [None, 2, 4, 8, 16, 32, 64]

print 'this will take a while...',
for d in depths:
    clf = DecisionTreeClassifier(max_depth=d)
    for t,v in kf:
        trn = train_dat[t]
        val = train_dat[v]
        trn_lbl = train_lbl_txt[t]
        val_lbl = train_lbl_txt[v]
        clf.fit(trn, trn_lbl)
        predictions = clf.predict(val)
        #score = accuracy_score(val_lbl, predictions)
        score = clf.score(val, val_lbl)
        runs.append(tuple([d, score]))
        print d, score
print 'done!'

In [ ]:
best_result = max(runs, key=lambda run: run[1])
print 'Best result:', best_result
best_d = best_result[0]

clf = DecisionTreeClassifier(max_depth=best_d)
clf.fit(train_dat, train_lbl_txt)
print 'Score on test data:', clf.score(test_dat, test_lbl_txt)

Random Forests


In [7]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=50, n_jobs=-1)
clf.fit(train_dat, train_lbl_txt)
clf.score(test_dat, test_lbl_txt)


Out[7]:
0.49753968718879971

Cross validation on random forests


In [8]:
from sklearn.cross_validation import StratifiedKFold

# Generate k-fold split in the training data
kf = StratifiedKFold(train_lbl_txt, n_folds=5, shuffle=True)

# Do multiple runs and save'em to runs list
runs = []
params = []

depths = [None, 2, 4, 8, 16, 32, 64]
max_features = ['auto', 'log2', None]
criterions = ['gini', 'entropy']
for d in depths:
    for mf in max_features:
        for c in criterions:
            params.append([d, mf, c])


print 'this will take a while...'
for d, mf, c in params:
    clf = RandomForestClassifier(n_jobs=-1, max_depth=d, max_features=mf, criterion=c)
    print 'run:', d, mf, c,
    for t,v in kf:
        trn = train_dat[t]
        val = train_dat[v]
        trn_lbl = train_lbl_txt[t]
        val_lbl = train_lbl_txt[v]
        clf.fit(trn, trn_lbl)
        score = clf.score(val, val_lbl)
        runs.append([score, d, mf, c])
    print 'done!'
print 'All done!'


this will take a while...
run: None auto gini done!
run: None auto entropy done!
run: None log2 gini done!
run: None log2 entropy done!
run: None None gini done!
run: None None entropy done!
run: 2 auto gini done!
run: 2 auto entropy done!
run: 2 log2 gini done!
run: 2 log2 entropy done!
run: 2 None gini done!
run: 2 None entropy done!
run: 4 auto gini done!
run: 4 auto entropy done!
run: 4 log2 gini done!
run: 4 log2 entropy done!
run: 4 None gini done!
run: 4 None entropy done!
run: 8 auto gini done!
run: 8 auto entropy done!
run: 8 log2 gini done!
run: 8 log2 entropy done!
run: 8 None gini done!
run: 8 None entropy done!
run: 16 auto gini done!
run: 16 auto entropy done!
run: 16 log2 gini done!
run: 16 log2 entropy done!
run: 16 None gini done!
run: 16 None entropy done!
run: 32 auto gini done!
run: 32 auto entropy done!
run: 32 log2 gini done!
run: 32 log2 entropy done!
run: 32 None gini done!
run: 32 None entropy done!
run: 64 auto gini done!
run: 64 auto entropy done!
run: 64 log2 gini done!
run: 64 log2 entropy done!
run: 64 None gini done!
run: 64 None entropy done!
All done!

In [9]:
champion = max(runs, key=lambda run: run[0])
score, d, mf, c = champion
clf = RandomForestClassifier(n_jobs=-1, max_depth=d, max_features=mf, criterion=c)
clf.fit(full_dat, full_lbl_txt)

from sklearn.externals import joblib
joblib.dump(clf, '../models/random_forest_model.p')


Out[9]:
['../models/random_forest_model.p',
 '../models/random_forest_model.p_01.npy',
 '../models/random_forest_model.p_02.npy',
 '../models/random_forest_model.p_03.npy',
 '../models/random_forest_model.p_04.npy',
 '../models/random_forest_model.p_05.npy',
 '../models/random_forest_model.p_06.npy',
 '../models/random_forest_model.p_07.npy',
 '../models/random_forest_model.p_08.npy',
 '../models/random_forest_model.p_09.npy',
 '../models/random_forest_model.p_10.npy',
 '../models/random_forest_model.p_11.npy',
 '../models/random_forest_model.p_12.npy',
 '../models/random_forest_model.p_13.npy',
 '../models/random_forest_model.p_14.npy',
 '../models/random_forest_model.p_15.npy',
 '../models/random_forest_model.p_16.npy',
 '../models/random_forest_model.p_17.npy',
 '../models/random_forest_model.p_18.npy',
 '../models/random_forest_model.p_19.npy',
 '../models/random_forest_model.p_20.npy',
 '../models/random_forest_model.p_21.npy',
 '../models/random_forest_model.p_22.npy',
 '../models/random_forest_model.p_23.npy',
 '../models/random_forest_model.p_24.npy',
 '../models/random_forest_model.p_25.npy',
 '../models/random_forest_model.p_26.npy',
 '../models/random_forest_model.p_27.npy',
 '../models/random_forest_model.p_28.npy',
 '../models/random_forest_model.p_29.npy',
 '../models/random_forest_model.p_30.npy',
 '../models/random_forest_model.p_31.npy',
 '../models/random_forest_model.p_32.npy',
 '../models/random_forest_model.p_33.npy',
 '../models/random_forest_model.p_34.npy',
 '../models/random_forest_model.p_35.npy',
 '../models/random_forest_model.p_36.npy',
 '../models/random_forest_model.p_37.npy',
 '../models/random_forest_model.p_38.npy',
 '../models/random_forest_model.p_39.npy',
 '../models/random_forest_model.p_40.npy',
 '../models/random_forest_model.p_41.npy']

Support Vector Machine Crossvalidation


In [ ]:
# from sklearn.svm import SVC
# from sklearn.grid_search import GridSearchCV
# from sklearn.cross_validation import StratifiedKFold

# folder = StratifiedKFold(train_lbl_txt, n_folds=5, shuffle=False)

# parameters = {'kernel':['linear', 'poly', 'rbf'], 'C':[64, 32, 16, 8], 'probability':[False], 'max_iter':[1000]}
# svm_clf = SVC()

# clf = GridSearchCV(svm_clf, parameters, n_jobs=-1, pre_dispatch='n_jobs', cv=folder, refit=True, verbose=5)
# clf.fit(train_dat_scaled, train_lbl_txt)
# clf.score(test_dat_scaled, test_lbl_txt)

In [ ]:
# print 'best score:', clf.best_score_
# print 'best params:', clf.best_params_

In [ ]:


In [ ]: