In [1]:
import numpy as np
import pickle as pk
import pandas as pd
import timeit as tm
import csv
import sys
In [ ]:
In [2]:
# Open training data to pandas
train_dat_pandas = pd.read_csv('../data/clean_data/train_vectors.csv', index_col=0, encoding='utf-8')
del train_dat_pandas['TYPE']
# Open training labels to pandas
train_lbl_pandas = pd.read_csv('../data/clean_data/train_labels.csv', index_col=0, encoding='utf-8')
del train_lbl_pandas['YEAR']
# Save headers
headers = [list(train_dat_pandas)]
# Convert pandas to numpy matrix
train_dat = train_dat_pandas.as_matrix()
print 'training data dimensions:', train_dat.shape
# Convert pandas to numpy matrix
train_lbl = train_lbl_pandas.as_matrix()
print 'training label dimensions:', train_lbl.shape
In [3]:
# Open test data
test_dat_pandas = pd.read_csv('../data/clean_data/test_vectors.csv', index_col=0, encoding='utf-8')
del test_dat_pandas['TYPE']
# Open test labels
test_lbl_pandas = pd.read_csv('../data/clean_data/test_labels.csv', index_col=0, encoding='utf-8')
del test_lbl_pandas['YEAR']
# Convert pandas to numpy matrix
test_dat = test_dat_pandas.as_matrix()
print 'testing data dimensions:', test_dat.shape
# Convert pandas to numpy matrix
test_lbl = test_lbl_pandas.as_matrix()
print 'testing label dimensions:', test_lbl.shape
In [4]:
full_dat_pandas = pd.concat([train_dat_pandas, test_dat_pandas])
full_dat = full_dat_pandas.as_matrix()
full_lbl_pandas = pd.concat([train_lbl_pandas, test_lbl_pandas])
full_lbl = full_lbl_pandas.as_matrix()
In [5]:
# method to convert a one hot encoding array into a numeric array
def onehot_2_numeric(onehot):
numeric = []
for elem in onehot:
result = 0
for i, k in enumerate(elem):
result += i * k
numeric.append(result)
return np.asarray(numeric)
train_lbl_txt = onehot_2_numeric(train_lbl)
test_lbl_txt = onehot_2_numeric(test_lbl)
full_lbl_txt = onehot_2_numeric(full_lbl)
print train_lbl_txt, test_lbl_txt, full_lbl
In [6]:
# Feature vector scalling
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(train_dat)
train_dat = scaler.transform(train_dat)
test_dat = scaler.transform(test_dat)
scaler.fit_transform(full_dat)
Out[6]:
In [ ]:
# from sklearn.decomposition import PCA
# pca = PCA(n_components='mle')
# print pca
In [ ]:
# pca.fit(train_dat)
# print train_dat.shape
# train_dat = pca.transform(train_dat)
# print train_dat.shape
# print test_dat.shape
# test_dat = pca.transform(test_dat)
# print test_dat.shape
In [ ]:
from sklearn.linear_model import LinearRegression
# Fit Linear Regression
lin_reg = LinearRegression(n_jobs=-1, normalize=True)
lin_reg.fit(train_dat, train_lbl)
In [ ]:
# Generate predictions
predictions = lin_reg.predict(test_dat)
print predictions.shape
In [ ]:
# Compute RMSE
import math
errors = []
# compute squared errors
for i in xrange(predictions.shape[0]):
p = predictions[i]
t = test_lbl[i]
# compute distance
squared_distance = 0.0
for j in xrange(predictions.shape[1]):
squared_distance += (p[j] - t[j])**2
errors.append(squared_distance)
rmse = math.sqrt(sum(errors)/len(errors))
print 'Root mean squared error:', rmse
In [ ]:
# Save model
from sklearn.externals import joblib
joblib.dump(lin_reg, '../models/linear_regression_model.p')
In [ ]:
# from sklearn.linear_model import LogisticRegression
# clf = LogisticRegression(n_jobs=-1)
# clf.fit(train_dat, train_lbl_txt)
In [ ]:
# predictions = clf.predict(test_dat)
# p_predictions = clf.predict_proba(test_dat)
# print 'predictions dimensions:', predictions.shape
# print 'probabilities per class:', p_predictions.shape
In [ ]:
# # Table of probabilities for each class
# for i in range(6):
# print str(i)+'\t',
# print ''
# for i in xrange(len(p_predictions)):
# for j in xrange(len(p_predictions[i])):
# print("%.2f" % (p_predictions[i][j]*100))+'%\t',
# print ''
In [ ]:
# from sklearn.metrics import accuracy_score
# score = accuracy_score(test_lbl_txt, predictions)
# print score
In [ ]:
from sklearn.linear_model import LogisticRegressionCV
from sklearn.cross_validation import StratifiedKFold
folder = StratifiedKFold(train_lbl_txt, n_folds=5, shuffle=False)
clf = LogisticRegressionCV(n_jobs=-1, solver='liblinear', cv=folder, verbose=5)
print clf
In [ ]:
clf = clf.fit(train_dat, train_lbl_txt)
print clf.score(test_dat, test_lbl_txt)
In [ ]:
# Save model
from sklearn.externals import joblib
joblib.dump(clf, '../models/logistic_regression_model.p')
In [ ]:
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier()
clf.fit(train_dat, train_lbl_txt)
predictions = clf.predict(test_dat)
from sklearn.metrics import accuracy_score
score = accuracy_score(test_lbl_txt, predictions)
print score
In [ ]:
from sklearn.grid_search import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.cross_validation import StratifiedKFold
folder = StratifiedKFold(train_lbl_txt, n_folds=5, shuffle=False)
parameters = {'max_depth':[None, 2, 4, 8, 16, 32, 64]}
dtc_clf = DecisionTreeClassifier()
clf = GridSearchCV(dtc_clf, parameters, n_jobs=-1, pre_dispatch='n_jobs', cv=folder, refit=True, verbose=5)
clf.fit(train_dat, train_lbl_txt)
print 'Score on test data:', clf.score(test_dat, test_lbl_txt)
print 'best params:', clf.best_params_
In [ ]:
# Save model
from sklearn.externals import joblib
joblib.dump(clf, '../models/decision_tree_model.p')
In [ ]:
from sklearn.cross_validation import StratifiedKFold
from sklearn.metrics import accuracy_score
# Generate k-fold split in the training data
kf = StratifiedKFold(train_lbl_txt, n_folds=5, shuffle=True)
# Do multiple runs and save'em to runs list
runs = []
depths = [None, 2, 4, 8, 16, 32, 64]
print 'this will take a while...',
for d in depths:
clf = DecisionTreeClassifier(max_depth=d)
for t,v in kf:
trn = train_dat[t]
val = train_dat[v]
trn_lbl = train_lbl_txt[t]
val_lbl = train_lbl_txt[v]
clf.fit(trn, trn_lbl)
predictions = clf.predict(val)
#score = accuracy_score(val_lbl, predictions)
score = clf.score(val, val_lbl)
runs.append(tuple([d, score]))
print d, score
print 'done!'
In [ ]:
best_result = max(runs, key=lambda run: run[1])
print 'Best result:', best_result
best_d = best_result[0]
clf = DecisionTreeClassifier(max_depth=best_d)
clf.fit(train_dat, train_lbl_txt)
print 'Score on test data:', clf.score(test_dat, test_lbl_txt)
In [7]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=50, n_jobs=-1)
clf.fit(train_dat, train_lbl_txt)
clf.score(test_dat, test_lbl_txt)
Out[7]:
In [8]:
from sklearn.cross_validation import StratifiedKFold
# Generate k-fold split in the training data
kf = StratifiedKFold(train_lbl_txt, n_folds=5, shuffle=True)
# Do multiple runs and save'em to runs list
runs = []
params = []
depths = [None, 2, 4, 8, 16, 32, 64]
max_features = ['auto', 'log2', None]
criterions = ['gini', 'entropy']
for d in depths:
for mf in max_features:
for c in criterions:
params.append([d, mf, c])
print 'this will take a while...'
for d, mf, c in params:
clf = RandomForestClassifier(n_jobs=-1, max_depth=d, max_features=mf, criterion=c)
print 'run:', d, mf, c,
for t,v in kf:
trn = train_dat[t]
val = train_dat[v]
trn_lbl = train_lbl_txt[t]
val_lbl = train_lbl_txt[v]
clf.fit(trn, trn_lbl)
score = clf.score(val, val_lbl)
runs.append([score, d, mf, c])
print 'done!'
print 'All done!'
In [9]:
champion = max(runs, key=lambda run: run[0])
score, d, mf, c = champion
clf = RandomForestClassifier(n_jobs=-1, max_depth=d, max_features=mf, criterion=c)
clf.fit(full_dat, full_lbl_txt)
from sklearn.externals import joblib
joblib.dump(clf, '../models/random_forest_model.p')
Out[9]:
In [ ]:
# from sklearn.svm import SVC
# from sklearn.grid_search import GridSearchCV
# from sklearn.cross_validation import StratifiedKFold
# folder = StratifiedKFold(train_lbl_txt, n_folds=5, shuffle=False)
# parameters = {'kernel':['linear', 'poly', 'rbf'], 'C':[64, 32, 16, 8], 'probability':[False], 'max_iter':[1000]}
# svm_clf = SVC()
# clf = GridSearchCV(svm_clf, parameters, n_jobs=-1, pre_dispatch='n_jobs', cv=folder, refit=True, verbose=5)
# clf.fit(train_dat_scaled, train_lbl_txt)
# clf.score(test_dat_scaled, test_lbl_txt)
In [ ]:
# print 'best score:', clf.best_score_
# print 'best params:', clf.best_params_
In [ ]:
In [ ]: