In [4]:
#Ignore scikit learn deprication warnings
def warn(*args, **kwargs):
pass
import warnings
warnings.warn = warn
In [5]:
import os
print(os.getcwd())
#os.chdir('../blocking/')
import pandas as pd
import py_entitymatching as em
import math
import matplotlib.pyplot as plt
import numpy as np
from matplotlib import style
import re
In [6]:
from numpy import genfromtxt
# train = pd.read_csv('train_reduced.csv', encoding="ISO-8859-1", index_col='_id')
# test = pd.read_csv('test_reduced.csv', encoding="ISO-8859-1", index_col='_id')
from numpy import genfromtxt
#myData = genfromtxt('train.csv',delimiter=',')
#train = genfromtxt('train_reduced_CME_test.csv',delimiter=',')
#test = genfromtxt('test_reduced_CME_test.csv',delimiter=',')
train = genfromtxt('train_reduced.csv',delimiter=',')
test = genfromtxt('test_reduced.csv',delimiter=',')
#print(train.shape)
# test = test.dropna()
# test.target
train = train[~np.isnan(train).any(axis=1)]
y = train[:,4] # label
X = train[:,0:3] # data
#print(X)
#print(y)
#print(X.shape)
In [7]:
# Awesome sauce. Now let's fill in missing values rather than dropping them
# reload train
train = genfromtxt('train_reduced.csv',delimiter=',')
# get rid of first row (I think this is just column labels)
train = train[1:,:]
# # # replace missing featureVals with 0.5 (halfway in between)
# where_are_NaNs = np.isnan(train)
# train[where_are_NaNs] = 0
##Drop all nans:
train = train[~np.isnan(train).any(axis=1)]
print(train.shape)
print(train[6,:])
print
print(train[6,train.shape[1]-1])
len(train)
y = train[:,train.shape[1]-1] # label
X = train[:,1:train.shape[1]-1] # data
print('X!')
print(X.shape)
print('y')
print(y.shape)
print(y[6])
print('X')
print(X.shape)
print(X[6])
In [8]:
len(X)
Out[8]:
In [9]:
from sklearn.cross_validation import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import *
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import *
models = [LogisticRegression(),
DecisionTreeClassifier(),
RandomForestClassifier(n_estimators=15, max_depth=15),
LinearSVC(),
SVC(),
GaussianNB(),
KNeighborsClassifier(n_neighbors=20),
MLPClassifier(solver='lbfgs', alpha=0.001, hidden_layer_sizes=(8, 6))
]
names = ['Logistic Regression',
'Decision Tree',
'Random Forest',
'LinearSVC',
'SVC',
'Naive Bayes',
'KNN',
'MLP'
]
metrics = ['accuracy', 'precision', 'recall', 'f1'] # 'accuracy',
def compute_metrics(name, model):
model.fit(X,y)
print("{}:".format(name))
for metric in metrics:
print(metric + ':', cross_val_score(model, X, y, cv=10, scoring=metric).mean())
print('')
for i in range(len(names)):
compute_metrics(names[i], models[i])
In [ ]:
# CME's grid search stuff
from sklearn.grid_search import GridSearchCV
# random forest
max_features = np.arange(1,19,4)
min_samples_split = np.arange(1,11,4)
min_samples_leaf = np.arange(1,11,4)
n_estimators = np.arange(15,26,4)
clf = RandomForestClassifier()
param_grid = {"max_depth": [3,6,9, None],
"n_estimators": n_estimators,
"max_features": max_features,
"min_samples_split": min_samples_split,
"min_samples_leaf": min_samples_leaf,
"bootstrap": [True, False],
"criterion": ["gini", "entropy"]}
# run grid search
grid_search = GridSearchCV(clf, param_grid=param_grid,scoring='f1',cv=10)
grid_search.fit(X, y)
print("randForest:" + str(grid_search.best_score_))
print("randForest:" + str(grid_search.best_estimator_))
# Decision tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.grid_search import GridSearchCV
param_grid = {"max_depth": [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19, None]}
DTC = DecisionTreeClassifier()
# run grid search
grid_search_ABC = GridSearchCV(DTC, param_grid=param_grid, cv=10,scoring = 'f1')
grid_search_ABC.fit(X, y)
print("DT: " + str(grid_search_ABC.best_score_))
print("Best estimator: " + str(grid_search_ABC.best_estimator_))
# KNN tuning
knn = KNeighborsClassifier()
k_range = np.arange(1,50)
param_grid = dict(n_neighbors=k_range)
gridKNN = GridSearchCV(knn, param_grid, cv=10, scoring='f1')
gridKNN.fit(X,y)
print("KNN:" + str(gridKNN.best_score_)
print("KNN:" + str(gridKNN.best_estimator_)
# SVC tuning
tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],
'C': [1, 10, 100, 1000]},
{'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]
clf = GridSearchCV(SVC(C=1), tuned_parameters, cv=10,
scoring='f1')
clf.fit(X,y)
print("SVC:" + str(clf.best_score_)
print("SVC:" + str(clf.best_estimator_)
In [ ]:
# MLP
from itertools import product
tuned_parameters = {'solver': ['lbfgs'],
'alpha': [1e-3,1e-4],
'hidden_layer_sizes': list(product(list(range(2,15)), repeat=1)) + list(product(list(range(3,9)),repeat=2))}
clf = GridSearchCV(MLPClassifier(), tuned_parameters, cv=5,
scoring='f1')
clf.fit(X,y)
print(clf.best_score_)
compute_metrics('MLP', clf.best_estimator_)
print(clf.best_estimator_)
In [ ]: