In [4]:
#Ignore scikit learn deprication warnings
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

In [5]:
import os
print(os.getcwd())
#os.chdir('../blocking/')
import pandas as pd
import py_entitymatching as em
import math
import matplotlib.pyplot as plt
import numpy as np
from matplotlib import style
import re


/Users/tammi/Desktop/CS 638 Stuffs/cs638project/learning

In [6]:
from numpy import genfromtxt
# train = pd.read_csv('train_reduced.csv', encoding="ISO-8859-1", index_col='_id')
# test = pd.read_csv('test_reduced.csv', encoding="ISO-8859-1", index_col='_id')
from numpy import genfromtxt
#myData = genfromtxt('train.csv',delimiter=',')

#train = genfromtxt('train_reduced_CME_test.csv',delimiter=',')
#test = genfromtxt('test_reduced_CME_test.csv',delimiter=',')

train = genfromtxt('train_reduced.csv',delimiter=',')
test = genfromtxt('test_reduced.csv',delimiter=',')

#print(train.shape)
# test = test.dropna()
# test.target
train = train[~np.isnan(train).any(axis=1)]

y = train[:,4] # label
X = train[:,0:3] # data
#print(X)
#print(y)
#print(X.shape)

In [7]:
# Awesome sauce.  Now let's fill in missing values rather than dropping them
# reload train
train = genfromtxt('train_reduced.csv',delimiter=',')

# get rid of first row (I think this is just column labels)
train = train[1:,:]

# # # replace missing featureVals with 0.5 (halfway in between)
# where_are_NaNs = np.isnan(train)
# train[where_are_NaNs] = 0


##Drop all nans:
train = train[~np.isnan(train).any(axis=1)]


print(train.shape)
print(train[6,:])
print
print(train[6,train.shape[1]-1])
len(train)

y = train[:,train.shape[1]-1] # label
X = train[:,1:train.shape[1]-1]  # data
print('X!')
print(X.shape)

print('y')
print(y.shape)
print(y[6])
print('X')
print(X.shape)
print(X[6])


(124, 19)
[  8.16009400e+06   1.96078431e-01   2.85714286e-01   0.00000000e+00
   0.00000000e+00   0.00000000e+00   4.63296703e-01   2.10000000e+01
   1.60000000e-01  -8.00000000e+00   3.00000000e+00   1.00000000e+00
   1.00000000e+00   0.00000000e+00   1.00000000e+00   1.00000000e+00
   0.00000000e+00   1.00000000e+00   0.00000000e+00]
0.0
X!
(124, 17)
y
(124,)
0.0
X
(124, 17)
[  0.19607843   0.28571429   0.           0.           0.           0.4632967
  21.           0.16        -8.           3.           1.           1.           0.
   1.           1.           0.           1.        ]

In [8]:
len(X)


Out[8]:
124

In [9]:
from sklearn.cross_validation import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import *
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import *

models = [LogisticRegression(), 
          DecisionTreeClassifier(), 
          RandomForestClassifier(n_estimators=15, max_depth=15),
          LinearSVC(),
          SVC(),
          GaussianNB(),
          KNeighborsClassifier(n_neighbors=20),
          MLPClassifier(solver='lbfgs', alpha=0.001,  hidden_layer_sizes=(8, 6))

         ]
names =  ['Logistic Regression', 
          'Decision Tree',
          'Random Forest',
          'LinearSVC',
          'SVC',
          'Naive Bayes',
          'KNN',
          'MLP'
         ]

metrics = ['accuracy', 'precision', 'recall', 'f1'] # 'accuracy', 

def compute_metrics(name, model):
    model.fit(X,y)
    print("{}:".format(name))
    for metric in metrics:
        print(metric + ':', cross_val_score(model, X, y, cv=10, scoring=metric).mean())
    print('')

for i in range(len(names)):
    compute_metrics(names[i], models[i])


---------------------------------------------------------------------------
ImportError                               Traceback (most recent call last)
<ipython-input-9-6fbe7fb14699> in <module>()
      6 from sklearn.naive_bayes import GaussianNB
      7 from sklearn.neighbors import KNeighborsClassifier
----> 8 from sklearn.neural_network import MLPClassifier
      9 from sklearn.metrics import *
     10 

ImportError: cannot import name 'MLPClassifier'

In [ ]:
# CME's grid search stuff
from sklearn.grid_search import GridSearchCV

# random forest
max_features = np.arange(1,19,4)
min_samples_split = np.arange(1,11,4)
min_samples_leaf = np.arange(1,11,4)
n_estimators = np.arange(15,26,4)
clf = RandomForestClassifier()
param_grid = {"max_depth": [3,6,9, None],
              "n_estimators": n_estimators,
              "max_features": max_features,
              "min_samples_split": min_samples_split,
              "min_samples_leaf": min_samples_leaf,
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"]}

# run grid search
grid_search = GridSearchCV(clf, param_grid=param_grid,scoring='f1',cv=10)
grid_search.fit(X, y)
print("randForest:" + str(grid_search.best_score_))
print("randForest:" + str(grid_search.best_estimator_))




# Decision tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.grid_search import GridSearchCV

param_grid = {"max_depth": [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19, None]}
DTC = DecisionTreeClassifier()

# run grid search
grid_search_ABC = GridSearchCV(DTC, param_grid=param_grid, cv=10,scoring = 'f1')
grid_search_ABC.fit(X, y)
print("DT: " + str(grid_search_ABC.best_score_))
print("Best estimator: " + str(grid_search_ABC.best_estimator_))



# KNN tuning
knn = KNeighborsClassifier()
k_range = np.arange(1,50)
param_grid = dict(n_neighbors=k_range)
gridKNN = GridSearchCV(knn, param_grid, cv=10, scoring='f1')
gridKNN.fit(X,y)
print("KNN:" + str(gridKNN.best_score_)
print("KNN:" + str(gridKNN.best_estimator_)


# SVC tuning
tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],
                     'C': [1, 10, 100, 1000]},
                    {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]
clf = GridSearchCV(SVC(C=1), tuned_parameters, cv=10,
                       scoring='f1')
clf.fit(X,y)
print("SVC:" + str(clf.best_score_)
print("SVC:" + str(clf.best_estimator_)

In [ ]:
# MLP
from itertools import product
tuned_parameters = {'solver': ['lbfgs'],
                    'alpha': [1e-3,1e-4],
                    'hidden_layer_sizes': list(product(list(range(2,15)), repeat=1)) + list(product(list(range(3,9)),repeat=2))} 
clf = GridSearchCV(MLPClassifier(), tuned_parameters, cv=5,
                       scoring='f1')
clf.fit(X,y)
print(clf.best_score_)
compute_metrics('MLP', clf.best_estimator_)
print(clf.best_estimator_)

In [ ]: