In [4]:
from __future__ import print_function
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cross_validation import train_test_split
from sklearn.svm import LinearSVC
from sklearn.grid_search import GridSearchCV
import random
# This enables inline Plots
%matplotlib inline
In [5]:
#bookings = pd.read_csv('../data/bookings.csv')
data = pd.read_table('crx.data',sep=',', header = None, na_values='?')
#print(data.describe())
#print(data.info())
In [6]:
#Missing values
# A1: 12 --> category... will not impute
# A2: 12
# A4: 6 --> category... will not impute
# A5: 6 --> category... will not impute
# A6: 9 --> category... will not impute
# A7: 9 --> category... will not impute
# A14: 13
#Impute Function
def get_data_impute_values(column, n):
return np.random.normal(column.mean(), column.std(), n)
#col 1
#Impute!
data1_missing_mask = data[1].isnull()
#print('Mean col 1:', data[1].mean())
#print('Std col 1:', data[1].std())
data.loc[data[1].isnull(),1] = get_data_impute_values(data[1],n=12)
#Col 13
data[data[13].isnull()]
data.loc[data[13].isnull(),13] = get_data_impute_values(data[13],n=len(data[13].isnull()))
In [7]:
#change column 15 to 0's and 1s
data.loc[data[15] == '+',15] = 1
data.loc[data[15] == '-',15] = 0
data[15] = data[15].astype(int)
data.info()
#Drop NAs
data = data.dropna()
data.info()
In [8]:
#Create Dummy Variables
cat_data = [0,3,4,5,6,8,9,11,12]
num_data = [1,2,7,10,13,14]
dataX = data[num_data]
#new_data = data[cat_data]
#new_data = pd.get_dummies(new_data)
for i in cat_data:
dataX = dataX.merge(pd.get_dummies(data[i],i), right_index=True, left_index=True)
print(i, len(dataX))
In [9]:
#Split Data
X_train, X_test, y_train, y_test = train_test_split(dataX, data[15], random_state=12, test_size=0.3)
In [10]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()
clf.fit(X_train, y_train)
clf.score(X_test, y_test)
Out[10]:
In [11]:
from sklearn.metrics import confusion_matrix, classification_report
y_pred = clf.predict(X_test)
In [12]:
# Confusion Matrix for Type 1 and Type 2 Error
confusion_matrix(y_test, y_pred)
Out[12]:
In [13]:
import seaborn as sns
# Examine Precision and Recall
print(classification_report(y_test, y_pred))
# Take a look at Predicted vs True Values
sns.regplot(y_pred, y_test, x_jitter=0.1, y_jitter=0.1)
Out[13]:
In [13]:
In [13]:
In [14]:
#SVM
#initialize C=1e-3
random.seed(1)
C=1e-3
est = LinearSVC(C=C)
est.fit(X_train, y_train)
est.score(X_test, y_test)
Out[14]:
In [17]:
from sklearn import svm, grid_search, datasets
parameters = {'kernel':['rbf'], 'C':np.logspace(-3.,3.,10),'gamma':np.logspace(-3.,3.,10)}
svr = svm.SVC()
clf = grid_search.GridSearchCV(svr, parameters, n_jobs=3)
clf.fit(X_train, y_train)
# Examine Precision and Recall
print(classification_report(y_test, y_pred))
# Take a look at Predicted vs True Values
sns.regplot(y_pred, y_test, x_jitter=0.1, y_jitter=0.1)
Out[17]:
In [18]:
clf.score(X_test, y_test)
Out[18]:
In [20]:
clf.best_params_
# Confusion Matrix for Type 1 and Type 2 Error
confusion_matrix(y_test, y_pred)
Out[20]:
In [ ]: