In [4]:
from __future__ import print_function
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cross_validation import train_test_split
from sklearn.svm import LinearSVC
from sklearn.grid_search import GridSearchCV
import random


# This enables inline Plots
%matplotlib inline

In [5]:
#bookings = pd.read_csv('../data/bookings.csv')
data = pd.read_table('crx.data',sep=',', header = None, na_values='?')
#print(data.describe())
#print(data.info())

In [6]:
#Missing values

#    A1:  12 --> category... will not impute
#    A2:  12
#    A4:   6 --> category... will not impute
#    A5:   6 --> category... will not impute
#    A6:   9 --> category... will not impute
#    A7:   9 --> category... will not impute
#    A14: 13 


#Impute Function
def get_data_impute_values(column, n):
    return np.random.normal(column.mean(), column.std(), n)

#col 1
#Impute!
data1_missing_mask = data[1].isnull()
#print('Mean col 1:', data[1].mean())
#print('Std col 1:', data[1].std())
data.loc[data[1].isnull(),1] = get_data_impute_values(data[1],n=12)

#Col 13
data[data[13].isnull()]
data.loc[data[13].isnull(),13] = get_data_impute_values(data[13],n=len(data[13].isnull()))

In [7]:
#change column 15 to 0's and 1s
data.loc[data[15] == '+',15] = 1
data.loc[data[15] == '-',15] = 0
data[15] = data[15].astype(int)
data.info()

#Drop NAs
data = data.dropna()
data.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 690 entries, 0 to 689
Data columns (total 16 columns):
0     678 non-null object
1     690 non-null float64
2     690 non-null float64
3     684 non-null object
4     684 non-null object
5     681 non-null object
6     681 non-null object
7     690 non-null float64
8     690 non-null object
9     690 non-null object
10    690 non-null int64
11    690 non-null object
12    690 non-null object
13    690 non-null float64
14    690 non-null int64
15    690 non-null int32
dtypes: float64(4), int32(1), int64(2), object(9)<class 'pandas.core.frame.DataFrame'>
Int64Index: 671 entries, 0 to 689
Data columns (total 16 columns):
0     671 non-null object
1     671 non-null float64
2     671 non-null float64
3     671 non-null object
4     671 non-null object
5     671 non-null object
6     671 non-null object
7     671 non-null float64
8     671 non-null object
9     671 non-null object
10    671 non-null int64
11    671 non-null object
12    671 non-null object
13    671 non-null float64
14    671 non-null int64
15    671 non-null int32
dtypes: float64(4), int32(1), int64(2), object(9)

In [8]:
#Create Dummy Variables
cat_data = [0,3,4,5,6,8,9,11,12]
num_data = [1,2,7,10,13,14]
dataX = data[num_data]
#new_data = data[cat_data]

#new_data = pd.get_dummies(new_data)
for i in cat_data:
    dataX = dataX.merge(pd.get_dummies(data[i],i), right_index=True, left_index=True)
    print(i, len(dataX))


0 671
3 671
4 671
5 671
6 671
8 671
9 671
11 671
12 671

In [9]:
#Split Data
X_train, X_test, y_train, y_test = train_test_split(dataX, data[15], random_state=12, test_size=0.3)

In [10]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()
clf.fit(X_train, y_train)
clf.score(X_test, y_test)


Out[10]:
0.86138613861386137

In [11]:
from sklearn.metrics import confusion_matrix, classification_report
y_pred = clf.predict(X_test)

In [12]:
# Confusion Matrix for Type 1 and Type 2 Error
confusion_matrix(y_test, y_pred)


Out[12]:
array([[95, 14],
       [14, 79]])

In [13]:
import seaborn as sns
# Examine Precision and Recall
print(classification_report(y_test, y_pred))
# Take a look at Predicted vs True Values
sns.regplot(y_pred, y_test, x_jitter=0.1, y_jitter=0.1)


             precision    recall  f1-score   support

          0       0.87      0.87      0.87       109
          1       0.85      0.85      0.85        93

avg / total       0.86      0.86      0.86       202

Out[13]:
<matplotlib.axes._subplots.AxesSubplot at 0x1751af60>

In [13]:


In [13]:


In [14]:
#SVM
#initialize C=1e-3
random.seed(1)
C=1e-3
est = LinearSVC(C=C)
est.fit(X_train, y_train)
est.score(X_test, y_test)


Out[14]:
0.82673267326732669

In [17]:
from sklearn import svm, grid_search, datasets
parameters = {'kernel':['rbf'], 'C':np.logspace(-3.,3.,10),'gamma':np.logspace(-3.,3.,10)}
svr = svm.SVC()
clf = grid_search.GridSearchCV(svr, parameters, n_jobs=3)
clf.fit(X_train, y_train)

# Examine Precision and Recall
print(classification_report(y_test, y_pred))

# Take a look at Predicted vs True Values
sns.regplot(y_pred, y_test, x_jitter=0.1, y_jitter=0.1)


Out[17]:
<matplotlib.axes._subplots.AxesSubplot at 0x190cbb70>

In [18]:
clf.score(X_test, y_test)


Out[18]:
0.67821782178217827

In [20]:
clf.best_params_

# Confusion Matrix for Type 1 and Type 2 Error
confusion_matrix(y_test, y_pred)


Out[20]:
array([[95, 14],
       [14, 79]])

In [ ]: