notebook.community

Edit and run



In [4]:

    
from __future__ import print_function
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cross_validation import train_test_split
from sklearn.svm import LinearSVC
from sklearn.grid_search import GridSearchCV
import random


# This enables inline Plots
%matplotlib inline



In [5]:

    
#bookings = pd.read_csv('../data/bookings.csv')
data = pd.read_table('crx.data',sep=',', header = None, na_values='?')
#print(data.describe())
#print(data.info())



In [6]:

    
#Missing values

#    A1:  12 --> category... will not impute
#    A2:  12
#    A4:   6 --> category... will not impute
#    A5:   6 --> category... will not impute
#    A6:   9 --> category... will not impute
#    A7:   9 --> category... will not impute
#    A14: 13 


#Impute Function
def get_data_impute_values(column, n):
    return np.random.normal(column.mean(), column.std(), n)

#col 1
#Impute!
data1_missing_mask = data[1].isnull()
#print('Mean col 1:', data[1].mean())
#print('Std col 1:', data[1].std())
data.loc[data[1].isnull(),1] = get_data_impute_values(data[1],n=12)

#Col 13
data[data[13].isnull()]
data.loc[data[13].isnull(),13] = get_data_impute_values(data[13],n=len(data[13].isnull()))



In [7]:

    
#change column 15 to 0's and 1s
data.loc[data[15] == '+',15] = 1
data.loc[data[15] == '-',15] = 0
data[15] = data[15].astype(int)
data.info()

#Drop NAs
data = data.dropna()
data.info()









    



<class 'pandas.core.frame.DataFrame'>
Int64Index: 690 entries, 0 to 689
Data columns (total 16 columns):
0     678 non-null object
1     690 non-null float64
2     690 non-null float64
3     684 non-null object
4     684 non-null object
5     681 non-null object
6     681 non-null object
7     690 non-null float64
8     690 non-null object
9     690 non-null object
10    690 non-null int64
11    690 non-null object
12    690 non-null object
13    690 non-null float64
14    690 non-null int64
15    690 non-null int32
dtypes: float64(4), int32(1), int64(2), object(9)<class 'pandas.core.frame.DataFrame'>
Int64Index: 671 entries, 0 to 689
Data columns (total 16 columns):
0     671 non-null object
1     671 non-null float64
2     671 non-null float64
3     671 non-null object
4     671 non-null object
5     671 non-null object
6     671 non-null object
7     671 non-null float64
8     671 non-null object
9     671 non-null object
10    671 non-null int64
11    671 non-null object
12    671 non-null object
13    671 non-null float64
14    671 non-null int64
15    671 non-null int32
dtypes: float64(4), int32(1), int64(2), object(9)



In [8]:

    
#Create Dummy Variables
cat_data = [0,3,4,5,6,8,9,11,12]
num_data = [1,2,7,10,13,14]
dataX = data[num_data]
#new_data = data[cat_data]

#new_data = pd.get_dummies(new_data)
for i in cat_data:
    dataX = dataX.merge(pd.get_dummies(data[i],i), right_index=True, left_index=True)
    print(i, len(dataX))



In [9]:

    
#Split Data
X_train, X_test, y_train, y_test = train_test_split(dataX, data[15], random_state=12, test_size=0.3)



In [10]:

    
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()
clf.fit(X_train, y_train)
clf.score(X_test, y_test)









    Out[10]:





0.86138613861386137



In [11]:

    
from sklearn.metrics import confusion_matrix, classification_report
y_pred = clf.predict(X_test)



In [12]:

    
# Confusion Matrix for Type 1 and Type 2 Error
confusion_matrix(y_test, y_pred)









    Out[12]:





array([[95, 14],
       [14, 79]])



In [13]:

    
import seaborn as sns
# Examine Precision and Recall
print(classification_report(y_test, y_pred))
# Take a look at Predicted vs True Values
sns.regplot(y_pred, y_test, x_jitter=0.1, y_jitter=0.1)









    



             precision    recall  f1-score   support

          0       0.87      0.87      0.87       109
          1       0.85      0.85      0.85        93

avg / total       0.86      0.86      0.86       202







    Out[13]:





<matplotlib.axes._subplots.AxesSubplot at 0x1751af60>



In [13]:



In [13]:



In [14]:

    
#SVM
#initialize C=1e-3
random.seed(1)
C=1e-3
est = LinearSVC(C=C)
est.fit(X_train, y_train)
est.score(X_test, y_test)









    Out[14]:





0.82673267326732669



In [17]:

    
from sklearn import svm, grid_search, datasets
parameters = {'kernel':['rbf'], 'C':np.logspace(-3.,3.,10),'gamma':np.logspace(-3.,3.,10)}
svr = svm.SVC()
clf = grid_search.GridSearchCV(svr, parameters, n_jobs=3)
clf.fit(X_train, y_train)

# Examine Precision and Recall
print(classification_report(y_test, y_pred))

# Take a look at Predicted vs True Values
sns.regplot(y_pred, y_test, x_jitter=0.1, y_jitter=0.1)









    Out[17]:





<matplotlib.axes._subplots.AxesSubplot at 0x190cbb70>



In [18]:

    
clf.score(X_test, y_test)









    Out[18]:





0.67821782178217827



In [20]:

    
clf.best_params_

# Confusion Matrix for Type 1 and Type 2 Error
confusion_matrix(y_test, y_pred)









    Out[20]:





array([[95, 14],
       [14, 79]])



In [ ]: