Impute missing data
Plot and visualize data to see any patterns
For the actual model, the submission Notebook should have the following -
Build Models using Logistics Regression and SVM (you will learn tonight - Wed).
Use Grid Search to evaluate model parameters (Wed Lab) and select a model
Build a Confusion Matrix (Mon Lab) to show how well your prediction did.
The homework is due by Monday Dec'15th Midnight. Upload your submission the same way as Homework 1.
In [25]:
import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split
from sklearn.svm import LinearSVC
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
pd.set_option('display.max_rows', 10)
In [26]:
df = pd.read_csv('/Users/ChristopherRuiz/Documents/Education/GA-Data_Science/DAT_SF_11/homeworks/data/crx.data', header=None)
df.info()
In [27]:
df.describe()
Out[27]:
In [28]:
# df[4]
# columnX = df[2]
# columnX[pd.isnull(columnX)]
# df[1].unique()
# there are no NaN. All columns have some sort of entry.
# Found that the entry was a '?' ... Using numpy to replace ? with NaN so that we can calculate the mean for the column
# NaN in Np skips over missing values
# Only col1 and 13 have ? entered. All other float/int columns have values
df[1] = df[1].replace('?', np.nan)
df[13] = df[13].replace('?', np.nan)
In [29]:
# converting column 1 series to float.
df[1] = df[1].astype(float)
df[13] = df[13].astype(float)
In [30]:
# checking for null values
df[df[1].isnull()]
Out[30]:
In [31]:
print 'Mean Col1:', df[1].mean()
print 'Std Col1:', df[1].std()
print ' '
print 'Mean Col13:', df[13].mean()
print 'Std Col13:', df[13].std()
In [32]:
# Create a Normal Distribution centered on Mean of 31.5681710914 and Standard Dev of 11.9578624983
# Get 117 Entries since that's how many missing entries we have for Age
def get_ColOne_impute_values(n):
return np.random.normal(31.5681710914, 11.9578624983, n)
def get_ColThirteen_impute_values(n):
return np.random.normal(184.014771049, 173.806768225, n)
In [33]:
dfNullValuesColOne = df[1].isnull()
df[dfNullValuesColOne]
Out[33]:
In [34]:
df.loc[df[1].isnull(), 1] = get_ColOne_impute_values(n=12)
In [35]:
df[dfNullValuesColOne]
Out[35]:
In [36]:
dfNullValuesColThirteen = df[13].isnull()
df[dfNullValuesColThirteen]
Out[36]:
In [37]:
df.loc[df[13].isnull(), 13] = get_ColThirteen_impute_values(n=13)
df[dfNullValuesColThirteen]
Out[37]:
In [38]:
letters=['a','b']
np.random.choice(letters, 1)
Out[38]:
In [39]:
def get_ColZero_impute_values(n):
return np.random.choice('a','b', n)
# def get_ColZero_impute_values(n):
# letters=['a','b']
# np.random.choice(letters, n)
In [40]:
def A1_map(val):
if val == 'b':
return 1
elif val == '?':
return '?'
else:
return 0
df['A1_map'] = df[0].map(A1_map)
df.head(5)
df['A1_map'] = df['A1_map'].replace('?', np.nan)
df['A1_map'] = df['A1_map'].astype(float)
In [41]:
print 'Mean Col1:', df['A1_map'].mean()
print 'Std Col1:', df['A1_map'].std()
In [42]:
def get_ColThirteen_impute_values(n):
return np.random.normal(184.014771049, 173.806768225, n)
In [43]:
# df[df[0]=='?']
In [44]:
# dfNullValuesColZero = df[df[0]=='?']
# dfNullValuesColZero
# df.loc[df[df[0]=='?'],12 ] = get_ColZero_impute_values(n=12)
# df.loc[df[df[0]=='?'], 'Age'] = get_age_impute_values(n=177)
# df[dfNullValuesColThirteen]
In [45]:
# remove rows with ? in column 7. This takes care of the missing
df = df[df[6] != '?']
df
Out[45]:
In [46]:
df[0].value_counts() / df[0].size
def impute_a1(val):
return np.random.choice(['a', 'b'], p=[0.7, 0.3])
df[0] = df[0].map(impute_a1)
df[0].unique()
Out[46]:
In [47]:
df
Out[47]:
In [48]:
df[1].hist(by=df[5],bins=20, sharey=True)
Out[48]:
In [49]:
df[1].hist(by=df[0],bins=20, sharey=True)
Out[49]:
In [50]:
df[1].hist(bins=20)
Out[50]:
In [51]:
# Rechecking data for reference
df.describe()
Out[51]:
In [52]:
# Rechecking data for reference
df.head(5)
Out[52]:
In [53]:
# columnZeroCount = df.groupby([0])[15].agg(['count'])
# print columnZeroCount
# columnZeroCount.hist()
# df[0].values
df.info()
In [54]:
plt.scatter(df[1],df[7],s=70,alpha=.5)
plt.xlim(0)
plt.ylim(0)
Out[54]:
In [55]:
plt.scatter(df[1],df[14],s=70,alpha=.5)
plt.axis('tight')
# plt.xlim(0)
# plt.ylim(90)
Out[55]:
In [56]:
criteria = df[df[14]<950]
# criteria
plt.scatter(criteria[1],criteria[14],s=70,alpha=.5)
plt.axis('tight')
Out[56]:
In [57]:
criteria_Two = df[df[14]<150]
# criteria
plt.scatter(criteria_Two[1],criteria_Two[14],s=70,alpha=.5)
plt.axis('tight')
Out[57]:
In [58]:
plt.scatter(df[1],df[2],s=70,alpha=.5)
plt.xlim(0)
plt.ylim(0)
Out[58]:
In [59]:
plt.scatter(df[7],df[10],s=70,alpha=.5)
plt.xlim(0)
plt.ylim(0)
Out[59]:
In [60]:
plt.scatter(df[10],df[14],s=70,alpha=.5)
# plt.xlim(0, 50)
# plt.ylim(0,50)
plt.xlim(0)
plt.ylim(0)
Out[60]:
In [61]:
column_Zero = df.groupby(df[0]).agg('count')
column_Zero
# df.hist(by=df[0], sharey=True)
Out[61]:
In [62]:
# for i in list(df.columns.values):
# print i
pd.scatter_matrix(df[[0, 1, 2, 3,4,5,6,7,8,9,10,11,12,13,14,15]]);
In [63]:
from mpl_toolkits.mplot3d import Axes3D
ThreeD_Graph = plt.figure().gca(projection='3d')
ThreeD_Graph.scatter(df[1], df[2], df[13], s = 5)
ThreeD_Graph.set_xlabel(1)
ThreeD_Graph.set_ylabel(2)
ThreeD_Graph.set_zlabel(13)
plt.show()
In [64]:
box_Color = dict(boxes='DarkGreen', whiskers='DarkOrange',medians='DarkBlue', caps='Gray')
# df[1].plot(kind='box', color=box_Color, sym='r+')
# df[2].plot(kind='box', color=box_Color, sym='r+')
# df[7].plot(kind='box', color=box_Color, sym='r+')
df[13].plot(kind='box', color=box_Color, sym='r+')
# df[14].plot(kind='box', color=box_Color, sym='r+')
Out[64]:
In [65]:
plt.scatter(df[1],df[2],c=df[13], s=70,alpha=.5)
Out[65]:
In [66]:
X_data = df[[1,2,13]]
In [67]:
X_data.info()
In [68]:
X_data = pd.get_dummies(X_data)
In [69]:
y_data = df[14]
In [70]:
X_data
Out[70]:
In [71]:
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report
In [72]:
X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, random_state=12, test_size=0.2)
In [73]:
clf = LogisticRegression()
clf.fit(X_train, y_train)
Out[73]:
In [74]:
clf.score(X_test, y_test)
Out[74]:
In [75]:
y_pred = clf.predict(X_test)
confusion_matrix(y_test, y_pred)
Out[75]:
In [76]:
print classification_report(y_test, y_pred)
# Warning appeared
# /Library/Python/2.7/site-packages/sklearn/metrics/metrics.py:1771: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.
# 'precision', 'predicted', average, warn_for)
In [77]:
pd.DataFrame(zip(X_data.columns, np.transpose(clf.coef_)))
Out[77]:
In [78]:
y_pred_df = pd.DataFrame(clf.predict_proba(X_test))
y_pred_df.rename(columns={0: 'No', 1: 'Yes'}, inplace=True)
y_pred_df['y_pred'] = y_pred
y_pred_df['y_true'] = y_test
y_pred_df.tail()
Out[78]:
In [79]:
sns.regplot(y_pred, y_test, x_jitter=0.1, y_jitter=0.1)
Out[79]:
sklearn.svm.LinearSVC
C
?
In [80]:
#initialize C=1e-3
# the documentation tells us the default value is 1
est = LinearSVC(C=1e-3)
Fit the model with the training data
In [81]:
est.fit(X_train, y_train)
Out[81]:
Score our model using test data
In [82]:
est.score(X_test, y_test)
Out[82]:
In [83]:
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
# We generate a grid in the square [-3,3 ]^2.
xx, yy = np.meshgrid(np.linspace(-3, 3, 500),
np.linspace(-3, 3, 500))
# This function takes a SVM estimator as input.
# def plot_decision_function(fitted_est):
def plot_decision_function(fitted_est,X,y):
# We evaluate the decision function on the grid.
Z = fitted_est.decision_function(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
cmap = plt.cm.coolwarm
# We display the decision function on the grid.
plt.figure(figsize=(5,5));
plt.imshow(Z,
extent=(xx.min(), xx.max(), yy.min(), yy.max()),
aspect='auto', origin='lower', cmap=cmap);
# We display the boundaries.
plt.contour(xx, yy, Z, levels=[0], linewidths=2,
colors='k');
# We display the points with their true labels.
plt.scatter(X[:, 0], X[:, 1], s=30, c=.5+.5*y, lw=1,
cmap=cmap, vmin=0, vmax=1);
plt.axhline(0, color='k', ls='--');
plt.axvline(0, color='k', ls='--');
plt.xticks(());
plt.yticks(());
plt.axis([-3, 3, -3, 3]);
In [84]:
est.fit(X_train, y_train)
Out[84]:
In [85]:
plot_decision_function(est,df[[1,2]].values,df[[13]].values)
# plot_decision_function(est)
C
using np.logspace(-3., 3., 10)
.Read the example for grid search sklearn.grid_search.GridSearchCV
.
In [ ]:
from sklearn.grid_search import GridSearchCV
# d = {'C':[0,2,1]}
# d['C'] = np.logspace(-3., 3., 10)
# clf = GridSearchCV(svr, parameters)
gs = GridSearchCV(LinearSVC(),{'C':np.logspace(-3., 3., 10)})
# GridSearch will look through all of your parameters
# iris = datasets.load_iris()
# parameters = {'kernel':('linear', 'rbf'), 'C':[1, 10]}
# svr = svm.SVC()
# clf = grid_search.GridSearchCV(svr, parameters)
# clf.fit(iris.data, iris.target)
In [ ]:
gs.fit(X_train, y_train)
In [ ]:
gs.best_params_
For this you will use sklearn.svm.SVC
Let's first make sure we understand how to read the documentation:
Implement an SVM classifier using the defaults and fit to our data:
In [ ]:
from sklearn.svm import SVC
svcL = SVC()
svcL.fit(X_train, y_train)
Plot the decision function
In [ ]:
plot_decision_function(svcL,df[[1,2]].values,df[13].values)
In [ ]:
param = {'C':np.logspace(-3., 3., 10),'gamma':np.logspace(-3., 3., 10)}
# np.logspace(-3., 3., 10) is pretty much the standard logspace... its just a best practice
gsL = GridSearchCV(SVC(),param)
gsL.fit(X_train, y_train)
In [ ]:
plot_decision_function(gsL.best_estimator_, df[[1,2]].values,df[13].values)
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]: