In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
In [2]:
# enables inline Plots
%matplotlib inline
In [3]:
# Limit rows displayed in notebook
pd.set_option('display.max_columns', None)
In [4]:
pd.set_option('display.max_rows', 100)
In [5]:
pd.set_option('display.precision', 3)
In [6]:
pd.set_option('display.mpl_style', 'default')
In [7]:
columns = ['A1', 'Age', 'A3', 'A4', 'A5', 'A6', 'A7', 'A8', 'A9', 'A10', 'A11', 'A12', 'A13', 'A14', 'A15', 'A16']
df = pd.read_csv('../data/hw2_data.data', names = columns)
In [8]:
df.replace('?', np.nan, inplace = True)
In [9]:
df.A16.replace({'+' : 1, '-' : 0}, inplace = True)
In [10]:
df.A1.describe()
Out[10]:
In [11]:
from scipy import stats
counts = df.A1.value_counts()
dist = stats.rv_discrete(values=(np.arange(counts.shape[0]),
counts/counts.sum()))
fill_idxs = dist.rvs(size=df.shape[0] - df.A1.count())
df.loc[df.A1.isnull(), "A1"] = counts.iloc[fill_idxs].index.values
In [11]:
In [12]:
df[df.A1.isnull()]
Out[12]:
In [13]:
#a_count = df.A1[df.A1 == 'a'].count()
#b_count = df.A1[df.A1 == 'b'].count()
#float( b_count)/(a_count + b_count)
In [14]:
# how to do it manually
# to fill categorical info based on percentage
# A1_fill = np.random.choice(('a','b'), size= 12, p = (.31, .69))
#df.loc[df.A1.isnull(), "A1"] = A1_fill
In [15]:
df.A1.describe()
Out[15]:
In [16]:
df.Age = df.Age.astype(float)
In [17]:
print 'Mean Column:', df.Age.mean()
print 'Std Column:', df.Age.std()
In [18]:
def get_age_impute_values(n):
return np.random.normal(df.Age.mean(), df.Age.std(), n)
In [19]:
df.loc[df.Age.isnull(), 'Age'] = get_age_impute_values(12)
In [20]:
df.dropna(subset=['A4'], how='all', inplace = True)
In [21]:
#A6
counts = df.A6.value_counts()
dist = stats.rv_discrete(values=(np.arange(counts.shape[0]),
counts/counts.sum()))
fill_idxs = dist.rvs(size=df.shape[0] - df.A6.count())
df.loc[df.A6.isnull(), "A6"] = counts.iloc[fill_idxs].index.values
#A7
counts = df.A7.value_counts()
dist = stats.rv_discrete(values=(np.arange(counts.shape[0]),
counts/counts.sum()))
fill_idxs = dist.rvs(size=df.shape[0] - df.A7.count())
df.loc[df.A7.isnull(), "A7"] = counts.iloc[fill_idxs].index.values
#A14
counts = df.A14.value_counts()
dist = stats.rv_discrete(values=(np.arange(counts.shape[0]),
counts/counts.sum()))
fill_idxs = dist.rvs(size=df.shape[0] - df.A14.count())
df.loc[df.A14.isnull(), "A14"] = counts.iloc[fill_idxs].index.values
In [22]:
# All filled by random proportional representation
df.info()
In [23]:
df.Age.hist(bins = 50)
Out[23]:
In [24]:
df.A1.value_counts().plot(kind = 'bar') #gender?
Out[24]:
In [25]:
df.A4.value_counts().plot(kind = 'bar')
Out[25]:
In [26]:
df.A5.value_counts().plot(kind = 'bar')
Out[26]:
In [27]:
df.A6.value_counts().plot(kind = 'bar')
Out[27]:
In [28]:
df.A7.value_counts().plot(kind = 'bar')
Out[28]:
In [29]:
df.A9.value_counts().plot(kind = 'bar')
Out[29]:
In [30]:
df.A10.value_counts().plot(kind = 'bar')
Out[30]:
In [31]:
df.A12.value_counts().plot(kind = 'bar')
Out[31]:
In [32]:
df.A13.value_counts().plot(kind = 'bar')
Out[32]:
In [33]:
df.A14.value_counts().plot(kind = 'bar')
Out[33]:
In [34]:
df.groupby(['A4'])['A16'].mean()
Out[34]:
In [35]:
df.info()
In [36]:
df.hist(bins = 50, figsize = (15, 15));
In [37]:
df.corr()
Out[37]:
In [38]:
dummy = pd.get_dummies(df)
dummy.drop('A16' , axis = 1, inplace = True)
In [39]:
dummy.info()
In [39]:
In [40]:
Target = df.A16.values
In [41]:
Features = dummy
In [42]:
from sklearn.cross_validation import train_test_split
In [43]:
X_train, X_test, y_train, y_test = train_test_split(Features, Target, random_state=3, test_size=0.2)
In [44]:
from sklearn.linear_model import LogisticRegression
In [45]:
clf = LogisticRegression()
In [46]:
clf.fit(X_train, y_train)
Out[46]:
In [47]:
clf.score(X_test, y_test)
Out[47]:
In [48]:
from sklearn.metrics import confusion_matrix, classification_report
In [49]:
y_pred = clf.predict(X_test)
In [50]:
confusion_matrix(y_test, y_pred)
Out[50]:
In [51]:
print classification_report(y_test, y_pred)
In [52]:
# coefficients of features in dummy
coef = pd.DataFrame(zip(dummy.columns, np.transpose(clf.coef_)))
coef.head()
Out[52]:
In [53]:
# renaming columns
coef.rename(columns={0 : 'features', 1 : 'coef'}, inplace = True)
In [54]:
coef[coef['coef'].abs().order(ascending = False) > 1.66]
Out[54]:
In [55]:
# separating the low value coefficient
features_0 = coef.features[coef['coef'].abs().order(ascending = False) > .5].values
features_0
Out[55]:
In [56]:
#droping the low value coefficients
dummy_0 = dummy.drop(features_0, axis = 1)
In [57]:
dummy_0.info()
In [58]:
Features_0 = dummy_0
In [59]:
X_train_0, X_test_0, y_train_0, y_test_0 = train_test_split(Features_0, Target, random_state=3, test_size=0.2)
In [60]:
clf.fit(X_train_0, y_train_0)
Out[60]:
In [61]:
clf.score(X_test_0, y_test_0)
Out[61]:
In [62]:
#seems like all the low coef features still help
In [63]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.cross_validation import train_test_split
from sklearn.svm import LinearSVC
from sklearn.grid_search import GridSearchCV
from sklearn.svm import SVC
In [64]:
X_train_1, X_test_1, y_train_1, y_test_1 = train_test_split(Features_0, Target, random_state=3, test_size = 0.3)
In [65]:
est = LinearSVC(C=1e+1)
In [66]:
est.fit(X_train, y_train)
Out[66]:
In [67]:
est.score(X_test, y_test)
Out[67]:
In [68]:
d = {}
d['C'] = np.logspace(-3,3,10)
d['C']
Out[68]:
In [69]:
%%time
gs = GridSearchCV(LinearSVC(),d)
gs.fit(X_train, y_train)
In [70]:
gs.best_params_,gs.best_score_
Out[70]:
In [71]:
gs.score(X_train, y_train)
Out[71]:
In [72]:
y_pred = gs.predict(X_test)
In [73]:
66/(9+66.)
Out[73]:
In [74]:
confusion_matrix(y_test, y_pred)
Out[74]:
In [75]:
print classification_report(y_test, y_pred)
In [76]:
svc = SVC()
In [77]:
svc.fit(X_train, y_train)
Out[77]:
In [78]:
svc.score(X_train, y_train)
Out[78]:
In [79]:
y_pred = svc.predict(X_test)
In [80]:
confusion_matrix(y_test, y_pred)
Out[80]:
In [81]:
print classification_report(y_test, y_pred)
In [82]:
d = {}
d['C'] = np.logspace(-3, 3, 10)
In [83]:
%%time
gs = GridSearchCV(SVC(),d)
gs.fit(X_train, y_train)
In [84]:
gs.best_params_,gs.best_score_
Out[84]:
In [85]:
gs.score(X_train, y_train)
Out[85]:
In [86]:
y_pred = gs.predict(X_test)
In [87]:
confusion_matrix(y_test, y_pred)
Out[87]:
In [88]:
print classification_report(y_test, y_pred)
In [88]:
In [88]:
In [89]:
param = {'C' : np.logspace(-3,3,5), 'gamma' : np.logspace(-3, 3, 5)}
gs = GridSearchCV(SVC(), param)
In [90]:
%%time
gs.fit(X_train, y_train)
Out[90]:
In [91]:
gs.score(X_train, y_train)
Out[91]:
In [92]:
y_pred = gs.predict(X_test)
In [93]:
confusion_matrix(y_test, y_pred)
Out[93]:
In [94]:
print classification_report(y_test, y_pred)
In [94]:
In [94]:
In [94]:
In [94]:
In [94]:
In [94]:
In [94]:
In [94]:
In [94]:
In [94]:
In [ ]: