Chapter 04
In [7]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
In [36]:
weekly_file_path = '../data/Weekly.csv'
weekly = pd.read_csv(weekly_file_path, index_col=0)
weekly.head()
Out[36]:
In [37]:
weekly.describe()
Out[37]:
In [16]:
from pandas.tools.plotting import scatter_matrix
weekly_refine = weekly[['Lag1','Lag2','Lag3','Lag4','Lag5','Volume','Today','Direction']]
weekly_refine=weekly_refine.replace('Up',1)
weekly_refine=weekly_refine.replace('Down',0)
fig, ax = plt.subplots(figsize=(15, 15))
scatter_matrix(weekly_refine,alpha=0.5,diagonal='kde', ax=ax);
In [17]:
weekly_refine.corr()
Out[17]:
In [29]:
from sklearn.linear_model import LogisticRegression
X = weekly_refine[['Lag1','Lag2','Lag3','Lag4','Lag5','Volume']].values
y = weekly_refine['Direction'].values
y = y.reshape((len(y),1))
lg = LogisticRegression()
lg.fit(X,y)
Out[29]:
In [31]:
print(lg.coef_,lg.intercept_)
In [33]:
from sklearn.metrics import confusion_matrix,accuracy_score
pred = lg.predict(X)
confusion_matrix(y,pred)
Out[33]:
In [35]:
print(accuracy_score(y,pred))
In [42]:
df_train = weekly[weekly['Year'].isin(range(1990,2009))]
df_test = weekly[weekly['Year'].isin(range(2009,2011))]
# training data
X_train = df_train['Lag2'].values
X_train = X_train.reshape((len(X_train),1))
y_train = df_train['Direction'].values
# test data
X_test = df_test['Lag2'].values
X_test = X_test.reshape((len(X_test),1))
y_test = df_test['Direction'].values
# lg
lg = LogisticRegression()
lg.fit(X_train, y_train)
Out[42]:
In [43]:
pred_test = lg.predict(X_test)
print(confusion_matrix(y_test, pred_test))
In [44]:
print(accuracy_score(y_test,pred_test))
In [45]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
lda = LinearDiscriminantAnalysis()
lda.fit(X_train,y_train)
Out[45]:
In [46]:
pred_test = lda.predict(X_test)
print(confusion_matrix(y_test, pred_test))
In [47]:
print(accuracy_score(y_test,pred_test))
In [48]:
qda = QuadraticDiscriminantAnalysis()
qda.fit(X_train, y_train)
Out[48]:
In [49]:
pred_test = qda.predict(X_test)
print(confusion_matrix(y_test, pred_test))
In [50]:
print(accuracy_score(y_test, pred_test))
In [54]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
Out[54]:
In [55]:
pred_test = knn.predict(X_test)
print(confusion_matrix(y_test, pred_test))
In [56]:
print(accuracy_score(y_test, pred_test))
In [57]:
auto_file_path = '../data/Auto'
autos = pd.read_table(auto_file_path,sep='\s+')
autos=autos.replace('?',np.NAN).dropna()
autos['horsepower']=autos['horsepower'].astype('float')
autos.head()
Out[57]:
In [59]:
mpgs = autos['mpg'].values
mpg_med = np.median(mpgs)
mpg0 = [1 if mpg > mpg_med else 0 for mpg in mpgs]
autos['mpg0'] = mpg0
autos.head()
Out[59]:
In [62]:
fig, ax = plt.subplots(figsize=(15, 15))
scatter_matrix(autos[['cylinders','displacement','horsepower','weight','acceleration','mpg0']],ax=ax);
In [64]:
autos[['cylinders','displacement','horsepower','weight','acceleration','mpg0']].boxplot(by='mpg0');
In [81]:
autos_train = autos[autos.apply(lambda x: x['year'] %2 ==0, axis=1)]
autos_test = autos[autos.apply(lambda x:x['year'] % 2 != 0, axis=1)]
variables = ['cylinders','weight','displacement','horsepower']
response = ['mpg0']
X_train = autos_train[variables].values
y_train = autos_train[response].values
y_train = y_train.reshape((len(y_train)))
X_test = autos_test[variables].values
y_test = autos_test[response].values
y_test = y_test.reshape((len(y_test)))
In [82]:
lda = LinearDiscriminantAnalysis()
lda.fit(X_train, y_train)
Out[82]:
In [83]:
pred = lda.predict(X_test)
print(accuracy_score(y_test, pred))
In [84]:
qda = QuadraticDiscriminantAnalysis()
qda.fit(X_train, y_train)
Out[84]:
In [85]:
pred = qda.predict(X_test)
print(accuracy_score(y_test, pred))
In [86]:
lg = LogisticRegression()
lg.fit(X_train, y_train)
Out[86]:
In [87]:
pred = lg.predict(X_test)
print(accuracy_score(y_test, pred))
In [90]:
ks=[1,3,5,7,9,11,13,15]
accur={}
for k in ks:
knn = KNeighborsClassifier(n_neighbors=k)
knn.fit(X_train,y_train)
pred = knn.predict(X_test)
accur[k]=accuracy_score(y_test,pred)
for k,v in accur.items():
print('k is %d :'%k, 'accuracy is %f'%v)
When $k$ equals to $3$, the knn classifier has the highest accuracy scores, $0.8625$.
In [91]:
def Power():
return 2*2*2
Power()
Out[91]:
In [95]:
def Power2(x,a):
res = 1
while a>=1:
res *= x
a -= 1
return res
Power2(3,8)
Out[95]:
In [96]:
Power2(10,3)
Out[96]:
In [97]:
Power2(8,17)
Out[97]:
In [98]:
Power2(131,3)
Out[98]:
In [99]:
x = range(1,11,1)
y = [Power2(item,2) for item in x]
plt.plot(x,y)
plt.show()
In [100]:
boston_file_name = '../data/Boston.csv'
bostons = pd.read_csv(boston_file_name, index_col=0)
bostons.head()
Out[100]:
In [107]:
crims = bostons['crim'].values
crims_med = np.median(crims)
cime_statue = [1 if crim > crims_med else 0 for crim in crims]
bostons['crim_statue'] = cime_statue
In [112]:
X = bostons[['dis']].values
y = bostons['crim_statue'].values
lg = LogisticRegression()
lg.fit(X,y)
Out[112]:
In [113]:
print(lg.coef_,lg.intercept_)
The logistic regression coefficient is $-0.95466145$, that is negitive, We infer that distance will eliminate the influence on crime. the distances increase 1 unit, the logit ($log(\frac{\text{high crime ratio}}{\text{low crim ratio}})$)of crime will decrease $-0.95$ unit.