In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
%matplotlib inline
In [5]:
smarket_df = pd.read_csv('data/Smarket.csv')
In [3]:
smarket_df.head()
#市场数据,来自标准普尔的500支股票,
#每天记录了与前五个交易日的投资回报比例、交易量、今日投资回报比例
Out[3]:
In [12]:
axes = pd.tools.plotting.scatter_matrix(smarket_df,color="brown")
In [3]:
import statsmodels.api as sm
In [36]:
X = smarket_df[['Lag1','Lag2','Lag3','Lag4','Lag5','Volume']].values
X = sm.add_constant(X)
In [37]:
y = pd.factorize(smarket_df['Direction'])[0]#对y进行编码
In [33]:
help(pd.factorize)
In [41]:
#逻辑回归,利用了statsmodels.api.Logit模块,最大似然估计拟合数据
lgt = sm.Logit(y,X).fit()
lgt.summary()
Out[41]:
In [45]:
p=lgt.predict(X) #预测
In [48]:
ypreds = ["Up" if prob > 0.5 else "Down" for prob in p]
ypreds[0:5]
yact = [str(i) for i in smarket_df['Direction'].values]
In [52]:
accuracy_score(yact,ypreds)
Out[52]:
In [58]:
#修改特征,去除P值过大的特征
X1 = smarket_df[['Lag1','Lag2','Volume']].values
X1 = sm.add_constant(X1)
lgt2 = sm.Logit(y,X1).fit()
lgt2.summary()
Out[58]:
In [69]:
p2=lgt2.predict(X1) #预测
ypreds2 = ["Up" if prob > 0.5 else "Down" for prob in p2]
In [70]:
accuracy_score(yact,ypreds2)
Out[70]:
In [72]:
#利用sklearn进行预测,与statsmodel的实现算法有差别
X3 = smarket_df[smarket_df.columns[1:-2]]
clf = LogisticRegression()
In [76]:
clf.fit(X3,y)
clf.intercept_,clf.coef_
Out[76]:
In [78]:
probs3 = clf.predict_proba(X3)
ypreds3 = ["Up" if prob[0] > 0.5 else "Down" for prob in probs3]
accuracy_score(yact,ypreds3)
Out[78]:
In [6]:
#划分训练集合测试集
train_df =smarket_df[smarket_df['Year'] < 2005]
test_df = smarket_df[smarket_df['Year'] >= 2005]
In [26]:
clf2 = LogisticRegression()
Xtrain = train_df[smarket_df.columns[1:-2]]
ytrain = pd.factorize(train_df['Direction'])[0]
clf2.fit(Xtrain,ytranin)
Out[26]:
In [17]:
Xtest = test_df[smarket_df.columns[1:-2]]
ytest = pd.factorize(test_df['Direction'])[0]
ypred = clf2.predict(Xtest)
In [34]:
from sklearn.metrics import confusion_matrix
help(confusion_matrix)
In [22]:
confusion_matrix(ytest,ypred)
Out[22]:
In [23]:
accuracy_score(ytest,ypred)
Out[23]:
In [24]:
from sklearn.lda import LDA
In [27]:
clf3 = LDA()
clf3.fit(Xtrain,ytrain)
ypred = clf3.predict(Xtest)
In [28]:
confusion_matrix(ytest,ypred)
Out[28]:
In [29]:
accuracy_score(ytest,ypred)
Out[29]:
In [44]:
yprob=clf3.predict_proba(Xtest)
#返回每个类的概率
In [29]:
X = smarket_df[['Lag1','Lag2']]
y = smarket_df['Direction']
ytrain = pd.factorize(y)[0]
In [19]:
from sklearn.qda import QDA
In [31]:
clf4 = QDA()
In [36]:
clf4.fit(X,ytrain)
ypred = clf4.predict(X)
confusion_matrix(ytrain,ypred)
Out[36]:
In [37]:
accuracy_score(ytrain,ypred)
Out[37]:
In [38]:
clf4.means_
Out[38]:
In [43]:
from sklearn.neighbors import KNeighborsClassifier
In [44]:
clf5 = KNeighborsClassifier()
clf5.fit(X,ytrain)
Out[44]:
In [46]:
ypred = clf5.predict(X)
In [47]:
accuracy_score(ytrain,ypred)
Out[47]:
In [ ]: