In [26]:
%matplotlib inline
from sklearn import datasets
from sklearn import linear_model
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report
from sklearn.preprocessing import scale
import sklearn
print sklearn.__version__
In [27]:
boston = datasets.load_boston()
y = boston.target
X = boston.data
In [28]:
' '.join(dir(boston))
Out[28]:
In [29]:
boston['feature_names']
Out[29]:
In [30]:
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf
results = smf.ols('boston.target ~ boston.data', data=boston).fit()
print results.summary()
In [31]:
regr = linear_model.LinearRegression()
lm = regr.fit(boston.data, y)
In [32]:
lm.intercept_, lm.coef_, lm.score(boston.data, y)
Out[32]:
In [33]:
predicted = regr.predict(boston.data)
In [34]:
fig, ax = plt.subplots()
ax.scatter(y, predicted)
ax.plot([y.min(), y.max()], [y.min(), y.max()], 'k--', lw=4)
ax.set_xlabel('$Measured$', fontsize = 20)
ax.set_ylabel('$Predicted$', fontsize = 20)
plt.show()
In [35]:
boston.data
from sklearn.cross_validation import train_test_split
Xs_train, Xs_test, y_train, y_test = train_test_split(boston.data, boston.target, test_size=0.2, random_state=42)
In [36]:
regr = linear_model.LinearRegression()
lm = regr.fit(Xs_train, y_train)
In [37]:
lm.intercept_, lm.coef_, lm.score(Xs_train, y_train)
Out[37]:
In [38]:
predicted = regr.predict(Xs_test)
In [39]:
fig, ax = plt.subplots()
ax.scatter(y_test, predicted)
ax.plot([y.min(), y.max()], [y.min(), y.max()], 'k--', lw=4)
ax.set_xlabel('$Measured$', fontsize = 20)
ax.set_ylabel('$Predicted$', fontsize = 20)
plt.show()
In [43]:
from sklearn.cross_validation import cross_val_score
regr = linear_model.LinearRegression()
scores = cross_val_score(regr, boston.data , boston.target, cv = 3)
scores.mean()
Out[43]:
In [47]:
data_X_scale = scale(boston.data)
scores = cross_val_score(regr, boston.data, boston.target, cv = 7)
scores.mean()
Out[47]:
In [48]:
scores=[cross_val_score(regr, data_X_scale, boston.target, cv = int(i)).mean() for i in range(3, 50)]
plt.plot(range(3, 50), scores,'r-o')
plt.show()
In [52]:
import pandas as pd
df = pd.read_csv('/Users/zhangyixin/Desktop/cjc2016-gh-pages/data/tianya_bbs_threads_list.txt', sep = "\t", header=None)
df=df.rename(columns = {0:'title', 1:'link', 2:'author',3:'author_page', 4:'click', 5:'reply', 6:'time'})
df[:2]
Out[52]:
In [53]:
def randomSplit(dataX, dataY, num):
dataX_train = []
dataX_test = []
dataY_train = []
dataY_test = []
import random
test_index = random.sample(range(len(df)), num)
for k in range(len(dataX)):
if k in test_index:
dataX_test.append([dataX[k]])
dataY_test.append(dataY[k])
else:
dataX_train.append([dataX[k]])
dataY_train.append(dataY[k])
return dataX_train, dataX_test, dataY_train, dataY_test,
In [54]:
import numpy as np
data_X = df.reply
data_X_train, data_X_test, data_y_train, data_y_test = randomSplit(np.log(df.click+1), np.log(df.reply+1), 20)
regr = linear_model.LinearRegression()
regr.fit(data_X_train, data_y_train)
print'Variance score: %.2f' % regr.score(data_X_test, data_y_test)
In [55]:
y_true, y_pred = data_y_test, regr.predict(data_X_test)
In [56]:
plt.scatter(y_pred, y_true, color='black')
plt.show()
In [57]:
plt.scatter(data_X_test, data_y_test, color='black')
plt.plot(data_X_test, regr.predict(data_X_test), color='blue', linewidth=3)
plt.show()
In [58]:
print 'Coefficients: \n', regr.coef_
In [59]:
print "Residual sum of squares: %.2f" % np.mean((regr.predict(data_X_test) - data_y_test) ** 2)
In [60]:
df.click_log = [[df.click[i]] for i in range(len(df))]
df.reply_log = [[df.reply[i]] for i in range(len(df))]
In [61]:
from sklearn.cross_validation import train_test_split
Xs_train, Xs_test, y_train, y_test = train_test_split(df.click_log, df.reply_log,test_size=0.2, random_state=0)
regr = linear_model.LinearRegression()
regr.fit(Xs_train, y_train)
print'Variance score: %.2f' % regr.score(Xs_test, y_test)
In [62]:
plt.scatter(Xs_test, y_test, color='black')
plt.plot(Xs_test, regr.predict(Xs_test), color='blue', linewidth=3)
plt.show()
In [63]:
from sklearn.cross_validation import cross_val_score
regr = linear_model.LinearRegression()
scores = cross_val_score(regr, df.click_log, df.reply_log, cv = 3)
scores.mean()
Out[63]:
In [64]:
regr = linear_model.LinearRegression()
scores = cross_val_score(regr, df.click_log, df.reply_log, cv = 4)
scores.mean()
Out[64]:
In [65]:
repost = []
for i in df.title:
if u'转载' in i.decode('utf8'):
repost.append(1)
else:
repost.append(0)
In [66]:
data_X = [[df.click[i], df.reply[i]] for i in range(len(df))]
data_X[:3]
Out[66]:
In [67]:
from sklearn.linear_model import LogisticRegression
df['repost'] = repost
model = LogisticRegression()
model.fit(data_X,df.repost)
model.score(data_X,df.repost)
Out[67]:
In [68]:
def randomSplitLogistic(dataX, dataY, num):
dataX_train = []
dataX_test = []
dataY_train = []
dataY_test = []
import random
test_index = random.sample(range(len(df)), num)
for k in range(len(dataX)):
if k in test_index:
dataX_test.append(dataX[k])
dataY_test.append(dataY[k])
else:
dataX_train.append(dataX[k])
dataY_train.append(dataY[k])
return dataX_train, dataX_test, dataY_train, dataY_test,
In [69]:
data_X_train, data_X_test, data_y_train, data_y_test = randomSplitLogistic(data_X, df.repost, 20)
log_regr = LogisticRegression()
log_regr.fit(data_X_train, data_y_train)
print'Variance score: %.2f' % log_regr.score(data_X_test, data_y_test)
In [70]:
y_true, y_pred = data_y_test, log_regr.predict(data_X_test)
In [71]:
y_true, y_pred
Out[71]:
In [72]:
print(classification_report(y_true, y_pred))
In [73]:
from sklearn.cross_validation import train_test_split
Xs_train, Xs_test, y_train, y_test = train_test_split(data_X, df.repost, test_size=0.2, random_state=42)
In [74]:
log_regr = LogisticRegression()
log_regr.fit(Xs_train, y_train)
print'Variance score: %.2f' % log_regr.score(Xs_test, y_test)
In [75]:
print('Logistic score for test set: %f' % log_regr.score(Xs_test, y_test))
print('Logistic score for training set: %f' % log_regr.score(Xs_train, y_train))
y_true, y_pred = y_test, log_regr.predict(Xs_test)
print(classification_report(y_true, y_pred))
In [76]:
logre = LogisticRegression()
scores = cross_val_score(logre, data_X, df.repost, cv = 3)
scores.mean()
Out[76]:
In [78]:
logre = LogisticRegression()
data_X_scale = scale(data_X)
scores = cross_val_score(logre, data_X_scale, df.repost, cv = 3)
scores.mean()
Out[78]:
In [81]:
from sklearn import naive_bayes
' '.join(dir(naive_bayes))
Out[81]:
In [82]:
from sklearn.naive_bayes import GaussianNB
import numpy as np
In [83]:
x= np.array([[-3,7],[1,5], [1,2], [-2,0], [2,3], [-4,0], [-1,1], [1,1], [-2,2], [2,7], [-4,1], [-2,7]])
Y = np.array([3, 3, 3, 3, 4, 3, 3, 4, 3, 4, 4, 4])
In [87]:
model = GaussianNB()
model.fit(x[:8], Y[:8])
predicted= model.predict([[1,2],[3,4]])
print predicted
In [88]:
model.score(x[8:], Y[8:])
Out[88]:
In [89]:
data_X_train, data_X_test, data_y_train, data_y_test = randomSplit(df.click, df.reply, 20)
model.fit(data_X_train, data_y_train)
predicted= model.predict(data_X_test)
print predicted
In [90]:
model.score(data_X_test, data_y_test)
Out[90]:
In [91]:
from sklearn.cross_validation import cross_val_score
model = GaussianNB()
scores = cross_val_score(model, [[c] for c in df.click], df.reply, cv = 5)
scores.mean()
Out[91]:
In [92]:
from sklearn import tree
model = tree.DecisionTreeClassifier(criterion='gini')
In [93]:
data_X_train, data_X_test, data_y_train, data_y_test = randomSplitLogistic(data_X, df.repost, 20)
model.fit(data_X_train,data_y_train)
model.score(data_X_train,data_y_train)
Out[93]:
In [94]:
model.predict(data_X_test)
Out[94]:
In [95]:
scores = cross_val_score(model, data_X, df.repost, cv = 3)
scores.mean()
Out[95]:
In [96]:
from sklearn import svm
model=svm.SVC()
In [97]:
' '.join(dir(svm))
Out[97]:
In [98]:
data_X_train, data_X_test, data_y_train, data_y_test = randomSplitLogistic(data_X, df.repost, 20)
model.fit(data_X_train,data_y_train)
model.score(data_X_train,data_y_train)
Out[98]:
In [99]:
model.predict(data_X_test)
Out[99]:
In [100]:
scores = []
cvs = [3, 5, 10, 25, 50, 75, 100]
for i in cvs:
score = cross_val_score(model, data_X, df.repost, cv = i)
scores.append(score.mean() ) # Try to tune cv
In [101]:
plt.plot(cvs, scores, 'b-o')
plt.xlabel('$cv$', fontsize = 20)
plt.ylabel('$Score$', fontsize = 20)
plt.show()
In [102]:
import numpy as np
from sklearn import tree
In [103]:
import pandas as pd
train = pd.read_csv('/Users/zhangyixin/Desktop/cjc2016-gh-pages/data/tatanic_train.csv', sep = ",")
In [104]:
train.head()
Out[104]:
In [105]:
train["Age"] = train["Age"].fillna(train["Age"].median())
train["Sex"][train["Sex"] == "male"] = 0
train["Sex"][train["Sex"] == "female"] = 1
train["Embarked"] = train["Embarked"].fillna('S')
train["Embarked"][train["Embarked"] == "S"] = 0
train["Embarked"][train["Embarked"] == "C"] = 1
train["Embarked"][train["Embarked"] == "Q"] = 2
In [106]:
target = train['Survived'].values
features_one = train[["Pclass", "Sex", "Age", "Fare"]].values
my_tree_one = tree.DecisionTreeClassifier()
my_tree_one = my_tree_one.fit(features_one, target)
print(my_tree_one.feature_importances_)
print(my_tree_one.score(features_one, target))
In [108]:
test = pd.read_csv('/Users/zhangyixin/Desktop/cjc2016-gh-pages/data/tatanic_test.csv', sep = ",")
test.Fare[152] = test.Fare.median()
test["Age"] = test["Age"].fillna(test["Age"].median())
test["Sex"][test["Sex"] == "male"] = 0
test["Sex"][test["Sex"] == "female"] = 1
test["Embarked"] = test["Embarked"].fillna('S')
test["Embarked"][test["Embarked"] == "S"] = 0
test["Embarked"][test["Embarked"] == "C"] = 1
test["Embarked"][test["Embarked"] == "Q"] = 2
test_features = test[["Pclass","Sex", "Age", "Fare"]].values
my_prediction = my_tree_one.predict(test_features)
PassengerId =np.array(test['PassengerId']).astype(int)
my_solution = pd.DataFrame(my_prediction, PassengerId, columns = ["Survived"])
In [109]:
print my_solution[:3]
In [110]:
print my_solution.shape
In [112]:
my_solution.to_csv("/Users/zhangyixin/Desktop/cjc2016-gh-pages/data/tatanic_solution_one.csv", index_label = ["PassengerId"])
In [113]:
features_two = train[["Pclass","Age","Sex","Fare", "SibSp", "Parch", "Embarked"]].values
max_depth = 10
min_samples_split = 5
my_tree_two = tree.DecisionTreeClassifier(max_depth = max_depth, min_samples_split = min_samples_split, random_state = 1)
my_tree_two = my_tree_two.fit(features_two, target)
print(my_tree_two.score(features_two, target))
In [114]:
train_two = train
train_two['family_size'] = train.SibSp + train.Parch + 1
features_three = train[["Pclass", "Sex", "Age", "Fare", "SibSp", "Parch", "family_size"]].values
my_tree_three = tree.DecisionTreeClassifier()
my_tree_three = my_tree_three.fit(features_three, target)
print(my_tree_three.score(features_three, target))
In [115]:
from sklearn.ensemble import RandomForestClassifier
features_forest = train[["Pclass", "Age", "Sex", "Fare", "SibSp", "Parch", "Embarked"]].values
n_estimators = 100
forest = RandomForestClassifier(max_depth = 10, min_samples_split=2, n_estimators = n_estimators, random_state = 1)
my_forest = forest.fit(features_forest, target)
print(my_forest.score(features_forest, target))
test_features = test[["Pclass", "Age", "Sex", "Fare", "SibSp", "Parch", "Embarked"]].values
pred_forest = my_forest.predict(test_features)
print(len(test_features))
print(pred_forest[:3])
In [116]:
print(my_tree_two.feature_importances_)
print(my_forest.feature_importances_)
print(my_tree_two.score(features_two, target))
print(my_forest.score(features_two, target))
In [ ]: