工作机制:
Chess. Here, the agent decides upon a series of moves depending on the state of the board (the environment), and the reward can be defined as win or lose at the end of the game:
In [37]:
%matplotlib inline
from sklearn import datasets
from sklearn import linear_model
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report
from sklearn.preprocessing import scale
import sklearn
print sklearn.__version__
In [116]:
# boston data
boston = datasets.load_boston()
y = boston.target
X = boston.data
In [117]:
' '.join(dir(boston))
Out[117]:
In [118]:
boston['feature_names']
Out[118]:
In [238]:
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf
# Fit regression model (using the natural log of one of the regressors)
results = smf.ols('boston.target ~ boston.data', data=boston).fit()
print results.summary()
In [119]:
regr = linear_model.LinearRegression()
lm = regr.fit(boston.data, y)
In [120]:
lm.intercept_, lm.coef_, lm.score(boston.data, y)
Out[120]:
In [121]:
predicted = regr.predict(boston.data)
In [122]:
fig, ax = plt.subplots()
ax.scatter(y, predicted)
ax.plot([y.min(), y.max()], [y.min(), y.max()], 'k--', lw=4)
ax.set_xlabel('$Measured$', fontsize = 20)
ax.set_ylabel('$Predicted$', fontsize = 20)
plt.show()
In [190]:
boston.data
Out[190]:
In [188]:
from sklearn.cross_validation import train_test_split
Xs_train, Xs_test, y_train, y_test = train_test_split(boston.data,
boston.target,
test_size=0.2,
random_state=42)
In [124]:
regr = linear_model.LinearRegression()
lm = regr.fit(Xs_train, y_train)
In [126]:
lm.intercept_, lm.coef_, lm.score(Xs_train, y_train)
Out[126]:
In [127]:
predicted = regr.predict(Xs_test)
In [128]:
fig, ax = plt.subplots()
ax.scatter(y_test, predicted)
ax.plot([y.min(), y.max()], [y.min(), y.max()], 'k--', lw=4)
ax.set_xlabel('$Measured$', fontsize = 20)
ax.set_ylabel('$Predicted$', fontsize = 20)
plt.show()
In [175]:
from sklearn.cross_validation import cross_val_score
regr = linear_model.LinearRegression()
scores = cross_val_score(regr, boston.data , boston.target, cv = 3)
scores.mean()
Out[175]:
In [178]:
scores = [cross_val_score(regr, data_X_scale, boston.target, cv = int(i)).mean() for i in range(3, 50)]
plt.plot(range(3, 50), scores,'r-o')
plt.show()
In [176]:
data_X_scale = scale(boston.data)
scores = cross_val_score(regr, boston.data, boston.target, cv = 7)
scores.mean()
Out[176]:
In [207]:
import pandas as pd
df = pd.read_csv('/Users/chengjun/github/cjc2016/data/tianya_bbs_threads_list.txt', sep = "\t", header=None)
df=df.rename(columns = {0:'title', 1:'link', 2:'author',3:'author_page', 4:'click', 5:'reply', 6:'time'})
df[:2]
Out[207]:
In [45]:
# 定义这个函数的目的是让读者感受到:
# 抽取不同的样本,得到的结果完全不同。
def randomSplit(dataX, dataY, num):
dataX_train = []
dataX_test = []
dataY_train = []
dataY_test = []
import random
test_index = random.sample(range(len(df)), num)
for k in range(len(dataX)):
if k in test_index:
dataX_test.append([dataX[k]])
dataY_test.append(dataY[k])
else:
dataX_train.append([dataX[k]])
dataY_train.append(dataY[k])
return dataX_train, dataX_test, dataY_train, dataY_test,
In [46]:
import numpy as np
# Use only one feature
data_X = df.reply
# Split the data into training/testing sets
data_X_train, data_X_test, data_y_train, data_y_test = randomSplit(np.log(df.click+1),
np.log(df.reply+1), 20)
# Create linear regression object
regr = linear_model.LinearRegression()
# Train the model using the training sets
regr.fit(data_X_train, data_y_train)
# Explained variance score: 1 is perfect prediction
print'Variance score: %.2f' % regr.score(data_X_test, data_y_test)
In [48]:
y_true, y_pred = data_y_test, regr.predict(data_X_test)
In [51]:
plt.scatter(y_pred, y_true, color='black')
plt.show()
In [23]:
# Plot outputs
plt.scatter(data_X_test, data_y_test, color='black')
plt.plot(data_X_test, regr.predict(data_X_test), color='blue', linewidth=3)
plt.show()
In [24]:
# The coefficients
print 'Coefficients: \n', regr.coef_
In [25]:
# The mean square error
print "Residual sum of squares: %.2f" % np.mean((regr.predict(data_X_test) - data_y_test) ** 2)
In [217]:
df.click_log = [[df.click[i]] for i in range(len(df))]
df.reply_log = [[df.reply[i]] for i in range(len(df))]
In [236]:
from sklearn.cross_validation import train_test_split
Xs_train, Xs_test, y_train, y_test = train_test_split(df.click_log, df.reply_log,test_size=0.2, random_state=0)
# Create linear regression object
regr = linear_model.LinearRegression()
# Train the model using the training sets
regr.fit(Xs_train, y_train)
# Explained variance score: 1 is perfect prediction
print'Variance score: %.2f' % regr.score(Xs_test, y_test)
In [237]:
# Plot outputs
plt.scatter(Xs_test, y_test, color='black')
plt.plot(Xs_test, regr.predict(Xs_test), color='blue', linewidth=3)
plt.show()
In [226]:
from sklearn.cross_validation import cross_val_score
regr = linear_model.LinearRegression()
scores = cross_val_score(regr, df.click_log, df.reply_log, cv = 3)
scores.mean()
Out[226]:
In [228]:
regr = linear_model.LinearRegression()
scores = cross_val_score(regr, df.click_log, df.reply_log, cv = 4)
scores.mean()
Out[228]:
In [81]:
repost = []
for i in df.title:
if u'转载' in i.decode('utf8'):
repost.append(1)
else:
repost.append(0)
In [82]:
data_X = [[df.click[i], df.reply[i]] for i in range(len(df))]
data_X[:3]
Out[82]:
In [83]:
from sklearn.linear_model import LogisticRegression
df['repost'] = repost
model = LogisticRegression()
model.fit(data_X,df.repost)
model.score(data_X,df.repost)
Out[83]:
In [84]:
def randomSplitLogistic(dataX, dataY, num):
dataX_train = []
dataX_test = []
dataY_train = []
dataY_test = []
import random
test_index = random.sample(range(len(df)), num)
for k in range(len(dataX)):
if k in test_index:
dataX_test.append(dataX[k])
dataY_test.append(dataY[k])
else:
dataX_train.append(dataX[k])
dataY_train.append(dataY[k])
return dataX_train, dataX_test, dataY_train, dataY_test,
In [102]:
# Split the data into training/testing sets
data_X_train, data_X_test, data_y_train, data_y_test = randomSplitLogistic(data_X, df.repost, 20)
# Create logistic regression object
log_regr = LogisticRegression()
# Train the model using the training sets
log_regr.fit(data_X_train, data_y_train)
# Explained variance score: 1 is perfect prediction
print'Variance score: %.2f' % log_regr.score(data_X_test, data_y_test)
In [86]:
y_true, y_pred = data_y_test, log_regr.predict(data_X_test)
In [87]:
y_true, y_pred
Out[87]:
In [88]:
print(classification_report(y_true, y_pred))
In [94]:
from sklearn.cross_validation import train_test_split
Xs_train, Xs_test, y_train, y_test = train_test_split(data_X, df.repost, test_size=0.2, random_state=42)
In [95]:
# Create logistic regression object
log_regr = LogisticRegression()
# Train the model using the training sets
log_regr.fit(Xs_train, y_train)
# Explained variance score: 1 is perfect prediction
print'Variance score: %.2f' % log_regr.score(Xs_test, y_test)
In [96]:
print('Logistic score for test set: %f' % log_regr.score(Xs_test, y_test))
print('Logistic score for training set: %f' % log_regr.score(Xs_train, y_train))
y_true, y_pred = y_test, log_regr.predict(Xs_test)
print(classification_report(y_true, y_pred))
In [97]:
logre = LogisticRegression()
scores = cross_val_score(logre, data_X, df.repost, cv = 3)
scores.mean()
Out[97]:
In [98]:
logre = LogisticRegression()
data_X_scale = scale(data_X)
# The importance of preprocessing in data science and the machine learning pipeline I:
scores = cross_val_score(logre, data_X_scale, df.repost, cv = 3)
scores.mean()
Out[98]:
It is a classification technique based on Bayes’ Theorem with an assumption of independence among predictors.
In simple terms, a Naive Bayes classifier assumes that the presence of a particular feature in a class is unrelated to the presence of any other feature.
why it is known as ‘Naive’? For example, a fruit may be considered to be an apple if it is red, round, and about 3 inches in diameter. Even if these features depend on each other or upon the existence of the other features, all of these properties independently contribute to the probability that this fruit is an apple.
贝叶斯定理为使用$p(c)$, $p(x)$, $p(x|c)$ 计算后验概率$P(c|x)$提供了方法:
Step 1: Convert the data set into a frequency table
Step 2: Create Likelihood table by finding the probabilities like:
Step 3: Now, use Naive Bayesian equation to calculate the posterior probability for each class. The class with the highest posterior probability is the outcome of prediction.
We can solve it using above discussed method of posterior probability.
$P(Yes | Sunny) = \frac{P( Sunny | Yes) * P(Yes) } {P (Sunny)}$
Here we have P (Sunny |Yes) = 3/9 = 0.33, P(Sunny) = 5/14 = 0.36, P( Yes)= 9/14 = 0.64
Now, $P (Yes | Sunny) = \frac{0.33 * 0.64}{0.36} = 0.60$, which has higher probability.
In [17]:
from sklearn import naive_bayes
' '.join(dir(naive_bayes))
Out[17]:
In [29]:
#Import Library of Gaussian Naive Bayes model
from sklearn.naive_bayes import GaussianNB
import numpy as np
#assigning predictor and target variables
x= np.array([[-3,7],[1,5], [1,2], [-2,0], [2,3], [-4,0], [-1,1], [1,1], [-2,2], [2,7], [-4,1], [-2,7]])
Y = np.array([3, 3, 3, 3, 4, 3, 3, 4, 3, 4, 4, 4])
In [30]:
#Create a Gaussian Classifier
model = GaussianNB()
# Train the model using the training sets
model.fit(x[:8], Y[:8])
#Predict Output
predicted= model.predict([[1,2],[3,4]])
print predicted
In [31]:
model.score(x[8:], Y[8:])
Out[31]:
k-fold CV, the training set is split into k smaller sets (other approaches are described below, but generally follow the same principles). The following procedure is followed for each of the k “folds”:
In [48]:
data_X_train, data_X_test, data_y_train, data_y_test = randomSplit(df.click, df.reply, 20)
# Train the model using the training sets
model.fit(data_X_train, data_y_train)
#Predict Output
predicted= model.predict(data_X_test)
print predicted
In [49]:
model.score(data_X_test, data_y_test)
Out[49]:
In [51]:
from sklearn.cross_validation import cross_val_score
model = GaussianNB()
scores = cross_val_score(model, [[c] for c in df.click], df.reply, cv = 5)
scores.mean()
Out[51]:
In [92]:
from sklearn import tree
model = tree.DecisionTreeClassifier(criterion='gini')
In [100]:
data_X_train, data_X_test, data_y_train, data_y_test = randomSplitLogistic(data_X, df.repost, 20)
model.fit(data_X_train,data_y_train)
model.score(data_X_train,data_y_train)
Out[100]:
In [95]:
# Predict
model.predict(data_X_test)
Out[95]:
In [102]:
# crossvalidation
scores = cross_val_score(model, data_X, df.repost, cv = 3)
scores.mean()
Out[102]:
In [108]:
from sklearn import svm
# Create SVM classification object
model=svm.SVC()
In [107]:
' '.join(dir(svm))
Out[107]:
In [109]:
data_X_train, data_X_test, data_y_train, data_y_test = randomSplitLogistic(data_X, df.repost, 20)
model.fit(data_X_train,data_y_train)
model.score(data_X_train,data_y_train)
Out[109]:
In [110]:
# Predict
model.predict(data_X_test)
Out[110]:
In [117]:
# crossvalidation
scores = []
cvs = [3, 5, 10, 25, 50, 75, 100]
for i in cvs:
score = cross_val_score(model, data_X, df.repost, cv = i)
scores.append(score.mean() ) # Try to tune cv
In [119]:
plt.plot(cvs, scores, 'b-o')
plt.xlabel('$cv$', fontsize = 20)
plt.ylabel('$Score$', fontsize = 20)
plt.show()
In [1]:
#Import the Numpy library
import numpy as np
#Import 'tree' from scikit-learn library
from sklearn import tree
In [19]:
import pandas as pd
train = pd.read_csv('/Users/chengjun/github/cjc2016/data/tatanic_train.csv', sep = ",")
In [20]:
train.head()
Out[20]:
In [21]:
train["Age"] = train["Age"].fillna(train["Age"].median())
#Convert the male and female groups to integer form
train["Sex"][train["Sex"] == "male"] = 0
train["Sex"][train["Sex"] == "female"] = 1
#Impute the Embarked variable
train["Embarked"] = train["Embarked"].fillna('S')
#Convert the Embarked classes to integer form
train["Embarked"][train["Embarked"] == "S"] = 0
train["Embarked"][train["Embarked"] == "C"] = 1
train["Embarked"][train["Embarked"] == "Q"] = 2
In [22]:
#Create the target and features numpy arrays: target, features_one
target = train['Survived'].values
features_one = train[["Pclass", "Sex", "Age", "Fare"]].values
#Fit your first decision tree: my_tree_one
my_tree_one = tree.DecisionTreeClassifier()
my_tree_one = my_tree_one.fit(features_one, target)
#Look at the importance of the included features and print the score
print(my_tree_one.feature_importances_)
print(my_tree_one.score(features_one, target))
In [34]:
test = pd.read_csv('/Users/chengjun/github/cjc2016/data/tatanic_test.csv', sep = ",")
# Impute the missing value with the median
test.Fare[152] = test.Fare.median()
test["Age"] = test["Age"].fillna(test["Age"].median())
#Convert the male and female groups to integer form
test["Sex"][test["Sex"] == "male"] = 0
test["Sex"][test["Sex"] == "female"] = 1
#Impute the Embarked variable
test["Embarked"] = test["Embarked"].fillna('S')
#Convert the Embarked classes to integer form
test["Embarked"][test["Embarked"] == "S"] = 0
test["Embarked"][test["Embarked"] == "C"] = 1
test["Embarked"][test["Embarked"] == "Q"] = 2
# Extract the features from the test set: Pclass, Sex, Age, and Fare.
test_features = test[["Pclass","Sex", "Age", "Fare"]].values
# Make your prediction using the test set
my_prediction = my_tree_one.predict(test_features)
# Create a data frame with two columns: PassengerId & Survived. Survived contains your predictions
PassengerId =np.array(test['PassengerId']).astype(int)
my_solution = pd.DataFrame(my_prediction, PassengerId, columns = ["Survived"])
In [27]:
print my_solution[:3]
In [28]:
# Check that your data frame has 418 entries
print my_solution.shape
In [30]:
# Write your solution to a csv file with the name my_solution.csv
my_solution.to_csv("/Users/chengjun/github/cjc2016/data/tatanic_solution_one.csv", index_label = ["PassengerId"])
In [31]:
# Create a new array with the added features: features_two
features_two = train[["Pclass","Age","Sex","Fare", "SibSp", "Parch", "Embarked"]].values
#Control overfitting by setting "max_depth" to 10 and "min_samples_split" to 5 : my_tree_two
max_depth = 10
min_samples_split = 5
my_tree_two = tree.DecisionTreeClassifier(max_depth = max_depth, min_samples_split = min_samples_split, random_state = 1)
my_tree_two = my_tree_two.fit(features_two, target)
#Print the score of the new decison tree
print(my_tree_two.score(features_two, target))
In [32]:
# create a new train set with the new variable
train_two = train
train_two['family_size'] = train.SibSp + train.Parch + 1
# Create a new decision tree my_tree_three
features_three = train[["Pclass", "Sex", "Age", "Fare", "SibSp", "Parch", "family_size"]].values
my_tree_three = tree.DecisionTreeClassifier()
my_tree_three = my_tree_three.fit(features_three, target)
# Print the score of this decision tree
print(my_tree_three.score(features_three, target))
In [35]:
#Import the `RandomForestClassifier`
from sklearn.ensemble import RandomForestClassifier
#We want the Pclass, Age, Sex, Fare,SibSp, Parch, and Embarked variables
features_forest = train[["Pclass", "Age", "Sex", "Fare", "SibSp", "Parch", "Embarked"]].values
#Building the Forest: my_forest
n_estimators = 100
forest = RandomForestClassifier(max_depth = 10, min_samples_split=2, n_estimators = n_estimators, random_state = 1)
my_forest = forest.fit(features_forest, target)
#Print the score of the random forest
print(my_forest.score(features_forest, target))
#Compute predictions and print the length of the prediction vector:test_features, pred_forest
test_features = test[["Pclass", "Age", "Sex", "Fare", "SibSp", "Parch", "Embarked"]].values
pred_forest = my_forest.predict(test_features)
print(len(test_features))
print(pred_forest[:3])
In [36]:
#Request and print the `.feature_importances_` attribute
print(my_tree_two.feature_importances_)
print(my_forest.feature_importances_)
#Compute and print the mean accuracy score for both models
print(my_tree_two.score(features_two, target))
print(my_forest.score(features_two, target))
机器学习算法的要点(附 Python 和 R 代码)http://blog.csdn.net/a6225301/article/details/50479672
The "Python Machine Learning" book code repository and info resource https://github.com/rasbt/python-machine-learning-book
An Introduction to Statistical Learning (James, Witten, Hastie, Tibshirani, 2013) : Python code https://github.com/JWarmenhoven/ISLR-python
BuildingMachineLearningSystemsWithPython https://github.com/luispedro/BuildingMachineLearningSystemsWithPython
In [ ]: