张艺馨

15210130100


In [26]:
%matplotlib inline

from sklearn import datasets
from sklearn import linear_model
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report
from sklearn.preprocessing import scale

import sklearn
print sklearn.__version__


0.17.1

In [27]:
boston = datasets.load_boston()
y = boston.target
X = boston.data

In [28]:
' '.join(dir(boston))


Out[28]:
'__class__ __cmp__ __contains__ __delattr__ __delitem__ __dict__ __doc__ __eq__ __format__ __ge__ __getattr__ __getattribute__ __getitem__ __gt__ __hash__ __init__ __iter__ __le__ __len__ __lt__ __module__ __ne__ __new__ __reduce__ __reduce_ex__ __repr__ __setattr__ __setitem__ __setstate__ __sizeof__ __str__ __subclasshook__ __weakref__ clear copy fromkeys get has_key items iteritems iterkeys itervalues keys pop popitem setdefault update values viewitems viewkeys viewvalues'

In [29]:
boston['feature_names']


Out[29]:
array(['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD',
       'TAX', 'PTRATIO', 'B', 'LSTAT'], 
      dtype='|S7')

In [30]:
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf
results = smf.ols('boston.target ~ boston.data', data=boston).fit()
print results.summary()


                            OLS Regression Results                            
==============================================================================
Dep. Variable:          boston.target   R-squared:                       0.741
Model:                            OLS   Adj. R-squared:                  0.734
Method:                 Least Squares   F-statistic:                     108.1
Date:                Sun, 15 May 2016   Prob (F-statistic):          6.95e-135
Time:                        18:50:15   Log-Likelihood:                -1498.8
No. Observations:                 506   AIC:                             3026.
Df Residuals:                     492   BIC:                             3085.
Df Model:                          13                                         
Covariance Type:            nonrobust                                         
===================================================================================
                      coef    std err          t      P>|t|      [95.0% Conf. Int.]
-----------------------------------------------------------------------------------
Intercept          36.4911      5.104      7.149      0.000        26.462    46.520
boston.data[0]     -0.1072      0.033     -3.276      0.001        -0.171    -0.043
boston.data[1]      0.0464      0.014      3.380      0.001         0.019     0.073
boston.data[2]      0.0209      0.061      0.339      0.735        -0.100     0.142
boston.data[3]      2.6886      0.862      3.120      0.002         0.996     4.381
boston.data[4]    -17.7958      3.821     -4.658      0.000       -25.302   -10.289
boston.data[5]      3.8048      0.418      9.102      0.000         2.983     4.626
boston.data[6]      0.0008      0.013      0.057      0.955        -0.025     0.027
boston.data[7]     -1.4758      0.199     -7.398      0.000        -1.868    -1.084
boston.data[8]      0.3057      0.066      4.608      0.000         0.175     0.436
boston.data[9]     -0.0123      0.004     -3.278      0.001        -0.020    -0.005
boston.data[10]    -0.9535      0.131     -7.287      0.000        -1.211    -0.696
boston.data[11]     0.0094      0.003      3.500      0.001         0.004     0.015
boston.data[12]    -0.5255      0.051    -10.366      0.000        -0.625    -0.426
==============================================================================
Omnibus:                      178.029   Durbin-Watson:                   1.078
Prob(Omnibus):                  0.000   Jarque-Bera (JB):              782.015
Skew:                           1.521   Prob(JB):                    1.54e-170
Kurtosis:                       8.276   Cond. No.                     1.51e+04
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.51e+04. This might indicate that there are
strong multicollinearity or other numerical problems.

In [31]:
regr = linear_model.LinearRegression()
lm = regr.fit(boston.data, y)

In [32]:
lm.intercept_, lm.coef_, lm.score(boston.data, y)


Out[32]:
(36.491103280361237,
 array([ -1.07170557e-01,   4.63952195e-02,   2.08602395e-02,
          2.68856140e+00,  -1.77957587e+01,   3.80475246e+00,
          7.51061703e-04,  -1.47575880e+00,   3.05655038e-01,
         -1.23293463e-02,  -9.53463555e-01,   9.39251272e-03,
         -5.25466633e-01]),
 0.74060774286494269)

In [33]:
predicted = regr.predict(boston.data)

In [34]:
fig, ax = plt.subplots()
ax.scatter(y, predicted)
ax.plot([y.min(), y.max()], [y.min(), y.max()], 'k--', lw=4)
ax.set_xlabel('$Measured$', fontsize = 20)
ax.set_ylabel('$Predicted$', fontsize = 20)
plt.show()



In [35]:
boston.data
from sklearn.cross_validation import train_test_split
Xs_train, Xs_test, y_train, y_test = train_test_split(boston.data, boston.target, test_size=0.2, random_state=42)

In [36]:
regr = linear_model.LinearRegression()
lm = regr.fit(Xs_train, y_train)

In [37]:
lm.intercept_, lm.coef_, lm.score(Xs_train, y_train)


Out[37]:
(30.288948339368815,
 array([ -1.12463481e-01,   3.00810168e-02,   4.07309919e-02,
          2.78676719e+00,  -1.72406347e+01,   4.43248784e+00,
         -6.23998173e-03,  -1.44848504e+00,   2.62113793e-01,
         -1.06390978e-02,  -9.16398679e-01,   1.24516469e-02,
         -5.09349120e-01]),
 0.75088377867329137)

In [38]:
predicted = regr.predict(Xs_test)

In [39]:
fig, ax = plt.subplots()
ax.scatter(y_test, predicted)
ax.plot([y.min(), y.max()], [y.min(), y.max()], 'k--', lw=4)
ax.set_xlabel('$Measured$', fontsize = 20)
ax.set_ylabel('$Predicted$', fontsize = 20)
plt.show()



In [43]:
from sklearn.cross_validation import cross_val_score

regr = linear_model.LinearRegression()
scores = cross_val_score(regr, boston.data , boston.target, cv = 3)
scores.mean()


Out[43]:
-1.5787701857181775

In [47]:
data_X_scale = scale(boston.data)
scores = cross_val_score(regr, boston.data, boston.target, cv = 7)
scores.mean()


Out[47]:
0.45384871359695766

In [48]:
scores=[cross_val_score(regr, data_X_scale, boston.target, cv = int(i)).mean() for i in range(3, 50)]
plt.plot(range(3, 50), scores,'r-o')
plt.show()



In [52]:
import pandas as pd
df = pd.read_csv('/Users/zhangyixin/Desktop/cjc2016-gh-pages/data/tianya_bbs_threads_list.txt', sep = "\t", header=None)
df=df.rename(columns = {0:'title', 1:'link', 2:'author',3:'author_page', 4:'click', 5:'reply', 6:'time'})
df[:2]


Out[52]:
title link author author_page click reply time
0 【民间语文第161期】宁波px启示:船进港湾人应上岸 /post-free-2849477-1.shtml 贾也 http://www.tianya.cn/50499450 194675 2703 2012-10-29 07:59
1 宁波镇海PX项目引发群体上访 当地政府发布说明(转载) /post-free-2839539-1.shtml 无上卫士ABC http://www.tianya.cn/74341835 88244 1041 2012-10-24 12:41

In [53]:
def randomSplit(dataX, dataY, num):
    dataX_train = []
    dataX_test = []
    dataY_train = []
    dataY_test = []
    import random
    test_index = random.sample(range(len(df)), num)
    for k in range(len(dataX)):
        if k in test_index:
            dataX_test.append([dataX[k]])
            dataY_test.append(dataY[k])
        else:
            dataX_train.append([dataX[k]])
            dataY_train.append(dataY[k])
    return dataX_train, dataX_test, dataY_train, dataY_test,

In [54]:
import numpy as np
data_X = df.reply
data_X_train, data_X_test, data_y_train, data_y_test = randomSplit(np.log(df.click+1), np.log(df.reply+1), 20)
regr = linear_model.LinearRegression()
regr.fit(data_X_train, data_y_train)
print'Variance score: %.2f' % regr.score(data_X_test, data_y_test)


Variance score: 0.13

In [55]:
y_true, y_pred = data_y_test, regr.predict(data_X_test)

In [56]:
plt.scatter(y_pred, y_true,  color='black')
plt.show()



In [57]:
plt.scatter(data_X_test, data_y_test,  color='black')
plt.plot(data_X_test, regr.predict(data_X_test), color='blue', linewidth=3)
plt.show()



In [58]:
print 'Coefficients: \n', regr.coef_


Coefficients: 
[ 0.69299365]

In [59]:
print "Residual sum of squares: %.2f" % np.mean((regr.predict(data_X_test) - data_y_test) ** 2)


Residual sum of squares: 0.90

In [60]:
df.click_log = [[df.click[i]] for i in range(len(df))]
df.reply_log = [[df.reply[i]] for i in range(len(df))]

In [61]:
from sklearn.cross_validation import train_test_split
Xs_train, Xs_test, y_train, y_test = train_test_split(df.click_log, df.reply_log,test_size=0.2, random_state=0)
regr = linear_model.LinearRegression()
regr.fit(Xs_train, y_train)
print'Variance score: %.2f' % regr.score(Xs_test, y_test)


Variance score: 0.95

In [62]:
plt.scatter(Xs_test, y_test,  color='black')
plt.plot(Xs_test, regr.predict(Xs_test), color='blue', linewidth=3)
plt.show()



In [63]:
from sklearn.cross_validation import cross_val_score

regr = linear_model.LinearRegression()
scores = cross_val_score(regr, df.click_log, df.reply_log, cv = 3)
scores.mean()


Out[63]:
0.21630869764168115

In [64]:
regr = linear_model.LinearRegression()
scores = cross_val_score(regr, df.click_log, df.reply_log, cv = 4)
scores.mean()


Out[64]:
0.17723080221577134

In [65]:
repost = []
for i in df.title:
    if u'转载' in i.decode('utf8'):
        repost.append(1)
    else:
        repost.append(0)

In [66]:
data_X = [[df.click[i], df.reply[i]] for i in range(len(df))]
data_X[:3]


Out[66]:
[[194675, 2703], [88244, 1041], [82779, 625]]

In [67]:
from sklearn.linear_model import LogisticRegression
df['repost'] = repost
model = LogisticRegression()
model.fit(data_X,df.repost)
model.score(data_X,df.repost)


Out[67]:
0.61241970021413272

In [68]:
def randomSplitLogistic(dataX, dataY, num):
    dataX_train = []
    dataX_test = []
    dataY_train = []
    dataY_test = []
    import random
    test_index = random.sample(range(len(df)), num)
    for k in range(len(dataX)):
        if k in test_index:
            dataX_test.append(dataX[k])
            dataY_test.append(dataY[k])
        else:
            dataX_train.append(dataX[k])
            dataY_train.append(dataY[k])
    return dataX_train, dataX_test, dataY_train, dataY_test,

In [69]:
data_X_train, data_X_test, data_y_train, data_y_test = randomSplitLogistic(data_X, df.repost, 20)
log_regr = LogisticRegression()
log_regr.fit(data_X_train, data_y_train)
print'Variance score: %.2f' % log_regr.score(data_X_test, data_y_test)


Variance score: 0.75

In [70]:
y_true, y_pred = data_y_test, log_regr.predict(data_X_test)

In [71]:
y_true, y_pred


Out[71]:
([0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1],
 array([0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]))

In [72]:
print(classification_report(y_true, y_pred))


             precision    recall  f1-score   support

          0       1.00      0.17      0.29         6
          1       0.74      1.00      0.85        14

avg / total       0.82      0.75      0.68        20


In [73]:
from sklearn.cross_validation import train_test_split
Xs_train, Xs_test, y_train, y_test = train_test_split(data_X, df.repost, test_size=0.2, random_state=42)

In [74]:
log_regr = LogisticRegression()
log_regr.fit(Xs_train, y_train)
print'Variance score: %.2f' % log_regr.score(Xs_test, y_test)


Variance score: 0.60

In [75]:
print('Logistic score for test set: %f' % log_regr.score(Xs_test, y_test))
print('Logistic score for training set: %f' % log_regr.score(Xs_train, y_train))
y_true, y_pred = y_test, log_regr.predict(Xs_test)
print(classification_report(y_true, y_pred))


Logistic score for test set: 0.595745
Logistic score for training set: 0.613941
             precision    recall  f1-score   support

          0       1.00      0.03      0.05        39
          1       0.59      1.00      0.74        55

avg / total       0.76      0.60      0.46        94


In [76]:
logre = LogisticRegression()
scores = cross_val_score(logre, data_X, df.repost, cv = 3)
scores.mean()


Out[76]:
0.53333333333333333

In [78]:
logre = LogisticRegression()
data_X_scale = scale(data_X)
scores = cross_val_score(logre, data_X_scale, df.repost, cv = 3)
scores.mean()


Out[78]:
0.62948717948717947

In [81]:
from sklearn import naive_bayes
'  '.join(dir(naive_bayes))


Out[81]:
'ABCMeta  BaseDiscreteNB  BaseEstimator  BaseNB  BernoulliNB  ClassifierMixin  GaussianNB  LabelBinarizer  MultinomialNB  __all__  __builtins__  __doc__  __file__  __name__  __package__  _check_partial_fit_first_call  abstractmethod  binarize  check_X_y  check_array  check_is_fitted  in1d  issparse  label_binarize  logsumexp  np  safe_sparse_dot  six'

In [82]:
from sklearn.naive_bayes import GaussianNB
import numpy as np

In [83]:
x= np.array([[-3,7],[1,5], [1,2], [-2,0], [2,3], [-4,0], [-1,1], [1,1], [-2,2], [2,7], [-4,1], [-2,7]])
Y = np.array([3, 3, 3, 3, 4, 3, 3, 4, 3, 4, 4, 4])

In [87]:
model = GaussianNB()
model.fit(x[:8], Y[:8])
predicted= model.predict([[1,2],[3,4]])
print predicted


[4 3]

In [88]:
model.score(x[8:], Y[8:])


Out[88]:
0.25

In [89]:
data_X_train, data_X_test, data_y_train, data_y_test = randomSplit(df.click, df.reply, 20)
model.fit(data_X_train, data_y_train)

predicted= model.predict(data_X_test)
print predicted


[41 70  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]

In [90]:
model.score(data_X_test, data_y_test)


Out[90]:
0.55000000000000004

In [91]:
from sklearn.cross_validation import cross_val_score

model = GaussianNB()
scores = cross_val_score(model, [[c] for c in df.click], df.reply, cv = 5)
scores.mean()


//anaconda/lib/python2.7/site-packages/sklearn/cross_validation.py:516: Warning: The least populated class in y has only 1 members, which is too few. The minimum number of labels for any class cannot be less than n_folds=5.
  % (min_labels, self.n_folds)), Warning)
Out[91]:
0.49403904714780522

In [92]:
from sklearn import tree
model = tree.DecisionTreeClassifier(criterion='gini')

In [93]:
data_X_train, data_X_test, data_y_train, data_y_test = randomSplitLogistic(data_X, df.repost, 20)
model.fit(data_X_train,data_y_train)
model.score(data_X_train,data_y_train)


Out[93]:
0.91722595078299773

In [94]:
model.predict(data_X_test)


Out[94]:
array([0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1])

In [95]:
scores = cross_val_score(model, data_X, df.repost, cv = 3)
scores.mean()


Out[95]:
0.33461538461538459

In [96]:
from sklearn import svm
model=svm.SVC()

In [97]:
' '.join(dir(svm))


Out[97]:
'LinearSVC LinearSVR NuSVC NuSVR OneClassSVM SVC SVR __all__ __builtins__ __doc__ __file__ __name__ __package__ __path__ base bounds classes l1_min_c liblinear libsvm libsvm_sparse'

In [98]:
data_X_train, data_X_test, data_y_train, data_y_test = randomSplitLogistic(data_X, df.repost, 20)
model.fit(data_X_train,data_y_train)
model.score(data_X_train,data_y_train)


Out[98]:
0.91051454138702459

In [99]:
model.predict(data_X_test)


Out[99]:
array([0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1])

In [100]:
scores = []
cvs = [3, 5, 10, 25, 50, 75, 100]
for i in cvs:
    score = cross_val_score(model, data_X, df.repost, cv = i)
    scores.append(score.mean() ) # Try to tune cv

In [101]:
plt.plot(cvs, scores, 'b-o')
plt.xlabel('$cv$', fontsize = 20)
plt.ylabel('$Score$', fontsize = 20)
plt.show()



In [102]:
import numpy as np
from sklearn import tree

In [103]:
import pandas as pd
train = pd.read_csv('/Users/zhangyixin/Desktop/cjc2016-gh-pages/data/tatanic_train.csv', sep = ",")

In [104]:
train.head()


Out[104]:
Unnamed: 0 PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S
1 1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C
2 2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S
3 3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S
4 4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S

In [105]:
train["Age"] = train["Age"].fillna(train["Age"].median())
train["Sex"][train["Sex"] == "male"] = 0
train["Sex"][train["Sex"] == "female"] = 1

train["Embarked"] = train["Embarked"].fillna('S')

train["Embarked"][train["Embarked"] == "S"] = 0
train["Embarked"][train["Embarked"] == "C"] = 1
train["Embarked"][train["Embarked"] == "Q"] = 2


/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:3: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()
/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:7: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:8: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:9: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy

In [106]:
target = train['Survived'].values
features_one = train[["Pclass", "Sex", "Age", "Fare"]].values

my_tree_one = tree.DecisionTreeClassifier()

my_tree_one = my_tree_one.fit(features_one, target)

print(my_tree_one.feature_importances_)
print(my_tree_one.score(features_one, target))


[ 0.13555314  0.31274009  0.23810102  0.31360575]
0.977553310887

In [108]:
test = pd.read_csv('/Users/zhangyixin/Desktop/cjc2016-gh-pages/data/tatanic_test.csv', sep = ",")
test.Fare[152] = test.Fare.median()
test["Age"] = test["Age"].fillna(test["Age"].median())

test["Sex"][test["Sex"] == "male"] = 0
test["Sex"][test["Sex"] == "female"] = 1

test["Embarked"] = test["Embarked"].fillna('S')

test["Embarked"][test["Embarked"] == "S"] = 0
test["Embarked"][test["Embarked"] == "C"] = 1
test["Embarked"][test["Embarked"] == "Q"] = 2

test_features = test[["Pclass","Sex", "Age", "Fare"]].values
my_prediction = my_tree_one.predict(test_features)

PassengerId =np.array(test['PassengerId']).astype(int)
my_solution = pd.DataFrame(my_prediction, PassengerId, columns = ["Survived"])


/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:5: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:6: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:10: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:11: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:12: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy

In [109]:
print my_solution[:3]


     Survived
892         0
893         0
894         1

In [110]:
print my_solution.shape


(418, 1)

In [112]:
my_solution.to_csv("/Users/zhangyixin/Desktop/cjc2016-gh-pages/data/tatanic_solution_one.csv", index_label = ["PassengerId"])

In [113]:
features_two = train[["Pclass","Age","Sex","Fare", "SibSp", "Parch", "Embarked"]].values
max_depth = 10
min_samples_split = 5
my_tree_two = tree.DecisionTreeClassifier(max_depth = max_depth, min_samples_split = min_samples_split, random_state = 1)
my_tree_two = my_tree_two.fit(features_two, target)

print(my_tree_two.score(features_two, target))


0.905723905724

In [114]:
train_two = train
train_two['family_size'] = train.SibSp + train.Parch + 1

features_three = train[["Pclass", "Sex", "Age", "Fare", "SibSp", "Parch", "family_size"]].values

my_tree_three = tree.DecisionTreeClassifier()
my_tree_three = my_tree_three.fit(features_three, target)

print(my_tree_three.score(features_three, target))


0.979797979798

In [115]:
from sklearn.ensemble import RandomForestClassifier

features_forest = train[["Pclass", "Age", "Sex", "Fare", "SibSp", "Parch", "Embarked"]].values

n_estimators = 100
forest = RandomForestClassifier(max_depth = 10, min_samples_split=2, n_estimators = n_estimators, random_state = 1)
my_forest = forest.fit(features_forest, target)

print(my_forest.score(features_forest, target))

test_features = test[["Pclass", "Age", "Sex", "Fare", "SibSp", "Parch", "Embarked"]].values
pred_forest = my_forest.predict(test_features)
print(len(test_features))
print(pred_forest[:3])


0.939393939394
418
[0 0 0]

In [116]:
print(my_tree_two.feature_importances_)
print(my_forest.feature_importances_)

print(my_tree_two.score(features_two, target))
print(my_forest.score(features_two, target))


[ 0.14130255  0.17906027  0.41616727  0.17938711  0.05039699  0.01923751
  0.0144483 ]
[ 0.10384741  0.20139027  0.31989322  0.24602858  0.05272693  0.04159232
  0.03452128]
0.905723905724
0.939393939394

In [ ]: