张艺馨

15210130100



In [26]:

    
%matplotlib inline

from sklearn import datasets
from sklearn import linear_model
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report
from sklearn.preprocessing import scale

import sklearn
print sklearn.__version__



In [27]:

    
boston = datasets.load_boston()
y = boston.target
X = boston.data



In [28]:

    
' '.join(dir(boston))









    Out[28]:





'__class__ __cmp__ __contains__ __delattr__ __delitem__ __dict__ __doc__ __eq__ __format__ __ge__ __getattr__ __getattribute__ __getitem__ __gt__ __hash__ __init__ __iter__ __le__ __len__ __lt__ __module__ __ne__ __new__ __reduce__ __reduce_ex__ __repr__ __setattr__ __setitem__ __setstate__ __sizeof__ __str__ __subclasshook__ __weakref__ clear copy fromkeys get has_key items iteritems iterkeys itervalues keys pop popitem setdefault update values viewitems viewkeys viewvalues'



In [29]:

    
boston['feature_names']









    Out[29]:





array(['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD',
       'TAX', 'PTRATIO', 'B', 'LSTAT'], 
      dtype='|S7')



In [30]:

    
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf
results = smf.ols('boston.target ~ boston.data', data=boston).fit()
print results.summary()









    



                            OLS Regression Results                            
==============================================================================
Dep. Variable:          boston.target   R-squared:                       0.741
Model:                            OLS   Adj. R-squared:                  0.734
Method:                 Least Squares   F-statistic:                     108.1
Date:                Sun, 15 May 2016   Prob (F-statistic):          6.95e-135
Time:                        18:50:15   Log-Likelihood:                -1498.8
No. Observations:                 506   AIC:                             3026.
Df Residuals:                     492   BIC:                             3085.
Df Model:                          13                                         
Covariance Type:            nonrobust                                         
===================================================================================
                      coef    std err          t      P>|t|      [95.0% Conf. Int.]
-----------------------------------------------------------------------------------
Intercept          36.4911      5.104      7.149      0.000        26.462    46.520
boston.data[0]     -0.1072      0.033     -3.276      0.001        -0.171    -0.043
boston.data[1]      0.0464      0.014      3.380      0.001         0.019     0.073
boston.data[2]      0.0209      0.061      0.339      0.735        -0.100     0.142
boston.data[3]      2.6886      0.862      3.120      0.002         0.996     4.381
boston.data[4]    -17.7958      3.821     -4.658      0.000       -25.302   -10.289
boston.data[5]      3.8048      0.418      9.102      0.000         2.983     4.626
boston.data[6]      0.0008      0.013      0.057      0.955        -0.025     0.027
boston.data[7]     -1.4758      0.199     -7.398      0.000        -1.868    -1.084
boston.data[8]      0.3057      0.066      4.608      0.000         0.175     0.436
boston.data[9]     -0.0123      0.004     -3.278      0.001        -0.020    -0.005
boston.data[10]    -0.9535      0.131     -7.287      0.000        -1.211    -0.696
boston.data[11]     0.0094      0.003      3.500      0.001         0.004     0.015
boston.data[12]    -0.5255      0.051    -10.366      0.000        -0.625    -0.426
==============================================================================
Omnibus:                      178.029   Durbin-Watson:                   1.078
Prob(Omnibus):                  0.000   Jarque-Bera (JB):              782.015
Skew:                           1.521   Prob(JB):                    1.54e-170
Kurtosis:                       8.276   Cond. No.                     1.51e+04
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.51e+04. This might indicate that there are
strong multicollinearity or other numerical problems.



In [31]:

    
regr = linear_model.LinearRegression()
lm = regr.fit(boston.data, y)



In [32]:

    
lm.intercept_, lm.coef_, lm.score(boston.data, y)









    Out[32]:





(36.491103280361237,
 array([ -1.07170557e-01,   4.63952195e-02,   2.08602395e-02,
          2.68856140e+00,  -1.77957587e+01,   3.80475246e+00,
          7.51061703e-04,  -1.47575880e+00,   3.05655038e-01,
         -1.23293463e-02,  -9.53463555e-01,   9.39251272e-03,
         -5.25466633e-01]),
 0.74060774286494269)



In [33]:

    
predicted = regr.predict(boston.data)



In [34]:

    
fig, ax = plt.subplots()
ax.scatter(y, predicted)
ax.plot([y.min(), y.max()], [y.min(), y.max()], 'k--', lw=4)
ax.set_xlabel('$Measured$', fontsize = 20)
ax.set_ylabel('$Predicted$', fontsize = 20)
plt.show()



In [35]:

    
boston.data
from sklearn.cross_validation import train_test_split
Xs_train, Xs_test, y_train, y_test = train_test_split(boston.data, boston.target, test_size=0.2, random_state=42)



In [36]:

    
regr = linear_model.LinearRegression()
lm = regr.fit(Xs_train, y_train)



In [37]:

    
lm.intercept_, lm.coef_, lm.score(Xs_train, y_train)









    Out[37]:





(30.288948339368815,
 array([ -1.12463481e-01,   3.00810168e-02,   4.07309919e-02,
          2.78676719e+00,  -1.72406347e+01,   4.43248784e+00,
         -6.23998173e-03,  -1.44848504e+00,   2.62113793e-01,
         -1.06390978e-02,  -9.16398679e-01,   1.24516469e-02,
         -5.09349120e-01]),
 0.75088377867329137)



In [38]:

    
predicted = regr.predict(Xs_test)



In [39]:

    
fig, ax = plt.subplots()
ax.scatter(y_test, predicted)
ax.plot([y.min(), y.max()], [y.min(), y.max()], 'k--', lw=4)
ax.set_xlabel('$Measured$', fontsize = 20)
ax.set_ylabel('$Predicted$', fontsize = 20)
plt.show()



In [43]:

    
from sklearn.cross_validation import cross_val_score

regr = linear_model.LinearRegression()
scores = cross_val_score(regr, boston.data , boston.target, cv = 3)
scores.mean()









    Out[43]:





-1.5787701857181775



In [47]:

    
data_X_scale = scale(boston.data)
scores = cross_val_score(regr, boston.data, boston.target, cv = 7)
scores.mean()









    Out[47]:





0.45384871359695766



In [48]:

    
scores=[cross_val_score(regr, data_X_scale, boston.target, cv = int(i)).mean() for i in range(3, 50)]
plt.plot(range(3, 50), scores,'r-o')
plt.show()



In [52]:

    
import pandas as pd
df = pd.read_csv('/Users/zhangyixin/Desktop/cjc2016-gh-pages/data/tianya_bbs_threads_list.txt', sep = "\t", header=None)
df=df.rename(columns = {0:'title', 1:'link', 2:'author',3:'author_page', 4:'click', 5:'reply', 6:'time'})
df[:2]









    Out[52]:






  
    
      
      title
      link
      author
      author_page
      click
      reply
      time
    
  
  
    
      0
      【民间语文第161期】宁波px启示:船进港湾人应上岸
      /post-free-2849477-1.shtml
      贾也
      http://www.tianya.cn/50499450
      194675
      2703
      2012-10-29 07:59
    
    
      1
      宁波镇海PX项目引发群体上访 当地政府发布说明(转载)
      /post-free-2839539-1.shtml
      无上卫士ABC
      http://www.tianya.cn/74341835
      88244
      1041
      2012-10-24 12:41



In [53]:

    
def randomSplit(dataX, dataY, num):
    dataX_train = []
    dataX_test = []
    dataY_train = []
    dataY_test = []
    import random
    test_index = random.sample(range(len(df)), num)
    for k in range(len(dataX)):
        if k in test_index:
            dataX_test.append([dataX[k]])
            dataY_test.append(dataY[k])
        else:
            dataX_train.append([dataX[k]])
            dataY_train.append(dataY[k])
    return dataX_train, dataX_test, dataY_train, dataY_test,



In [54]:

    
import numpy as np
data_X = df.reply
data_X_train, data_X_test, data_y_train, data_y_test = randomSplit(np.log(df.click+1), np.log(df.reply+1), 20)
regr = linear_model.LinearRegression()
regr.fit(data_X_train, data_y_train)
print'Variance score: %.2f' % regr.score(data_X_test, data_y_test)









    



Variance score: 0.13



In [55]:

    
y_true, y_pred = data_y_test, regr.predict(data_X_test)



In [56]:

    
plt.scatter(y_pred, y_true,  color='black')
plt.show()



In [57]:

    
plt.scatter(data_X_test, data_y_test,  color='black')
plt.plot(data_X_test, regr.predict(data_X_test), color='blue', linewidth=3)
plt.show()



In [58]:

    
print 'Coefficients: \n', regr.coef_









    



Coefficients: 
[ 0.69299365]



In [59]:

    
print "Residual sum of squares: %.2f" % np.mean((regr.predict(data_X_test) - data_y_test) ** 2)









    



Residual sum of squares: 0.90



In [60]:

    
df.click_log = [[df.click[i]] for i in range(len(df))]
df.reply_log = [[df.reply[i]] for i in range(len(df))]



In [61]:

    
from sklearn.cross_validation import train_test_split
Xs_train, Xs_test, y_train, y_test = train_test_split(df.click_log, df.reply_log,test_size=0.2, random_state=0)
regr = linear_model.LinearRegression()
regr.fit(Xs_train, y_train)
print'Variance score: %.2f' % regr.score(Xs_test, y_test)









    



Variance score: 0.95



In [62]:

    
plt.scatter(Xs_test, y_test,  color='black')
plt.plot(Xs_test, regr.predict(Xs_test), color='blue', linewidth=3)
plt.show()



In [63]:

    
from sklearn.cross_validation import cross_val_score

regr = linear_model.LinearRegression()
scores = cross_val_score(regr, df.click_log, df.reply_log, cv = 3)
scores.mean()









    Out[63]:





0.21630869764168115



In [64]:

    
regr = linear_model.LinearRegression()
scores = cross_val_score(regr, df.click_log, df.reply_log, cv = 4)
scores.mean()









    Out[64]:





0.17723080221577134



In [65]:

    
repost = []
for i in df.title:
    if u'转载' in i.decode('utf8'):
        repost.append(1)
    else:
        repost.append(0)



In [66]:

    
data_X = [[df.click[i], df.reply[i]] for i in range(len(df))]
data_X[:3]









    Out[66]:





[[194675, 2703], [88244, 1041], [82779, 625]]



In [67]:

    
from sklearn.linear_model import LogisticRegression
df['repost'] = repost
model = LogisticRegression()
model.fit(data_X,df.repost)
model.score(data_X,df.repost)









    Out[67]:





0.61241970021413272



In [68]:

    
def randomSplitLogistic(dataX, dataY, num):
    dataX_train = []
    dataX_test = []
    dataY_train = []
    dataY_test = []
    import random
    test_index = random.sample(range(len(df)), num)
    for k in range(len(dataX)):
        if k in test_index:
            dataX_test.append(dataX[k])
            dataY_test.append(dataY[k])
        else:
            dataX_train.append(dataX[k])
            dataY_train.append(dataY[k])
    return dataX_train, dataX_test, dataY_train, dataY_test,



In [69]:

    
data_X_train, data_X_test, data_y_train, data_y_test = randomSplitLogistic(data_X, df.repost, 20)
log_regr = LogisticRegression()
log_regr.fit(data_X_train, data_y_train)
print'Variance score: %.2f' % log_regr.score(data_X_test, data_y_test)









    



Variance score: 0.75



In [70]:

    
y_true, y_pred = data_y_test, log_regr.predict(data_X_test)



In [71]:

    
y_true, y_pred









    Out[71]:





([0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1],
 array([0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]))



In [72]:

    
print(classification_report(y_true, y_pred))









    



             precision    recall  f1-score   support

          0       1.00      0.17      0.29         6
          1       0.74      1.00      0.85        14

avg / total       0.82      0.75      0.68        20



In [73]:

    
from sklearn.cross_validation import train_test_split
Xs_train, Xs_test, y_train, y_test = train_test_split(data_X, df.repost, test_size=0.2, random_state=42)



In [74]:

    
log_regr = LogisticRegression()
log_regr.fit(Xs_train, y_train)
print'Variance score: %.2f' % log_regr.score(Xs_test, y_test)









    



Variance score: 0.60



In [75]:

    
print('Logistic score for test set: %f' % log_regr.score(Xs_test, y_test))
print('Logistic score for training set: %f' % log_regr.score(Xs_train, y_train))
y_true, y_pred = y_test, log_regr.predict(Xs_test)
print(classification_report(y_true, y_pred))









    



Logistic score for test set: 0.595745
Logistic score for training set: 0.613941
             precision    recall  f1-score   support

          0       1.00      0.03      0.05        39
          1       0.59      1.00      0.74        55

avg / total       0.76      0.60      0.46        94



In [76]:

    
logre = LogisticRegression()
scores = cross_val_score(logre, data_X, df.repost, cv = 3)
scores.mean()









    Out[76]:





0.53333333333333333



In [78]:

    
logre = LogisticRegression()
data_X_scale = scale(data_X)
scores = cross_val_score(logre, data_X_scale, df.repost, cv = 3)
scores.mean()









    Out[78]:





0.62948717948717947



In [81]:

    
from sklearn import naive_bayes
'  '.join(dir(naive_bayes))









    Out[81]:





'ABCMeta  BaseDiscreteNB  BaseEstimator  BaseNB  BernoulliNB  ClassifierMixin  GaussianNB  LabelBinarizer  MultinomialNB  __all__  __builtins__  __doc__  __file__  __name__  __package__  _check_partial_fit_first_call  abstractmethod  binarize  check_X_y  check_array  check_is_fitted  in1d  issparse  label_binarize  logsumexp  np  safe_sparse_dot  six'



In [82]:

    
from sklearn.naive_bayes import GaussianNB
import numpy as np



In [83]:

    
x= np.array([[-3,7],[1,5], [1,2], [-2,0], [2,3], [-4,0], [-1,1], [1,1], [-2,2], [2,7], [-4,1], [-2,7]])
Y = np.array([3, 3, 3, 3, 4, 3, 3, 4, 3, 4, 4, 4])



In [87]:

    
model = GaussianNB()
model.fit(x[:8], Y[:8])
predicted= model.predict([[1,2],[3,4]])
print predicted



In [88]:

    
model.score(x[8:], Y[8:])









    Out[88]:





0.25



In [89]:

    
data_X_train, data_X_test, data_y_train, data_y_test = randomSplit(df.click, df.reply, 20)
model.fit(data_X_train, data_y_train)

predicted= model.predict(data_X_test)
print predicted









    



[41 70  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]



In [90]:

    
model.score(data_X_test, data_y_test)









    Out[90]:





0.55000000000000004



In [91]:

    
from sklearn.cross_validation import cross_val_score

model = GaussianNB()
scores = cross_val_score(model, [[c] for c in df.click], df.reply, cv = 5)
scores.mean()









    



//anaconda/lib/python2.7/site-packages/sklearn/cross_validation.py:516: Warning: The least populated class in y has only 1 members, which is too few. The minimum number of labels for any class cannot be less than n_folds=5.
  % (min_labels, self.n_folds)), Warning)






    Out[91]:





0.49403904714780522



In [92]:

    
from sklearn import tree
model = tree.DecisionTreeClassifier(criterion='gini')



In [93]:

    
data_X_train, data_X_test, data_y_train, data_y_test = randomSplitLogistic(data_X, df.repost, 20)
model.fit(data_X_train,data_y_train)
model.score(data_X_train,data_y_train)









    Out[93]:





0.91722595078299773



In [94]:

    
model.predict(data_X_test)









    Out[94]:





array([0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1])



In [95]:

    
scores = cross_val_score(model, data_X, df.repost, cv = 3)
scores.mean()









    Out[95]:





0.33461538461538459



In [96]:

    
from sklearn import svm
model=svm.SVC()



In [97]:

    
' '.join(dir(svm))









    Out[97]:





'LinearSVC LinearSVR NuSVC NuSVR OneClassSVM SVC SVR __all__ __builtins__ __doc__ __file__ __name__ __package__ __path__ base bounds classes l1_min_c liblinear libsvm libsvm_sparse'



In [98]:

    
data_X_train, data_X_test, data_y_train, data_y_test = randomSplitLogistic(data_X, df.repost, 20)
model.fit(data_X_train,data_y_train)
model.score(data_X_train,data_y_train)









    Out[98]:





0.91051454138702459



In [99]:

    
model.predict(data_X_test)









    Out[99]:





array([0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1])



In [100]:

    
scores = []
cvs = [3, 5, 10, 25, 50, 75, 100]
for i in cvs:
    score = cross_val_score(model, data_X, df.repost, cv = i)
    scores.append(score.mean() ) # Try to tune cv



In [101]:

    
plt.plot(cvs, scores, 'b-o')
plt.xlabel('$cv$', fontsize = 20)
plt.ylabel('$Score$', fontsize = 20)
plt.show()



In [102]:

    
import numpy as np
from sklearn import tree



In [103]:

    
import pandas as pd
train = pd.read_csv('/Users/zhangyixin/Desktop/cjc2016-gh-pages/data/tatanic_train.csv', sep = ",")



In [104]:

    
train.head()









    Out[104]:






  
    
      
      Unnamed: 0
      PassengerId
      Survived
      Pclass
      Name
      Sex
      Age
      SibSp
      Parch
      Ticket
      Fare
      Cabin
      Embarked
    
  
  
    
      0
      0
      1
      0
      3
      Braund, Mr. Owen Harris
      male
      22.0
      1
      0
      A/5 21171
      7.2500
      NaN
      S
    
    
      1
      1
      2
      1
      1
      Cumings, Mrs. John Bradley (Florence Briggs Th...
      female
      38.0
      1
      0
      PC 17599
      71.2833
      C85
      C
    
    
      2
      2
      3
      1
      3
      Heikkinen, Miss. Laina
      female
      26.0
      0
      0
      STON/O2. 3101282
      7.9250
      NaN
      S
    
    
      3
      3
      4
      1
      1
      Futrelle, Mrs. Jacques Heath (Lily May Peel)
      female
      35.0
      1
      0
      113803
      53.1000
      C123
      S
    
    
      4
      4
      5
      0
      3
      Allen, Mr. William Henry
      male
      35.0
      0
      0
      373450
      8.0500
      NaN
      S



In [105]:

    
train["Age"] = train["Age"].fillna(train["Age"].median())
train["Sex"][train["Sex"] == "male"] = 0
train["Sex"][train["Sex"] == "female"] = 1

train["Embarked"] = train["Embarked"].fillna('S')

train["Embarked"][train["Embarked"] == "S"] = 0
train["Embarked"][train["Embarked"] == "C"] = 1
train["Embarked"][train["Embarked"] == "Q"] = 2









    



/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:3: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()
/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:7: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:8: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:9: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



In [106]:

    
target = train['Survived'].values
features_one = train[["Pclass", "Sex", "Age", "Fare"]].values

my_tree_one = tree.DecisionTreeClassifier()

my_tree_one = my_tree_one.fit(features_one, target)

print(my_tree_one.feature_importances_)
print(my_tree_one.score(features_one, target))









    



[ 0.13555314  0.31274009  0.23810102  0.31360575]
0.977553310887



In [108]:

    
test = pd.read_csv('/Users/zhangyixin/Desktop/cjc2016-gh-pages/data/tatanic_test.csv', sep = ",")
test.Fare[152] = test.Fare.median()
test["Age"] = test["Age"].fillna(test["Age"].median())

test["Sex"][test["Sex"] == "male"] = 0
test["Sex"][test["Sex"] == "female"] = 1

test["Embarked"] = test["Embarked"].fillna('S')

test["Embarked"][test["Embarked"] == "S"] = 0
test["Embarked"][test["Embarked"] == "C"] = 1
test["Embarked"][test["Embarked"] == "Q"] = 2

test_features = test[["Pclass","Sex", "Age", "Fare"]].values
my_prediction = my_tree_one.predict(test_features)

PassengerId =np.array(test['PassengerId']).astype(int)
my_solution = pd.DataFrame(my_prediction, PassengerId, columns = ["Survived"])









    



/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:5: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:6: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:10: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:11: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:12: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



In [109]:

    
print my_solution[:3]









    



     Survived
892         0
893         0
894         1



In [110]:

    
print my_solution.shape



In [112]:

    
my_solution.to_csv("/Users/zhangyixin/Desktop/cjc2016-gh-pages/data/tatanic_solution_one.csv", index_label = ["PassengerId"])



In [113]:

    
features_two = train[["Pclass","Age","Sex","Fare", "SibSp", "Parch", "Embarked"]].values
max_depth = 10
min_samples_split = 5
my_tree_two = tree.DecisionTreeClassifier(max_depth = max_depth, min_samples_split = min_samples_split, random_state = 1)
my_tree_two = my_tree_two.fit(features_two, target)

print(my_tree_two.score(features_two, target))









    



0.905723905724



In [114]:

    
train_two = train
train_two['family_size'] = train.SibSp + train.Parch + 1

features_three = train[["Pclass", "Sex", "Age", "Fare", "SibSp", "Parch", "family_size"]].values

my_tree_three = tree.DecisionTreeClassifier()
my_tree_three = my_tree_three.fit(features_three, target)

print(my_tree_three.score(features_three, target))









    



0.979797979798



In [115]:

    
from sklearn.ensemble import RandomForestClassifier

features_forest = train[["Pclass", "Age", "Sex", "Fare", "SibSp", "Parch", "Embarked"]].values

n_estimators = 100
forest = RandomForestClassifier(max_depth = 10, min_samples_split=2, n_estimators = n_estimators, random_state = 1)
my_forest = forest.fit(features_forest, target)

print(my_forest.score(features_forest, target))

test_features = test[["Pclass", "Age", "Sex", "Fare", "SibSp", "Parch", "Embarked"]].values
pred_forest = my_forest.predict(test_features)
print(len(test_features))
print(pred_forest[:3])









    



0.939393939394
418
[0 0 0]



In [116]:

    
print(my_tree_two.feature_importances_)
print(my_forest.feature_importances_)

print(my_tree_two.score(features_two, target))
print(my_forest.score(features_two, target))









    



[ 0.14130255  0.17906027  0.41616727  0.17938711  0.05039699  0.01923751
  0.0144483 ]
[ 0.10384741  0.20139027  0.31989322  0.24602858  0.05272693  0.04159232
  0.03452128]
0.905723905724
0.939393939394



In [ ]:

	title	link	author	author_page	click	reply	time
0	【民间语文第161期】宁波px启示:船进港湾人应上岸	/post-free-2849477-1.shtml	贾也	http://www.tianya.cn/50499450	194675	2703	2012-10-29 07:59
1	宁波镇海PX项目引发群体上访当地政府发布说明(转载)	/post-free-2839539-1.shtml	无上卫士ABC	http://www.tianya.cn/74341835	88244	1041	2012-10-24 12:41

	Unnamed: 0	PassengerId	Survived	Pclass	Name	Sex	Age	SibSp	Ticket	Fare	Cabin	Embarked
0	0	1	0	3	Braund, Mr. Owen Harris	male	22.0	1	A/5 21171	7.2500	NaN	S
1	1	2	1	1	Cumings, Mrs. John Bradley (Florence Briggs Th...	female	38.0	1	PC 17599	71.2833	C85	C
2	2	3	1	3	Heikkinen, Miss. Laina	female	26.0	0	STON/O2. 3101282	7.9250	NaN	S
3	3	4	1	1	Futrelle, Mrs. Jacques Heath (Lily May Peel)	female	35.0	1	113803	53.1000	C123	S
4	4	5	0	3	Allen, Mr. William Henry	male	35.0	0	373450	8.0500	NaN	S