notebook.community

Edit and run



In [1]:

    
%matplotlib inline
#良、恶性肿瘤预测样例
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np



In [2]:

    
titanic = pd.read_csv('http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic.txt')



In [3]:

    
titanic.head()









    Out[3]:







  
    
      
      row.names
      pclass
      survived
      name
      age
      embarked
      home.dest
      room
      ticket
      boat
      sex
    
  
  
    
      0
      1
      1st
      1
      Allen, Miss Elisabeth Walton
      29.0000
      Southampton
      St Louis, MO
      B-5
      24160 L221
      2
      female
    
    
      1
      2
      1st
      0
      Allison, Miss Helen Loraine
      2.0000
      Southampton
      Montreal, PQ / Chesterville, ON
      C26
      NaN
      NaN
      female
    
    
      2
      3
      1st
      0
      Allison, Mr Hudson Joshua Creighton
      30.0000
      Southampton
      Montreal, PQ / Chesterville, ON
      C26
      NaN
      (135)
      male
    
    
      3
      4
      1st
      0
      Allison, Mrs Hudson J.C. (Bessie Waldo Daniels)
      25.0000
      Southampton
      Montreal, PQ / Chesterville, ON
      C26
      NaN
      NaN
      female
    
    
      4
      5
      1st
      1
      Allison, Master Hudson Trevor
      0.9167
      Southampton
      Montreal, PQ / Chesterville, ON
      C22
      NaN
      11
      male



In [4]:

    
#查看数据的统计信息，所有使用panda加载数据都转化成DataFrame
titanic.info()









    



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1313 entries, 0 to 1312
Data columns (total 11 columns):
row.names    1313 non-null int64
pclass       1313 non-null object
survived     1313 non-null int64
name         1313 non-null object
age          633 non-null float64
embarked     821 non-null object
home.dest    754 non-null object
room         77 non-null object
ticket       69 non-null object
boat         347 non-null object
sex          1313 non-null object
dtypes: float64(1), int64(2), object(8)
memory usage: 112.9+ KB



In [5]:

    
#数据样本有缺失，有的数据特征没有量化，所以需要做数据预处理
#特征选择往往是机器学习中重要的一环，但是往往被初学者忽视
#需要有一些背景知识才能做出很好的特征选择
#这里我们选选取sex,age,pclss这些因素来进行分析
X = titanic[['pclass', 'age', 'sex']]
Y = titanic['survived']



In [6]:

    
X.info()









    



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1313 entries, 0 to 1312
Data columns (total 3 columns):
pclass    1313 non-null object
age       633 non-null float64
sex       1313 non-null object
dtypes: float64(1), object(2)
memory usage: 30.8+ KB



In [7]:

    
#1.上述信息可以看到age只有633列数据，需要补充完整
#2.sex和pclass两个数据都是类别型的，需要数值化，使用1/0代替
#首先补充age里的数据，使用中位数或者平均数代替
X['age'].fillna(X['age'].mean(), inplace=True)









    



/Users/wizardholy/soft/dunas/lib/python2.7/site-packages/pandas/core/generic.py:3660: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)



In [8]:

    
X.info()









    



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1313 entries, 0 to 1312
Data columns (total 3 columns):
pclass    1313 non-null object
age       1313 non-null float64
sex       1313 non-null object
dtypes: float64(1), object(2)
memory usage: 30.8+ KB



In [9]:

    
#由上可知X已经将age特征补充完毕
#接下来进行数据分割
#切分训练集和测试集
from sklearn.cross_validation import train_test_split
X_train,X_test,Y_train,Y_test=train_test_split(X,
                                             Y,
                                            test_size=0.25,
                                             random_state=33)









    



/Users/wizardholy/soft/dunas/lib/python2.7/site-packages/sklearn/cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)



In [10]:

    
#使用特征提取模块中的特征转换器进行特征转换
from sklearn.feature_extraction import DictVectorizer



In [11]:

    
dvec = DictVectorizer()



In [12]:

    
X_train = dvec.fit_transform(X_train.to_dict('recore'))
X_test = dvec.transform(X_test.to_dict('recore'))



In [13]:

    
#可以发现凡事类别的类型都被单独提出来
print dvec.feature_names_









    



['age', 'pclass=1st', 'pclass=2nd', 'pclass=3rd', 'sex=female', 'sex=male']



In [14]:

    
from sklearn.tree import DecisionTreeClassifier



In [15]:

    
dtc = DecisionTreeClassifier()



In [16]:

    
dtc.fit(X_train, Y_train)









    Out[16]:





DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')



In [17]:

    
y_predict = dtc.predict(X_test)



In [18]:

    
print y_predict









    



[0 1 0 0 0 1 1 0 0 1 0 1 1 0 0 1 0 1 0 1 1 1 0 0 0 0 1 1 0 0 1 0 0 0 1 1 1
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 1 1 0 0 1 0 0 1 0 0 0 0 0 0 0 1 0
 0 1 0 0 0 1 1 1 0 0 0 0 0 0 1 0 0 1 0 0 0 1 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0
 1 1 0 0 0 0 0 0 0 0 0 1 1 0 0 1 1 0 0 1 0 0 1 0 0 1 0 1 1 0 0 0 0 0 0 0 1
 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 1 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0
 0 1 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 1 0 1 1 1 0
 0 0 0 0 0 0 0 0 1 1 0 0 0 1 0 0 0 0 0 1 0 1 1 0 0 0 0 1 1 1 0 0 0 1 1 0 0
 1 1 0 0 0 1 0 0 0 1 0 0 0 1 1 0 0 0 0 1 0 1 0 0 0 1 0 0 1 1 0 0 0 1 0 0 0
 1 0 0 0 1 1 0 0 0 0 1 0 0 0 1 1 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 1 0]



In [19]:

    
from sklearn.metrics import classification_report



In [20]:

    
print 'The Accuracy of DecisionTreeClassifier is',dtc.score(X_test, Y_test)









    



The Accuracy of DecisionTreeClassifier is 0.781155015198



In [21]:

    
#使用单一算法 决策树算法训练结果
print classification_report(Y_test, y_predict, target_names=['died', 'survived'])









    



             precision    recall  f1-score   support

       died       0.78      0.91      0.84       202
   survived       0.80      0.58      0.67       127

avg / total       0.78      0.78      0.77       329



In [22]:

    
from sklearn.ensemble import GradientBoostingClassifier



In [23]:

    
gbc = GradientBoostingClassifier()



In [24]:

    
gbc.fit(X_train, Y_train)









    Out[24]:





GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False)



In [25]:

    
gbc_y_pred = gbc.predict(X_test)



In [26]:

    
from sklearn.metrics import classification_report



In [27]:

    
print 'The Accuracy of GradientBoostingClassifier is',gbc.score(X_test, Y_test)









    



The Accuracy of GradientBoostingClassifier is 0.790273556231



In [28]:

    
print classification_report(Y_test, gbc_y_pred, target_names=['died', 'survived'])









    



             precision    recall  f1-score   support

       died       0.78      0.92      0.84       202
   survived       0.82      0.58      0.68       127

avg / total       0.80      0.79      0.78       329



In [29]:

    
from sklearn.ensemble import RandomForestClassifier



In [30]:

    
rfc = RandomForestClassifier()



In [31]:

    
rfc.fit(X_train, Y_train)









    Out[31]:





RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)



In [32]:

    
rfc_y_pred = rfc.predict(X_test)



In [33]:

    
print 'The Accuracy of RandomForestClassifier is',rfc.score(X_test, Y_test)









    



The Accuracy of RandomForestClassifier is 0.787234042553



In [34]:

    
print classification_report(Y_test, rfc_y_pred, target_names=['died', 'survived'])









    



             precision    recall  f1-score   support

       died       0.78      0.92      0.84       202
   survived       0.81      0.58      0.68       127

avg / total       0.79      0.79      0.78       329



In [ ]:

	row.names	pclass	survived	name	age	embarked	home.dest	room	ticket	boat	sex
0	1	1st	1	Allen, Miss Elisabeth Walton	29.0000	Southampton	St Louis, MO	B-5	24160 L221	2	female
1	2	1st	0	Allison, Miss Helen Loraine	2.0000	Southampton	Montreal, PQ / Chesterville, ON	C26	NaN	NaN	female
2	3	1st	0	Allison, Mr Hudson Joshua Creighton	30.0000	Southampton	Montreal, PQ / Chesterville, ON	C26	NaN	(135)	male
3	4	1st	0	Allison, Mrs Hudson J.C. (Bessie Waldo Daniels)	25.0000	Southampton	Montreal, PQ / Chesterville, ON	C26	NaN	NaN	female
4	5	1st	1	Allison, Master Hudson Trevor	0.9167	Southampton	Montreal, PQ / Chesterville, ON	C22	NaN	11	male