In [1]:
%matplotlib inline
#良、恶性肿瘤预测样例
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [2]:
titanic = pd.read_csv('http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic.txt')

In [3]:
titanic.head()


Out[3]:
row.names pclass survived name age embarked home.dest room ticket boat sex
0 1 1st 1 Allen, Miss Elisabeth Walton 29.0000 Southampton St Louis, MO B-5 24160 L221 2 female
1 2 1st 0 Allison, Miss Helen Loraine 2.0000 Southampton Montreal, PQ / Chesterville, ON C26 NaN NaN female
2 3 1st 0 Allison, Mr Hudson Joshua Creighton 30.0000 Southampton Montreal, PQ / Chesterville, ON C26 NaN (135) male
3 4 1st 0 Allison, Mrs Hudson J.C. (Bessie Waldo Daniels) 25.0000 Southampton Montreal, PQ / Chesterville, ON C26 NaN NaN female
4 5 1st 1 Allison, Master Hudson Trevor 0.9167 Southampton Montreal, PQ / Chesterville, ON C22 NaN 11 male

In [4]:
#查看数据的统计信息,所有使用panda加载数据都转化成DataFrame
titanic.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1313 entries, 0 to 1312
Data columns (total 11 columns):
row.names    1313 non-null int64
pclass       1313 non-null object
survived     1313 non-null int64
name         1313 non-null object
age          633 non-null float64
embarked     821 non-null object
home.dest    754 non-null object
room         77 non-null object
ticket       69 non-null object
boat         347 non-null object
sex          1313 non-null object
dtypes: float64(1), int64(2), object(8)
memory usage: 112.9+ KB

In [5]:
#数据样本有缺失,有的数据特征没有量化,所以需要做数据预处理
#特征选择往往是机器学习中重要的一环,但是往往被初学者忽视
#需要有一些背景知识才能做出很好的特征选择
#这里我们选选取sex,age,pclss这些因素来进行分析
X = titanic[['pclass', 'age', 'sex']]
Y = titanic['survived']

In [6]:
X.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1313 entries, 0 to 1312
Data columns (total 3 columns):
pclass    1313 non-null object
age       633 non-null float64
sex       1313 non-null object
dtypes: float64(1), object(2)
memory usage: 30.8+ KB

In [7]:
#1.上述信息可以看到age只有633列数据,需要补充完整
#2.sex和pclass两个数据都是类别型的,需要数值化,使用1/0代替
#首先补充age里的数据,使用中位数或者平均数代替
X['age'].fillna(X['age'].mean(), inplace=True)


/Users/wizardholy/soft/dunas/lib/python2.7/site-packages/pandas/core/generic.py:3660: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)

In [8]:
X.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1313 entries, 0 to 1312
Data columns (total 3 columns):
pclass    1313 non-null object
age       1313 non-null float64
sex       1313 non-null object
dtypes: float64(1), object(2)
memory usage: 30.8+ KB

In [9]:
#由上可知X已经将age特征补充完毕
#接下来进行数据分割
#切分训练集和测试集
from sklearn.cross_validation import train_test_split
X_train,X_test,Y_train,Y_test=train_test_split(X,
                                             Y,
                                            test_size=0.25,
                                             random_state=33)


/Users/wizardholy/soft/dunas/lib/python2.7/site-packages/sklearn/cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)

In [10]:
#使用特征提取模块中的特征转换器进行特征转换
from sklearn.feature_extraction import DictVectorizer

In [11]:
dvec = DictVectorizer()

In [12]:
X_train = dvec.fit_transform(X_train.to_dict('recore'))
X_test = dvec.transform(X_test.to_dict('recore'))

In [13]:
#可以发现凡事类别的类型都被单独提出来
print dvec.feature_names_


['age', 'pclass=1st', 'pclass=2nd', 'pclass=3rd', 'sex=female', 'sex=male']

In [14]:
from sklearn.tree import DecisionTreeClassifier

In [15]:
dtc = DecisionTreeClassifier()

In [16]:
dtc.fit(X_train, Y_train)


Out[16]:
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [17]:
y_predict = dtc.predict(X_test)

In [18]:
print y_predict


[0 1 0 0 0 1 1 0 0 1 0 1 1 0 0 1 0 1 0 1 1 1 0 0 0 0 1 1 0 0 1 0 0 0 1 1 1
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 1 1 0 0 1 0 0 1 0 0 0 0 0 0 0 1 0
 0 1 0 0 0 1 1 1 0 0 0 0 0 0 1 0 0 1 0 0 0 1 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0
 1 1 0 0 0 0 0 0 0 0 0 1 1 0 0 1 1 0 0 1 0 0 1 0 0 1 0 1 1 0 0 0 0 0 0 0 1
 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 1 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0
 0 1 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 1 0 1 1 1 0
 0 0 0 0 0 0 0 0 1 1 0 0 0 1 0 0 0 0 0 1 0 1 1 0 0 0 0 1 1 1 0 0 0 1 1 0 0
 1 1 0 0 0 1 0 0 0 1 0 0 0 1 1 0 0 0 0 1 0 1 0 0 0 1 0 0 1 1 0 0 0 1 0 0 0
 1 0 0 0 1 1 0 0 0 0 1 0 0 0 1 1 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 1 0]

In [19]:
from sklearn.metrics import classification_report

In [20]:
print 'The Accuracy of DecisionTreeClassifier is',dtc.score(X_test, Y_test)


The Accuracy of DecisionTreeClassifier is 0.781155015198

In [21]:
#使用单一算法 决策树算法训练结果
print classification_report(Y_test, y_predict, target_names=['died', 'survived'])


             precision    recall  f1-score   support

       died       0.78      0.91      0.84       202
   survived       0.80      0.58      0.67       127

avg / total       0.78      0.78      0.77       329


In [22]:
from sklearn.ensemble import GradientBoostingClassifier

In [23]:
gbc = GradientBoostingClassifier()

In [24]:
gbc.fit(X_train, Y_train)


Out[24]:
GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False)

In [25]:
gbc_y_pred = gbc.predict(X_test)

In [26]:
from sklearn.metrics import classification_report

In [27]:
print 'The Accuracy of GradientBoostingClassifier is',gbc.score(X_test, Y_test)


The Accuracy of GradientBoostingClassifier is 0.790273556231

In [28]:
print classification_report(Y_test, gbc_y_pred, target_names=['died', 'survived'])


             precision    recall  f1-score   support

       died       0.78      0.92      0.84       202
   survived       0.82      0.58      0.68       127

avg / total       0.80      0.79      0.78       329


In [29]:
from sklearn.ensemble import RandomForestClassifier

In [30]:
rfc = RandomForestClassifier()

In [31]:
rfc.fit(X_train, Y_train)


Out[31]:
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [32]:
rfc_y_pred = rfc.predict(X_test)

In [33]:
print 'The Accuracy of RandomForestClassifier is',rfc.score(X_test, Y_test)


The Accuracy of RandomForestClassifier is 0.787234042553

In [34]:
print classification_report(Y_test, rfc_y_pred, target_names=['died', 'survived'])


             precision    recall  f1-score   support

       died       0.78      0.92      0.84       202
   survived       0.81      0.58      0.68       127

avg / total       0.79      0.79      0.78       329


In [ ]: