In [1]:
%matplotlib inline
#良、恶性肿瘤预测样例
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
In [2]:
titanic = pd.read_csv('http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic.txt')
In [3]:
titanic.head()
Out[3]:
In [4]:
#查看数据的统计信息,所有使用panda加载数据都转化成DataFrame
titanic.info()
In [5]:
#数据样本有缺失,有的数据特征没有量化,所以需要做数据预处理
#特征选择往往是机器学习中重要的一环,但是往往被初学者忽视
#需要有一些背景知识才能做出很好的特征选择
#这里我们选选取sex,age,pclss这些因素来进行分析
X = titanic[['pclass', 'age', 'sex']]
Y = titanic['survived']
In [6]:
X.info()
In [7]:
#1.上述信息可以看到age只有633列数据,需要补充完整
#2.sex和pclass两个数据都是类别型的,需要数值化,使用1/0代替
#首先补充age里的数据,使用中位数或者平均数代替
X['age'].fillna(X['age'].mean(), inplace=True)
In [8]:
X.info()
In [9]:
#由上可知X已经将age特征补充完毕
#接下来进行数据分割
#切分训练集和测试集
from sklearn.cross_validation import train_test_split
X_train,X_test,Y_train,Y_test=train_test_split(X,
Y,
test_size=0.25,
random_state=33)
In [10]:
#使用特征提取模块中的特征转换器进行特征转换
from sklearn.feature_extraction import DictVectorizer
In [11]:
dvec = DictVectorizer()
In [12]:
X_train = dvec.fit_transform(X_train.to_dict('recore'))
X_test = dvec.transform(X_test.to_dict('recore'))
In [13]:
#可以发现凡事类别的类型都被单独提出来
print dvec.feature_names_
In [14]:
from sklearn.tree import DecisionTreeClassifier
In [15]:
dtc = DecisionTreeClassifier()
In [16]:
dtc.fit(X_train, Y_train)
Out[16]:
In [17]:
y_predict = dtc.predict(X_test)
In [18]:
print y_predict
In [19]:
from sklearn.metrics import classification_report
In [20]:
print 'The Accuracy of DecisionTreeClassifier is',dtc.score(X_test, Y_test)
In [21]:
#使用单一算法 决策树算法训练结果
print classification_report(Y_test, y_predict, target_names=['died', 'survived'])
In [22]:
from sklearn.ensemble import GradientBoostingClassifier
In [23]:
gbc = GradientBoostingClassifier()
In [24]:
gbc.fit(X_train, Y_train)
Out[24]:
In [25]:
gbc_y_pred = gbc.predict(X_test)
In [26]:
from sklearn.metrics import classification_report
In [27]:
print 'The Accuracy of GradientBoostingClassifier is',gbc.score(X_test, Y_test)
In [28]:
print classification_report(Y_test, gbc_y_pred, target_names=['died', 'survived'])
In [29]:
from sklearn.ensemble import RandomForestClassifier
In [30]:
rfc = RandomForestClassifier()
In [31]:
rfc.fit(X_train, Y_train)
Out[31]:
In [32]:
rfc_y_pred = rfc.predict(X_test)
In [33]:
print 'The Accuracy of RandomForestClassifier is',rfc.score(X_test, Y_test)
In [34]:
print classification_report(Y_test, rfc_y_pred, target_names=['died', 'survived'])
In [ ]: