In [2]:
%pylab inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
In [41]:
titantic = pd.read_csv('titanic.txt')
titantic.head()
Out[41]:
In [29]:
print titantic.head()[['pclass', 'survived', 'age', 'embarked', 'boat', 'sex']]
In [30]:
# for any column in cols, it converts the one column to multiple columns
# for example column name - sex, values Female, Male
# it converts to two columns, sex=female, sex=male
from sklearn import feature_extraction
def one_hot_dataframe(data, cols, replace=True):
vec = feature_extraction.DictVectorizer()
vecData = pd.DataFrame(vec.fit_transform(data[cols].to_dict(outtype='records')).toarray())
vecData.columns = vec.get_feature_names()
vecData.index = data.index
if replace:
data = data.drop(cols, axis=1)
data = data.join(vecData)
return (data, vecData, vec)
In [43]:
titantic, titanic_n, vec = one_hot_dataframe(titantic, ['pclass','embarked','sex','home.dest','room','ticket','boat'], replace=True)
In [44]:
titantic.shape
Out[44]:
In [42]:
# Fill N/A values
mean = titantic['age'].mean()
titantic['age'].fillna(mean, inplace=True)
titantic.fillna(0, inplace=True)
titantic.head()
Out[42]:
In [45]:
# split into Train and Test
from sklearn.cross_validation import train_test_split
titanic_target = titantic['survived']
titanic_data = titantic.drop(['name', 'row.names','survived'],axis=1)
X_train, X_test, y_train, y_test = train_test_split(titanic_data, titanic_target,test_size=0.25,random_state=33)
In [47]:
from sklearn import tree
dt = tree.DecisionTreeClassifier(criterion='entropy')
dt = dt.fit(X_train, y_train)
from sklearn import metrics
y_perd = dt.predict(X_test)
print "Accuracy: {0:.3f}".format(metrics.accuracy_score(y_test, y_perd)), "\n"
In [48]:
from sklearn import feature_selection
fs = feature_selection.SelectPercentile(
feature_selection.chi2, percentile=20)
X_train_fs = fs.fit_transform(X_train, y_train)
In [49]:
dt.fit(X_train_fs, y_train)
X_test_fs = fs.transform(X_test)
y_pred_fs = dt.predict(X_test_fs)
print "Accuracy: {0:.3f}".format(metrics.accuracy_score(y_test, y_perd)), "\n"
In [53]:
print X_test.shape
In [52]:
print X_test_fs.shape
In [55]:
# try to find the optimal number of features
from sklearn import cross_validation
percentiles = range(1,100,5)
results = []
for i in range(1,100,5):
fs = feature_selection.SelectPercentile(
feature_selection.chi2, percentile=i)
x_train_fs = fs.fit_transform(X_train, y_train)
scores = cross_validation.cross_val_score(dt, x_train_fs, y_train, cv=5)
results = np.append(results, scores.mean())
optimal_percentil = np.where(results == results.max())[0]
print "Optimal number of features:{0}".format(
percentiles[optimal_percentil]), "\n"
In [56]:
%pylab inline
In [57]:
plt.figure()
plt.xlabel("Number of features selected")
plt.ylabel("Cross-validation scores")
plt.plot(percentiles, results)
Out[57]: