In [218]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from pylab import rcParams
rcParams['figure.figsize'] = 5, 10
# use prettier ggplot style
matplotlib.style.use('ggplot')
% matplotlib inline
In [219]:
#print(os.getcwd() + "\n")
# load data
train = pd.read_csv("../data/train.csv")
test = pd.read_csv("../data/test.csv")
In [220]:
uniq = pd.Series([len(pd.unique(train[x].values.ravel())) for x in train.columns.values], train.columns)
print len(train.index), "observations,", len(train.columns), "dimensions\n", \
pd.DataFrame([train.dtypes, train.count(), uniq, train.isnull().sum()], index=['type', 'count', 'UNIQUE', 'NA']).transpose()
In [221]:
print train.select_dtypes(['object']).describe().transpose(), '\n'
print train.select_dtypes(['int64']).describe().transpose(), '\n'
print train.select_dtypes(['float64']).describe().transpose()
In [222]:
# print counts per category
#for col in ['Survived', 'Pclass', 'Sex', 'SibSp', 'Parch', 'Embarked']:
# print pd.DataFrame(train[col].value_counts(dropna=False)), '\n' # automatically sorts by count (desc)
In [223]:
# descriptive statistics
train.groupby('Pclass').mean()
Out[223]:
In [224]:
# clean data (fill missing Embark and Age values)
train.Embarked.fillna('S', inplace=True) # assume 'S'
meanAge = train.Age.mean()
train.Age = train.Age.fillna(meanAge)
In [225]:
# convert to categories (requires pandas 0.15+)
for col in ['Survived', 'Pclass', 'Sex', 'Embarked']:
train[col] = train[col].astype('category')
In [226]:
# does not work on category or object
corr = train.corr()
sns.heatmap(corr, square=True, annot=True, linewidths='1', cmap="RdBu")
pass
In [227]:
matplotlib.style.use('ggplot')
# visualization with matplotlib
plt.scatter(y=train.Fare, x=train.Age, alpha=0.2, color="blue")
plt.gcf().set_size_inches(12,8)
#plt.minorticks_on()
#plt.grid(b=True, which='major', c='gray', linestyle='-')
#plt.grid(b=True, which='minor', c='lightgray', linestyle='--')
plt.xlim([-2,85])
plt.ylim([-10,550])
plt.xlabel('Age')
plt.ylabel('Fare')
pass
In [228]:
# Random Forest
# must encode categories as integers
sexEncoder = preprocessing.LabelEncoder()
train['Sex'] = sexEncoder.fit_transform(train.Sex)
embEncoder = preprocessing.LabelEncoder()
train['Embarked'] = embEncoder.fit_transform(train.Embarked)
columns = np.array(['Sex', 'Pclass', 'Age', 'Fare', 'SibSp', 'Parch', 'Embarked'])
forest = RandomForestClassifier(n_estimators=1000, max_depth=5)
fit = forest.fit(train[columns], train.Survived)
In [229]:
train.isnull().any()
Out[229]:
In [230]:
# analyze variable importance
# (inspired by http://scikit-learn.org/stable/auto_examples/ensemble/plot_gradient_boosting_regression.html)
varImportance = pd.Series(fit.feature_importances_, index=columns).sort_values()
print varImportance
In [231]:
varImportance.plot.barh() # color='skyblue'
ax = plt.gca()
ax.yaxis.grid(False)
#ax.set_axisbelow(True)
#ax.set_axis_bgcolor('darkgrey')
#ax.xaxis.grid(color='white', linestyle='solid', linewidth='1')
#ax.set_frame_on(False)
plt.tick_params(axis='y', left='off', right='off')
plt.tick_params(axis='x', top='off', direction='out')
plt.title('Variable Importance')
plt.xlim(-0.015, 0.515)
pass
In [ ]: