In [221]:
%matplotlib inline
import matplotlib.pyplot as plt
from __future__ import division
import pandas as pd
import numpy as np
import gzip
import sklearn as skl
from sklearn import cross_validation
from sklearn.ensemble import RandomForestClassifier
from sklearn import preprocessing
#nice defaults for matplotlib
from matplotlib import rcParams
dark2_colors = [(0.10588235294117647, 0.6196078431372549, 0.4666666666666667),
(0.8509803921568627, 0.37254901960784315, 0.00784313725490196),
(0.4588235294117647, 0.4392156862745098, 0.7019607843137254),
(0.9058823529411765, 0.1607843137254902, 0.5411764705882353),
(0.4, 0.6509803921568628, 0.11764705882352941),
(0.9019607843137255, 0.6705882352941176, 0.00784313725490196),
(0.6509803921568628, 0.4627450980392157, 0.11372549019607843),
(0.4, 0.4, 0.4)]
rcParams['figure.figsize'] = (10, 6)
rcParams['figure.dpi'] = 150
rcParams['axes.color_cycle'] = dark2_colors
rcParams['lines.linewidth'] = 2
rcParams['axes.grid'] = True
rcParams['axes.facecolor'] = '#eeeeee'
rcParams['font.size'] = 14
rcParams['patch.edgecolor'] = 'none'
In [254]:
def to_years(string):
words = string.split(' ')
retVal = int(words[0])
if words[1] == 'month' or words[1] == 'months':
retVal /= 12.0
elif words[1] == 'week' or words[1] =='weeks':
retVal /= 52.0
elif words[1] == 'day' or words[1] == 'days':
retVal /= 365.0
return retVal
def is_sterile(string):
words = string.split(' ')
if len(words) == 1:
return 'unknown'
else:
return words[0]
def gender(string):
words = string.split(' ')
if len(words) == 1:
return 'Unknown'
else:
return words[1]
def has_name(string):
if type(string) != np.str:
return 0
else:
return 1
def date_to_month(timestamp):
return timestamp.month
df = pd.read_csv("train.csv")
df.columns = ['id','name','date','outcome','outcomesub','species','sex','age','breed', 'color']
df['date'] = pd.to_datetime(df['date'])
df = df.drop(['outcomesub', 'id'], 1)
df.dropna(axis=0, subset=['age', 'sex'],inplace=True)
df['age'] = df['age'].map(to_years)
df['sterile'] = df['sex']
df['sterile'] = df['sterile'].map(is_sterile)
df['sex'] = df['sex'].map(gender)
df['name'] = df['name'].map(has_name)
df['date'] = df['date'].map(date_to_month)
labels = df['outcome'].values
df = df.drop('outcome', 1)
In [255]:
df.head()
Out[255]:
In [281]:
le = preprocessing.LabelEncoder()
for column in ['color', 'sex', 'breed', 'species', 'sterile']:
le.fit(df[column])
df[column] = le.transform(df[column])
X_train, X_test, y_train, y_test = cross_validation.train_test_split(df, labels, test_size=0.4, random_state=0)
clf = RandomForestClassifier(n_estimators=15)
clf = clf.fit(X_train,y_train)
Out[281]:
In [284]:
def get_properly_class(yhat,y):
return float(sum([yhat[i]==y[i] for i in range(len(y))]))/len(y)
yhat = clf.predict(X_test)
get_properly_class(yhat, y_test)
Out[284]:
In [ ]:
In [178]:
plt.hist(df['age'], bins = np.arange(0,20,1))
plt.title("Age distribution")
plt.xlabel("Ages")
plt.show()
In [179]:
plt.hist(df[df['outcome'] == 'Euthanasia']['age'], bins = np.arange(0,20,1), normed=1)
plt.title("Age distribution for Euthanasia")
plt.xlabel("Ages")
plt.show()
In [180]:
plt.hist(df[df['outcome'] == 'Adoption']['age'], bins = np.arange(0,20,1), normed=1)
plt.title("Age distribution for Adoption")
plt.xlabel("Ages")
plt.show()
In [181]:
len(df['color'].value_counts())
Out[181]:
In [208]:
df['color'].isnull().sum()
Out[208]:
In [275]:
for col in pd.DataFrame(y_train):
print(y_train[col].isnull().sum())
In [ ]:
for col in df:
df[]