In [221]:
%matplotlib inline
import matplotlib.pyplot as plt
from __future__ import division
import pandas as pd
import numpy as np
import gzip
import sklearn as skl
from sklearn import cross_validation
from sklearn.ensemble import RandomForestClassifier
from sklearn import preprocessing
#nice defaults for matplotlib
from matplotlib import rcParams

dark2_colors = [(0.10588235294117647, 0.6196078431372549, 0.4666666666666667),
               (0.8509803921568627, 0.37254901960784315, 0.00784313725490196),
               (0.4588235294117647, 0.4392156862745098, 0.7019607843137254),
               (0.9058823529411765, 0.1607843137254902, 0.5411764705882353),
               (0.4, 0.6509803921568628, 0.11764705882352941),
               (0.9019607843137255, 0.6705882352941176, 0.00784313725490196),
               (0.6509803921568628, 0.4627450980392157, 0.11372549019607843),
               (0.4, 0.4, 0.4)]

rcParams['figure.figsize'] = (10, 6)
rcParams['figure.dpi'] = 150
rcParams['axes.color_cycle'] = dark2_colors
rcParams['lines.linewidth'] = 2
rcParams['axes.grid'] = True
rcParams['axes.facecolor'] = '#eeeeee'
rcParams['font.size'] = 14
rcParams['patch.edgecolor'] = 'none'


/opt/anaconda/lib/python3.5/site-packages/matplotlib/__init__.py:872: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter.
  warnings.warn(self.msg_depr % (key, alt_key))

In [254]:
def to_years(string):
    words = string.split(' ')
    retVal = int(words[0])
    if words[1] == 'month' or words[1] == 'months':
        retVal /= 12.0
    elif words[1] == 'week' or words[1] =='weeks':
        retVal /= 52.0
    elif words[1] == 'day' or words[1] == 'days':
        retVal /= 365.0
    return retVal

def is_sterile(string):
    words = string.split(' ')
    if len(words) == 1:
        return 'unknown'
    else:
        return words[0]

def gender(string):
    words = string.split(' ')
    if len(words) == 1:
        return 'Unknown'
    else:
        return words[1]

def has_name(string):
    if type(string) != np.str:
        return 0
    else:
        return 1
    
def date_to_month(timestamp):
    return timestamp.month
    
df = pd.read_csv("train.csv")
df.columns = ['id','name','date','outcome','outcomesub','species','sex','age','breed', 'color']
df['date'] = pd.to_datetime(df['date'])
df = df.drop(['outcomesub', 'id'], 1)
df.dropna(axis=0, subset=['age', 'sex'],inplace=True)
df['age'] = df['age'].map(to_years)
df['sterile'] = df['sex']
df['sterile'] = df['sterile'].map(is_sterile)
df['sex'] = df['sex'].map(gender)
df['name'] = df['name'].map(has_name)
df['date'] = df['date'].map(date_to_month)
labels = df['outcome'].values
df = df.drop('outcome', 1)

In [255]:
df.head()


Out[255]:
name date species sex age breed color sterile
0 1 2 Dog Male 1.000000 Shetland Sheepdog Mix Brown/White Neutered
1 1 10 Cat Female 1.000000 Domestic Shorthair Mix Cream Tabby Spayed
2 1 1 Dog Male 2.000000 Pit Bull Mix Blue/White Neutered
3 0 7 Cat Male 0.057692 Domestic Shorthair Mix Blue Cream Intact
4 0 11 Dog Male 2.000000 Lhasa Apso/Miniature Poodle Tan Neutered

In [281]:
le = preprocessing.LabelEncoder()
for column in ['color', 'sex', 'breed', 'species', 'sterile']:
    le.fit(df[column])
    df[column] = le.transform(df[column])

X_train, X_test, y_train, y_test = cross_validation.train_test_split(df, labels, test_size=0.4, random_state=0)
clf = RandomForestClassifier(n_estimators=15)
clf = clf.fit(X_train,y_train)


Out[281]:
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=15, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [284]:
def get_properly_class(yhat,y):
    return float(sum([yhat[i]==y[i] for i in range(len(y))]))/len(y)

yhat = clf.predict(X_test)
get_properly_class(yhat, y_test)


Out[284]:
0.6014601272931487

In [ ]:


In [178]:
plt.hist(df['age'], bins = np.arange(0,20,1))
plt.title("Age distribution")
plt.xlabel("Ages")
plt.show()



In [179]:
plt.hist(df[df['outcome'] == 'Euthanasia']['age'], bins = np.arange(0,20,1), normed=1)
plt.title("Age distribution for Euthanasia")
plt.xlabel("Ages")
plt.show()



In [180]:
plt.hist(df[df['outcome'] == 'Adoption']['age'], bins = np.arange(0,20,1), normed=1)
plt.title("Age distribution for Adoption")
plt.xlabel("Ages")
plt.show()



In [181]:
len(df['color'].value_counts())


Out[181]:
366

In [208]:
df['color'].isnull().sum()


Out[208]:
0

In [275]:
for col in pd.DataFrame(y_train):
    print(y_train[col].isnull().sum())


---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-275-aaa3810e6b88> in <module>()
      1 for col in pd.DataFrame(y_train):
----> 2     print(y_train[col].isnull().sum())

AttributeError: 'str' object has no attribute 'isnull'

In [ ]:
for col in df:
    df[]