notebook.community

Edit and run



In [221]:

    
%matplotlib inline
import matplotlib.pyplot as plt
from __future__ import division
import pandas as pd
import numpy as np
import gzip
import sklearn as skl
from sklearn import cross_validation
from sklearn.ensemble import RandomForestClassifier
from sklearn import preprocessing
#nice defaults for matplotlib
from matplotlib import rcParams

dark2_colors = [(0.10588235294117647, 0.6196078431372549, 0.4666666666666667),
               (0.8509803921568627, 0.37254901960784315, 0.00784313725490196),
               (0.4588235294117647, 0.4392156862745098, 0.7019607843137254),
               (0.9058823529411765, 0.1607843137254902, 0.5411764705882353),
               (0.4, 0.6509803921568628, 0.11764705882352941),
               (0.9019607843137255, 0.6705882352941176, 0.00784313725490196),
               (0.6509803921568628, 0.4627450980392157, 0.11372549019607843),
               (0.4, 0.4, 0.4)]

rcParams['figure.figsize'] = (10, 6)
rcParams['figure.dpi'] = 150
rcParams['axes.color_cycle'] = dark2_colors
rcParams['lines.linewidth'] = 2
rcParams['axes.grid'] = True
rcParams['axes.facecolor'] = '#eeeeee'
rcParams['font.size'] = 14
rcParams['patch.edgecolor'] = 'none'









    



/opt/anaconda/lib/python3.5/site-packages/matplotlib/__init__.py:872: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter.
  warnings.warn(self.msg_depr % (key, alt_key))



In [254]:

    
def to_years(string):
    words = string.split(' ')
    retVal = int(words[0])
    if words[1] == 'month' or words[1] == 'months':
        retVal /= 12.0
    elif words[1] == 'week' or words[1] =='weeks':
        retVal /= 52.0
    elif words[1] == 'day' or words[1] == 'days':
        retVal /= 365.0
    return retVal

def is_sterile(string):
    words = string.split(' ')
    if len(words) == 1:
        return 'unknown'
    else:
        return words[0]

def gender(string):
    words = string.split(' ')
    if len(words) == 1:
        return 'Unknown'
    else:
        return words[1]

def has_name(string):
    if type(string) != np.str:
        return 0
    else:
        return 1
    
def date_to_month(timestamp):
    return timestamp.month
    
df = pd.read_csv("train.csv")
df.columns = ['id','name','date','outcome','outcomesub','species','sex','age','breed', 'color']
df['date'] = pd.to_datetime(df['date'])
df = df.drop(['outcomesub', 'id'], 1)
df.dropna(axis=0, subset=['age', 'sex'],inplace=True)
df['age'] = df['age'].map(to_years)
df['sterile'] = df['sex']
df['sterile'] = df['sterile'].map(is_sterile)
df['sex'] = df['sex'].map(gender)
df['name'] = df['name'].map(has_name)
df['date'] = df['date'].map(date_to_month)
labels = df['outcome'].values
df = df.drop('outcome', 1)



In [255]:

    
df.head()









    Out[255]:






  
    
      
      name
      date
      species
      sex
      age
      breed
      color
      sterile
    
  
  
    
      0
      1
      2
      Dog
      Male
      1.000000
      Shetland Sheepdog Mix
      Brown/White
      Neutered
    
    
      1
      1
      10
      Cat
      Female
      1.000000
      Domestic Shorthair Mix
      Cream Tabby
      Spayed
    
    
      2
      1
      1
      Dog
      Male
      2.000000
      Pit Bull Mix
      Blue/White
      Neutered
    
    
      3
      0
      7
      Cat
      Male
      0.057692
      Domestic Shorthair Mix
      Blue Cream
      Intact
    
    
      4
      0
      11
      Dog
      Male
      2.000000
      Lhasa Apso/Miniature Poodle
      Tan
      Neutered



In [281]:

    
le = preprocessing.LabelEncoder()
for column in ['color', 'sex', 'breed', 'species', 'sterile']:
    le.fit(df[column])
    df[column] = le.transform(df[column])

X_train, X_test, y_train, y_test = cross_validation.train_test_split(df, labels, test_size=0.4, random_state=0)
clf = RandomForestClassifier(n_estimators=15)
clf = clf.fit(X_train,y_train)









    Out[281]:





RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=15, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)



In [284]:

    
def get_properly_class(yhat,y):
    return float(sum([yhat[i]==y[i] for i in range(len(y))]))/len(y)

yhat = clf.predict(X_test)
get_properly_class(yhat, y_test)









    Out[284]:





0.6014601272931487



In [ ]:



In [178]:

    
plt.hist(df['age'], bins = np.arange(0,20,1))
plt.title("Age distribution")
plt.xlabel("Ages")
plt.show()



In [179]:

    
plt.hist(df[df['outcome'] == 'Euthanasia']['age'], bins = np.arange(0,20,1), normed=1)
plt.title("Age distribution for Euthanasia")
plt.xlabel("Ages")
plt.show()



In [180]:

    
plt.hist(df[df['outcome'] == 'Adoption']['age'], bins = np.arange(0,20,1), normed=1)
plt.title("Age distribution for Adoption")
plt.xlabel("Ages")
plt.show()



In [181]:

    
len(df['color'].value_counts())









    Out[181]:





366



In [208]:

    
df['color'].isnull().sum()









    Out[208]:





0



In [275]:

    
for col in pd.DataFrame(y_train):
    print(y_train[col].isnull().sum())









    



---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-275-aaa3810e6b88> in <module>()
      1 for col in pd.DataFrame(y_train):
----> 2     print(y_train[col].isnull().sum())

AttributeError: 'str' object has no attribute 'isnull'



In [ ]:

    
for col in df:
    df[]

	name	date	species	sex	age	breed	color	sterile
0	1	2	Dog	Male	1.000000	Shetland Sheepdog Mix	Brown/White	Neutered
1	1	10	Cat	Female	1.000000	Domestic Shorthair Mix	Cream Tabby	Spayed
2	1	1	Dog	Male	2.000000	Pit Bull Mix	Blue/White	Neutered
3	0	7	Cat	Male	0.057692	Domestic Shorthair Mix	Blue Cream	Intact
4	0	11	Dog	Male	2.000000	Lhasa Apso/Miniature Poodle	Tan	Neutered