notebook.community

Edit and run



In [1]:

    
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.cross_validation import train_test_split



In [2]:

    
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

train.head(5)









    Out[2]:






  
    
      
      AnimalID
      Name
      DateTime
      OutcomeType
      OutcomeSubtype
      AnimalType
      SexuponOutcome
      AgeuponOutcome
      Breed
      Color
    
  
  
    
      0
      A671945
      Hambone
      2014-02-12 18:22:00
      Return_to_owner
      NaN
      Dog
      Neutered Male
      1 year
      Shetland Sheepdog Mix
      Brown/White
    
    
      1
      A656520
      Emily
      2013-10-13 12:44:00
      Euthanasia
      Suffering
      Cat
      Spayed Female
      1 year
      Domestic Shorthair Mix
      Cream Tabby
    
    
      2
      A686464
      Pearce
      2015-01-31 12:28:00
      Adoption
      Foster
      Dog
      Neutered Male
      2 years
      Pit Bull Mix
      Blue/White
    
    
      3
      A683430
      NaN
      2014-07-11 19:09:00
      Transfer
      Partner
      Cat
      Intact Male
      3 weeks
      Domestic Shorthair Mix
      Blue Cream
    
    
      4
      A667013
      NaN
      2013-11-15 12:52:00
      Transfer
      Partner
      Dog
      Neutered Male
      2 years
      Lhasa Apso/Miniature Poodle
      Tan



In [3]:

    
id_test = test['ID'].values # savind id column for later use
test = test.drop(['ID'],axis=1)



In [4]:

    
train = train.drop(['AnimalID'],axis=1)



In [5]:

    
# fix the DateTime column

def fix_date_time(df):
    def extract_field(_df, start, stop):
        return _df['DateTime'].map(lambda dt: int(dt[start:stop]))
    df['Year'] = extract_field(df,0,4)
    df['Month'] = extract_field(df,5,7)
    df['Day'] = extract_field(df,8,10)
    df['Hour'] = extract_field(df,11,13)
    df['Minute'] = extract_field(df,14,16)
    
    return df.drop(['DateTime'], axis = 1)

train = fix_date_time(train)
test = fix_date_time(test)



In [6]:

    
# add a column with a Name frequency count

names = pd.concat([test['Name'], train['Name']])
values = dict(names.value_counts())

train['_NameFreq'] = train['Name'].map(values)
test['_NameFreq'] = test['Name'].map(values)

train['_NameFreq'] = train['_NameFreq'].fillna(-1)
test['_NameFreq'] = test['_NameFreq'].fillna(-1)

print train.head()
print(train.info())









    



      Name      OutcomeType OutcomeSubtype AnimalType SexuponOutcome  \
0  Hambone  Return_to_owner            NaN        Dog  Neutered Male   
1    Emily       Euthanasia      Suffering        Cat  Spayed Female   
2   Pearce         Adoption         Foster        Dog  Neutered Male   
3      NaN         Transfer        Partner        Cat    Intact Male   
4      NaN         Transfer        Partner        Dog  Neutered Male   

  AgeuponOutcome                        Breed        Color  Year  Month  Day  \
0         1 year        Shetland Sheepdog Mix  Brown/White  2014      2   12   
1         1 year       Domestic Shorthair Mix  Cream Tabby  2013     10   13   
2        2 years                 Pit Bull Mix   Blue/White  2015      1   31   
3        3 weeks       Domestic Shorthair Mix   Blue Cream  2014      7   11   
4        2 years  Lhasa Apso/Miniature Poodle          Tan  2013     11   15   

   Hour  Minute  _NameFreq  
0    18      22        6.0  
1    12      44       25.0  
2    12      28        2.0  
3    19       9       -1.0  
4    12      52       -1.0  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26729 entries, 0 to 26728
Data columns (total 14 columns):
Name              19038 non-null object
OutcomeType       26729 non-null object
OutcomeSubtype    13117 non-null object
AnimalType        26729 non-null object
SexuponOutcome    26728 non-null object
AgeuponOutcome    26711 non-null object
Breed             26729 non-null object
Color             26729 non-null object
Year              26729 non-null int64
Month             26729 non-null int64
Day               26729 non-null int64
Hour              26729 non-null int64
Minute            26729 non-null int64
_NameFreq         26729 non-null float64
dtypes: float64(1), int64(5), object(8)
memory usage: 2.9+ MB
None



In [7]:

    
# if name not null make it 1, else 0. If Dog 1, cat 0
train['Name'] = train['Name'].apply(lambda x: 1 if x == x else 0)
train['AnimalType'] = train['AnimalType'].apply(lambda x: 1 if x == 'Dog' else 0)

test['Name'] = test['Name'].apply(lambda x: 1 if x == x else 0)
test['AnimalType'] = test['AnimalType'].apply(lambda x: 1 if x == 'Dog' else 0)



In [8]:

    
# label object types. 

from sklearn import preprocessing
lbl = preprocessing.LabelEncoder()

shapeTrain = train.shape[0]
shapeTest = test.shape[0]
train = train.append(test)

toTransform = ['Breed', 'Color', 'SexuponOutcome','OutcomeSubtype']
for f in toTransform: 
    lbl = preprocessing.LabelEncoder() 
    lbl.fit(list(train[f].values)) 
    train[f] = lbl.transform(list(train[f].values))
   
test = train[shapeTrain:shapeTrain+shapeTest]
train = train[0:shapeTrain]

lbl = preprocessing.LabelEncoder() 
lbl.fit(list(train['OutcomeType'].values))
train['OutcomeType'] = lbl.transform(list(train['OutcomeType'].values))

#set target columns
target = train['OutcomeType']
train = train.drop(['OutcomeType'],axis=1)
test = test.drop(['OutcomeType'],axis=1)



In [9]:

    
# target labels, seems to be correct
print lbl.inverse_transform([0, 1, 2, 3, 4])









    



['Adoption' 'Died' 'Euthanasia' 'Return_to_owner' 'Transfer']



In [10]:

    
#pd.get_dummies(train['OutcomeSubtype']).head()



In [11]:

    
# train = train.append(test)

# train = pd.concat([train, pd.get_dummies(train['SexuponOutcome'])], axis=1, join_axes=[train.index])
# train = train.drop(['Unknown', 'SexuponOutcome'],axis=1)

# test = train[shapeTrain:shapeTrain+shapeTest]
# train = train[0:shapeTrain]



In [12]:

    
# train = train.append(test)

# train = pd.concat([train, pd.get_dummies(train['OutcomeSubtype'])], axis=1, join_axes=[train.index])
# train = train.drop(['Suffering', 'OutcomeSubtype'],axis=1)

# test = train[shapeTrain:shapeTrain + shapeTest]
# train = train[0:shapeTrain]



In [13]:

    
# train = train.append(test)

# train = pd.concat([train, pd.get_dummies(train['OutcomeType'])], axis=1, join_axes=[train.index])
# train = train.drop(['Unknown', 'OutcomeType'],axis=1)

# test = train[shapeTrain:shapeTrain + shapeTest]
# train = train[0:shapeTrain]



In [14]:

    
# converting   AgeuponOutcome (1 years) to days 
def age_to_days(item):
    # convert item to list if it is one string
    if type(item) is str:
        item = [item]
    if type(item) is float:
        item = '1 day'
        item = [item]
    ages_in_days = np.zeros(len(item))
    for i in range(len(item)):
        # check if item[i] is str
        if type(item[i]) is str:
            if 'day' in item[i]:
                ages_in_days[i] = int(item[i].split(' ')[0])
            if 'week' in item[i]:
                ages_in_days[i] = int(item[i].split(' ')[0])*7
            if 'month' in item[i]:
                ages_in_days[i] = int(item[i].split(' ')[0])*30
            if 'year' in item[i]:
                ages_in_days[i] = int(item[i].split(' ')[0])*365    
        else:
            # item[i] is not a string but a nan
            ages_in_days[i] = 0
    return ages_in_days

train['AgeuponOutcome'] = train['AgeuponOutcome'].map(lambda x: age_to_days(x)[0])
test['AgeuponOutcome'] = test['AgeuponOutcome'].map(lambda x: age_to_days(x)[0])



In [ ]:

    
#splitting 
X_fit, X_eval, y_fit, y_eval= train_test_split(
    train, target, test_size=0.15, random_state=1
)
#training model
clf = xgb.XGBClassifier(max_depth=4, missing=np.NAN,
                        n_estimators=500, learning_rate=0.05, 
                        subsample=1, colsample_bytree=0.9, seed=2100,objective= 'multi:softprob')

clf.fit(X_fit, y_fit, early_stopping_rounds=35,  eval_metric="mlogloss", eval_set=[(X_eval, y_eval)])



In [17]:

    
#printing result into file
k = clf.predict_proba(test)
#Adoption	Died	Euthanasia	Return_to_owner	Transfer
submission = pd.DataFrame({"ID":id_test, "Adoption":k[:,0], "Died":k[:,1],"Euthanasia":k[:,2],"Return_to_owner":k[:,3],"Transfer":k[:,4]})

cols = ['ID',"Adoption", "Died", "Euthanasia","Return_to_owner", "Transfer"]

submission = submission[cols]
submission.to_csv("submission_cats.csv", index=False)

	AnimalID	Name	DateTime	OutcomeType	OutcomeSubtype	AnimalType	SexuponOutcome	AgeuponOutcome	Breed	Color
0	A671945	Hambone	2014-02-12 18:22:00	Return_to_owner	NaN	Dog	Neutered Male	1 year	Shetland Sheepdog Mix	Brown/White
1	A656520	Emily	2013-10-13 12:44:00	Euthanasia	Suffering	Cat	Spayed Female	1 year	Domestic Shorthair Mix	Cream Tabby
2	A686464	Pearce	2015-01-31 12:28:00	Adoption	Foster	Dog	Neutered Male	2 years	Pit Bull Mix	Blue/White
3	A683430	NaN	2014-07-11 19:09:00	Transfer	Partner	Cat	Intact Male	3 weeks	Domestic Shorthair Mix	Blue Cream
4	A667013	NaN	2013-11-15 12:52:00	Transfer	Partner	Dog	Neutered Male	2 years	Lhasa Apso/Miniature Poodle	Tan