In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.cross_validation import train_test_split

In [2]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

train.head(5)


Out[2]:
AnimalID Name DateTime OutcomeType OutcomeSubtype AnimalType SexuponOutcome AgeuponOutcome Breed Color
0 A671945 Hambone 2014-02-12 18:22:00 Return_to_owner NaN Dog Neutered Male 1 year Shetland Sheepdog Mix Brown/White
1 A656520 Emily 2013-10-13 12:44:00 Euthanasia Suffering Cat Spayed Female 1 year Domestic Shorthair Mix Cream Tabby
2 A686464 Pearce 2015-01-31 12:28:00 Adoption Foster Dog Neutered Male 2 years Pit Bull Mix Blue/White
3 A683430 NaN 2014-07-11 19:09:00 Transfer Partner Cat Intact Male 3 weeks Domestic Shorthair Mix Blue Cream
4 A667013 NaN 2013-11-15 12:52:00 Transfer Partner Dog Neutered Male 2 years Lhasa Apso/Miniature Poodle Tan

In [3]:
id_test = test['ID'].values # savind id column for later use
test = test.drop(['ID'],axis=1)

In [4]:
train = train.drop(['AnimalID'],axis=1)

In [5]:
# fix the DateTime column

def fix_date_time(df):
    def extract_field(_df, start, stop):
        return _df['DateTime'].map(lambda dt: int(dt[start:stop]))
    df['Year'] = extract_field(df,0,4)
    df['Month'] = extract_field(df,5,7)
    df['Day'] = extract_field(df,8,10)
    df['Hour'] = extract_field(df,11,13)
    df['Minute'] = extract_field(df,14,16)
    
    return df.drop(['DateTime'], axis = 1)

train = fix_date_time(train)
test = fix_date_time(test)

In [6]:
# add a column with a Name frequency count

names = pd.concat([test['Name'], train['Name']])
values = dict(names.value_counts())

train['_NameFreq'] = train['Name'].map(values)
test['_NameFreq'] = test['Name'].map(values)

train['_NameFreq'] = train['_NameFreq'].fillna(-1)
test['_NameFreq'] = test['_NameFreq'].fillna(-1)

print train.head()
print(train.info())


      Name      OutcomeType OutcomeSubtype AnimalType SexuponOutcome  \
0  Hambone  Return_to_owner            NaN        Dog  Neutered Male   
1    Emily       Euthanasia      Suffering        Cat  Spayed Female   
2   Pearce         Adoption         Foster        Dog  Neutered Male   
3      NaN         Transfer        Partner        Cat    Intact Male   
4      NaN         Transfer        Partner        Dog  Neutered Male   

  AgeuponOutcome                        Breed        Color  Year  Month  Day  \
0         1 year        Shetland Sheepdog Mix  Brown/White  2014      2   12   
1         1 year       Domestic Shorthair Mix  Cream Tabby  2013     10   13   
2        2 years                 Pit Bull Mix   Blue/White  2015      1   31   
3        3 weeks       Domestic Shorthair Mix   Blue Cream  2014      7   11   
4        2 years  Lhasa Apso/Miniature Poodle          Tan  2013     11   15   

   Hour  Minute  _NameFreq  
0    18      22        6.0  
1    12      44       25.0  
2    12      28        2.0  
3    19       9       -1.0  
4    12      52       -1.0  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26729 entries, 0 to 26728
Data columns (total 14 columns):
Name              19038 non-null object
OutcomeType       26729 non-null object
OutcomeSubtype    13117 non-null object
AnimalType        26729 non-null object
SexuponOutcome    26728 non-null object
AgeuponOutcome    26711 non-null object
Breed             26729 non-null object
Color             26729 non-null object
Year              26729 non-null int64
Month             26729 non-null int64
Day               26729 non-null int64
Hour              26729 non-null int64
Minute            26729 non-null int64
_NameFreq         26729 non-null float64
dtypes: float64(1), int64(5), object(8)
memory usage: 2.9+ MB
None

In [7]:
# if name not null make it 1, else 0. If Dog 1, cat 0
train['Name'] = train['Name'].apply(lambda x: 1 if x == x else 0)
train['AnimalType'] = train['AnimalType'].apply(lambda x: 1 if x == 'Dog' else 0)

test['Name'] = test['Name'].apply(lambda x: 1 if x == x else 0)
test['AnimalType'] = test['AnimalType'].apply(lambda x: 1 if x == 'Dog' else 0)

In [8]:
# label object types. 

from sklearn import preprocessing
lbl = preprocessing.LabelEncoder()

shapeTrain = train.shape[0]
shapeTest = test.shape[0]
train = train.append(test)

toTransform = ['Breed', 'Color', 'SexuponOutcome','OutcomeSubtype']
for f in toTransform: 
    lbl = preprocessing.LabelEncoder() 
    lbl.fit(list(train[f].values)) 
    train[f] = lbl.transform(list(train[f].values))
   
test = train[shapeTrain:shapeTrain+shapeTest]
train = train[0:shapeTrain]

lbl = preprocessing.LabelEncoder() 
lbl.fit(list(train['OutcomeType'].values))
train['OutcomeType'] = lbl.transform(list(train['OutcomeType'].values))

#set target columns
target = train['OutcomeType']
train = train.drop(['OutcomeType'],axis=1)
test = test.drop(['OutcomeType'],axis=1)

In [9]:
# target labels, seems to be correct
print lbl.inverse_transform([0, 1, 2, 3, 4])


['Adoption' 'Died' 'Euthanasia' 'Return_to_owner' 'Transfer']

In [10]:
#pd.get_dummies(train['OutcomeSubtype']).head()

In [11]:
# train = train.append(test)

# train = pd.concat([train, pd.get_dummies(train['SexuponOutcome'])], axis=1, join_axes=[train.index])
# train = train.drop(['Unknown', 'SexuponOutcome'],axis=1)

# test = train[shapeTrain:shapeTrain+shapeTest]
# train = train[0:shapeTrain]

In [12]:
# train = train.append(test)

# train = pd.concat([train, pd.get_dummies(train['OutcomeSubtype'])], axis=1, join_axes=[train.index])
# train = train.drop(['Suffering', 'OutcomeSubtype'],axis=1)

# test = train[shapeTrain:shapeTrain + shapeTest]
# train = train[0:shapeTrain]

In [13]:
# train = train.append(test)

# train = pd.concat([train, pd.get_dummies(train['OutcomeType'])], axis=1, join_axes=[train.index])
# train = train.drop(['Unknown', 'OutcomeType'],axis=1)

# test = train[shapeTrain:shapeTrain + shapeTest]
# train = train[0:shapeTrain]

In [14]:
# converting   AgeuponOutcome (1 years) to days 
def age_to_days(item):
    # convert item to list if it is one string
    if type(item) is str:
        item = [item]
    if type(item) is float:
        item = '1 day'
        item = [item]
    ages_in_days = np.zeros(len(item))
    for i in range(len(item)):
        # check if item[i] is str
        if type(item[i]) is str:
            if 'day' in item[i]:
                ages_in_days[i] = int(item[i].split(' ')[0])
            if 'week' in item[i]:
                ages_in_days[i] = int(item[i].split(' ')[0])*7
            if 'month' in item[i]:
                ages_in_days[i] = int(item[i].split(' ')[0])*30
            if 'year' in item[i]:
                ages_in_days[i] = int(item[i].split(' ')[0])*365    
        else:
            # item[i] is not a string but a nan
            ages_in_days[i] = 0
    return ages_in_days

train['AgeuponOutcome'] = train['AgeuponOutcome'].map(lambda x: age_to_days(x)[0])
test['AgeuponOutcome'] = test['AgeuponOutcome'].map(lambda x: age_to_days(x)[0])

In [ ]:
#splitting 
X_fit, X_eval, y_fit, y_eval= train_test_split(
    train, target, test_size=0.15, random_state=1
)
#training model
clf = xgb.XGBClassifier(max_depth=4, missing=np.NAN,
                        n_estimators=500, learning_rate=0.05, 
                        subsample=1, colsample_bytree=0.9, seed=2100,objective= 'multi:softprob')

clf.fit(X_fit, y_fit, early_stopping_rounds=35,  eval_metric="mlogloss", eval_set=[(X_eval, y_eval)])

In [17]:
#printing result into file
k = clf.predict_proba(test)
#Adoption	Died	Euthanasia	Return_to_owner	Transfer
submission = pd.DataFrame({"ID":id_test, "Adoption":k[:,0], "Died":k[:,1],"Euthanasia":k[:,2],"Return_to_owner":k[:,3],"Transfer":k[:,4]})

cols = ['ID',"Adoption", "Died", "Euthanasia","Return_to_owner", "Transfer"]

submission = submission[cols]
submission.to_csv("submission_cats.csv", index=False)