In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.cross_validation import train_test_split
In [2]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
train.head(5)
Out[2]:
In [3]:
id_test = test['ID'].values # savind id column for later use
test = test.drop(['ID'],axis=1)
In [4]:
train = train.drop(['AnimalID'],axis=1)
In [5]:
# fix the DateTime column
def fix_date_time(df):
def extract_field(_df, start, stop):
return _df['DateTime'].map(lambda dt: int(dt[start:stop]))
df['Year'] = extract_field(df,0,4)
df['Month'] = extract_field(df,5,7)
df['Day'] = extract_field(df,8,10)
df['Hour'] = extract_field(df,11,13)
df['Minute'] = extract_field(df,14,16)
return df.drop(['DateTime'], axis = 1)
train = fix_date_time(train)
test = fix_date_time(test)
In [6]:
# add a column with a Name frequency count
names = pd.concat([test['Name'], train['Name']])
values = dict(names.value_counts())
train['_NameFreq'] = train['Name'].map(values)
test['_NameFreq'] = test['Name'].map(values)
train['_NameFreq'] = train['_NameFreq'].fillna(-1)
test['_NameFreq'] = test['_NameFreq'].fillna(-1)
print train.head()
print(train.info())
In [7]:
# if name not null make it 1, else 0. If Dog 1, cat 0
train['Name'] = train['Name'].apply(lambda x: 1 if x == x else 0)
train['AnimalType'] = train['AnimalType'].apply(lambda x: 1 if x == 'Dog' else 0)
test['Name'] = test['Name'].apply(lambda x: 1 if x == x else 0)
test['AnimalType'] = test['AnimalType'].apply(lambda x: 1 if x == 'Dog' else 0)
In [8]:
# label object types.
from sklearn import preprocessing
lbl = preprocessing.LabelEncoder()
shapeTrain = train.shape[0]
shapeTest = test.shape[0]
train = train.append(test)
toTransform = ['Breed', 'Color', 'SexuponOutcome','OutcomeSubtype']
for f in toTransform:
lbl = preprocessing.LabelEncoder()
lbl.fit(list(train[f].values))
train[f] = lbl.transform(list(train[f].values))
test = train[shapeTrain:shapeTrain+shapeTest]
train = train[0:shapeTrain]
lbl = preprocessing.LabelEncoder()
lbl.fit(list(train['OutcomeType'].values))
train['OutcomeType'] = lbl.transform(list(train['OutcomeType'].values))
#set target columns
target = train['OutcomeType']
train = train.drop(['OutcomeType'],axis=1)
test = test.drop(['OutcomeType'],axis=1)
In [9]:
# target labels, seems to be correct
print lbl.inverse_transform([0, 1, 2, 3, 4])
In [10]:
#pd.get_dummies(train['OutcomeSubtype']).head()
In [11]:
# train = train.append(test)
# train = pd.concat([train, pd.get_dummies(train['SexuponOutcome'])], axis=1, join_axes=[train.index])
# train = train.drop(['Unknown', 'SexuponOutcome'],axis=1)
# test = train[shapeTrain:shapeTrain+shapeTest]
# train = train[0:shapeTrain]
In [12]:
# train = train.append(test)
# train = pd.concat([train, pd.get_dummies(train['OutcomeSubtype'])], axis=1, join_axes=[train.index])
# train = train.drop(['Suffering', 'OutcomeSubtype'],axis=1)
# test = train[shapeTrain:shapeTrain + shapeTest]
# train = train[0:shapeTrain]
In [13]:
# train = train.append(test)
# train = pd.concat([train, pd.get_dummies(train['OutcomeType'])], axis=1, join_axes=[train.index])
# train = train.drop(['Unknown', 'OutcomeType'],axis=1)
# test = train[shapeTrain:shapeTrain + shapeTest]
# train = train[0:shapeTrain]
In [14]:
# converting AgeuponOutcome (1 years) to days
def age_to_days(item):
# convert item to list if it is one string
if type(item) is str:
item = [item]
if type(item) is float:
item = '1 day'
item = [item]
ages_in_days = np.zeros(len(item))
for i in range(len(item)):
# check if item[i] is str
if type(item[i]) is str:
if 'day' in item[i]:
ages_in_days[i] = int(item[i].split(' ')[0])
if 'week' in item[i]:
ages_in_days[i] = int(item[i].split(' ')[0])*7
if 'month' in item[i]:
ages_in_days[i] = int(item[i].split(' ')[0])*30
if 'year' in item[i]:
ages_in_days[i] = int(item[i].split(' ')[0])*365
else:
# item[i] is not a string but a nan
ages_in_days[i] = 0
return ages_in_days
train['AgeuponOutcome'] = train['AgeuponOutcome'].map(lambda x: age_to_days(x)[0])
test['AgeuponOutcome'] = test['AgeuponOutcome'].map(lambda x: age_to_days(x)[0])
In [ ]:
#splitting
X_fit, X_eval, y_fit, y_eval= train_test_split(
train, target, test_size=0.15, random_state=1
)
#training model
clf = xgb.XGBClassifier(max_depth=4, missing=np.NAN,
n_estimators=500, learning_rate=0.05,
subsample=1, colsample_bytree=0.9, seed=2100,objective= 'multi:softprob')
clf.fit(X_fit, y_fit, early_stopping_rounds=35, eval_metric="mlogloss", eval_set=[(X_eval, y_eval)])
In [17]:
#printing result into file
k = clf.predict_proba(test)
#Adoption Died Euthanasia Return_to_owner Transfer
submission = pd.DataFrame({"ID":id_test, "Adoption":k[:,0], "Died":k[:,1],"Euthanasia":k[:,2],"Return_to_owner":k[:,3],"Transfer":k[:,4]})
cols = ['ID',"Adoption", "Died", "Euthanasia","Return_to_owner", "Transfer"]
submission = submission[cols]
submission.to_csv("submission_cats.csv", index=False)