In [1]:
import pandas as pd
import numpy as np
In [17]:
df_train = pd.read_csv('../train.csv')
df_test = pd.read_csv('../test.csv')
df_dangerousDogs = pd.read_csv('../dangerousDogs.csv')
df_dogBreedGroup = pd.read_csv('../dogbreeds.csv', names=['BreedName', 'Group'])
In [2]:
def getAncestry(data, pos):
try:
ancestors = data.split('/')
if (pos == 1):
return ancestors[0]
else:
return ancestors[1]
except:
if (pos == 1):
return data
else:
return np.nan
In [4]:
def coatLength(breed):
if "Shorthair" in breed:
return 1
elif "Medium Hair" in breed:
return 2
elif "Longhair" in breed:
return 3
else :
return 0
In [9]:
def dangerousDogBreed(breed):
if ((df_dangerousDogs['Dangerous_Breed'] == getAncestry(breed, 1)).any() or
(df_dangerousDogs['Dangerous_Breed'] == getAncestry(breed, 2)).any()):
return 1
else:
return 0
In [12]:
def getAncestorGroup(breed):
groupArray = df_dogBreedGroup['Group'].values[df_dogBreedGroup['BreedName'].values == breed]
if groupArray.size:
return groupArray[0].strip()
else:
return np.nan
In [18]:
def munge(data, train):
data['NameLength'] = data['Name'].str.len()
data['NameLength'].fillna(0, inplace=True)
gender = {'Neutered Male':1, 'Spayed Female':2, 'Intact Male':3, 'Intact Female':4, 'Unknown':5, np.nan:0}
data['Gender'] = data['SexuponOutcome'].map(gender)
data['IsCat'] = data['AnimalType'].map({'Cat':1, 'Dog':0})
#Everything related to Breed
data['IsMix'] = data['Breed'].str.contains("/|Mix", na=False).map({True: 1, False: 0})
data['Breed'] = data['Breed'].map( lambda x: x.replace(' Mix', '').replace(' Shorthair', '').replace(' Longhair', '').strip())
data['CoatType'] = data['Breed'].map(coatLength)
data['IsDangerous'] = data['Breed'].map(dangerousDogBreed)
data['Ancestor1'] = data['Breed'].map( lambda x: getAncestry(x, 1))
data['Ancestor2'] = data['Breed'].map( lambda x: getAncestry(x, 2))
group = {np.nan: 0, 'Herding':1, 'Non-Sporting':2, 'Terrier':3, 'Toy':4, 'Working':5,
'Sporting':6, 'Hound':7, 'Terrier & Toy':8}
data['Ancestor1_group'] = data['Ancestor1'].map(getAncestorGroup).map(group)
data['Ancestor2_group'] = data['Ancestor2'].map(getAncestorGroup).map(group)
if (train):
data['Outcome'] = data['OutcomeType'].map({'Adoption':1, 'Died':2, 'Euthanasia':3, 'Return_to_owner':4, 'Transfer':5})
data = data.drop('OutcomeSubtype', 1)
data = data.drop('OutcomeType', 1)
data = data.drop('AnimalID', 1)
# dropping breed for now
return data.drop(['SexuponOutcome', 'AnimalType','AgeuponOutcome', 'Name', 'Breed', 'DateTime', 'Color', 'Ancestor1', 'Ancestor2'], 1)
In [19]:
out_train = munge(df_train, True)
out_train.head()
Out[19]:
In [20]:
out_test = munge(df_test, False)
out_test.head()
Out[20]:
In [21]:
out_test.to_csv('../Shelter_test.csv', index= False)
In [22]:
out_train.to_csv('../Shelter_train.csv', index= False)