Shelter Animal Outcomes 2

Data cleaning & preprocessing


In [1]:
import pandas as pd
import numpy as np

In [17]:
df_train = pd.read_csv('../train.csv')
df_test  = pd.read_csv('../test.csv')
df_dangerousDogs = pd.read_csv('../dangerousDogs.csv')
df_dogBreedGroup = pd.read_csv('../dogbreeds.csv', names=['BreedName', 'Group'])

In [2]:
def getAncestry(data, pos):
    try:
        ancestors = data.split('/')
        if (pos == 1):
            return ancestors[0]
        else:
            return ancestors[1]
    except:
        if (pos == 1):
            return data
        else:
            return np.nan

In [4]:
def coatLength(breed):
    if "Shorthair" in breed:
        return 1
    elif "Medium Hair" in breed:
        return 2
    elif "Longhair" in breed:
        return 3
    else :
        return 0

In [9]:
def dangerousDogBreed(breed):
    if ((df_dangerousDogs['Dangerous_Breed'] == getAncestry(breed, 1)).any() or
        (df_dangerousDogs['Dangerous_Breed'] == getAncestry(breed, 2)).any()):
        return 1
    else:
        return 0

In [12]:
def getAncestorGroup(breed):
    groupArray = df_dogBreedGroup['Group'].values[df_dogBreedGroup['BreedName'].values == breed]
    if groupArray.size:
        return groupArray[0].strip()
    else:
        return np.nan

In [18]:
def munge(data, train):
    data['NameLength'] = data['Name'].str.len()
    data['NameLength'].fillna(0, inplace=True)
    gender = {'Neutered Male':1, 'Spayed Female':2, 'Intact Male':3, 'Intact Female':4, 'Unknown':5, np.nan:0}
    data['Gender'] = data['SexuponOutcome'].map(gender)
    data['IsCat'] = data['AnimalType'].map({'Cat':1, 'Dog':0})
    
    #Everything related to Breed
    data['IsMix'] = data['Breed'].str.contains("/|Mix", na=False).map({True: 1, False: 0})
    data['Breed'] = data['Breed'].map( lambda x: x.replace(' Mix', '').replace(' Shorthair', '').replace(' Longhair', '').strip())
    data['CoatType'] = data['Breed'].map(coatLength)
    data['IsDangerous'] = data['Breed'].map(dangerousDogBreed)
    data['Ancestor1'] = data['Breed'].map( lambda x: getAncestry(x, 1))
    data['Ancestor2'] = data['Breed'].map( lambda x: getAncestry(x, 2))
    group = {np.nan: 0, 'Herding':1, 'Non-Sporting':2, 'Terrier':3, 'Toy':4, 'Working':5, 
             'Sporting':6, 'Hound':7, 'Terrier & Toy':8}
    data['Ancestor1_group'] = data['Ancestor1'].map(getAncestorGroup).map(group)
    data['Ancestor2_group'] = data['Ancestor2'].map(getAncestorGroup).map(group)
    
    if (train):
        data['Outcome'] = data['OutcomeType'].map({'Adoption':1, 'Died':2, 'Euthanasia':3, 'Return_to_owner':4, 'Transfer':5})
        data = data.drop('OutcomeSubtype', 1)
        data = data.drop('OutcomeType', 1)
        data = data.drop('AnimalID', 1)
                         
    # dropping breed for now 
    return data.drop(['SexuponOutcome', 'AnimalType','AgeuponOutcome', 'Name', 'Breed', 'DateTime', 'Color', 'Ancestor1', 'Ancestor2'], 1)

In [19]:
out_train = munge(df_train, True)
out_train.head()


Out[19]:
NameLength Gender IsCat IsMix CoatType IsDangerous Ancestor1_group Ancestor2_group Outcome
0 7.0 1 0 1 0 0 1 0 4
1 5.0 2 1 1 0 0 0 0 3
2 6.0 1 0 1 0 1 0 0 1
3 0.0 3 1 1 0 0 0 0 5
4 0.0 1 0 1 0 0 2 0 5

In [20]:
out_test = munge(df_test, False)
out_test.head()


Out[20]:
ID NameLength Gender IsCat IsMix CoatType IsDangerous Ancestor1_group Ancestor2_group
0 1 6.0 4 0 1 0 0 6 0
1 2 8.0 2 0 1 0 1 1 5
2 3 3.0 1 1 1 0 0 0 0
3 4 5.0 3 0 1 0 0 0 0
4 5 7.0 1 0 1 0 0 0 0

In [21]:
out_test.to_csv('../Shelter_test.csv', index= False)

In [22]:
out_train.to_csv('../Shelter_train.csv', index= False)