In [53]:
import pandas as pd
import numpy as np
import pylab as pl

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC

svm_clf = SVC()
neighbors_clf = KNeighborsClassifier()
clfs = [
    ("svc", SVC()),
    ("KNN", KNeighborsClassifier())
    ]

def comp_clfs(clfs, data, data_labels):
    """data includes features in dataframe format and labels are separately"""
    for name, clf in clfs:
        clf.fit(data, data_labels)
        print name, clf.score(data,clf.predict(data))
        print "*"*80

In [82]:
train= pd.read_csv("/home/keer/DSSG/data-challenges/BuildingInspections/data/Building_Violations_50000_features_and_labels.csv")

In [83]:
train.columns
train['violation_date']=pd.to_datetime(train.violation_date)

In [84]:
train=train.sort("violation_date")

In [85]:
def roll_window(data, date_col, beginning, ending):
    return data[data[date_col]<ending][data[date_col]>beginning]

In [86]:
train.columns


Out[86]:
Index([u'Unnamed: 0', u'unnamed:_0', u'id', u'violation_last_modified_date', u'violation_date', u'violation_code', u'violation_status', u'violation_status_date', u'violation_description', u'violation_location', u'violation_inspector_comments', u'violation_ordinance', u'inspector_id', u'inspection_number', u'inspection_status', u'inspection_waived', u'inspection_category', u'department_bureau', u'address', u'property_group', u'ssa', u'latitude', u'longitude', u'location', u'log_lat', u'new_lat', u'closed', u'failed', u'hold', u'passed', u'boiler', u'conservation', u'construction_equipment', u'demolition', u'electrical', u'elevator', u'iron', u'new_construction', u'plumbing', u'refrigeration', u'signs', u'special_inspection_program', u'special_task_force', u'ventilation', u'water', u'complied', u'no_entry', u'open'], dtype='object')

In [ ]:


In [89]:
remove_these=['Unnamed: 0','violation_code','property_group','inspector_id', 'unnamed:_0' ,'inspection_status', 'violation_description', 'violation_inspector_comments', 'address','ssa','location','department_bureau','violation_status','passed','violation_status_date','violation_location','violation_ordinance','inspection_number']
###removed passed so we only have failed as our label after removing below
remove_these_entries=['closed','hold']
train=train[train.closed==0]
train=train[train.hold==0]

In [90]:
train=train.drop(remove_these, axis=1)
train.head()


Out[90]:
id violation_last_modified_date violation_date inspection_waived inspection_category latitude longitude log_lat new_lat closed ... plumbing refrigeration signs special_inspection_program special_task_force ventilation water complied no_entry open
44796 1834762 01/12/2007 2006-01-01 N PERIODIC 41.883310 -87.735709 3.734887 42.512678 0 ... 0 0 0 0 0 0 0 1 0 0
44787 1439036 04/16/2009 2006-01-01 N PERIODIC 41.782263 -87.610447 3.732472 42.432114 0 ... 0 0 0 0 0 0 0 1 0 0
44788 1474102 11/12/2009 2006-01-01 N PERIODIC 41.802877 -87.609908 3.732965 42.796032 0 ... 0 0 0 0 0 0 0 1 0 0
44789 1473213 10/25/2007 2006-01-01 N PERIODIC 41.778666 -87.615737 3.732386 42.716735 0 ... 0 0 0 0 0 0 0 1 0 0
44790 1474071 11/12/2009 2006-01-01 N PERIODIC 41.802877 -87.609908 3.732965 41.804273 0 ... 0 0 0 0 0 0 0 1 0 0

5 rows × 30 columns


In [91]:
train.columns


Out[91]:
Index([u'id', u'violation_last_modified_date', u'violation_date', u'inspection_waived', u'inspection_category', u'latitude', u'longitude', u'log_lat', u'new_lat', u'closed', u'failed', u'hold', u'boiler', u'conservation', u'construction_equipment', u'demolition', u'electrical', u'elevator', u'iron', u'new_construction', u'plumbing', u'refrigeration', u'signs', u'special_inspection_program', u'special_task_force', u'ventilation', u'water', u'complied', u'no_entry', u'open'], dtype='object')

In [93]:
train['violation_last_modified_date']=pd.to_datetime(train.violation_last_modified_date)
dummy_cols=['inspection_waived', 'inspection_category' ]
def make_dummies(data, cats):
    for cat in cats:
        dummies=pd.get_dummies(data[cat])
        data=data.drop(cat, axis=1)
        data=pd.concat([data, dummies], axis=1)
    return data
try_dumm=make_dummies(train, dummy_cols)
try_dumm.head()


Out[93]:
id violation_last_modified_date violation_date latitude longitude log_lat new_lat closed failed hold ... ventilation water complied no_entry open N COMPLAINT PERIODIC PERMIT REGISTRATION
44796 1834762 2007-01-12 2006-01-01 41.883310 -87.735709 3.734887 42.512678 0 1 0 ... 0 0 1 0 0 1 0 1 0 0
44787 1439036 2009-04-16 2006-01-01 41.782263 -87.610447 3.732472 42.432114 0 1 0 ... 0 0 1 0 0 1 0 1 0 0
44788 1474102 2009-11-12 2006-01-01 41.802877 -87.609908 3.732965 42.796032 0 1 0 ... 0 0 1 0 0 1 0 1 0 0
44789 1473213 2007-10-25 2006-01-01 41.778666 -87.615737 3.732386 42.716735 0 0 0 ... 0 0 1 0 0 1 0 1 0 0
44790 1474071 2009-11-12 2006-01-01 41.802877 -87.609908 3.732965 41.804273 0 1 0 ... 0 0 1 0 0 1 0 1 0 0

5 rows × 33 columns


In [112]:
try_dumm.head()


Out[112]:
id violation_last_modified_date violation_date latitude longitude log_lat new_lat closed failed hold ... ventilation water complied no_entry open N COMPLAINT PERIODIC PERMIT REGISTRATION
44796 1834762 2007-01-12 2006-01-01 41.883310 -87.735709 3.734887 42.512678 0 1 0 ... 0 0 1 0 0 1 0 1 0 0
44787 1439036 2009-04-16 2006-01-01 41.782263 -87.610447 3.732472 42.432114 0 1 0 ... 0 0 1 0 0 1 0 1 0 0
44788 1474102 2009-11-12 2006-01-01 41.802877 -87.609908 3.732965 42.796032 0 1 0 ... 0 0 1 0 0 1 0 1 0 0
44789 1473213 2007-10-25 2006-01-01 41.778666 -87.615737 3.732386 42.716735 0 0 0 ... 0 0 1 0 0 1 0 1 0 0
44790 1474071 2009-11-12 2006-01-01 41.802877 -87.609908 3.732965 41.804273 0 1 0 ... 0 0 1 0 0 1 0 1 0 0

5 rows × 33 columns


In [ ]:
#try_dumm.dtypes
#train['inspection_waived']=[float(i) for i in train['inspection_waived'] ]
drop_these=['failed', 'violation_date','violation_last_modified_date']
tester=try_dumm[np.isfinite(try_dumm['longitude'])]

In [ ]:
comp_clfs(clfs,tester.drop(drop_these,axis=1) , tester['failed'])