notebook.community

Edit and run



In [53]:

    
import pandas as pd
import numpy as np
import pylab as pl

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC

svm_clf = SVC()
neighbors_clf = KNeighborsClassifier()
clfs = [
    ("svc", SVC()),
    ("KNN", KNeighborsClassifier())
    ]

def comp_clfs(clfs, data, data_labels):
    """data includes features in dataframe format and labels are separately"""
    for name, clf in clfs:
        clf.fit(data, data_labels)
        print name, clf.score(data,clf.predict(data))
        print "*"*80



In [82]:

    
train= pd.read_csv("/home/keer/DSSG/data-challenges/BuildingInspections/data/Building_Violations_50000_features_and_labels.csv")



In [83]:

    
train.columns
train['violation_date']=pd.to_datetime(train.violation_date)



In [84]:

    
train=train.sort("violation_date")



In [85]:

    
def roll_window(data, date_col, beginning, ending):
    return data[data[date_col]<ending][data[date_col]>beginning]



In [86]:

    
train.columns









    Out[86]:





Index([u'Unnamed: 0', u'unnamed:_0', u'id', u'violation_last_modified_date', u'violation_date', u'violation_code', u'violation_status', u'violation_status_date', u'violation_description', u'violation_location', u'violation_inspector_comments', u'violation_ordinance', u'inspector_id', u'inspection_number', u'inspection_status', u'inspection_waived', u'inspection_category', u'department_bureau', u'address', u'property_group', u'ssa', u'latitude', u'longitude', u'location', u'log_lat', u'new_lat', u'closed', u'failed', u'hold', u'passed', u'boiler', u'conservation', u'construction_equipment', u'demolition', u'electrical', u'elevator', u'iron', u'new_construction', u'plumbing', u'refrigeration', u'signs', u'special_inspection_program', u'special_task_force', u'ventilation', u'water', u'complied', u'no_entry', u'open'], dtype='object')



In [ ]:



In [89]:

    
remove_these=['Unnamed: 0','violation_code','property_group','inspector_id', 'unnamed:_0' ,'inspection_status', 'violation_description', 'violation_inspector_comments', 'address','ssa','location','department_bureau','violation_status','passed','violation_status_date','violation_location','violation_ordinance','inspection_number']
###removed passed so we only have failed as our label after removing below
remove_these_entries=['closed','hold']
train=train[train.closed==0]
train=train[train.hold==0]



In [90]:

    
train=train.drop(remove_these, axis=1)
train.head()









    Out[90]:






  
    
      
      id
      violation_last_modified_date
      violation_date
      inspection_waived
      inspection_category
      latitude
      longitude
      log_lat
      new_lat
      closed
      ...
      plumbing
      refrigeration
      signs
      special_inspection_program
      special_task_force
      ventilation
      water
      complied
      no_entry
      open
    
  
  
    
      44796
       1834762
       01/12/2007
      2006-01-01
       N
       PERIODIC
       41.883310
      -87.735709
       3.734887
       42.512678
       0
      ...
       0
       0
       0
       0
       0
       0
       0
       1
       0
       0
    
    
      44787
       1439036
       04/16/2009
      2006-01-01
       N
       PERIODIC
       41.782263
      -87.610447
       3.732472
       42.432114
       0
      ...
       0
       0
       0
       0
       0
       0
       0
       1
       0
       0
    
    
      44788
       1474102
       11/12/2009
      2006-01-01
       N
       PERIODIC
       41.802877
      -87.609908
       3.732965
       42.796032
       0
      ...
       0
       0
       0
       0
       0
       0
       0
       1
       0
       0
    
    
      44789
       1473213
       10/25/2007
      2006-01-01
       N
       PERIODIC
       41.778666
      -87.615737
       3.732386
       42.716735
       0
      ...
       0
       0
       0
       0
       0
       0
       0
       1
       0
       0
    
    
      44790
       1474071
       11/12/2009
      2006-01-01
       N
       PERIODIC
       41.802877
      -87.609908
       3.732965
       41.804273
       0
      ...
       0
       0
       0
       0
       0
       0
       0
       1
       0
       0
    
  

5 rows × 30 columns



In [91]:

    
train.columns









    Out[91]:





Index([u'id', u'violation_last_modified_date', u'violation_date', u'inspection_waived', u'inspection_category', u'latitude', u'longitude', u'log_lat', u'new_lat', u'closed', u'failed', u'hold', u'boiler', u'conservation', u'construction_equipment', u'demolition', u'electrical', u'elevator', u'iron', u'new_construction', u'plumbing', u'refrigeration', u'signs', u'special_inspection_program', u'special_task_force', u'ventilation', u'water', u'complied', u'no_entry', u'open'], dtype='object')



In [93]:

    
train['violation_last_modified_date']=pd.to_datetime(train.violation_last_modified_date)
dummy_cols=['inspection_waived', 'inspection_category' ]
def make_dummies(data, cats):
    for cat in cats:
        dummies=pd.get_dummies(data[cat])
        data=data.drop(cat, axis=1)
        data=pd.concat([data, dummies], axis=1)
    return data
try_dumm=make_dummies(train, dummy_cols)
try_dumm.head()









    Out[93]:






  
    
      
      id
      violation_last_modified_date
      violation_date
      latitude
      longitude
      log_lat
      new_lat
      closed
      failed
      hold
      ...
      ventilation
      water
      complied
      no_entry
      open
      N
      COMPLAINT
      PERIODIC
      PERMIT
      REGISTRATION
    
  
  
    
      44796
       1834762
      2007-01-12
      2006-01-01
       41.883310
      -87.735709
       3.734887
       42.512678
       0
       1
       0
      ...
       0
       0
       1
       0
       0
       1
       0
       1
       0
       0
    
    
      44787
       1439036
      2009-04-16
      2006-01-01
       41.782263
      -87.610447
       3.732472
       42.432114
       0
       1
       0
      ...
       0
       0
       1
       0
       0
       1
       0
       1
       0
       0
    
    
      44788
       1474102
      2009-11-12
      2006-01-01
       41.802877
      -87.609908
       3.732965
       42.796032
       0
       1
       0
      ...
       0
       0
       1
       0
       0
       1
       0
       1
       0
       0
    
    
      44789
       1473213
      2007-10-25
      2006-01-01
       41.778666
      -87.615737
       3.732386
       42.716735
       0
       0
       0
      ...
       0
       0
       1
       0
       0
       1
       0
       1
       0
       0
    
    
      44790
       1474071
      2009-11-12
      2006-01-01
       41.802877
      -87.609908
       3.732965
       41.804273
       0
       1
       0
      ...
       0
       0
       1
       0
       0
       1
       0
       1
       0
       0
    
  

5 rows × 33 columns



In [112]:

    
try_dumm.head()









    Out[112]:






  
    
      
      id
      violation_last_modified_date
      violation_date
      latitude
      longitude
      log_lat
      new_lat
      closed
      failed
      hold
      ...
      ventilation
      water
      complied
      no_entry
      open
      N
      COMPLAINT
      PERIODIC
      PERMIT
      REGISTRATION
    
  
  
    
      44796
       1834762
      2007-01-12
      2006-01-01
       41.883310
      -87.735709
       3.734887
       42.512678
       0
       1
       0
      ...
       0
       0
       1
       0
       0
       1
       0
       1
       0
       0
    
    
      44787
       1439036
      2009-04-16
      2006-01-01
       41.782263
      -87.610447
       3.732472
       42.432114
       0
       1
       0
      ...
       0
       0
       1
       0
       0
       1
       0
       1
       0
       0
    
    
      44788
       1474102
      2009-11-12
      2006-01-01
       41.802877
      -87.609908
       3.732965
       42.796032
       0
       1
       0
      ...
       0
       0
       1
       0
       0
       1
       0
       1
       0
       0
    
    
      44789
       1473213
      2007-10-25
      2006-01-01
       41.778666
      -87.615737
       3.732386
       42.716735
       0
       0
       0
      ...
       0
       0
       1
       0
       0
       1
       0
       1
       0
       0
    
    
      44790
       1474071
      2009-11-12
      2006-01-01
       41.802877
      -87.609908
       3.732965
       41.804273
       0
       1
       0
      ...
       0
       0
       1
       0
       0
       1
       0
       1
       0
       0
    
  

5 rows × 33 columns



In [ ]:

    
#try_dumm.dtypes
#train['inspection_waived']=[float(i) for i in train['inspection_waived'] ]
drop_these=['failed', 'violation_date','violation_last_modified_date']
tester=try_dumm[np.isfinite(try_dumm['longitude'])]



In [ ]:

    
comp_clfs(clfs,tester.drop(drop_these,axis=1) , tester['failed'])

	id	violation_last_modified_date	violation_date	inspection_waived	inspection_category	latitude	longitude	log_lat	new_lat	...	complied
44796	1834762	01/12/2007	2006-01-01	N	PERIODIC	41.883310	-87.735709	3.734887	42.512678	...	1
44787	1439036	04/16/2009	2006-01-01	N	PERIODIC	41.782263	-87.610447	3.732472	42.432114	...	1
44788	1474102	11/12/2009	2006-01-01	N	PERIODIC	41.802877	-87.609908	3.732965	42.796032	...	1
44789	1473213	10/25/2007	2006-01-01	N	PERIODIC	41.778666	-87.615737	3.732386	42.716735	...	1
44790	1474071	11/12/2009	2006-01-01	N	PERIODIC	41.802877	-87.609908	3.732965	41.804273	...	1

	id	violation_last_modified_date	violation_date	latitude	longitude	log_lat	new_lat	failed	...	complied	N	PERIODIC
44796	1834762	2007-01-12	2006-01-01	41.883310	-87.735709	3.734887	42.512678	1	...	1	1	1
44787	1439036	2009-04-16	2006-01-01	41.782263	-87.610447	3.732472	42.432114	1	...	1	1	1
44788	1474102	2009-11-12	2006-01-01	41.802877	-87.609908	3.732965	42.796032	1	...	1	1	1
44789	1473213	2007-10-25	2006-01-01	41.778666	-87.615737	3.732386	42.716735	0	...	1	1	1
44790	1474071	2009-11-12	2006-01-01	41.802877	-87.609908	3.732965	41.804273	1	...	1	1	1