In [53]:
import pandas as pd
import numpy as np
import pylab as pl
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
svm_clf = SVC()
neighbors_clf = KNeighborsClassifier()
clfs = [
("svc", SVC()),
("KNN", KNeighborsClassifier())
]
def comp_clfs(clfs, data, data_labels):
"""data includes features in dataframe format and labels are separately"""
for name, clf in clfs:
clf.fit(data, data_labels)
print name, clf.score(data,clf.predict(data))
print "*"*80
In [82]:
train= pd.read_csv("/home/keer/DSSG/data-challenges/BuildingInspections/data/Building_Violations_50000_features_and_labels.csv")
In [83]:
train.columns
train['violation_date']=pd.to_datetime(train.violation_date)
In [84]:
train=train.sort("violation_date")
In [85]:
def roll_window(data, date_col, beginning, ending):
return data[data[date_col]<ending][data[date_col]>beginning]
In [86]:
train.columns
Out[86]:
In [ ]:
In [89]:
remove_these=['Unnamed: 0','violation_code','property_group','inspector_id', 'unnamed:_0' ,'inspection_status', 'violation_description', 'violation_inspector_comments', 'address','ssa','location','department_bureau','violation_status','passed','violation_status_date','violation_location','violation_ordinance','inspection_number']
###removed passed so we only have failed as our label after removing below
remove_these_entries=['closed','hold']
train=train[train.closed==0]
train=train[train.hold==0]
In [90]:
train=train.drop(remove_these, axis=1)
train.head()
Out[90]:
In [91]:
train.columns
Out[91]:
In [93]:
train['violation_last_modified_date']=pd.to_datetime(train.violation_last_modified_date)
dummy_cols=['inspection_waived', 'inspection_category' ]
def make_dummies(data, cats):
for cat in cats:
dummies=pd.get_dummies(data[cat])
data=data.drop(cat, axis=1)
data=pd.concat([data, dummies], axis=1)
return data
try_dumm=make_dummies(train, dummy_cols)
try_dumm.head()
Out[93]:
In [112]:
try_dumm.head()
Out[112]:
In [ ]:
#try_dumm.dtypes
#train['inspection_waived']=[float(i) for i in train['inspection_waived'] ]
drop_these=['failed', 'violation_date','violation_last_modified_date']
tester=try_dumm[np.isfinite(try_dumm['longitude'])]
In [ ]:
comp_clfs(clfs,tester.drop(drop_these,axis=1) , tester['failed'])