In [8]:
    
import zipfile
from urllib.request import urlopen
import os
source_url = 'ftp://ftp.nhtsa.gov/GES/GES12/GES12_Flatfile.zip'
zip_name = 'GES12_Flatfile.zip'
cwd = os.getcwd()
dir_path  = os.path.join(cwd, 'GES2012')
zip_path = os.path.join(dir_path, zip_name)
# We'll make a directory for you to play around with,
# then when you're done playing you can just delete the directory
if not os.path.exists(dir_path):
    os.makedirs(dir_path)
# Download the file from GES website if you haven't already
if not os.path.exists(zip_path):
    response = urlopen(source_url)
    with open(zip_path, 'wb') as fh:
        x = response.read()
        fh.write(x)
# Extract all the files from that zipfile
with zipfile.ZipFile(os.path.join(dir_path, zip_name), 'r') as z:
    z.extractall(dir_path)
    
    
In [10]:
    
#See what we just unzipped
os.listdir(dir_path)
    
    Out[10]:
In [11]:
    
import pandas as pd
import numpy as np
import sklearn
cwd = os.getcwd()
dir_path  = os.path.join(cwd, 'GES2012')
input_file_path = os.path.join(dir_path, 'PERSON.TXT')
input_data = pd.read_csv(input_file_path, delimiter='\t')
    
In [12]:
    
sorted(input_data.columns)
    
    Out[12]:
In [13]:
    
input_data.INJSEV_IM.value_counts()
    
    Out[13]:
In [15]:
    
# Drop those odd cases
input_data = input_data[input_data.INJSEV_IM != 6]
for column_name in input_data.columns:
    n_nans = input_data[column_name].isnull().sum()
    if n_nans > 0:
        print (column_name, n_nans)
    
    
In [21]:
    
print (input_data.shape)
data = input_data[~input_data.MAKE.isnull()]
discarded = data.pop('INJ_SEV')
target = data.pop('INJSEV_IM')
print (data.shape)
    
    
In [22]:
    
target = (target == 4).astype('float')
    
In [26]:
    
from sklearn.linear_model import LinearRegression
from sklearn.metrics import roc_auc_score
from sklearn.grid_search import GridSearchCV
# Train on half of the data while reserving the other half for
# model comparisons
xtrain, xtest, ytrain, ytest = sklearn.cross_validation.train_test_split(
    data.values, target.values, train_size=0.5)
linreg = LinearRegression()
linreg.fit(xtrain, ytrain)
lr_preds = linreg.predict(xtest)
lr_perf = roc_auc_score(ytest, lr_preds)
print ('OLS: Area under the ROC curve = {}'.format(lr_perf))
    
    
In [27]:
    
from sklearn.linear_model import Ridge
ridge = GridSearchCV(Ridge(),
                     {'alpha': np.logspace(-10, 10, 10)})
ridge.fit(xtrain, ytrain)
ridge_preds = ridge.predict(xtest)
ridge_performance = roc_auc_score(ytest, ridge_preds)
print ('Ridge: Area under the ROC curve = {}'.format(ridge_performance))
    
    
In [29]:
    
from sklearn.linear_model import Lasso
from sklearn.metrics import roc_auc_score
from sklearn.grid_search import GridSearchCV
lasso = GridSearchCV(Lasso(),
                     {'alpha': np.logspace(-10, -8, 5)})
lasso.fit(xtrain, ytrain)
lasso_preds = lasso.predict(xtest)
lasso_performance = roc_auc_score(ytest, lasso_preds)
print ('Lasso: Area under the ROC curve = {}'.format(lasso_performance))
    
    
    
In [30]:
    
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import roc_auc_score
from sklearn.grid_search import GridSearchCV
gbm = GradientBoostingClassifier(n_estimators=500)
gbm.fit(xtrain, ytrain)
gbm_preds = gbm.predict_proba(xtest)[:, 1]
gbm_performance = roc_auc_score(ytest, gbm_preds)
print ('GBM: Area under the ROC curve = {}'.format(gbm_performance))
    
    
In [31]:
    
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score
tree = GridSearchCV(DecisionTreeClassifier(),
                    {'max_depth': np.arange(3, 10)})
tree.fit(xtrain, ytrain)
tree_preds = tree.predict_proba(xtest)[:, 1]
tree_performance = roc_auc_score(ytest, tree_preds)
print ('DecisionTree: Area under the ROC curve = {}'.format(tree_performance))
    
    
In [33]:
    
importances = pd.Series(gbm.feature_importances_, index=data.columns)
print (importances.order(ascending=False)[:10])
    
    
In [ ]: