In [1]:
    
# Import everything we need
import pandas as pd
import numpy as np
# Set Pandas display options so we can see more data
pd.set_option('display.width', 1000)
    
In [2]:
    
# Load the dataset
tlo_data_file = 'data/tlo_checks_07.28.15_cleaned.csv'
# Load the dataset into a pandas dataframe
raw_data = pd.DataFrame.from_csv(tlo_data_file, 
                       header=0, 
                       sep=',', 
                       index_col=0, 
                       parse_dates=True, 
                       encoding=None, 
                       tupleize_cols=False, 
                       infer_datetime_format=True)
raw_data.head()
    
    Out[2]:
In [3]:
    
# Lowercase the text fields
raw_data['failure_explanation'] = raw_data['failure_explanation'].str.lower()
    
In [5]:
    
# Failure Explanations: 'dob', 'name', 'ssn dob name', 'ssn', 'ssn name', 'ssn dob','dob name', nan
def update_failure_explanations(type):
    if type == 'dob':
        return 0
    elif type == 'name':
        return 1
    elif type == 'ssn dob name':
        return 2
    elif type == 'ssn':
        return 3
    elif type == 'ssn name':
        return 4
    elif type == 'ssn dob':
        return 5
    elif type == 'dob name':
        return 6
    
In [6]:
    
raw_data['failure_explanation'] = raw_data['failure_explanation'].apply(update_failure_explanations)
raw_data.head()
    
    Out[6]:
In [7]:
    
# Handle missing values
raw_data.fillna(0, inplace=True)
raw_data.head()
    
    Out[7]:
In [8]:
    
# Create two matrices for our model to use
tlo_data = raw_data.iloc[:,0:22].values
tlo_targets = raw_data['verified'].values
    
In [9]:
    
tlo_data
    
    Out[9]:
In [10]:
    
from sklearn import linear_model
logClassifier = linear_model.LogisticRegression(C=1, random_state=111)
    
In [11]:
    
from sklearn import cross_validation
X_train, X_test, y_train, y_test = cross_validation.train_test_split(tlo_data, tlo_targets, test_size=0.20, random_state=111)
logClassifier.fit(X_train, y_train)
    
    Out[11]:
In [12]:
    
# Run the test data
predicted = logClassifier.predict(X_test)
predicted
    
    Out[12]:
In [13]:
    
# Evaluate the model
from sklearn import metrics
metrics.accuracy_score(y_test, predicted)
    
    Out[13]:
In [14]:
    
# Confusion matrix
metrics.confusion_matrix(y_test, predicted)
    
    Out[14]:
In [15]:
    
import pickle
tlo_classifier_file = "models/tlo_lr_classifier_02.18.16.dat"
pickle.dump(logClassifier, open(tlo_classifier_file, "wb"))
    
In [16]:
    
# Recreate it as a test
logClassifier2 = pickle.load(open(tlo_classifier_file, "rb"))
print(logClassifier2)
    
    
In [ ]: