In [1]:
from sklearn import ensemble

In [4]:
from sklearn import cross_validation

In [47]:
from sklearn import metrics
from sklearn import linear_model

In [2]:
%matplotlib inline
import pandas as pd
import seaborn as sns
from collections import Counter
from pmareport import pmareport
import numpy as np
import matplotlib.pyplot as plt

In [3]:
clinic = pmareport.Clinic()
clinic.drop_redundant()

In [7]:
df = clinic.df

In [13]:
data = df[['AGE', 'PATIENT_CONDITION', 'SEX', 'INSURANCE_TYPE', 'PROVIDER_NAME']]
responses = df.appt_time

In [29]:
def make_int(nm):
    categories = list(set(df[nm]))
    df[nm+'i'] = df[nm].apply(lambda x : categories.index(x))

In [32]:


In [30]:
for i in ['PATIENT_CONDITION', 'SEX', 'INSURANCE_TYPE', 'PROVIDER_NAME']:
    make_int(i)

In [56]:
feat = ['AGE', 'PATIENT_CONDITIONi']

In [57]:
data = df[feat]
responses = df.appt_time

In [58]:
cv = cross_validation.KFold(len(data), n_folds = 10)

In [59]:
model = ensemble.RandomForestRegressor()

In [60]:
for train, test in cv:
    X_train = data.iloc[train]
    X_test = data.iloc[test]
    y_train = responses.iloc[train]
    y_test = responses.iloc[test]
    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    print metrics.regression.mean_squared_error(y_true=y_test, y_pred=pred)


6.42061211569
6.54205812407
5.7141137378
5.93013043853
6.08116550354
6.35753793651
6.37316595341
6.83181778258
5.61663102621
6.35848447564

In [1]:



---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-1-458d5f1afc81> in <module>()
----> 1 model

NameError: name 'model' is not defined

In [42]:


In [ ]: