In [1]:
from sklearn import ensemble
In [4]:
from sklearn import cross_validation
In [47]:
from sklearn import metrics
from sklearn import linear_model
In [2]:
%matplotlib inline
import pandas as pd
import seaborn as sns
from collections import Counter
from pmareport import pmareport
import numpy as np
import matplotlib.pyplot as plt
In [3]:
clinic = pmareport.Clinic()
clinic.drop_redundant()
In [7]:
df = clinic.df
In [13]:
data = df[['AGE', 'PATIENT_CONDITION', 'SEX', 'INSURANCE_TYPE', 'PROVIDER_NAME']]
responses = df.appt_time
In [29]:
def make_int(nm):
categories = list(set(df[nm]))
df[nm+'i'] = df[nm].apply(lambda x : categories.index(x))
In [32]:
In [30]:
for i in ['PATIENT_CONDITION', 'SEX', 'INSURANCE_TYPE', 'PROVIDER_NAME']:
make_int(i)
In [56]:
feat = ['AGE', 'PATIENT_CONDITIONi']
In [57]:
data = df[feat]
responses = df.appt_time
In [58]:
cv = cross_validation.KFold(len(data), n_folds = 10)
In [59]:
model = ensemble.RandomForestRegressor()
In [60]:
for train, test in cv:
X_train = data.iloc[train]
X_test = data.iloc[test]
y_train = responses.iloc[train]
y_test = responses.iloc[test]
model.fit(X_train, y_train)
pred = model.predict(X_test)
print metrics.regression.mean_squared_error(y_true=y_test, y_pred=pred)
In [1]:
In [42]:
In [ ]: