In [2]:
%matplotlib inline
import pandas as pd
import seaborn as sns
from collections import Counter
from pmareport import pmareport
import numpy as np
import matplotlib.pyplot as plt
import calendar
In [3]:
from sklearn import ensemble
from sklearn import cross_validation
from sklearn import metrics
from sklearn import linear_model
In [4]:
clinic = pmareport.Clinic()
df = clinic.df
In [6]:
df.head()
Out[6]:
In [7]:
# Get the min appointment times and the number of appointments scheduled for less
min_appt_times = df.groupby('cond').appt_time.min()
s = 0
for cond in min_appt_times.index:
min_appt_time = min_appt_times[cond]
data = df[df.cond == cond]
num_appts = len(data[data.since_prev_sched < min_appt_time])
print 'Condition: {}\n\tmin appt time: {}\n\tnum appts less: {}'.format(cond, min_appt_time, num_appts)
s += num_appts
print s
In [8]:
# Get the number of appointments that were double- or triple-booked
df_doc_date_sched = df.groupby(['PROVIDER_NAME', 'date', 'schedd'])
df_doc_date_sched_cnt = df_doc_date_sched.count()
double_booked = df_doc_date_sched_cnt[df_doc_date_sched_cnt.PATIENT_ID == 2]
triple_booked = df_doc_date_sched_cnt[df_doc_date_sched_cnt.PATIENT_ID == 3]
len(double_booked), len(triple_booked)
Out[8]:
In [9]:
# Turn the categorical variables into ints for decision trees and one hot encoding
def make_int(nm):
categories = list(set(df[nm]))
df[nm+'i'] = df[nm].apply(lambda x : categories.index(x))
for i in ['PATIENT_CONDITION', 'SEX', 'INSURANCE_TYPE', 'PROVIDER_NAME']:
make_int(i)
In [34]:
# Make the dataset to go into the model
feat = ['AGE', 'PATIENT_CONDITIONi']
data = df[feat]
rc = 'appt_time'
# Train/test split the dataset
train, test = cross_validation.train_test_split(df, test_size=0.1)
In [35]:
Xtrain, ytrain = train[feat], train[rc]
Xtest, ytest = test[feat], test[rc]
In [36]:
# CV split the dataset
cv = cross_validation.KFold(len(train), n_folds = 10)
In [37]:
from sklearn.tree import DecisionTreeRegressor
In [38]:
# model = ensemble.RandomForestRegressor()
model = DecisionTreeRegressor(max_depth=3)
In [39]:
msel = []
for train, test in cv:
cvXtrain = Xtrain.iloc[train]
cvXtest = Xtrain.iloc[test]
cvytrain = ytrain.iloc[train]
cvytest = ytrain.iloc[test]
model.fit(cvXtrain, cvytrain)
pred = model.predict(cvXtest)
mse = percent_within(y_true=cvytest, y_pred=pred, thresh=5)
msel.append(mse)
print msel, np.mean(msel)
In [40]:
model.fit(Xtrain, ytrain)
pred2 = model.predict(Xtest)
percent_within(y_true=ytest, y_pred=pred2, thresh=5)
Out[40]:
In [41]:
def percent_within(y_true, y_pred, thresh=5):
return np.sum(np.abs(y_true - y_pred) < thresh)/float(len(y_true))*100
In [48]:
pd.DataFrame((np.random.randn(20), np.random.randn(20), np.random.randn(20)), columns=['a', 'b', 'c'])
In [22]:
clinic.make_scatter()
Out[22]:
In [108]:
def make_scatter(
df=None,
hue='PATIENT_CONDITION',
size=4,
xvar='AGE',
yvar='appt_time',
file_name=None
):
g = sns.FacetGrid(data=df, hue=hue, size=size)
g = g.map(plt.scatter, xvar, yvar, edgecolor='w')
g.add_legend(fontsize=10, markerscale=2)
plt.title('Appointment duration vs age by condition')
plt.ylabel('min')
plt.xlabel('')
if file_name:
g.savefig(file_name, bbox='tight', dpi=300)
else:
return g
In [109]:
make_scatter(df, file_name='age_appt_cond.png')
In [110]:
Counter(df.AGE)
Out[110]:
In [111]:
731/246.0
Out[111]:
In [112]:
df[df.cond == 'tvp'].groupby(['PROVIDER_NAME', 'AGE']).count().PATIENT_ID
Out[112]:
In [113]:
653/731.0
Out[113]:
In [ ]: