This is an analysis of complaints data, munged here.
The fields are:
In [17]:
import pandas as pd
import numpy as np
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
pd.set_option('display.max_colwidth', -1)
In [18]:
df = pd.read_csv('../../data/processed/complaints-3-29-scrape.csv')
In [19]:
df.count()[0]
Out[19]:
In [20]:
df[df['public']=='offline'].count()[0]
Out[20]:
In [21]:
df[df['public']=='online'].count()[0]
Out[21]:
In [22]:
df[df['public']=='offline'].count()[0]/df.count()[0]*100
Out[22]:
In [23]:
df[(df['outcome']=='Exposed to Potential Harm') | (df['outcome']=='No Negative Outcome')].count()[0]
Out[23]:
In [24]:
df[(df['outcome']=='Exposed to Potential Harm') |
(df['outcome']=='No Negative Outcome')].count()[0]/df[df['public']=='offline'].count()[0]*100
Out[24]:
In [25]:
totals = df.groupby(['omg_outcome','public']).count()['abuse_number'].unstack().reset_index()
In [26]:
totals.fillna(0, inplace = True)
In [27]:
totals['total'] = totals['online']+totals['offline']
In [28]:
totals['pct_offline'] = round(totals['offline']/totals['total']*100)
In [29]:
totals.sort_values('pct_offline',ascending=False)
Out[29]:
In [30]:
df['outcome_notes'].fillna('', inplace = True)
In [31]:
df[(df['outcome_notes'].str.contains('constitute neglect|constitutes neglect|constitute abuse|constitutes abuse|constitutes exploitation|constitutes financial exploitation')) & (df['public']=='offline')].count()[0]
Out[31]:
"The state fined the facilities in hundreds of those cases."
In [32]:
df[(df['omg_outcome']=='Potential harm') & (df['fine']>0) & (df['public']=='offline')].count()[0]
Out[32]: