A notebook to make some quick plots of the seeclickfix data for New Haven
In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nhrc2
In [2]:
%matplotlib inline
In [6]:
print(nhrc2.__file__)
nhrc2dir = '/'.join(str(nhrc2.__file__).split('/')[:-1])+'/'
print(nhrc2dir)
In [8]:
scf_df = pd.read_csv(nhrc2dir+'data/scf_data_full.csv')
In [9]:
scf_df.head(5)
Out[9]:
In [10]:
plt.plot(scf_df['lng'], scf_df['lat'], '.')
plt.xlim([-73.0, -72.85])
plt.ylim([41.24, 41.355])
Out[10]:
In [35]:
issue_ints = scf_df['issue_id'].values
#now get right of the "other" ids:
issue_ints[np.where(issue_ints == 'other')[0]] = -1
new_ints = [int(i) for i in issue_ints]
scf_df['int_issue_id'] = new_ints
In [38]:
match_cats = scf_df.drop_duplicates(subset='issue_id')
In [39]:
match_cats[['issue_id', 'category']]
Out[39]:
In [27]:
print(len(match_cats['issue_id'].values))
match_cats['issue_id'].values
Out[27]:
In [ ]:
In [88]:
match_cats['num_issues'] = 0.
In [91]:
Out[91]:
In [94]:
total_issues = len(scf_df)
for cat in match_cats['category']:
idx = np.where(match_cats['category'] == cat)[0][0]
match_cats.loc[idx, 'num_issues'] = len(scf_df[scf_df['category'] == cat])
In [93]:
match_cats['num_issues']
Out[93]:
In [ ]:
print('total_issues: {}'.format(total_issues))
print('Percent | Category | Issues ')
print('============================================================')
for idx, cat in enumerate(match_cats['category'].sort('num_issues'):
cat_total = len(scf_df[scf_df['category'] == cat])
print('{:7s} | {:40s} | {}'.format(str(round(float(cat_total)/total_issues * 1e2, 2)), cat, cat_total))