In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import matplotlib
plt.style.use('ggplot')


/usr/lib64/python3.4/site-packages/matplotlib/backends/backend_gtk3agg.py:18: UserWarning: The Gtk3Agg backend is known to not work on Python 3.x with pycairo. Try installing cairocffi.
  "The Gtk3Agg backend is known to not work on Python 3.x with pycairo. "

In [2]:
checks = pd.read_csv('VizDraft1.csv')

Check types


In [28]:
checks = checks[checks['Amount'] != 'No data']

In [92]:
checks['Amount'] = checks['Amount'].map(lambda x: float(x))
checks['Check'] = checks['Check'].map(lambda x: int(x))

Strip whitespace and make case insensitive


In [93]:
checks['Purchase'] = checks.Purchase.map(lambda x: x if type(x)!=str else x.lower())
checks['Category'] = checks.Category.map(lambda x: x if type(x)!=str else x.lower())

In [94]:
checks['Purchase'] = checks['Purchase'].str.strip()
checks['Category'] = checks['Category'].str.strip()

In [95]:
checks = checks.fillna('No data')

Remove columns we don't want to visualize


In [96]:
checks.drop(['Notes'], axis=1, inplace=True)


---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-96-09f4adaa15e1> in <module>()
----> 1 checks.drop(['Notes'], axis=1, inplace=True)

/usr/lib64/python3.4/site-packages/pandas/core/generic.py in drop(self, labels, axis, level, inplace, errors)
   1871                 new_axis = axis.drop(labels, level=level, errors=errors)
   1872             else:
-> 1873                 new_axis = axis.drop(labels, errors=errors)
   1874             dropped = self.reindex(**{axis_name: new_axis})
   1875             try:

/usr/lib64/python3.4/site-packages/pandas/indexes/base.py in drop(self, labels, errors)
   2964             if errors != 'ignore':
   2965                 raise ValueError('labels %s not contained in axis' %
-> 2966                                  labels[mask])
   2967             indexer = indexer[~mask]
   2968         return self.delete(indexer)

ValueError: labels ['Notes'] not contained in axis

In [97]:
checks.drop(['Redaction'], axis=1, inplace=True)


---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-97-68906b259deb> in <module>()
----> 1 checks.drop(['Redaction'], axis=1, inplace=True)

/usr/lib64/python3.4/site-packages/pandas/core/generic.py in drop(self, labels, axis, level, inplace, errors)
   1871                 new_axis = axis.drop(labels, level=level, errors=errors)
   1872             else:
-> 1873                 new_axis = axis.drop(labels, errors=errors)
   1874             dropped = self.reindex(**{axis_name: new_axis})
   1875             try:

/usr/lib64/python3.4/site-packages/pandas/indexes/base.py in drop(self, labels, errors)
   2964             if errors != 'ignore':
   2965                 raise ValueError('labels %s not contained in axis' %
-> 2966                                  labels[mask])
   2967             indexer = indexer[~mask]
   2968         return self.delete(indexer)

ValueError: labels ['Redaction'] not contained in axis

In [98]:
checks = checks.rename(columns={'Unnamed: 0': 'Date', 'check': 'Check', 'amount': 'Amount', 
                                'requested by': 'Requested by', 'requested date': 'Requested Date'})

In [99]:
checks.drop(['Unnamed: 14', 'Unnamed: 15', 'Unnamed: 16', 'Unnamed: 17', 'Requested by'], axis=1, inplace=True)


---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-99-0ef978daba29> in <module>()
----> 1 checks.drop(['Unnamed: 14', 'Unnamed: 15', 'Unnamed: 16', 'Unnamed: 17', 'Requested by'], axis=1, inplace=True)

/usr/lib64/python3.4/site-packages/pandas/core/generic.py in drop(self, labels, axis, level, inplace, errors)
   1871                 new_axis = axis.drop(labels, level=level, errors=errors)
   1872             else:
-> 1873                 new_axis = axis.drop(labels, errors=errors)
   1874             dropped = self.reindex(**{axis_name: new_axis})
   1875             try:

/usr/lib64/python3.4/site-packages/pandas/indexes/base.py in drop(self, labels, errors)
   2964             if errors != 'ignore':
   2965                 raise ValueError('labels %s not contained in axis' %
-> 2966                                  labels[mask])
   2967             indexer = indexer[~mask]
   2968         return self.delete(indexer)

ValueError: labels ['Unnamed: 14' 'Unnamed: 15' 'Unnamed: 16' 'Unnamed: 17' 'Requested by'] not contained in axis

In [100]:
checks.columns


Out[100]:
Index(['Date', 'Check', 'Amount', 'Purchase', 'Category', 'FOIA No', 'Link',
       'Requested Date', 'First Response', 'Received Date', 'Muckrock Link'],
      dtype='object')

De-duplication


In [101]:
checks.Purchase.replace('ballistics labs', 'ballistics lab', inplace=True)
checks.Purchase.replace('car repair and seizure', 'car repair', inplace=True)
checks.Purchase.replace('file maker license', 'filemaker', inplace=True)
checks.Purchase.replace('gps kit', 'gps trackers', inplace=True)
checks.Purchase.replace('accurint', 'lexis nexis', inplace=True)
checks.Purchase.replace('k9 training', 'k9s', inplace=True)
checks.Purchase.replace('gps kit', 'gps trackers', inplace=True)
checks.Purchase.replace('repair for seized vehicles', 'car repair', inplace=True)
checks.Purchase.replace('repair of seized vehicles', 'car repair', inplace=True)
checks.Purchase.replace('raid apparel', 'raid jacket', inplace=True)
checks.Purchase.replace('seized vehicles', 'car repair', inplace=True)
checks.Purchase.replace('seized car repair', 'car repair', inplace=True)
checks.Purchase.replace('seized cars', 'car repair', inplace=True)
checks.Purchase.replace('undercover car purchase', 'undercover car', inplace=True)
checks.Purchase.replace('server purchase', 'server', inplace=True)

In [102]:
np.unique(checks.Purchase)


Out[102]:
array(['"clear services" subscription', '"realinfo accounts"',
       '1000 nik drug test pouches', '1505 audit',
       '20 unmarked ocd vehicles', '48 computers and monitors',
       '48tb hard drive for homan square security system', '49 vehicles',
       '?? web access', '??? of st petersburg fl', '??????',
       '??????????  "bulk narcotics destruction"???',
       '??????????  "narcotics related investigations"???',
       '??????????????', 'ads invoice', 'alpr', 'at & t mobility', 'at&t',
       'at&t boc phones', 'at&t mobility', 'at&t trunking',
       'audio / video supplies', 'auto repair', 'auto repairs',
       'avid video processing and tower', 'ballistics lab',
       'barracuda backup server', 'black widow surveillance',
       'boc police department vehicles', 'camera equipment',
       'canon cameras from b&h video', 'car repair', 'cat 5e cable',
       'clear', 'comcast', 'computer purchase',
       'conference registration cost', 'cpd dept of fleet mngmt',
       'dare & great program funding',
       'davis sign company of seatle washington', 'dea conference',
       'dell computers', 'dell storage array', 'drug testing',
       'electronics services agency invoice', 'enterprise',
       'enterprise fleet', 'enterprise fleet services',
       'enterprise rentals', 'extended warranty for filemaker servers',
       'filemaker', 'finance division check request',
       'freight security net', 'freight security trackers',
       'gang intelligence reporting', 'gas credit card charges',
       'gps air time', 'gps trackers', 'hard drives',
       'harris corp of melbourne fl', 'harris equipment invoice',
       'harris training invoice', 'hdmi video retrieval kits',
       'hidta distribution correction', 'hotline services',
       'hp monitors: "data color spyder3"', 'ideoa',
       'ideoa training conference', 'ideoa training fee for 40 officers',
       'independent audit 1505/657 funds',
       'ir cam, pole cams, counter surveillance', 'k9s',
       'keyw corporation of hanover', 'lexis nexis',
       'lodging, fbi training', 'lodging, ideoa conference',
       'membership renewal 12 officers', 'meridian it', 'money counter',
       'motorola apex 6000 portable radios', 'motorola radios',
       'narcotics destruction', 'no data', 'ocd vision', 'pen', 'pen link',
       'pen reg', 'pen-link pen registers', 'penlink',
       'penlink bluerayjukebox and one dex processor', 'phones',
       'pinhole camera', 'pinhole cameras', 'pinhole cams?', 'pods',
       'pole cams', 'postal equipment', 'pro-tech security sales invoice',
       'raid jacket', 'real estate database', 'redacted',
       'redxdefense narcotics test kits', 'redxdefense training',
       'reimbursement for city sticker', 'repairs for forefeited vehicles',
       'research electronics international',
       'research electronics international invoice',
       'sandoval custom creations for ?? cameral and ??',
       'school prevention programs', 'server',
       'servers from dhpc technologies', 'shredder', 'simplynas',
       'sony cameras and equipment', 'sprint', 'sprint phones',
       'sprint services', 'star witness', 'stingray',
       'subscription to ?? web access', 'swat equipment', 't mobile',
       't-mobile (for pen register investigation)',
       't-mobile wireless for pen register investigation', 't1 data line',
       't1 internet access', 'team cameras from b&h video',
       'technical training course', 'tmobile', 'training equipment',
       'trap and trace', 'trunking', 'trunking line',
       'unclear, bought from research electronics international',
       'unclear: "utilized during narcotic related investigations"',
       'undercover car',
       'unlimited web access to public record payable to?',
       'unlimited web access to? payable to?',
       'us cellular pen register coverage', 'us marshalls service/cpd',
       'vehicle repair', 'vehicle repairs', 'verizon boc phones',
       'verizon wireless',
       'video retrieval, 12 dell laptops, cover hardware conflict',
       'vision database', 'windows administrator training',
       'worldwide technologies invoice',
       'x100 cocaine test kits and swaps',
       'x14 license plate renewal stickers', 'x2 server rack power strips',
       'x2 video forensic enhancement software',
       'x37 hours filemaker consulting', 'x40 lemart kits', 'x60 tasers',
       'xfm server'], dtype=object)

Group by purchase type


In [103]:
purchase_obj = checks.groupby('Purchase')

In [104]:
purchase_obj['Amount']


Out[104]:
<pandas.core.groupby.SeriesGroupBy object at 0x7f44555a05f8>

In [105]:
amounts_spent = purchase_obj['Amount'].agg('sum')

In [106]:
amounts = pd.DataFrame(amounts_spent)

In [107]:
amounts.sort_values('Amount', ascending=False)


Out[107]:
Amount
Purchase
enterprise 7204635.31
phones 2681492.34
lexis nexis 949351.27
no data 919346.59
undercover car 897918.00
ballistics lab 869668.11
motorola radios 728355.00
finance division check request 665950.75
20 unmarked ocd vehicles 619088.00
penlink 413209.34
hidta distribution correction 389823.46
enterprise rentals 348830.63
boc police department vehicles 340628.75
swat equipment 335000.00
dell computers 257333.60
pen-link pen registers 245920.89
car repair 241158.99
49 vehicles 213754.83
pods 205000.00
computer purchase 184571.60
verizon wireless 167172.34
stingray 157300.00
verizon boc phones 143260.06
pen link 137171.67
enterprise fleet 136448.53
harris corp of melbourne fl 136375.00
meridian it 131783.84
enterprise fleet services 105749.29
x60 tasers 101639.05
server 99489.19
... ...
"clear services" subscription 5884.00
hard drives 5586.00
unlimited web access to? payable to? 5569.25
trunking 5494.61
audio / video supplies 5487.40
hdmi video retrieval kits 5385.00
us marshalls service/cpd 5267.68
pinhole cameras 5240.00
gang intelligence reporting 5239.08
black widow surveillance 5220.00
sony cameras and equipment 5154.90
x40 lemart kits 4039.20
clear 3445.20
x100 cocaine test kits and swaps 3142.65
lodging, fbi training 1872.00
x2 video forensic enhancement software 1800.00
x14 license plate renewal stickers 1414.00
pen reg 1359.22
comcast 950.74
cat 5e cable 948.91
t1 data line 937.51
pen 932.11
membership renewal 12 officers 600.00
hotline services 478.40
real estate database 400.00
x2 server rack power strips 374.66
trunking line 256.93
at&t 188.67
reimbursement for city sticker 35.42
gas credit card charges 33.17

156 rows × 1 columns


In [108]:
amounts.sort_values('Amount', ascending=False)[0:10].plot(kind='barh', figsize=(9,5), title='1505 Expenditures by Purchase')


Out[108]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f44559c10f0>

In [109]:
import matplotlib
fig = plt.figure(figsize=(16,12))
ax = fig.add_subplot(111)
a = 0.7

# Remove grid lines
ax.grid(False)
# Remove plot frame
ax.set_frame_on(False)
 
# Title
ax.set_title(ax.get_title(), fontsize=36, alpha=a, ha='left')
plt.subplots_adjust(top=0.8)
# ax.title.set_position((0.5,1.08))
 
# Axis labels
# ax.xaxis.set_label_position('bottom')
# ylab = 'Amount Seized (USD)'
# ax.set_ylabel(ylab, fontsize=20, alpha=a, ha='left')

# People don't understand scientific notation
ax.get_yaxis().get_major_formatter().set_scientific(False)
ax.get_xaxis().get_major_formatter().set_scientific(False)

ax.get_yaxis().set_major_formatter(
    matplotlib.ticker.FuncFormatter(lambda x, p: format(int(x), ',')))

amounts.sort_values('Amount', ascending=False)[1:6].plot(ax=ax, kind='barh', figsize=(9,5), title='1505 Expenditures by Purchase')


Out[109]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f4455440908>

In [110]:
amounts.sum()


Out[110]:
Amount    22463428.36
dtype: float64

Group by routine/surveillance classification


In [111]:
class_obj = checks.groupby('Category')

In [112]:
amounts_by_class = class_obj.Amount.agg('sum')

In [113]:
amounts_category = pd.DataFrame(amounts_by_class)

In [114]:
amounts_category['Category'] = amounts_category.index

In [115]:
amounts_category.sort_values('Amount', ascending=False)[0:10].plot(kind='barh', figsize=(9,5), title='1505 Expenditures by Category')


Out[115]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f44553a5c50>

In [116]:
checks_to_plot_sur = checks[checks.Category == 'surveillance']
checks_to_plot_rou = checks[checks.Category == 'routine']

In [117]:
checks_to_plot = checks_to_plot_sur.append(checks_to_plot_rou)
class_obj = checks.groupby('Category')
amounts_by_class = class_obj.Amount.agg('sum')
amounts_category = pd.DataFrame(amounts_by_class)
amounts_category['Category'] = amounts_category.index

In [118]:
import matplotlib
fig = plt.figure(figsize=(16,12))
ax = fig.add_subplot(111)
a = 0.7

# Remove grid lines
ax.grid(False)
# Remove plot frame
ax.set_frame_on(False)
 
# Title
ax.set_title(ax.get_title(), fontsize=36, alpha=a, ha='left')
plt.subplots_adjust(top=0.8)
# ax.title.set_position((0.5,1.08))
 
# Axis labels
# ax.xaxis.set_label_position('bottom')
# ylab = 'Amount Seized (USD)'
# ax.set_ylabel(ylab, fontsize=20, alpha=a, ha='left')

# People don't understand scientific notation
ax.get_yaxis().get_major_formatter().set_scientific(False)
ax.get_xaxis().get_major_formatter().set_scientific(False)

ax.get_yaxis().set_major_formatter(
    matplotlib.ticker.FuncFormatter(lambda x, p: format(int(x), ',')))

amounts_category.sort_values('Amount', ascending=False)[0:10].plot(ax=ax, kind='barh', figsize=(9,5), title='1505 Expenditures by Category')


Out[118]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f44559aa390>

In [119]:
# Compute % routine and surveillance
surveillance_amount = amounts_category[amounts_category['Category'] == 'surveillance']['Amount'].values
routine_amount = amounts_category[amounts_category['Category'] == 'routine']['Amount'].values

In [120]:
result_by_cat = amounts_category.sort_values('Amount', ascending=False)[0:10]

In [121]:
surveillance_amount


Out[121]:
array([ 1606035.7])

In [122]:
print('Percent surveillance: {}'.format(surveillance_amount/(surveillance_amount + routine_amount) * 100)) 
print('Percent routine: {}'.format(routine_amount/(surveillance_amount + routine_amount) * 100))


Percent surveillance: [ 8.16362909]
Percent routine: [ 91.83637091]

In [123]:
checks.to_csv('checks_aug24_cleaned.csv', index=False)

In [ ]:


In [ ]:


In [ ]:


In [ ]: