In [1]:
from pandas import *
import matplotlib as mpl
mpl.rc('figure', figsize=(12, 6))
set_printoptions(max_columns=18)
months = {'JAN' : 1, 'FEB' : 2, 'MAR' : 3, 'APR' : 4, 'MAY' : 5, 'JUN' : 6,
          'JUL' : 7, 'AUG' : 8, 'SEP' : 9, 'OCT': 10, 'NOV': 11, 'DEC' : 12}
of_interest = ['Obama, Barack', 'Romney, Mitt', 'Santorum, Rick', 
               'Paul, Ron', 'Gingrich, Newt']
parties = {
  'Bachmann, Michelle': 'Republican',
  'Romney, Mitt': 'Republican',
  'Obama, Barack': 'Democrat',
  "Roemer, Charles E. 'Buddy' III": 'Reform',
  'Pawlenty, Timothy': 'Republican',
  'Johnson, Gary Earl': 'Libertarian',
  'Paul, Ron': 'Republican',
  'Santorum, Rick': 'Republican',
  'Cain, Herman': 'Republican',
  'Gingrich, Newt': 'Republican',
  'McCotter, Thaddeus G': 'Republican',
  'Huntsman, Jon': 'Republican',
  'Perry, Rick': 'Republican'           
 }

In [2]:
fec = read_csv('P00000001-ALL.txt')

In [3]:
fec


Out[3]:
<class 'pandas.core.frame.DataFrame'>
Int64Index: 536041 entries, 0 to 536040
Data columns:
cmte_id              536041  non-null values
cand_id              536041  non-null values
cand_nm              536041  non-null values
contbr_nm            536041  non-null values
contbr_city          536026  non-null values
contbr_st            536040  non-null values
contbr_zip           535973  non-null values
contbr_employer      531417  non-null values
contbr_occupation    531448  non-null values
contb_receipt_amt    536041  non-null values
contb_receipt_dt     536041  non-null values
receipt_desc         8479  non-null values
memo_cd              49718  non-null values
memo_text            52740  non-null values
form_tp              536041  non-null values
file_num             536041  non-null values
dtypes: float64(1), int64(1), object(14)

In [4]:
parties    # dictionary


Out[4]:
{'Bachmann, Michelle': 'Republican',
 'Cain, Herman': 'Republican',
 'Gingrich, Newt': 'Republican',
 'Huntsman, Jon': 'Republican',
 'Johnson, Gary Earl': 'Libertarian',
 'McCotter, Thaddeus G': 'Republican',
 'Obama, Barack': 'Democrat',
 'Paul, Ron': 'Republican',
 'Pawlenty, Timothy': 'Republican',
 'Perry, Rick': 'Republican',
 "Roemer, Charles E. 'Buddy' III": 'Reform',
 'Romney, Mitt': 'Republican',
 'Santorum, Rick': 'Republican'}

In [5]:
fec.cand_nm


Out[5]:
0     Bachmann, Michelle
1     Bachmann, Michelle
2     Bachmann, Michelle
3     Bachmann, Michelle
4     Bachmann, Michelle
5     Bachmann, Michelle
6     Bachmann, Michelle
7     Bachmann, Michelle
8     Bachmann, Michelle
9     Bachmann, Michelle
10    Bachmann, Michelle
11    Bachmann, Michelle
12    Bachmann, Michelle
13    Bachmann, Michelle
14    Bachmann, Michelle
...
536026    Perry, Rick
536027    Perry, Rick
536028    Perry, Rick
536029    Perry, Rick
536030    Perry, Rick
536031    Perry, Rick
536032    Perry, Rick
536033    Perry, Rick
536034    Perry, Rick
536035    Perry, Rick
536036    Perry, Rick
536037    Perry, Rick
536038    Perry, Rick
536039    Perry, Rick
536040    Perry, Rick
Name: cand_nm, Length: 536041

In [6]:
fec['party'] = fec.cand_nm.map(parties)  # map can take dictionary, series... 
                                         # Alternatively, use list comprehension: [parties[x] for x in fec.cand_nm]

In [8]:
fec.ix[0]    # A single record.


Out[8]:
cmte_id                       C00410118
cand_id                       P20002978
cand_nm              Bachmann, Michelle
contbr_nm               HARVEY, WILLIAM
contbr_city                      MOBILE
contbr_st                            AL
contbr_zip                    366010290
contbr_employer                 RETIRED
contbr_occupation               RETIRED
contb_receipt_amt                   250
contb_receipt_dt              20-JUN-11
receipt_desc                        NaN
memo_cd                             NaN
memo_text                           NaN
form_tp                           SA17A
file_num                         736166
party                        Republican
Name: 0

In [9]:
fec.party.value_counts()   # Doing it high level way using histogram.


Out[9]:
Democrat       292400
Republican     237575
Reform           5364
Libertarian       702

In [10]:
np.unique(fec.party)


Out[10]:
134010       Democrat
361727    Libertarian
356462         Reform
6067       Republican
Name: party

In [12]:
fec.groupby('party')['contb_receipt_amt'].sum()


Out[12]:
party
Democrat       8.105758e+07
Libertarian    4.132769e+05
Reform         3.390338e+05
Republican     1.192255e+08
Name: contb_receipt_amt

In [13]:
fec.groupby(['party', 'contb_receipt_dt'])['contb_receipt_amt'].sum()   # Not a python date object.


Out[13]:
party     contb_receipt_dt
Democrat  01-AUG-11           175281.00
          01-DEC-11           651532.82
          01-JAN-12            58098.80
          01-JUL-11           165961.00
          01-JUN-11           145459.00
          01-MAY-11            82644.00
          01-NOV-11           122529.87
          01-OCT-11           148977.00
          01-SEP-11           403297.62
          02-AUG-11           164510.11
          02-DEC-11           216056.96
          02-JAN-12            89743.60
          02-JUL-11            17105.00
          02-JUN-11           422453.00
          02-MAY-11           396675.00
...
Republican  30-JUL-11             12249.04
            30-JUN-11           2744932.63
            30-MAR-11             50240.00
            30-MAY-11             17803.60
            30-NOV-11            809014.83
            30-OCT-11             43913.16
            30-SEP-11           4886331.76
            31-AUG-11           1017735.02
            31-DEC-11           1094376.72
            31-JAN-11              6000.00
            31-JAN-12            869890.41
            31-JUL-11             12781.02
            31-MAR-11             62475.00
            31-MAY-11            301339.80
            31-OCT-11            734601.83
Name: contb_receipt_amt, Length: 1183

In [14]:
fec.contb_receipt_dt[0]


Out[14]:
'20-JUN-11'

In [15]:
print months


{'FEB': 2, 'AUG': 8, 'JAN': 1, 'DEC': 12, 'OCT': 10, 'MAR': 3, 'SEP': 9, 'MAY': 5, 'JUN': 6, 'JUL': 7, 'APR': 4, 'NOV': 11}

In [16]:
def convert_date(val):    # function to convert date to python's date object.
    d, m, y = val.split('-')
    m = months[m]
    return datetime(int ('20' + y), m, int(d))

fec.contb_receipt_dt   # output a list of date (not yet in python objects).
fec['contb_receipt_dt'] = fec.contb_receipt_dt.map(convert_date)

In [17]:
fec.contb_receipt_dt[0]


Out[17]:
datetime.datetime(2011, 6, 20, 0, 0)

In [18]:
fec.groupby(['party', 'contb_receipt_dt'])['contb_receipt_amt'].sum() # Same as above, however, date converted to python object.


Out[18]:
party     contb_receipt_dt
Democrat  2011-04-04          640235.12
          2011-04-05          307641.00
          2011-04-06          247542.59
          2011-04-07          252336.00
          2011-04-08          295452.00
          2011-04-09          105879.00
          2011-04-10          100373.00
          2011-04-11          360728.00
          2011-04-12          405160.46
          2011-04-13          382836.33
          2011-04-14          306643.50
          2011-04-15          408377.00
          2011-04-16           82658.00
          2011-04-17           70802.00
          2011-04-18          501971.37
...
Republican  2012-01-17           625365.77
            2012-01-18           888681.17
            2012-01-19          1066250.23
            2012-01-20           401298.03
            2012-01-21           374261.81
            2012-01-22           507168.71
            2012-01-23           645477.15
            2012-01-24           462233.66
            2012-01-25           416931.39
            2012-01-26           256406.86
            2012-01-27           368441.82
            2012-01-28            82775.80
            2012-01-29            75220.02
            2012-01-30           255204.80
            2012-01-31           869890.41
Name: contb_receipt_amt, Length: 1183

In [19]:
by_date = fec.groupby(['party', 'contb_receipt_dt'])['contb_receipt_amt'].sum()

In [22]:
by_date.unstack('party')   # Indexes are dates. Columns are political parties.


Out[22]:
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 376 entries, 2011-01-01 00:00:00 to 2012-01-31 00:00:00
Data columns:
Democrat       303  non-null values
Libertarian    190  non-null values
Reform         314  non-null values
Republican     376  non-null values
dtypes: float64(4)

In [23]:
by_date.unstack('party').cumsum().plot()


Out[23]:
<matplotlib.axes.AxesSubplot at 0x106e6ef90>

In [24]:
s = by_date.unstack('party')[-5:].stack()

In [25]:
s    # Note the inner level indexes. Not every date has 3 parties. Top level indexes are dates. stack() makes 1 dimensional.
     # Labels propagated down for all 3 observations: political parties.


Out[25]:
contb_receipt_dt  party     
2012-01-27        Democrat       305785.47
                  Reform           3176.37
                  Republican     368441.82
2012-01-28        Democrat       235492.85
                  Reform            175.00
                  Republican      82775.80
2012-01-29        Democrat        93177.00
                  Reform            200.00
                  Republican      75220.02
2012-01-30        Democrat       435921.72
                  Reform            130.00
                  Republican     255204.80
2012-01-31        Democrat      1418410.31
                  Reform            150.00
                  Republican     869890.41

In [30]:
set_printoptions(notebook_repr_html=False)  # turn off html, i.e., without table.
s.unstack()    # unstack() 2 dimensional.


Out[30]:
party               Democrat   Reform  Republican
contb_receipt_dt                                 
2012-01-27         305785.47  3176.37   368441.82
2012-01-28         235492.85   175.00    82775.80
2012-01-29          93177.00   200.00    75220.02
2012-01-30         435921.72   130.00   255204.80
2012-01-31        1418410.31   150.00   869890.41

In [31]:
s.unstack().stack()  # 2 dimensional to 1 dimensional


Out[31]:
contb_receipt_dt  party     
2012-01-27        Democrat       305785.47
                  Reform           3176.37
                  Republican     368441.82
2012-01-28        Democrat       235492.85
                  Reform            175.00
                  Republican      82775.80
2012-01-29        Democrat        93177.00
                  Reform            200.00
                  Republican      75220.02
2012-01-30        Democrat       435921.72
                  Reform            130.00
                  Republican     255204.80
2012-01-31        Democrat      1418410.31
                  Reform            150.00
                  Republican     869890.41

In [34]:
df = s.unstack('party')   # data frame

In [33]:
s.unstack('contb_receipt_dt')


Out[33]:
contb_receipt_dt  2012-01-27  2012-01-28  2012-01-29  2012-01-30  2012-01-31
party                                                                       
Democrat           305785.47   235492.85    93177.00   435921.72  1418410.31
Reform               3176.37      175.00      200.00      130.00      150.00
Republican         368441.82    82775.80    75220.02   255204.80   869890.41

In [35]:
concat([df, df], axis=1, keys=['A', 'B'])  # join 2 copies of data frame.
                                           # standard hierarchical indexing stuff.


Out[35]:
                           A                                B                     
party               Democrat   Reform  Republican    Democrat   Reform  Republican
contb_receipt_dt                                                                  
2012-01-27         305785.47  3176.37   368441.82   305785.47  3176.37   368441.82
2012-01-28         235492.85   175.00    82775.80   235492.85   175.00    82775.80
2012-01-29          93177.00   200.00    75220.02    93177.00   200.00    75220.02
2012-01-30         435921.72   130.00   255204.80   435921.72   130.00   255204.80
2012-01-31        1418410.31   150.00   869890.41  1418410.31   150.00   869890.41

In [36]:
concat([df, df], axis=1, keys=['A', 'B'])['A']   # select out columns by the first key 'A'. Or select out the 'B' group.conc


Out[36]:
party               Democrat   Reform  Republican
contb_receipt_dt                                 
2012-01-27         305785.47  3176.37   368441.82
2012-01-28         235492.85   175.00    82775.80
2012-01-29          93177.00   200.00    75220.02
2012-01-30         435921.72   130.00   255204.80
2012-01-31        1418410.31   150.00   869890.41

In [37]:
concat([df, df], axis=1, keys=['A', 'B']).stack('party')


Out[37]:
                                      A           B
contb_receipt_dt party                             
2012-01-27       Democrat     305785.47   305785.47
                 Reform         3176.37     3176.37
                 Republican   368441.82   368441.82
2012-01-28       Democrat     235492.85   235492.85
                 Reform          175.00      175.00
                 Republican    82775.80    82775.80
2012-01-29       Democrat      93177.00    93177.00
                 Reform          200.00      200.00
                 Republican    75220.02    75220.02
2012-01-30       Democrat     435921.72   435921.72
                 Reform          130.00      130.00
                 Republican   255204.80   255204.80
2012-01-31       Democrat    1418410.31  1418410.31
                 Reform          150.00      150.00
                 Republican   869890.41   869890.41

In [38]:
concat([df, df], axis=1, keys=['A', 'B']).stack(0)  # The group indexes have no labels.


Out[38]:
party                 Democrat   Reform  Republican
contb_receipt_dt                                   
2012-01-27       A   305785.47  3176.37   368441.82
                 B   305785.47  3176.37   368441.82
2012-01-28       A   235492.85   175.00    82775.80
                 B   235492.85   175.00    82775.80
2012-01-29       A    93177.00   200.00    75220.02
                 B    93177.00   200.00    75220.02
2012-01-30       A   435921.72   130.00   255204.80
                 B   435921.72   130.00   255204.80
2012-01-31       A  1418410.31   150.00   869890.41
                 B  1418410.31   150.00   869890.41

In [40]:
result = fec.groupby(['cand_nm', 'contbr_occupation'])['contb_receipt_amt'].sum()   # Top contributors to candidates by occupation.

In [41]:
result  # is a data frame instead of series.


Out[41]:
cand_nm             contbr_occupation             
Bachmann, Michelle  100% DISABLED VETERAN              1869.50
                    100% DISABLED VIETNAM VETERAN       236.25
                    A/C AND HEAT                        250.00
                    ACADEMIC EMPLOYMENT SPECIALIST       25.00
                    ACCOUNT                              50.00
                    ACCOUNT ANALYST                     200.00
                    ACCOUNT EXECUTIVE                   100.00
                    ACCOUNT MANAGER                     230.00
                    ACCOUNTANT                        12979.00
                    ACCOUNTANT TRAINER                  250.00
                    ACCOUNTING                          350.00
                    ACCOUNTING MANAGER                   50.00
                    ACCOUNTING SPECIALIST               250.00
                    ACCOUNTING SUPERVISOR               125.00
                    ACCOUNTING/AUDITING                 700.00
...
Santorum, Rick  WILDLIFE REFUGE MANAGER       200.00
                WIND FARM DEVELOPER           400.00
                WINDOW CLEANER                 85.00
                WINERY                        300.00
                WIRELESS NETWORK ENGINEER     220.15
                WITHHELD                     1000.00
                WNG                           700.00
                WORKER II                     300.00
                WRITER                       4894.55
                WRITER ACTIVIST              1112.00
                WRITER EDITOR                 250.00
                WRITER/ACCOUNT EXEC           500.00
                WRITER/JOURNALIST             250.00
                WRITER/LAWYER                 300.00
                YOUTH MINISTER                250.00
Name: contb_receipt_amt, Length: 40171

In [42]:
result[:, 'ZOMBIE SLAYER']


Out[42]:
cand_nm
Paul, Ron    1556
Name: contb_receipt_amt

In [43]:
result = fec.groupby(['cand_nm', 'contbr_occupation']).sum()  # When doing this aggregation, we don't need to select the columns as above.
# There are columns that are not possible to sum. The code actually tries to sum them all: nuisance columns. Whenever it fails, it just silently drops them out.


Out[43]:
<class 'pandas.core.frame.DataFrame'>
MultiIndex: 40171 entries, ('Bachmann, Michelle', '100% DISABLED VETERAN') to ('Santorum, Rick', 'YOUTH MINISTER')
Data columns:
contb_receipt_amt    40171  non-null values
file_num             40171  non-null values
dtypes: float64(2)

In [44]:
result = result.reset_index()  # opposite of set_index(). Took 'cand_nm' and 'contbr_occupation' and set them as col indexes again, so that the indexes are just simple integer indexes.

In [45]:
result


Out[45]:
<class 'pandas.core.frame.DataFrame'>
Int64Index: 40171 entries, 0 to 40170
Data columns:
cand_nm              40171  non-null values
contbr_occupation    40171  non-null values
contb_receipt_amt    40171  non-null values
dtypes: float64(1), object(2)

In [48]:
# We have so far aggregated total amount donated to each political party by occupation (the first groupby).
# Additional groupby let us split them up by candidate, sort them in descending order by total amount donated and take the top 5.
set_printoptions(notebook_repr_html=True)
def top5_donors(group):
    return group.sort_index(by='contb_receipt_amt')[-5:]
# top5_donors(result)
result.groupby('cand_nm').apply(top5_donors)   # The retired donated the most to Barack Obama.


Out[48]:
cand_nm contbr_occupation contb_receipt_amt
cand_nm None
Bachmann, Michelle 921 Bachmann, Michelle PRESIDENT 68781.00
Bachmann, Michelle 893 Bachmann, Michelle PHYSICIAN 76134.00
Bachmann, Michelle 555 Bachmann, Michelle HOMEMAKER 102540.00
Bachmann, Michelle 594 Bachmann, Michelle INFORMATION REQUESTED 147638.89
Bachmann, Michelle 1059 Bachmann, Michelle RETIRED 824618.05
Cain, Herman 1537 Cain, Herman CEO 125097.00
Cain, Herman 2227 Cain, Herman PHYSICIAN 125790.96
Cain, Herman 1898 Cain, Herman HOMEMAKER 153964.07
Cain, Herman 2391 Cain, Herman RETIRED 779635.51
Cain, Herman 1929 Cain, Herman INFORMATION REQUESTED 3838719.32
Gingrich, Newt 4068 Gingrich, Newt OWNER 273310.00
Gingrich, Newt 4222 Gingrich, Newt PRESIDENT 332403.00
Gingrich, Newt 3561 Gingrich, Newt HOMEMAKER 426118.26
Gingrich, Newt 3611 Gingrich, Newt INFORMATION REQUESTED PER BEST EFFORTS 910147.00
Gingrich, Newt 4484 Gingrich, Newt RETIRED 2279602.27
Huntsman, Jon 5470 Huntsman, Jon PRESIDENT 92251.00
Huntsman, Jon 4984 Huntsman, Jon ATTORNEY 143532.50
Huntsman, Jon 5288 Huntsman, Jon HOMEMAKER 241524.79
Huntsman, Jon 5563 Huntsman, Jon RETIRED 352669.00
Huntsman, Jon 5301 Huntsman, Jon INFORMATION REQUESTED PER BEST EFFORTS 354500.00
Johnson, Gary Earl 5773 Johnson, Gary Earl HOUSEWIFE 11000.00
Johnson, Gary Earl 5855 Johnson, Gary Earl SELF-EMPLOYED 14750.00
Johnson, Gary Earl 5735 Johnson, Gary Earl CONSULTANT 16767.76
Johnson, Gary Earl 5853 Johnson, Gary Earl SELF EMPLOYED 18781.00
Johnson, Gary Earl 5845 Johnson, Gary Earl RETIRED 85763.96
McCotter, Thaddeus G 5906 McCotter, Thaddeus G HOMEMAKER 2400.00
McCotter, Thaddeus G 5918 McCotter, Thaddeus G RETIRED 2450.00
McCotter, Thaddeus G 5904 McCotter, Thaddeus G GOVERNMENT 2500.00
McCotter, Thaddeus G 5912 McCotter, Thaddeus G PHYSICIAN 3650.00
McCotter, Thaddeus G 5902 McCotter, Thaddeus G EXECUTIVE 3750.00
Obama, Barack 16658 Obama, Barack PHYSICIAN 2244445.92
Obama, Barack 13527 Obama, Barack INFORMATION REQUESTED 2647247.90
Obama, Barack 13113 Obama, Barack HOMEMAKER 2780203.60
Obama, Barack 7091 Obama, Barack ATTORNEY 7112343.35
Obama, Barack 18862 Obama, Barack RETIRED 15086766.92
Paul, Ron 25806 Paul, Ron HOMEMAKER 310823.42
Paul, Ron 28349 Paul, Ron REQUESTED 381440.72
Paul, Ron 27649 Paul, Ron PHYSICIAN 509300.85
Paul, Ron 24957 Paul, Ron ENGINEER 612693.61
Paul, Ron 28461 Paul, Ron RETIRED 1799513.76
Pawlenty, Timothy 30758 Pawlenty, Timothy INFORMATION REQUESTED PER BEST EFFORTS 197841.18
Pawlenty, Timothy 30902 Pawlenty, Timothy PRESIDENT 211708.92
Pawlenty, Timothy 30517 Pawlenty, Timothy ATTORNEY 238331.10
Pawlenty, Timothy 30978 Pawlenty, Timothy RETIRED 584195.00
Pawlenty, Timothy 30750 Pawlenty, Timothy HOMEMAKER 769514.95
Perry, Rick 31205 Perry, Rick ATTORNEY 768778.80
Perry, Rick 32352 Perry, Rick PRESIDENT 845150.00
Perry, Rick 32546 Perry, Rick RETIRED 1091671.00
Perry, Rick 31880 Perry, Rick HOMEMAKER 1818443.00
Perry, Rick 31916 Perry, Rick INFORMATION REQUESTED PER BEST EFFORTS 1975508.76
Roemer, Charles E. 'Buddy' III 34273 Roemer, Charles E. 'Buddy' III STUDENT 6733.34
Roemer, Charles E. 'Buddy' III 33538 Roemer, Charles E. 'Buddy' III HOMEMAKER 8195.00
Roemer, Charles E. 'Buddy' III 33004 Roemer, Charles E. 'Buddy' III ATTORNEY 14186.00
Roemer, Charles E. 'Buddy' III 33584 Roemer, Charles E. 'Buddy' III INFORMATION REQUESTED 18707.37
Roemer, Charles E. 'Buddy' III 34125 Roemer, Charles E. 'Buddy' III RETIRED 73000.31
Romney, Mitt 37415 Romney, Mitt PRESIDENT 1522751.99
Romney, Mitt 34633 Romney, Mitt ATTORNEY 3662610.21
Romney, Mitt 36325 Romney, Mitt HOMEMAKER 5442013.48
Romney, Mitt 36419 Romney, Mitt INFORMATION REQUESTED PER BEST EFFORTS 6394753.06
Romney, Mitt 37919 Romney, Mitt RETIRED 6483596.24
Santorum, Rick 38865 Santorum, Rick ATTORNEY 107130.00
Santorum, Rick 39189 Santorum, Rick EXECUTIVE 131456.55
Santorum, Rick 39350 Santorum, Rick INFORMATION REQUESTED (BEST EFFORTS) 284718.10
Santorum, Rick 39314 Santorum, Rick HOMEMAKER 350597.75
Santorum, Rick 39822 Santorum, Rick RETIRED 421952.98

In [49]:
df._data   # Internal data structure. The back data frame. Having hierarchical indexing medicates a lot of the need for higher dimensional objects (because they're very sparse).


Out[49]:
BlockManager
Items: array([Democrat, Reform, Republican], dtype=object)
Axis 1: <class 'pandas.tseries.index.DatetimeIndex'>
[2012-01-27 00:00:00, ..., 2012-01-31 00:00:00]
Length: 5, Freq: None, Timezone: None
FloatBlock: array([Democrat, Reform, Republican], dtype=object), 3 x 5, dtype float64

In [ ]:
# Time Series for financial data will be improved next.