In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [3]:
%matplotlib inline
from pandas import DataFrame, Series

In [17]:
fec = pd.read_csv('ch09/P00000001-ALL.csv')


Out[17]:
cmte_id cand_id cand_nm contbr_nm contbr_city contbr_st contbr_zip contbr_employer contbr_occupation contb_receipt_amt contb_receipt_dt receipt_desc memo_cd memo_text form_tp file_num
0 C00410118 P20002978 Bachmann, Michelle HARVEY, WILLIAM MOBILE AL 3.6601e+08 RETIRED RETIRED 250.0 20-JUN-11 NaN NaN NaN SA17A 736166

In [19]:
fec[-5:-1]


Out[19]:
cmte_id cand_id cand_nm contbr_nm contbr_city contbr_st contbr_zip contbr_employer contbr_occupation contb_receipt_amt contb_receipt_dt receipt_desc memo_cd memo_text form_tp file_num
1001726 C00500587 P20003281 Perry, Rick GORMAN, CHRIS D. MR. INFO REQUESTED XX 99999 INFORMATION REQUESTED PER BEST EFFORTS INFORMATION REQUESTED PER BEST EFFORTS 5000.0 29-SEP-11 REATTRIBUTION / REDESIGNATION REQUESTED (AUTOM... NaN REATTRIBUTION / REDESIGNATION REQUESTED (AUTOM... SA17A 751678
1001727 C00500587 P20003281 Perry, Rick DUFFY, DAVID A. MR. INFO REQUESTED XX 99999 DUFFY EQUIPMENT COMPANY INC. BUSINESS OWNER 2500.0 30-SEP-11 NaN NaN NaN SA17A 751678
1001728 C00500587 P20003281 Perry, Rick GRANE, BRYAN F. MR. INFO REQUESTED XX 99999 INFORMATION REQUESTED PER BEST EFFORTS INFORMATION REQUESTED PER BEST EFFORTS 500.0 29-SEP-11 NaN NaN NaN SA17A 751678
1001729 C00500587 P20003281 Perry, Rick TOLBERT, DARYL MR. INFO REQUESTED XX 99999 T.A.C.C. LONGWALL MAINTENANCE FOREMAN 500.0 30-SEP-11 NaN NaN NaN SA17A 751678

In [20]:
fec.ix[123456]


Out[20]:
cmte_id                             C00431445
cand_id                             P80003338
cand_nm                         Obama, Barack
contbr_nm                         ELLMAN, IRA
contbr_city                             TEMPE
contbr_st                                  AZ
contbr_zip                          852816719
contbr_employer      ARIZONA STATE UNIVERSITY
contbr_occupation                   PROFESSOR
contb_receipt_amt                          50
contb_receipt_dt                    01-DEC-11
receipt_desc                              NaN
memo_cd                                   NaN
memo_text                                 NaN
form_tp                                 SA17A
file_num                               772372
Name: 123456, dtype: object

In [21]:
unique_cands = fec.cand_nm.unique()
unique_cands


Out[21]:
array(['Bachmann, Michelle', 'Romney, Mitt', 'Obama, Barack',
       "Roemer, Charles E. 'Buddy' III", 'Pawlenty, Timothy',
       'Johnson, Gary Earl', 'Paul, Ron', 'Santorum, Rick', 'Cain, Herman',
       'Gingrich, Newt', 'McCotter, Thaddeus G', 'Huntsman, Jon',
       'Perry, Rick'], dtype=object)

In [22]:
unique_cands[2]


Out[22]:
'Obama, Barack'

In [23]:
parties = {'Backmann, Michelle': 'Republican',
          'Cain, Herman': 'Republican',
          'Gingrich, Newt': 'Republican',
          'Huntsman, Jon': 'Republican',
            'Johnson, Gary Earl': 'Republican',
          'McCotter, Thaddeus G': 'Republican',
          'Obama, Barack': 'Democrat',
          'Paul, Ron':'Republican',
          'Pawlenty, Timothy':'Republican',
          'Perry, Rick':'Republican',
          "Roemer, Charles E. 'Buddy' III": 'Republican',
          'Romney, Mitt': 'Republican',
          'Santorum, Rick': 'Republican'}

In [24]:
fec.cand_nm[123456:123461]


Out[24]:
123456    Obama, Barack
123457    Obama, Barack
123458    Obama, Barack
123459    Obama, Barack
123460    Obama, Barack
Name: cand_nm, dtype: object

In [25]:
fec.cand_nm[123456:123461].map(parties)


Out[25]:
123456    Democrat
123457    Democrat
123458    Democrat
123459    Democrat
123460    Democrat
Name: cand_nm, dtype: object

In [26]:
# 将其添加为一个新列
fec['party'] = fec.cand_nm.map(parties)
fec['party'].value_counts()


Out[26]:
Democrat      593746
Republican    394845
Name: party, dtype: int64

In [27]:
(fec.contb_receipt_amt > 0).value_counts()


Out[27]:
True     991475
False     10256
Name: contb_receipt_amt, dtype: int64

In [28]:
fec = fec[fec.contb_receipt_amt > 0]

In [29]:
fec_mrbo = fec[fec.cand_nm.isin(['Obama, Barack', 'Romney, Mitt'])]

In [41]:
fec.groupby('cand_nm')['cand_nm'].count().sort_values(ascending=False)[:2]


Out[41]:
cand_nm
Obama, Barack    589127
Paul, Ron        143161
Name: cand_nm, dtype: int64

In [43]:
Series.sort_values?

In [44]:
fec.contbr_occupation.value_counts()[:10]


Out[44]:
RETIRED                                   233990
INFORMATION REQUESTED                      35107
ATTORNEY                                   34286
HOMEMAKER                                  29931
PHYSICIAN                                  23432
INFORMATION REQUESTED PER BEST EFFORTS     21138
ENGINEER                                   14334
TEACHER                                    13990
CONSULTANT                                 13273
PROFESSOR                                  12555
Name: contbr_occupation, dtype: int64

In [45]:
fec.cand_nm.value_counts()[:2]


Out[45]:
Obama, Barack    589127
Paul, Ron        143161
Name: cand_nm, dtype: int64

In [50]:
occ_mapping = {
    'INFORMATION REQUESTED PER BEST EFFORTS': 'NOT PROVIDED',
    'INFORMATION REQUESTED': 'NOT PROVIDED',
    'INFORMATION REQUESTED (BEST EFFORTS)': 'NOT PROVIDED',
    'C.E.O.':'CEO'
}
#如果没有提供相关映射,则返回x
f = lambda x: occ_mapping.get(x, x)
fec.loc[:, 'contbr_occupation']  = fec.loc[:, 'contbr_occupation'].map(f)

In [55]:
fec.contbr_occupation.value_counts()[:10]


Out[55]:
RETIRED         233990
NOT PROVIDED     57151
ATTORNEY         34286
HOMEMAKER        29931
PHYSICIAN        23432
ENGINEER         14334
TEACHER          13990
CONSULTANT       13273
PROFESSOR        12555
NOT EMPLOYED      9828
Name: contbr_occupation, dtype: int64

In [58]:
by_occupation = fec.pivot_table('contb_receipt_amt',
                               index='contbr_occupation',
                               columns='party', aggfunc='sum')
over_2mm = by_occupation[by_occupation.sum(1) > 2000000]
over_2mm


Out[58]:
party Democrat Republican
contbr_occupation
ATTORNEY 11141982.97 7.430110e+06
CEO 2074974.79 4.157392e+06
CONSULTANT 2459912.71 2.511957e+06
ENGINEER 951525.55 1.768609e+06
EXECUTIVE 1355161.05 4.100324e+06
HOMEMAKER 4248875.80 1.352865e+07
INVESTOR 884133.00 2.410658e+06
LAWYER 3160478.87 3.828063e+05
MANAGER 762883.22 1.419911e+06
NOT PROVIDED 4866973.96 2.041612e+07
OWNER 1001567.36 2.344497e+06
PHYSICIAN 3735124.94 3.517001e+06
PRESIDENT 1878509.95 4.652093e+06
PROFESSOR 2165071.08 2.887827e+05
REAL ESTATE 528902.09 1.616627e+06
RETIRED 25305116.38 2.270646e+07
SELF-EMPLOYED 672393.40 1.632071e+06

In [59]:
over_2mm.plot(kind='barh')


Out[59]:
<matplotlib.axes._subplots.AxesSubplot at 0x4c34c0b8>

In [61]:
fec_mrbo.groupby('cand_nm')['contb_receipt_amt'].value_counts()[:10]


Out[61]:
cand_nm        contb_receipt_amt
Obama, Barack  100.0                99976
               50.0                 86219
               25.0                 73993
               250.0                44853
               35.0                 33753
               500.0                29070
               10.0                 28320
               200.0                17899
               2500.0               16087
               15.0                 15174
Name: contb_receipt_amt, dtype: int64

In [66]:
def get_top_amounts(group, key, n=5):
    totals = group.groupby(key)['contb_receipt_amt'].sum()
    
    #根据key对totals进行降序排列
    return totals.sort_values(ascending=False)[:7]

In [67]:
grouped = fec_mrbo.groupby('cand_nm')
grouped.apply(get_top_amounts, 'contbr_occupation', n=7)


Out[67]:
cand_nm        contbr_occupation                     
Obama, Barack  RETIRED                                   25305116.38
               ATTORNEY                                  11141982.97
               INFORMATION REQUESTED                      4866973.96
               HOMEMAKER                                  4248875.80
               PHYSICIAN                                  3735124.94
               LAWYER                                     3160478.87
               CONSULTANT                                 2459912.71
Romney, Mitt   RETIRED                                   11508473.59
               INFORMATION REQUESTED PER BEST EFFORTS    11396894.84
               HOMEMAKER                                  8147446.22
               ATTORNEY                                   5364718.82
               PRESIDENT                                  2491244.89
               EXECUTIVE                                  2300947.03
               C.E.O.                                     1968386.11
Name: contb_receipt_amt, dtype: float64

In [68]:
bins = np.array([0, 1, 10, 100, 1000, 10000, 100000, 1000000, 10000000])
labels = pd.cut(fec_mrbo.contb_receipt_amt, bins)
labels


Out[68]:
411           (10, 100]
412         (100, 1000]
413         (100, 1000]
414           (10, 100]
415           (10, 100]
416           (10, 100]
417         (100, 1000]
418           (10, 100]
419         (100, 1000]
420           (10, 100]
421           (10, 100]
422         (100, 1000]
423         (100, 1000]
424         (100, 1000]
425         (100, 1000]
426         (100, 1000]
427       (1000, 10000]
428         (100, 1000]
429         (100, 1000]
430           (10, 100]
431       (1000, 10000]
432         (100, 1000]
433         (100, 1000]
434         (100, 1000]
435         (100, 1000]
436         (100, 1000]
437           (10, 100]
438         (100, 1000]
439         (100, 1000]
440           (10, 100]
              ...      
701356        (10, 100]
701357          (1, 10]
701358        (10, 100]
701359        (10, 100]
701360        (10, 100]
701361        (10, 100]
701362      (100, 1000]
701363        (10, 100]
701364        (10, 100]
701365        (10, 100]
701366        (10, 100]
701367        (10, 100]
701368      (100, 1000]
701369        (10, 100]
701370        (10, 100]
701371        (10, 100]
701372        (10, 100]
701373        (10, 100]
701374        (10, 100]
701375        (10, 100]
701376    (1000, 10000]
701377        (10, 100]
701378        (10, 100]
701379      (100, 1000]
701380    (1000, 10000]
701381        (10, 100]
701382      (100, 1000]
701383          (1, 10]
701384        (10, 100]
701385      (100, 1000]
Name: contb_receipt_amt, dtype: category
Categories (8, object): [(0, 1] < (1, 10] < (10, 100] < (100, 1000] < (1000, 10000] < (10000, 100000] < (100000, 1000000] < (1000000, 10000000]]

In [69]:
grouped = fec_mrbo.groupby(['cand_nm', labels])
grouped.size().unstack(0)


Out[69]:
cand_nm Obama, Barack Romney, Mitt
contb_receipt_amt
(0, 1] 493.0 77.0
(1, 10] 40070.0 3681.0
(10, 100] 372280.0 31853.0
(100, 1000] 153991.0 43357.0
(1000, 10000] 22284.0 26186.0
(10000, 100000] 2.0 1.0
(100000, 1000000] 3.0 NaN
(1000000, 10000000] 4.0 NaN

In [70]:
bucket_sums = grouped.contb_receipt_amt.sum().unstack(0)
bucket_sums


Out[70]:
cand_nm Obama, Barack Romney, Mitt
contb_receipt_amt
(0, 1] 318.24 77.00
(1, 10] 337267.62 29819.66
(10, 100] 20288981.41 1987783.76
(100, 1000] 54798531.46 22363381.69
(1000, 10000] 51753705.67 63942145.42
(10000, 100000] 59100.00 12700.00
(100000, 1000000] 1490683.08 NaN
(1000000, 10000000] 7148839.76 NaN

In [71]:
normed_sums = bucket_sums.div(bucket_sums.sum(axis=1), axis=0)
normed_sums


Out[71]:
cand_nm Obama, Barack Romney, Mitt
contb_receipt_amt
(0, 1] 0.805182 0.194818
(1, 10] 0.918767 0.081233
(10, 100] 0.910769 0.089231
(100, 1000] 0.710176 0.289824
(1000, 10000] 0.447326 0.552674
(10000, 100000] 0.823120 0.176880
(100000, 1000000] 1.000000 NaN
(1000000, 10000000] 1.000000 NaN

In [72]:
normed_sums[:-2].plot(kind='barh', stacked=True)


Out[72]:
<matplotlib.axes._subplots.AxesSubplot at 0x46e6ccc0>

In [73]:
grouped = fec_mrbo.groupby(['cand_nm', 'contbr_st'])
totals = grouped.contb_receipt_amt.sum().unstack(0).fillna(0)
totals[:10]


Out[73]:
cand_nm Obama, Barack Romney, Mitt
contbr_st
AA 56405.00 135.00
AB 2048.00 0.00
AE 42973.75 5680.00
AK 281840.15 86204.24
AL 543123.48 527303.51
AP 37130.50 1655.00
AR 359247.28 105556.00
AS 2955.00 0.00
AZ 1506476.98 1888436.23
CA 23824984.24 11237636.60

In [74]:
percent = totals.div(totals.sum(1), axis=0)
percent[:10]


Out[74]:
cand_nm Obama, Barack Romney, Mitt
contbr_st
AA 0.997612 0.002388
AB 1.000000 0.000000
AE 0.883257 0.116743
AK 0.765778 0.234222
AL 0.507390 0.492610
AP 0.957329 0.042671
AR 0.772902 0.227098
AS 1.000000 0.000000
AZ 0.443745 0.556255
CA 0.679498 0.320502

In [77]:
from mpl_toolkits.basemap import Basemap, cm
from matplotlib import rcParams
from matplotlib.collections import LineCollection
#from shapelib import ShapeFile
#import dbflib

In [78]:
obama = percent['Obama, Barack']

fig = plt.figure(figsize=(12, 12))
ax = fig.add_axes([0.1, 0.1, 0.8, 0.8])

lllat = 21; urlat = 53; lllon=-118; urlon=-62

m = Basemap(ax=ax, projection='stere',
           lon_0=(urlon+lllon)/2, lat_0=(urlat + lllat) / 2,
           llcrnrlat=lllat, urcrnrlat=urlat, llcrnrlon=lllon,
           urcrnrlon=urlon, resolution='l')
m.drawcoastlines()
m.drawcountries()

plt.show()



In [ ]:


In [ ]: