In [1]:

    
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

Quantile and bucket analysis



In [2]:

    
frame = DataFrame({'data1': np.random.randn(1000),
                   'data2': np.random.randn(1000)})
factor = pd.cut(frame.data1, 4)
factor[:10]









    Out[2]:





0    (0.0745, 1.679]
1    (-1.53, 0.0745]
2    (-1.53, 0.0745]
3    (-1.53, 0.0745]
4     (1.679, 3.284]
5    (0.0745, 1.679]
6    (-1.53, 0.0745]
7    (-1.53, 0.0745]
8    (-1.53, 0.0745]
9    (0.0745, 1.679]
Name: data1, dtype: category
Categories (4, object): [(-3.142, -1.53] < (-1.53, 0.0745] < (0.0745, 1.679] < (1.679, 3.284]]



In [3]:

    
def get_stats(group):
    return {'min': group.min(), 'max': group.max(),
            'count': group.count(), 'mean': group.mean()}

grouped = frame.data2.groupby(factor)
grouped.apply(get_stats).unstack()









    Out[3]:






  
    
      
      count
      max
      mean
      min
    
    
      data1
      
      
      
      
    
  
  
    
      (-3.142, -1.53]
      56.0
      1.755423
      -0.043593
      -2.616105
    
    
      (-1.53, 0.0745]
      477.0
      3.291334
      -0.047229
      -3.429891
    
    
      (0.0745, 1.679]
      419.0
      2.502201
      -0.005529
      -2.895748
    
    
      (1.679, 3.284]
      48.0
      2.242241
      0.063398
      -2.087416



In [4]:

    
grouping = pd.qcut(frame.data1, 10, labels=False)

grouped = frame.data2.groupby(grouping)
grouped.apply(get_stats).unstack()

Example: Filling missing values with group-specific values



In [5]:

    
s = Series(np.random.randn(6))
s[::2] = np.nan
s









    Out[5]:





0         NaN
1    0.177312
2         NaN
3   -0.815817
4         NaN
5    1.319567
dtype: float64



In [6]:

    
s.fillna(s.mean())









    Out[6]:





0    0.227021
1    0.177312
2    0.227021
3   -0.815817
4    0.227021
5    1.319567
dtype: float64



In [7]:

    
states = ['Ohio', 'New York', 'Vermont', 'Florida',
          'Oregon', 'Nevada', 'California', 'Idaho']
group_key = ['East'] * 4 + ['West'] * 4
data = Series(np.random.randn(8), index=states)
data[['Vermont', 'Nevada', 'Idaho']] = np.nan
data









    Out[7]:





Ohio         -0.166692
New York     -0.327077
Vermont            NaN
Florida       0.588951
Oregon       -0.425951
Nevada             NaN
California    1.268341
Idaho              NaN
dtype: float64



In [8]:

    
data.groupby(group_key).mean()









    Out[8]:





East    0.031727
West    0.421195
dtype: float64



In [9]:

    
fill_mean = lambda g: g.fillna(g.mean())
data.groupby(group_key).apply(fill_mean)









    Out[9]:





Ohio         -0.166692
New York     -0.327077
Vermont       0.031727
Florida       0.588951
Oregon       -0.425951
Nevada        0.421195
California    1.268341
Idaho         0.421195
dtype: float64



In [10]:

    
fill_values = {'East': 0.5, 'West': -1}
fill_func = lambda g: g.fillna(fill_values[g.name])

data.groupby(group_key).apply(fill_func)









    Out[10]:





Ohio         -0.166692
New York     -0.327077
Vermont       0.500000
Florida       0.588951
Oregon       -0.425951
Nevada       -1.000000
California    1.268341
Idaho        -1.000000
dtype: float64

Example: Random sampling and permutation



In [13]:

    
suits = ['H', 'S', 'C', 'D']
card_val = ([i for i in range(1, 11)] + [10] * 3) * 4
base_names = ['A'] + [i for i in range(2, 11)] + ['J', 'K', 'Q']
cards = []
for suit in ['H', 'S', 'C', 'D']:
    cards.extend(str(num) + suit for num in base_names)

deck = Series(card_val, index=cards)
deck[:13]









    Out[13]:





AH      1
2H      2
3H      3
4H      4
5H      5
6H      6
7H      7
8H      8
9H      9
10H    10
JH     10
KH     10
QH     10
dtype: int64



In [14]:

    
def draw(deck, n=5):
    return deck.take(np.random.permutation(len(deck))[:n])
draw(deck)









    Out[14]:





JD    10
9D     9
8C     8
7H     7
QS    10
dtype: int64



In [15]:

    
get_suit = lambda card: card[-1]
deck.groupby(get_suit).apply(draw, n=2)









    Out[15]:





C  5C    5
   3C    3
D  8D    8
   7D    7
H  4H    4
   5H    5
S  7S    7
   8S    8
dtype: int64



In [16]:

    
deck.groupby(get_suit, group_keys=False).apply(draw, n=2)









    Out[16]:





JC     10
2C      2
6D      6
8D      8
10H    10
JH     10
7S      7
9S      9
dtype: int64

Example: Group weighted average and correlation



In [17]:

    
df = DataFrame({'category': ['a', 'a', 'a', 'a', 'b', 'b', 'b', 'b'],
                'data': np.random.randn(8),
                'weights': np.random.rand(8)})
df



In [18]:

    
grouped = df.groupby('category')
get_wavg = lambda g: np.average(g['data'], weights=g['weights'])
grouped.apply(get_wavg)









    Out[18]:





category
a    0.078119
b    0.445973
dtype: float64



In [19]:

    
close_px = pd.read_csv('stock_px.csv', parse_dates=True, index_col=0)
close_px.info()









    



<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 2214 entries, 2003-01-02 to 2011-10-14
Data columns (total 4 columns):
AAPL    2214 non-null float64
MSFT    2214 non-null float64
XOM     2214 non-null float64
SPX     2214 non-null float64
dtypes: float64(4)
memory usage: 86.5 KB



In [20]:

    
close_px[-4:]



In [21]:

    
rets = close_px.pct_change().dropna()
spx_corr = lambda x: x.corrwith(x['SPX'])
by_year = rets.groupby(lambda x: x.year)
by_year.apply(spx_corr)



In [22]:

    
by_year.apply(lambda g: g['AAPL'].corr(g['MSFT']))









    Out[22]:





2003    0.480868
2004    0.259024
2005    0.300093
2006    0.161735
2007    0.417738
2008    0.611901
2009    0.432738
2010    0.571946
2011    0.581987
dtype: float64

Example: Group-wise linear regression



In [23]:

    
import statsmodels.api as sm
def regress(data, yvar, xvars):
    Y = data[yvar]
    X = data[xvars]
    X['intercept'] = 1.
    result = sm.OLS(Y, X).fit()
    return result.params
by_year.apply(regress, 'AAPL', ['SPX'])

Pivot tables and Cross-tabulation



In [25]:

    
tips = pd.read_csv('tips.csv')
tips.pivot_table(index=['sex', 'smoker'])



In [26]:

    
tips.pivot_table(['tip_pct', 'size'], index=['sex', 'day'],
                 columns='smoker')



In [27]:

    
tips.pivot_table(['tip_pct', 'size'], index=['sex', 'day'],
                 columns='smoker', margins=True)









    



---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
<ipython-input-27-b52005ad67d6> in <module>()
      1 tips.pivot_table(['tip_pct', 'size'], index=['sex', 'day'],
----> 2                  columns='smoker', margins=True)

/Users/alexkirnas/anaconda/lib/python3.6/site-packages/pandas/tools/pivot.py in pivot_table(data, values, index, columns, aggfunc, fill_value, margins, dropna, margins_name)
    162         table = _add_margins(table, data, values, rows=index,
    163                              cols=columns, aggfunc=aggfunc,
--> 164                              margins_name=margins_name)
    165 
    166     # discard the top level

/Users/alexkirnas/anaconda/lib/python3.6/site-packages/pandas/tools/pivot.py in _add_margins(table, data, values, rows, cols, aggfunc, margins_name)
    187             raise ValueError(exception_msg)
    188 
--> 189     grand_margin = _compute_grand_margin(data, values, aggfunc, margins_name)
    190 
    191     # could be passed a Series object with no 'columns'

/Users/alexkirnas/anaconda/lib/python3.6/site-packages/pandas/tools/pivot.py in _compute_grand_margin(data, values, aggfunc, margins_name)
    248     if values:
    249         grand_margin = {}
--> 250         for k, v in data[values].iteritems():
    251             try:
    252                 if isinstance(aggfunc, compat.string_types):

/Users/alexkirnas/anaconda/lib/python3.6/site-packages/pandas/core/frame.py in __getitem__(self, key)
   2051         if isinstance(key, (Series, np.ndarray, Index, list)):
   2052             # either boolean or fancy integer index
-> 2053             return self._getitem_array(key)
   2054         elif isinstance(key, DataFrame):
   2055             return self._getitem_frame(key)

/Users/alexkirnas/anaconda/lib/python3.6/site-packages/pandas/core/frame.py in _getitem_array(self, key)
   2095             return self.take(indexer, axis=0, convert=False)
   2096         else:
-> 2097             indexer = self.ix._convert_to_indexer(key, axis=1)
   2098             return self.take(indexer, axis=1, convert=True)
   2099 

/Users/alexkirnas/anaconda/lib/python3.6/site-packages/pandas/core/indexing.py in _convert_to_indexer(self, obj, axis, is_setter)
   1228                 mask = check == -1
   1229                 if mask.any():
-> 1230                     raise KeyError('%s not in index' % objarr[mask])
   1231 
   1232                 return _values_from_object(indexer)

KeyError: "['tip_pct'] not in index"



In [28]:

    
tips.pivot_table('tip_pct', index=['sex', 'smoker'], columns='day',
                 aggfunc=len, margins=True)









    



---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
<ipython-input-28-6d28cc32871b> in <module>()
      1 tips.pivot_table('tip_pct', index=['sex', 'smoker'], columns='day',
----> 2                  aggfunc=len, margins=True)

/Users/alexkirnas/anaconda/lib/python3.6/site-packages/pandas/tools/pivot.py in pivot_table(data, values, index, columns, aggfunc, fill_value, margins, dropna, margins_name)
    162         table = _add_margins(table, data, values, rows=index,
    163                              cols=columns, aggfunc=aggfunc,
--> 164                              margins_name=margins_name)
    165 
    166     # discard the top level

/Users/alexkirnas/anaconda/lib/python3.6/site-packages/pandas/tools/pivot.py in _add_margins(table, data, values, rows, cols, aggfunc, margins_name)
    187             raise ValueError(exception_msg)
    188 
--> 189     grand_margin = _compute_grand_margin(data, values, aggfunc, margins_name)
    190 
    191     # could be passed a Series object with no 'columns'

/Users/alexkirnas/anaconda/lib/python3.6/site-packages/pandas/tools/pivot.py in _compute_grand_margin(data, values, aggfunc, margins_name)
    248     if values:
    249         grand_margin = {}
--> 250         for k, v in data[values].iteritems():
    251             try:
    252                 if isinstance(aggfunc, compat.string_types):

/Users/alexkirnas/anaconda/lib/python3.6/site-packages/pandas/core/frame.py in __getitem__(self, key)
   2051         if isinstance(key, (Series, np.ndarray, Index, list)):
   2052             # either boolean or fancy integer index
-> 2053             return self._getitem_array(key)
   2054         elif isinstance(key, DataFrame):
   2055             return self._getitem_frame(key)

/Users/alexkirnas/anaconda/lib/python3.6/site-packages/pandas/core/frame.py in _getitem_array(self, key)
   2095             return self.take(indexer, axis=0, convert=False)
   2096         else:
-> 2097             indexer = self.ix._convert_to_indexer(key, axis=1)
   2098             return self.take(indexer, axis=1, convert=True)
   2099 

/Users/alexkirnas/anaconda/lib/python3.6/site-packages/pandas/core/indexing.py in _convert_to_indexer(self, obj, axis, is_setter)
   1228                 mask = check == -1
   1229                 if mask.any():
-> 1230                     raise KeyError('%s not in index' % objarr[mask])
   1231 
   1232                 return _values_from_object(indexer)

KeyError: "['tip_pct'] not in index"



In [29]:

    
tips.pivot_table('size', index=['time', 'sex', 'smoker'],
                 columns='day', aggfunc='sum', fill_value=0)

Cross-tabulations: crosstab



In [30]:

    
from StringIO import StringIO
data = """\
Sample    Gender    Handedness
1    Female    Right-handed
2    Male    Left-handed
3    Female    Right-handed
4    Male    Right-handed
5    Male    Left-handed
6    Male    Right-handed
7    Female    Right-handed
8    Female    Left-handed
9    Male    Right-handed
10    Female    Right-handed"""
data = pd.read_table(StringIO(data), sep='\s+')









    



---------------------------------------------------------------------------
ModuleNotFoundError                       Traceback (most recent call last)
<ipython-input-30-bac5d2e861fe> in <module>()
----> 1 from StringIO import StringIO
      2 data = """Sample    Gender    Handedness
      3 1    Female    Right-handed
      4 2    Male    Left-handed
      5 3    Female    Right-handed

ModuleNotFoundError: No module named 'StringIO'

Example: 2012 Federal Election Commission Database



In [31]:

    
fec = pd.read_csv('P00000001-ALL.csv')
fec.info()









    



/Users/alexkirnas/anaconda/lib/python3.6/site-packages/IPython/core/interactiveshell.py:2717: DtypeWarning: Columns (6) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)






    



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1001731 entries, 0 to 1001730
Data columns (total 16 columns):
cmte_id              1001731 non-null object
cand_id              1001731 non-null object
cand_nm              1001731 non-null object
contbr_nm            1001731 non-null object
contbr_city          1001712 non-null object
contbr_st            1001727 non-null object
contbr_zip           1001620 non-null object
contbr_employer      988002 non-null object
contbr_occupation    993301 non-null object
contb_receipt_amt    1001731 non-null float64
contb_receipt_dt     1001731 non-null object
receipt_desc         14166 non-null object
memo_cd              92482 non-null object
memo_text            97770 non-null object
form_tp              1001731 non-null object
file_num             1001731 non-null int64
dtypes: float64(1), int64(1), object(14)
memory usage: 122.3+ MB



In [32]:

    
fec.ix[123456]









    Out[32]:





cmte_id                             C00431445
cand_id                             P80003338
cand_nm                         Obama, Barack
contbr_nm                         ELLMAN, IRA
contbr_city                             TEMPE
contbr_st                                  AZ
contbr_zip                          852816719
contbr_employer      ARIZONA STATE UNIVERSITY
contbr_occupation                   PROFESSOR
contb_receipt_amt                          50
contb_receipt_dt                    01-DEC-11
receipt_desc                              NaN
memo_cd                                   NaN
memo_text                                 NaN
form_tp                                 SA17A
file_num                               772372
Name: 123456, dtype: object



In [33]:

    
unique_cands = fec.cand_nm.unique()
unique_cands









    Out[33]:





array(['Bachmann, Michelle', 'Romney, Mitt', 'Obama, Barack',
       "Roemer, Charles E. 'Buddy' III", 'Pawlenty, Timothy',
       'Johnson, Gary Earl', 'Paul, Ron', 'Santorum, Rick', 'Cain, Herman',
       'Gingrich, Newt', 'McCotter, Thaddeus G', 'Huntsman, Jon',
       'Perry, Rick'], dtype=object)



In [34]:

    
parties = {'Bachmann, Michelle': 'Republican',
           'Cain, Herman': 'Republican',
           'Gingrich, Newt': 'Republican',
           'Huntsman, Jon': 'Republican',
           'Johnson, Gary Earl': 'Republican',
           'McCotter, Thaddeus G': 'Republican',
           'Obama, Barack': 'Democrat',
           'Paul, Ron': 'Republican',
           'Pawlenty, Timothy': 'Republican',
           'Perry, Rick': 'Republican',
           "Roemer, Charles E. 'Buddy' III": 'Republican',
           'Romney, Mitt': 'Republican',
           'Santorum, Rick': 'Republican'}



In [35]:

    
fec.cand_nm[123456:123461]









    Out[35]:





123456    Obama, Barack
123457    Obama, Barack
123458    Obama, Barack
123459    Obama, Barack
123460    Obama, Barack
Name: cand_nm, dtype: object



In [36]:

    
fec.cand_nm[123456:123461].map(parties)









    Out[36]:





123456    Democrat
123457    Democrat
123458    Democrat
123459    Democrat
123460    Democrat
Name: cand_nm, dtype: object



In [37]:

    
fec['party'] = fec.cand_nm.map(parties)



In [38]:

    
fec['party'].value_counts()









    Out[38]:





Democrat      593746
Republican    407985
Name: party, dtype: int64



In [39]:

    
(fec.contb_receipt_amt > 0).value_counts()









    Out[39]:





True     991475
False     10256
Name: contb_receipt_amt, dtype: int64



In [40]:

    
fec = fec[fec.contb_receipt_amt > 0]



In [41]:

    
fec_mrbo = fec[fec.cand_nm.isin(['Obama, Barack', 'Romney, Mitt'])]

Donation statistics by occupation and employer



In [42]:

    
fec.contbr_occupation.value_counts()[:10]









    Out[42]:





RETIRED                                   233990
INFORMATION REQUESTED                      35107
ATTORNEY                                   34286
HOMEMAKER                                  29931
PHYSICIAN                                  23432
INFORMATION REQUESTED PER BEST EFFORTS     21138
ENGINEER                                   14334
TEACHER                                    13990
CONSULTANT                                 13273
PROFESSOR                                  12555
Name: contbr_occupation, dtype: int64



In [43]:

    
occ_mapping = {
   'INFORMATION REQUESTED PER BEST EFFORTS' : 'NOT PROVIDED',
   'INFORMATION REQUESTED' : 'NOT PROVIDED',
   'INFORMATION REQUESTED (BEST EFFORTS)' : 'NOT PROVIDED',
   'C.E.O.': 'CEO'
}

f = lambda x: occ_mapping.get(x, x)
fec.contbr_occupation = fec.contbr_occupation.map(f)



In [44]:

    
emp_mapping = {
   'INFORMATION REQUESTED PER BEST EFFORTS' : 'NOT PROVIDED',
   'INFORMATION REQUESTED' : 'NOT PROVIDED',
   'SELF' : 'SELF-EMPLOYED',
   'SELF EMPLOYED' : 'SELF-EMPLOYED',
}

f = lambda x: emp_mapping.get(x, x)
fec.contbr_employer = fec.contbr_employer.map(f)



In [45]:

    
by_occupation = fec.pivot_table('contb_receipt_amt',
                                index='contbr_occupation',
                                columns='party', aggfunc='sum')



In [46]:

    
over_2mm = by_occupation[by_occupation.sum(1) > 2000000]
over_2mm









    Out[46]:






  
    
      party
      Democrat
      Republican
    
    
      contbr_occupation
      
      
    
  
  
    
      ATTORNEY
      11141982.97
      7.477194e+06
    
    
      CEO
      2074974.79
      4.211041e+06
    
    
      CONSULTANT
      2459912.71
      2.544725e+06
    
    
      ENGINEER
      951525.55
      1.818374e+06
    
    
      EXECUTIVE
      1355161.05
      4.138850e+06
    
    
      HOMEMAKER
      4248875.80
      1.363428e+07
    
    
      INVESTOR
      884133.00
      2.431769e+06
    
    
      LAWYER
      3160478.87
      3.912243e+05
    
    
      MANAGER
      762883.22
      1.444532e+06
    
    
      NOT PROVIDED
      4866973.96
      2.056547e+07
    
    
      OWNER
      1001567.36
      2.408287e+06
    
    
      PHYSICIAN
      3735124.94
      3.594320e+06
    
    
      PRESIDENT
      1878509.95
      4.720924e+06
    
    
      PROFESSOR
      2165071.08
      2.967027e+05
    
    
      REAL ESTATE
      528902.09
      1.625902e+06
    
    
      RETIRED
      25305116.38
      2.356124e+07
    
    
      SELF-EMPLOYED
      672393.40
      1.640253e+06



In [48]:

    
%matplotlib inline
over_2mm.plot(kind='barh')









    Out[48]:





<matplotlib.axes._subplots.AxesSubplot at 0x11300d160>



In [55]:

    
def get_top_amounts(group, key, n=5):
    totals = group.groupby(key)['contb_receipt_amt'].sum()

    return totals.order(ascending=False)[-n:]



In [56]:

    
grouped = fec_mrbo.groupby('cand_nm')
grouped.apply(get_top_amounts, 'contbr_occupation', n=7)









    



/Users/alexkirnas/anaconda/lib/python3.6/site-packages/ipykernel/__main__.py:4: FutureWarning: order is deprecated, use sort_values(...)






    Out[56]:





cand_nm        contbr_occupation                     
Obama, Barack  COMPUTER ASSISTANT                        3.0
               SPRINKLER FITTER FIRE PROTECTION SPECI    3.0
               ADMINISTRATION/INSTRUCTOR                 3.0
               LEAD UI/UX DEVELOPER                      3.0
               POLICY/ LAWYER                            3.0
               LAN/WAN ANALYST                           3.0
               SR MGR                                    3.0
Romney, Mitt   MD - UROLOGIST                            5.0
               DISTRICT REPRESENTATIVE                   5.0
               INDEPENDENT PROFESSIONAL                  3.0
               REMODELER & SEMI RETIRED                  3.0
               AFFORDABLE REAL ESTATE DEVELOPER          3.0
               IFC CONTRACTING SOLUTIONS                 3.0
               3RD GENERATION FAMILY BUSINESS OWNER      3.0
Name: contb_receipt_amt, dtype: float64



In [57]:

    
grouped.apply(get_top_amounts, 'contbr_employer', n=10)









    



/Users/alexkirnas/anaconda/lib/python3.6/site-packages/ipykernel/__main__.py:4: FutureWarning: order is deprecated, use sort_values(...)






    Out[57]:





cand_nm        contbr_employer                   
Obama, Barack  SOLIYA                                3.0
               CARR ENTERPRISES                      3.0
               PENN STATE DICKINSON SCHOOL OF LAW    3.0
               CADUCEUS OCCUPATIONAL MEDICINE        3.0
               N.A.                                  3.0
               REAL ENERGY CONSULTING SERVICES       3.0
               JPDSYSTEMS, LLC                       3.0
               CASS REGIONAL MED. CENTER             2.5
               ARCON CORP                            2.0
               THE VICTORIA GROUP, INC.              2.0
Romney, Mitt   EASTHAM CAPITAL                       5.0
               GREGORY GALLIVAN                      5.0
               DIRECT LENDERS LLC                    5.0
               LOUGH INVESTMENT ADVISORY LLC         4.0
               WATERWORKS INDUSRTIES                 3.0
               WILL MERRIFIELD                       3.0
               HONOLD COMMUNICTAIONS                 3.0
               INDEPENDENT PROFESSIONAL              3.0
               UPTOWN CHEAPSKATE                     3.0
               UN                                    3.0
Name: contb_receipt_amt, dtype: float64

Bucketing donation amounts



In [58]:

    
bins = np.array([0, 1, 10, 100, 1000, 10000, 100000, 1000000, 10000000])
labels = pd.cut(fec_mrbo.contb_receipt_amt, bins)
labels









    Out[58]:





411           (10, 100]
412         (100, 1000]
413         (100, 1000]
414           (10, 100]
415           (10, 100]
416           (10, 100]
417         (100, 1000]
418           (10, 100]
419         (100, 1000]
420           (10, 100]
421           (10, 100]
422         (100, 1000]
423         (100, 1000]
424         (100, 1000]
425         (100, 1000]
426         (100, 1000]
427       (1000, 10000]
428         (100, 1000]
429         (100, 1000]
430           (10, 100]
431       (1000, 10000]
432         (100, 1000]
433         (100, 1000]
434         (100, 1000]
435         (100, 1000]
436         (100, 1000]
437           (10, 100]
438         (100, 1000]
439         (100, 1000]
440           (10, 100]
              ...      
701356        (10, 100]
701357          (1, 10]
701358        (10, 100]
701359        (10, 100]
701360        (10, 100]
701361        (10, 100]
701362      (100, 1000]
701363        (10, 100]
701364        (10, 100]
701365        (10, 100]
701366        (10, 100]
701367        (10, 100]
701368      (100, 1000]
701369        (10, 100]
701370        (10, 100]
701371        (10, 100]
701372        (10, 100]
701373        (10, 100]
701374        (10, 100]
701375        (10, 100]
701376    (1000, 10000]
701377        (10, 100]
701378        (10, 100]
701379      (100, 1000]
701380    (1000, 10000]
701381        (10, 100]
701382      (100, 1000]
701383          (1, 10]
701384        (10, 100]
701385      (100, 1000]
Name: contb_receipt_amt, dtype: category
Categories (8, object): [(0, 1] < (1, 10] < (10, 100] < (100, 1000] < (1000, 10000] < (10000, 100000] < (100000, 1000000] < (1000000, 10000000]]



In [59]:

    
grouped = fec_mrbo.groupby(['cand_nm', labels])
grouped.size().unstack(0)









    Out[59]:






  
    
      cand_nm
      Obama, Barack
      Romney, Mitt
    
    
      contb_receipt_amt
      
      
    
  
  
    
      (0, 1]
      493.0
      77.0
    
    
      (1, 10]
      40070.0
      3681.0
    
    
      (10, 100]
      372280.0
      31853.0
    
    
      (100, 1000]
      153991.0
      43357.0
    
    
      (1000, 10000]
      22284.0
      26186.0
    
    
      (10000, 100000]
      2.0
      1.0
    
    
      (100000, 1000000]
      3.0
      NaN
    
    
      (1000000, 10000000]
      4.0
      NaN



In [60]:

    
bucket_sums = grouped.contb_receipt_amt.sum().unstack(0)
bucket_sums









    Out[60]:






  
    
      cand_nm
      Obama, Barack
      Romney, Mitt
    
    
      contb_receipt_amt
      
      
    
  
  
    
      (0, 1]
      318.24
      77.00
    
    
      (1, 10]
      337267.62
      29819.66
    
    
      (10, 100]
      20288981.41
      1987783.76
    
    
      (100, 1000]
      54798531.46
      22363381.69
    
    
      (1000, 10000]
      51753705.67
      63942145.42
    
    
      (10000, 100000]
      59100.00
      12700.00
    
    
      (100000, 1000000]
      1490683.08
      NaN
    
    
      (1000000, 10000000]
      7148839.76
      NaN



In [61]:

    
normed_sums = bucket_sums.div(bucket_sums.sum(axis=1), axis=0)
normed_sums









    Out[61]:






  
    
      cand_nm
      Obama, Barack
      Romney, Mitt
    
    
      contb_receipt_amt
      
      
    
  
  
    
      (0, 1]
      0.805182
      0.194818
    
    
      (1, 10]
      0.918767
      0.081233
    
    
      (10, 100]
      0.910769
      0.089231
    
    
      (100, 1000]
      0.710176
      0.289824
    
    
      (1000, 10000]
      0.447326
      0.552674
    
    
      (10000, 100000]
      0.823120
      0.176880
    
    
      (100000, 1000000]
      1.000000
      NaN
    
    
      (1000000, 10000000]
      1.000000
      NaN



In [62]:

    
normed_sums[:-2].plot(kind='barh', stacked=True)









    Out[62]:





<matplotlib.axes._subplots.AxesSubplot at 0x113453400>

Donation statistics by state



In [63]:

    
grouped = fec_mrbo.groupby(['cand_nm', 'contbr_st'])
totals = grouped.contb_receipt_amt.sum().unstack(0).fillna(0)
totals = totals[totals.sum(1) > 100000]
totals[:10]









    Out[63]:






  
    
      cand_nm
      Obama, Barack
      Romney, Mitt
    
    
      contbr_st
      
      
    
  
  
    
      AK
      281840.15
      86204.24
    
    
      AL
      543123.48
      527303.51
    
    
      AR
      359247.28
      105556.00
    
    
      AZ
      1506476.98
      1888436.23
    
    
      CA
      23824984.24
      11237636.60
    
    
      CO
      2132429.49
      1506714.12
    
    
      CT
      2068291.26
      3499475.45
    
    
      DC
      4373538.80
      1025137.50
    
    
      DE
      336669.14
      82712.00
    
    
      FL
      7318178.58
      8338458.81



In [64]:

    
percent = totals.div(totals.sum(1), axis=0)
percent[:10]









    Out[64]:






  
    
      cand_nm
      Obama, Barack
      Romney, Mitt
    
    
      contbr_st
      
      
    
  
  
    
      AK
      0.765778
      0.234222
    
    
      AL
      0.507390
      0.492610
    
    
      AR
      0.772902
      0.227098
    
    
      AZ
      0.443745
      0.556255
    
    
      CA
      0.679498
      0.320502
    
    
      CO
      0.585970
      0.414030
    
    
      CT
      0.371476
      0.628524
    
    
      DC
      0.810113
      0.189887
    
    
      DE
      0.802776
      0.197224
    
    
      FL
      0.467417
      0.532583



In [ ]:

	count	max	mean	min
data1
0	100.0	1.755423	-0.043725	-2.616105
1	100.0	2.928873	0.088564	-1.964032
2	100.0	2.444503	-0.134126	-3.429891
3	100.0	2.280154	-0.025908	-2.658135
4	100.0	3.291334	-0.083880	-2.053819
5	100.0	2.424807	-0.067503	-2.527094
6	100.0	2.502201	-0.007038	-2.348952
7	100.0	2.024848	-0.013882	-2.555263
8	100.0	2.280538	0.026527	-2.895748
9	100.0	2.242241	0.018543	-2.102425

	category	data	weights
0	a	0.232154	0.523259
1	a	-1.642629	0.720095
2	a	1.688881	0.693117
3	a	0.175747	0.430776
4	b	0.181601	0.874108
5	b	1.168295	0.712755
6	b	0.030540	0.542980
7	b	0.287650	0.367464

	AAPL	MSFT	XOM	SPX
2011-10-11	400.29	27.00	76.27	1195.54
2011-10-12	402.19	26.96	77.16	1207.25
2011-10-13	408.43	27.18	76.37	1203.66
2011-10-14	422.00	27.27	78.11	1224.58

	AAPL	MSFT	XOM	SPX
2003	0.541124	0.745174	0.661265	1.0
2004	0.374283	0.588531	0.557742	1.0
2005	0.467540	0.562374	0.631010	1.0
2006	0.428267	0.406126	0.518514	1.0
2007	0.508118	0.658770	0.786264	1.0
2008	0.681434	0.804626	0.828303	1.0
2009	0.707103	0.654902	0.797921	1.0
2010	0.710105	0.730118	0.839057	1.0
2011	0.691931	0.800996	0.859975	1.0

	SPX	intercept
2003	1.195406	0.000710
2004	1.363463	0.004201
2005	1.766415	0.003246
2006	1.645496	0.000080
2007	1.198761	0.003438
2008	0.968016	-0.001110
2009	0.879103	0.002954
2010	1.052608	0.001261
2011	0.806605	0.001514

	count	max	mean	min
data1
(-3.142, -1.53]	56.0	1.755423	-0.043593	-2.616105
(-1.53, 0.0745]	477.0	3.291334	-0.047229	-3.429891
(0.0745, 1.679]	419.0	2.502201	-0.005529	-2.895748
(1.679, 3.284]	48.0	2.242241	0.063398	-2.087416

		size	tip	total_bill
sex	smoker
Female	No	2.592593	2.773519	18.105185
Female	Yes	2.242424	2.931515	17.977879
Male	No	2.711340	3.113402	19.791237
Male	Yes	2.500000	3.051167	22.284500

		day	Fri	Sat	Sun	Thur
time	sex	smoker
Dinner	Female	No	2	30	43	2
	Female	Yes	8	33	10	0
	Male	No	4	85	124	0
	Male	Yes	12	71	39	0
Lunch	Female	No	3	0	0	60
	Female	Yes	6	0	0	17
	Male	No	0	0	0	50
	Male	Yes	5	0	0	23

party	Democrat	Republican
contbr_occupation
ATTORNEY	11141982.97	7.477194e+06
CEO	2074974.79	4.211041e+06
CONSULTANT	2459912.71	2.544725e+06
ENGINEER	951525.55	1.818374e+06
EXECUTIVE	1355161.05	4.138850e+06
HOMEMAKER	4248875.80	1.363428e+07
INVESTOR	884133.00	2.431769e+06
LAWYER	3160478.87	3.912243e+05
MANAGER	762883.22	1.444532e+06
NOT PROVIDED	4866973.96	2.056547e+07
OWNER	1001567.36	2.408287e+06
PHYSICIAN	3735124.94	3.594320e+06
PRESIDENT	1878509.95	4.720924e+06
PROFESSOR	2165071.08	2.967027e+05
REAL ESTATE	528902.09	1.625902e+06
RETIRED	25305116.38	2.356124e+07
SELF-EMPLOYED	672393.40	1.640253e+06

cand_nm	Obama, Barack	Romney, Mitt
contb_receipt_amt
(0, 1]	493.0	77.0
(1, 10]	40070.0	3681.0
(10, 100]	372280.0	31853.0
(100, 1000]	153991.0	43357.0
(1000, 10000]	22284.0	26186.0
(10000, 100000]	2.0	1.0
(100000, 1000000]	3.0	NaN
(1000000, 10000000]	4.0	NaN

cand_nm	Obama, Barack	Romney, Mitt
contb_receipt_amt
(0, 1]	318.24	77.00
(1, 10]	337267.62	29819.66
(10, 100]	20288981.41	1987783.76
(100, 1000]	54798531.46	22363381.69
(1000, 10000]	51753705.67	63942145.42
(10000, 100000]	59100.00	12700.00
(100000, 1000000]	1490683.08	NaN
(1000000, 10000000]	7148839.76	NaN

cand_nm	Obama, Barack	Romney, Mitt
contb_receipt_amt
(0, 1]	0.805182	0.194818
(1, 10]	0.918767	0.081233
(10, 100]	0.910769	0.089231
(100, 1000]	0.710176	0.289824
(1000, 10000]	0.447326	0.552674
(10000, 100000]	0.823120	0.176880
(100000, 1000000]	1.000000	NaN
(1000000, 10000000]	1.000000	NaN

cand_nm	Obama, Barack	Romney, Mitt
contbr_st
AK	281840.15	86204.24
AL	543123.48	527303.51
AR	359247.28	105556.00
AZ	1506476.98	1888436.23
CA	23824984.24	11237636.60
CO	2132429.49	1506714.12
CT	2068291.26	3499475.45
DC	4373538.80	1025137.50
DE	336669.14	82712.00
FL	7318178.58	8338458.81

cand_nm	Obama, Barack	Romney, Mitt
contbr_st
AK	0.765778	0.234222
AL	0.507390	0.492610
AR	0.772902	0.227098
AZ	0.443745	0.556255
CA	0.679498	0.320502
CO	0.585970	0.414030
CT	0.371476	0.628524
DC	0.810113	0.189887
DE	0.802776	0.197224
FL	0.467417	0.532583