In [1]:
import re # regex
import pandas as pd 
from requests import get
import matplotlib.pyplot as plt

from libextract import extract # our standard extraction method
from libextract.strategies import TABULAR # for when dealing with tables
from libextract import prototypes # currently found in 'fuzzy-table-formatter' branch

In [185]:
# this function will be "applied" to every element in the column we designate
def convert_num(n):
    return float(n.replace(',','').replace('$',''))/1000000

def add_brand(ax, title, source):
    ax.xaxis.tick_top()
    leg = ax.legend(loc="upper right")
    # Get the bounding box of the original legend
    bb = leg.legendPatch.get_bbox().inverse_transformed(ax.transAxes)
    # Change to location of the legend. 
    #ttl = ax.figure.suptitle(title)
    ttl = ax.text(0.5,1.1, title, horizontalalignment='center',
                  fontsize=20, transform = ax.transAxes)
    
    ax.text(0.5, 1.15, "github.com/datalib/libextract",
             horizontalalignment='center', fontsize=16,
             transform = ax.transAxes)
    ax.text(0.5, 1.05, source, horizontalalignment='center',
             fontsize=12, transform = ax.transAxes)
    
    bb.set_points([1,1.15])
    leg.set_bbox_to_anchor(bb)
    return ax

In [141]:


In [207]:
url = "https://www.opensecrets.org/lobby/top.php?indexType=s&showYear=2012"
r = get(url)
# convert_table is something not yet in the master branch
strat = TABULAR + (prototypes.convert_table,) 
# top 5 predicted html elements/dicts of tabular data
tabs = list(extract(r.content, strategy=strat))

In [112]:
tabs


Out[112]:
[{'1': ['$124,080,000',
   '$55,057,053',
   '$21,298,774',
   '$20,753,146',
   '$19,650,000',
   '$18,440,000',
   '$17,460,000',
   '$16,970,000',
   '$16,830,000',
   '$16,800,000',
   '$16,640,000',
   '$16,350,000',
   '$15,738,000',
   '$14,840,000',
   '$14,787,640',
   '$14,581,800',
   '$14,430,000',
   '$14,020,000',
   '$13,800,000',
   '$13,414,536'],
  '0': ['US Chamber of Commerce',
   'National Assn of Realtors',
   'Blue Cross/Blue Shield',
   'American Hospital Assn',
   'American Medical Assn',
   'National Assn of Broadcasters',
   'National Cable & Telecommunications Assn',
   'Comcast Corp',
   'Google Inc',
   'Boeing Co',
   'Pharmaceutical Rsrch & Mfrs of America',
   'General Electric',
   'United Technologies',
   'Business Roundtable',
   'CVS Health',
   'Lockheed Martin',
   'Dow Chemical',
   'AT&T Inc',
   'Koch Industries',
   'FedEx Corp']},
 <SelectElement bbfb138 name='cycle'>,
 <Element ul at 0xbbf6d18>,
 <Element ul at 0xb5af548>,
 <Element ul at 0xbbe33b8>]

In [208]:
df1 = pd.DataFrame.from_dict(tabs[0])

df1['1'] = df1['1'].apply(convert_num)

df1.columns = ['Lobbying Client','Total ($)']
df1 = df1.set_index('Lobbying Client')
df1 = df1.sort('Total ($)')

In [116]:
df1


Out[116]:
Total ($)
Lobbying Client
CVS/Caremark Corp 13.128502
Exxon Mobil 13.420000
Verizon Communications 13.703000
United Technologies 13.900373
Grocery Manufacturers Assn 14.300000
National Assn of Broadcasters 14.450000
Lockheed Martin 14.516226
Boeing Co 15.230000
Google Inc 15.800000
AT&T Inc 15.935000
General Electric 16.240000
Pharmaceutical Rsrch & Mfrs of America 17.882500
American Medical Assn 18.250000
Comcast Corp 18.810000
American Hospital Assn 19.173813
National Cable & Telecommunications Assn 19.870000
Northrop Grumman 20.590000
Blue Cross/Blue Shield 22.618980
National Assn of Realtors 38.584580
US Chamber of Commerce 74.470000

In [209]:
pd.options.display.mpl_style = 'default' # to set a more neutral 'blue'
figsize = [16,8]

max_value = int(max(df1['Total ($)'])) + 10
min_value = int(min(df1['Total ($)'])) - 50

ax = df1.plot(kind='barh', figsize=figsize,
              xticks=list(range(max_value))[0::5])
              #                  10))[0::tick_interval])#,
              #xlim=(int(min_value)-10, max_value))

ax = add_brand(ax,"Top Spenders (in millions) 2012", "opensecrets.org/lobby/top.php?indexType=s&showYear=2012")

In [210]:
%matplotlib
ax.figure


Using matplotlib backend: Qt4Agg
Out[210]: