In [1]:
import re # regex
import pandas as pd
from requests import get
import matplotlib.pyplot as plt
from libextract import extract # our standard extraction method
from libextract.strategies import TABULAR # for when dealing with tables
from libextract import prototypes # currently found in 'fuzzy-table-formatter' branch
In [185]:
# this function will be "applied" to every element in the column we designate
def convert_num(n):
return float(n.replace(',','').replace('$',''))/1000000
def add_brand(ax, title, source):
ax.xaxis.tick_top()
leg = ax.legend(loc="upper right")
# Get the bounding box of the original legend
bb = leg.legendPatch.get_bbox().inverse_transformed(ax.transAxes)
# Change to location of the legend.
#ttl = ax.figure.suptitle(title)
ttl = ax.text(0.5,1.1, title, horizontalalignment='center',
fontsize=20, transform = ax.transAxes)
ax.text(0.5, 1.15, "github.com/datalib/libextract",
horizontalalignment='center', fontsize=16,
transform = ax.transAxes)
ax.text(0.5, 1.05, source, horizontalalignment='center',
fontsize=12, transform = ax.transAxes)
bb.set_points([1,1.15])
leg.set_bbox_to_anchor(bb)
return ax
In [141]:
In [207]:
url = "https://www.opensecrets.org/lobby/top.php?indexType=s&showYear=2012"
r = get(url)
# convert_table is something not yet in the master branch
strat = TABULAR + (prototypes.convert_table,)
# top 5 predicted html elements/dicts of tabular data
tabs = list(extract(r.content, strategy=strat))
In [112]:
tabs
Out[112]:
In [208]:
df1 = pd.DataFrame.from_dict(tabs[0])
df1['1'] = df1['1'].apply(convert_num)
df1.columns = ['Lobbying Client','Total ($)']
df1 = df1.set_index('Lobbying Client')
df1 = df1.sort('Total ($)')
In [116]:
df1
Out[116]:
In [209]:
pd.options.display.mpl_style = 'default' # to set a more neutral 'blue'
figsize = [16,8]
max_value = int(max(df1['Total ($)'])) + 10
min_value = int(min(df1['Total ($)'])) - 50
ax = df1.plot(kind='barh', figsize=figsize,
xticks=list(range(max_value))[0::5])
# 10))[0::tick_interval])#,
#xlim=(int(min_value)-10, max_value))
ax = add_brand(ax,"Top Spenders (in millions) 2012", "opensecrets.org/lobby/top.php?indexType=s&showYear=2012")
In [210]:
%matplotlib
ax.figure
Out[210]: