In [1]:
import requests
from bs4 import BeautifulSoup

from joblib import Parallel, delayed

In [2]:
FEDERAL_WEBSITES = "https://www.usa.gov/federal-agencies/a"

In [3]:
response = requests.get(FEDERAL_WEBSITES)
response


Out[3]:
<Response [200]>

In [4]:
data = BeautifulSoup(response.text, "lxml")

In [5]:
print data.select("#content > div > div > ul.az-list")[0].select("li > a")


[<a class="atoz_letter" href="/federal-agencies/b">B</a>, <a class="atoz_letter" href="/federal-agencies/c">C</a>, <a class="atoz_letter" href="/federal-agencies/d">D</a>, <a class="atoz_letter" href="/federal-agencies/e">E</a>, <a class="atoz_letter" href="/federal-agencies/f">F</a>, <a class="atoz_letter" href="/federal-agencies/g">G</a>, <a class="atoz_letter" href="/federal-agencies/h">H</a>, <a class="atoz_letter" href="/federal-agencies/i">I</a>, <a class="atoz_letter" href="/federal-agencies/j">J</a>, <a class="atoz_letter" href="/federal-agencies/k">K</a>, <a class="atoz_letter" href="/federal-agencies/l">L</a>, <a class="atoz_letter" href="/federal-agencies/m">M</a>, <a class="atoz_letter" href="/federal-agencies/n">N</a>, <a class="atoz_letter" href="/federal-agencies/o">O</a>, <a class="atoz_letter" href="/federal-agencies/p">P</a>, <a class="atoz_letter" href="/federal-agencies/r">R</a>, <a class="atoz_letter" href="/federal-agencies/s">S</a>, <a class="atoz_letter" href="/federal-agencies/t">T</a>, <a class="atoz_letter" href="/federal-agencies/u">U</a>, <a class="atoz_letter" href="/federal-agencies/v">V</a>, <a class="atoz_letter" href="/federal-agencies/w">W</a>]

In [6]:
BASE_URL = "https://www.usa.gov"
dir_urls = ["https://www.usa.gov/federal-agencies/a"]
for url in data.select("#content > div > div > ul.az-list")[0].select("li > a"):
    dir_urls.append("%s%s" % (BASE_URL, url["href"]))
dir_urls


Out[6]:
['https://www.usa.gov/federal-agencies/a',
 'https://www.usa.gov/federal-agencies/b',
 'https://www.usa.gov/federal-agencies/c',
 'https://www.usa.gov/federal-agencies/d',
 'https://www.usa.gov/federal-agencies/e',
 'https://www.usa.gov/federal-agencies/f',
 'https://www.usa.gov/federal-agencies/g',
 'https://www.usa.gov/federal-agencies/h',
 'https://www.usa.gov/federal-agencies/i',
 'https://www.usa.gov/federal-agencies/j',
 'https://www.usa.gov/federal-agencies/k',
 'https://www.usa.gov/federal-agencies/l',
 'https://www.usa.gov/federal-agencies/m',
 'https://www.usa.gov/federal-agencies/n',
 'https://www.usa.gov/federal-agencies/o',
 'https://www.usa.gov/federal-agencies/p',
 'https://www.usa.gov/federal-agencies/r',
 'https://www.usa.gov/federal-agencies/s',
 'https://www.usa.gov/federal-agencies/t',
 'https://www.usa.gov/federal-agencies/u',
 'https://www.usa.gov/federal-agencies/v',
 'https://www.usa.gov/federal-agencies/w']

In [7]:
def get_agencies(url):
    response = requests.get(url)
    data = BeautifulSoup(response.text, "lxml")
    agencies = {}
    for anchor in data.select("#content > div > div > ul.one_column_bullet > li > a"):
        agencies[anchor.text] = "%s%s" % (BASE_URL, anchor["href"])
    return agencies

In [8]:
agencies = get_agencies(dir_urls[0])
agencies


Out[8]:
{u'AbilityOne Commission': 'https://www.usa.gov/federal-agencies/u-s-abilityone-commission',
 u'Access Board': 'https://www.usa.gov/federal-agencies/u-s-access-board',
 u'Administration for Children and Families (ACF)': 'https://www.usa.gov/federal-agencies/administration-for-children-and-families',
 u'Administration for Community Living': 'https://www.usa.gov/federal-agencies/administration-for-community-living',
 u'Administration for Native Americans': 'https://www.usa.gov/federal-agencies/administration-for-native-americans',
 u'Administration on Aging': 'https://www.usa.gov/federal-agencies/administration-on-aging',
 u'Administration on Intellectual and Developmental Disabilities': 'https://www.usa.gov/federal-agencies/administration-on-intellectual-and-developmental-disabilities',
 u'Administrative Conference of the United States': 'https://www.usa.gov/federal-agencies/administrative-conference-of-the-united-states',
 u'Administrative Office of the U.S. Courts': 'https://www.usa.gov/federal-agencies/administrative-office-of-the-u-s-courts',
 u'Advisory Council on Historic Preservation': 'https://www.usa.gov/federal-agencies/advisory-council-on-historic-preservation',
 u'African Development Foundation': 'https://www.usa.gov/federal-agencies/african-development-foundation',
 u'Agency for Healthcare Research and Quality (AHRQ)': 'https://www.usa.gov/federal-agencies/agency-for-healthcare-research-and-quality',
 u'Agency for International Development (USAID)': 'https://www.usa.gov/federal-agencies/u-s-agency-for-international-development',
 u'Agency for Toxic Substances and Disease Registry': 'https://www.usa.gov/federal-agencies/agency-for-toxic-substances-and-disease-registry',
 u'Agricultural Marketing Service (AMS)': 'https://www.usa.gov/federal-agencies/agricultural-marketing-service',
 u'Agricultural Research Service': 'https://www.usa.gov/federal-agencies/agricultural-research-service',
 u'Agriculture Department (USDA)': 'https://www.usa.gov/federal-agencies/u-s-department-of-agriculture',
 u'Air Force': 'https://www.usa.gov/federal-agencies/u-s-air-force',
 u'Air Force Reserve': 'https://www.usa.gov/federal-agencies/u-s-air-force-reserve-command',
 u'Alabama': 'https://www.usa.gov/state-government/alabama',
 u'Alaska': 'https://www.usa.gov/state-government/alaska',
 u'Alcohol and Tobacco Tax and Trade Bureau': 'https://www.usa.gov/federal-agencies/alcohol-and-tobacco-tax-and-trade-bureau',
 u'Alcohol, Tobacco, Firearms and Explosives Bureau (ATF)': 'https://www.usa.gov/federal-agencies/bureau-of-alcohol-tobacco-firearms-and-explosives',
 u'AmeriCorps': 'https://www.usa.gov/federal-agencies/americorps',
 u'American Battle Monuments Commission': 'https://www.usa.gov/federal-agencies/american-battle-monuments-commission',
 u'American Samoa': 'https://www.usa.gov/state-government/american-samoa',
 u'Amtrak (AMTRAK)': 'https://www.usa.gov/federal-agencies/national-railroad-passenger-corporation',
 u'Animal and Plant Health Inspection Service (APHIS)': 'https://www.usa.gov/federal-agencies/animal-and-plant-health-inspection-service',
 u'Antitrust Division': 'https://www.usa.gov/federal-agencies/antitrust-division',
 u'Architect of the Capitol': 'https://www.usa.gov/federal-agencies/architect-of-the-capitol',
 u'Archives, National Archives and Records Administration (NARA)': 'https://www.usa.gov/federal-agencies/national-archives-and-records-administration',
 u'Arctic Research Commission': 'https://www.usa.gov/federal-agencies/u-s-arctic-research-commission',
 u'Arizona': 'https://www.usa.gov/state-government/arizona',
 u'Arkansas': 'https://www.usa.gov/state-government/arkansas',
 u'Armed Forces Retirement Home': 'https://www.usa.gov/federal-agencies/armed-forces-retirement-home',
 u'Arms Control and International Security': 'https://www.usa.gov/federal-agencies/arms-control-and-international-security',
 u'Army': 'https://www.usa.gov/federal-agencies/u-s-army',
 u'Army Corps of Engineers': 'https://www.usa.gov/federal-agencies/u-s-army-corps-of-engineers',
 u'Arthritis, Musculoskeletal and Skin Diseases, National Institute of ': 'https://www.usa.gov/federal-agencies/national-institute-of-arthritis-musculoskeletal-and-skin-diseases'}

In [9]:
all_agencies = {}
for url in dir_urls:
    print url, 
    agencies = get_agencies(url)
    print len(agencies), 
    all_agencies.update(agencies)
    
print len(all_agencies)


https://www.usa.gov/federal-agencies/a 39 https://www.usa.gov/federal-agencies/b 24 https://www.usa.gov/federal-agencies/c 50 https://www.usa.gov/federal-agencies/d 41 https://www.usa.gov/federal-agencies/e 22 https://www.usa.gov/federal-agencies/f 60 https://www.usa.gov/federal-agencies/g 10 https://www.usa.gov/federal-agencies/h 12 https://www.usa.gov/federal-agencies/i 27 https://www.usa.gov/federal-agencies/j 18 https://www.usa.gov/federal-agencies/k 2 https://www.usa.gov/federal-agencies/l 6 https://www.usa.gov/federal-agencies/m 29 https://www.usa.gov/federal-agencies/n 74 https://www.usa.gov/federal-agencies/o 38 https://www.usa.gov/federal-agencies/p 22 https://www.usa.gov/federal-agencies/r 16 https://www.usa.gov/federal-agencies/s 28 https://www.usa.gov/federal-agencies/t 13 https://www.usa.gov/federal-agencies/u 54 https://www.usa.gov/federal-agencies/v 9 https://www.usa.gov/federal-agencies/w 14 608

In [29]:
def get_agency_details(item):
    agency, url = item
    response = requests.get(url)
    data = BeautifulSoup(response.text, "lxml")
    details = data.select("#content > div > div > article > section a")
    return agency, url, details

In [30]:
get_agency_details((u'AbilityOne Commission', "https://www.usa.gov/federal-agencies/u-s-abilityone-commission"))


Out[30]:
(u'AbilityOne Commission',
 'https://www.usa.gov/federal-agencies/u-s-abilityone-commission',
 [<a href="http://www.abilityone.gov">U.S. AbilityOne Commission </a>,
  <a href="http://www.abilityone.gov/contact_us/index.html">Contact the U.S. AbilityOne Commission </a>,
  <a href="mailto:info@abilityone.gov" target="_top">info@abilityone.gov</a>])

In [31]:
agency_details = Parallel(n_jobs=20, verbose=10)(
    delayed(get_agency_details)(item) for item in all_agencies.iteritems())


[Parallel(n_jobs=20)]: Done   1 tasks      | elapsed:    0.3s
[Parallel(n_jobs=20)]: Done  10 tasks      | elapsed:    1.4s
[Parallel(n_jobs=20)]: Done  21 tasks      | elapsed:    2.6s
[Parallel(n_jobs=20)]: Done  32 tasks      | elapsed:    3.7s
[Parallel(n_jobs=20)]: Done  45 tasks      | elapsed:    5.2s
[Parallel(n_jobs=20)]: Done  58 tasks      | elapsed:    6.7s
[Parallel(n_jobs=20)]: Done  73 tasks      | elapsed:    8.4s
[Parallel(n_jobs=20)]: Done  88 tasks      | elapsed:    9.8s
[Parallel(n_jobs=20)]: Done 105 tasks      | elapsed:   13.0s
[Parallel(n_jobs=20)]: Done 122 tasks      | elapsed:   15.1s
[Parallel(n_jobs=20)]: Done 141 tasks      | elapsed:   17.6s
[Parallel(n_jobs=20)]: Done 160 tasks      | elapsed:   19.9s
[Parallel(n_jobs=20)]: Done 181 tasks      | elapsed:   22.4s
[Parallel(n_jobs=20)]: Done 202 tasks      | elapsed:   24.6s
[Parallel(n_jobs=20)]: Done 225 tasks      | elapsed:   27.3s
[Parallel(n_jobs=20)]: Done 248 tasks      | elapsed:   31.6s
[Parallel(n_jobs=20)]: Done 273 tasks      | elapsed:   34.5s
[Parallel(n_jobs=20)]: Done 298 tasks      | elapsed:   37.3s
[Parallel(n_jobs=20)]: Done 325 tasks      | elapsed:   40.4s
[Parallel(n_jobs=20)]: Done 352 tasks      | elapsed:   43.3s
[Parallel(n_jobs=20)]: Done 381 tasks      | elapsed:   46.8s
[Parallel(n_jobs=20)]: Done 410 tasks      | elapsed:   51.9s
[Parallel(n_jobs=20)]: Done 441 tasks      | elapsed:   55.5s
[Parallel(n_jobs=20)]: Done 472 tasks      | elapsed:   59.1s
[Parallel(n_jobs=20)]: Done 505 tasks      | elapsed:  1.0min
[Parallel(n_jobs=20)]: Done 538 tasks      | elapsed:  1.1min
[Parallel(n_jobs=20)]: Done 608 out of 608 | elapsed:  1.3min finished

In [32]:
agency_details[3]


Out[32]:
(u'Government Ethics, Office of',
 'https://www.usa.gov/federal-agencies/office-of-government-ethics',
 [<a href="http://www.oge.gov/">Office of Government Ethics </a>,
  <a href="https://www.oge.gov/web/oge.nsf/Organization/Contact%20Us?opendocument">Contact the Office of Government Ethics</a>,
  <a href="https://www.oge.gov/Web/OGE.nsf/Resources/Where+to+Report+Misconduct">Where to Report Misconduct</a>,
  <a href="mailto:ContactOGE@oge.gov" target="_top">ContactOGE@oge.gov</a>])

In [33]:
import urlparse

In [34]:
urlparse.urlsplit("mailto:ContactOGE@oge.gov")


Out[34]:
SplitResult(scheme='mailto', netloc='', path='ContactOGE@oge.gov', query='', fragment='')

In [35]:
urlparse.urlsplit("https://www.oge.gov/Web/OGE.nsf/Resources/Where+to+Report+Misconduct")


Out[35]:
SplitResult(scheme='https', netloc='www.oge.gov', path='/Web/OGE.nsf/Resources/Where+to+Report+Misconduct', query='', fragment='')

In [43]:
def get_domain(details):
    agency, url, details = details
    links = set()
    for anchor in details:
        link = anchor["href"]
        split_link = urlparse.urlsplit(link)
        if split_link.scheme == "mailto":
            link = split_link.path.split("@")[-1]
        else:
            link = split_link.netloc
        if link == "":
            continue
        links.add(link)
    return links

In [44]:
get_domain(agency_details[3])


Out[44]:
{'oge.gov', 'www.oge.gov'}

In [45]:
all_domains = set()
for details in agency_details:
    all_domains.update(get_domain(details))
print len(all_domains)


1058

In [46]:
list(all_domains)[:10]


Out[46]:
['governor.hawaii.gov',
 'www.traveltex.com',
 'presidiotrust.gov',
 'www.visitdelaware.com',
 'gsa.gov',
 'www.sos.state.nm.us',
 'www.ustr.gov',
 'www.federallabs.org',
 'www.oregonlottery.org',
 'www.rd.usda.gov']

In [47]:
with open("DomainDataset/USGov_domain+suffix.txt", "wb+") as fp:
    for domain in all_domains:
        print >> fp, domain
        
! head DomainDataset/USGov_domain+suffix.txt


governor.hawaii.gov
www.traveltex.com
presidiotrust.gov
www.visitdelaware.com
gsa.gov
www.sos.state.nm.us
www.ustr.gov
www.federallabs.org
www.oregonlottery.org
www.rd.usda.gov

In [ ]: