In [2]:
import pandas as pd
import numpy as np
from datetime import datetime
import requests
import re
import io
import urllib
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from collections import defaultdict
import pickle

In [3]:
# text cleaning imports
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\aregel\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!

Collect and Clean Twitter Data

The twitter data was obtained using the Trump Twitter Archive, the data is from 01/20/2017 - 03/02/2018 2:38 PM MST. I used the Federal Register's website to obtain all of the actions published by the Executive Office for the same time frame.


In [9]:
# load json twitter data
twitter_json = r'data/twitter_01_20_17_to_3-2-18.json'
# Convert to pandas dataframe
tweet_data = pd.read_json(twitter_json)

Using Pandas I will read the twitter json file, convert it to a dataframe, set the index to 'created at' as datetime objects, then write it to a csv


In [10]:
# read the json data into a pandas dataframe
tweet_data = pd.read_json(twitter_json)
# set column 'created_at' to the index
tweet_data.set_index('created_at', drop=True, inplace= True)
# convert timestamp index to a datetime index
pd.to_datetime(tweet_data.index)


Out[10]:
DatetimeIndex(['2018-03-02 03:04:51', '2018-03-02 02:59:37',
               '2018-03-02 02:58:35', '2018-03-02 00:52:43',
               '2018-03-02 00:51:28', '2018-03-01 21:31:32',
               '2018-03-01 20:40:52', '2018-03-01 18:26:55',
               '2018-03-01 18:06:30', '2018-03-01 12:12:42',
               ...
               '2017-01-20 18:00:43', '2017-01-20 17:58:24',
               '2017-01-20 17:55:44', '2017-01-20 17:54:36',
               '2017-01-20 17:54:00', '2017-01-20 17:53:17',
               '2017-01-20 17:52:45', '2017-01-20 17:51:58',
               '2017-01-20 17:51:25', '2017-01-20 12:31:53'],
              dtype='datetime64[ns]', name='created_at', length=2792, freq=None)

The next step is to add columns with tokenized text and identify twitter specific puncutiations like hashtags and @ mentions


In [11]:
# function to identify hash tags
def hash_tag(text):
    return re.findall(r'(#[^\s]+)', text) 
# function to identify @mentions
def at_tag(text):
    return re.findall(r'(@[A-Za-z_]+)[^s]', text)

In [12]:
# tokenize all the tweet's text
tweet_data['text_tokenized'] = tweet_data['text'].apply(lambda x: word_tokenize(x.lower()))
# apply hash tag function to text column
tweet_data['hash_tags'] = tweet_data['text'].apply(lambda x: hash_tag(x))
# apply at_tag function to text column
tweet_data['@_tags'] = tweet_data['text'].apply(lambda x: at_tag(x))

In [14]:
# pickle data
tweet_pickle_path = r'data/twitter_01_20_17_to_3-2-18.pickle'
tweet_data.to_pickle(tweet_pickle_path)

Scrape Data from the Federal Register

This has already been done, and all of the pdfs published by the Executive Office of the U.S.A are in the data folder from 2017/01/20 - 2018/03/02

Don't execute this code unless you need more up-to-date information


In [ ]:
# Define the 2017 and 2018 url that contains all of the Executive Office of the President's published documents
executive_office_url_2017 = r'https://www.federalregister.gov/index/2017/executive-office-of-the-president' 
executive_office_url_2018 = r'https://www.federalregister.gov/index/2018/executive-office-of-the-president' 
# scrape all urls for pdf documents published in 2017 and 2018 by the U.S.A. Executive Office
pdf_urls= []
for url in [executive_office_url_2017,executive_office_url_2018]:
    response = requests.get(url)
    pattern = re.compile(r'https:.*\.pdf')
    pdfs = re.findall(pattern, response.text)
    pdf_urls.append(pdfs)

In [ ]:
# writes all of the pdfs to the data folder
start = 'data/'
end = '.pdf'
num = 0
for i in range(0,(len(pdf_urls))):
    for url in pdf_urls[i]:
        ver = str(num)
        pdf_path = start + ver + end
        r = requests.get(url)
        file = open(pdf_path, 'wb')
        file.write(r.content)
        file.close()
        num = num + 1

Create dataframe with the date the pdf was published and the text of each pdf


In [9]:
# function to convert pdf to text from stack overflow (https://stackoverflow.com/questions/26494211/extracting-text-from-a-pdf-file-using-pdfminer-in-python/44476759#44476759)
def convert_pdf_to_txt(path):
    rsrcmgr = PDFResourceManager()
    retstr = io.StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    fp = open(path, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    password = ""
    maxpages = 0
    caching = True
    pagenos = set()

    for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages,
                                  password=password,
                                  caching=caching,
                                  check_extractable=True):
        interpreter.process_page(page)

    text = retstr.getvalue()

    fp.close()
    device.close()
    retstr.close()
    return text
# finds the first time the name of a day appears in the txt, and returns that name

def find_day(word_generator):
    day_list = ['Monday,', 'Tuesday,', 'Wednesday,', 'Thursday,', 'Friday,', 'Saturday,', 'Sunday,']
    day_name_dict = {'Mon':'Monday,', 'Tue':'Tuesday,','Wed':'Wednesday,','Thu':'Thursday,','Fri':'Friday,','Sat':'Saturday,','Sun':'Sunday,'}
    day_name = []
    for val in word_generator:
        if val in day_list:
            num_position = txt.index(val)
            day_name.append(txt[num_position] + txt[num_position + 1] + txt[num_position +2])
            break
            
    return day_name_dict[day_name[0]]
# takes text and returns the first date in the document
def extract_date(txt):
    word_generator = (word for word in txt.split())
    day_name = find_day(word_generator)
    txt_start = int(txt.index(day_name))
    txt_end = txt_start + 40
    date_txt = txt[txt_start:txt_end].replace('\n','')
    cleaned_txt = re.findall('.* \d{4}', date_txt)
    date_list = cleaned_txt[0].split()
    clean_date_list = map(lambda x:x.strip(","), date_list)
    clean_date_string = ", ".join(clean_date_list)
    date_obj = datetime.strptime(clean_date_string, '%A, %B, %d, %Y')
    return date_obj

Create a dictionary using DefaultDict where the date of publication is the key, and the text of the pdf is the value.


In [10]:
start_path = r'data/'
end_path = '.pdf'
data_dict = defaultdict(list)
for i in range(0,270):
    file_path = start_path + str(i) + end_path
    txt = convert_pdf_to_txt(file_path)
    date_obj = extract_date(txt)
    data_dict[date_obj].append(txt)

Create a list of tuples, where the date is the first entry and the text of a pdf is the second entry, skipping over any values of None


In [11]:
tuple_lst = []
for k, v in data_dict.items():
    if v != None:
        for text in v:
            tuple_lst.append((k, text))

In [12]:
# create dataframe from list of tuples
fed_reg_dataframe = pd.DataFrame.from_records(tuple_lst, columns=['date','str_text'], index = 'date')

In [13]:
# tokenize all the pdf text
fed_reg_dataframe['token_text'] = fed_reg_dataframe['str_text'].apply(lambda x: word_tokenize(x.lower()))

In [14]:
# final dataframe
fed_reg_dataframe[fed_reg_dataframe.index > '2017-01-20']


Out[14]:
str_text token_text
date
2017-08-22 Federal Register / Vol. 82, No. 161 / Tues... [federal, register, /, vol, ., 82, ,, no, ., 1...
2017-09-29 Federal Register / Vol. 82, No. 188 / Frid... [federal, register, /, vol, ., 82, ,, no, ., 1...
2017-09-11 42706 \n\nFederal Register / Vol. 82, No. ... [42706, federal, register, /, vol, ., 82, ,, n...
2017-10-04 Federal Register / Vol. 82, No. 191 / Wedn... [federal, register, /, vol, ., 82, ,, no, ., 1...
2017-10-04 Federal Register / Vol. 82, No. 191 / Wedn... [federal, register, /, vol, ., 82, ,, no, ., 1...
2017-10-04 Federal Register / Vol. 82, No. 191 / Wedn... [federal, register, /, vol, ., 82, ,, no, ., 1...
2017-10-04 Federal Register / Vol. 82, No. 191 / Wedn... [federal, register, /, vol, ., 82, ,, no, ., 1...
2017-10-04 Federal Register / Vol. 82, No. 191 / Wedn... [federal, register, /, vol, ., 82, ,, no, ., 1...
2017-10-04 Federal Register / Vol. 82, No. 191 / Wedn... [federal, register, /, vol, ., 82, ,, no, ., 1...
2017-10-04 Federal Register \n\nVol. 82, No. 191 \n\n... [federal, register, vol, ., 82, ,, no, ., 191,...
2017-10-04 Federal Register / Vol. 82, No. 191 / Wedn... [federal, register, /, vol, ., 82, ,, no, ., 1...
2017-10-16 Federal Register / Vol. 82, No. 198 / Mond... [federal, register, /, vol, ., 82, ,, no, ., 1...
2017-10-16 Federal Register \n\nVol. 82, No. 198 \n\n... [federal, register, vol, ., 82, ,, no, ., 198,...
2017-06-29 29584 \n\nFederal Register / Vol. 82, No. ... [29584, federal, register, /, vol, ., 82, ,, n...
2017-06-23 Federal Register / Vol. 82, No. 120 / Frid... [federal, register, /, vol, ., 82, ,, no, ., 1...
2017-06-23 Federal Register \n\nVol. 82, No. 120 \n\n... [federal, register, vol, ., 82, ,, no, ., 120,...
2017-06-23 Federal Register / Vol. 82, No. 120 / Frid... [federal, register, /, vol, ., 82, ,, no, ., 1...
2017-05-31 Federal Register / Vol. 82, No. 103 / Wedn... [federal, register, /, vol, ., 82, ,, no, ., 1...
2017-05-31 Federal Register \n\nVol. 82, No. 103 \n\n... [federal, register, vol, ., 82, ,, no, ., 103,...
2017-07-14 Federal Register / Vol. 82, No. 134 / Frid... [federal, register, /, vol, ., 82, ,, no, ., 1...
2017-07-14 Presidential Documents\n\n32611 \n\nExecutive ... [presidential, documents, 32611, executive, or...
2017-08-21 Federal Register / Vol. 82, No. 160 / Mond... [federal, register, /, vol, ., 82, ,, no, ., 1...
2017-03-03 Federal Register \n\nVol. 82, No. 41 \n\nF... [federal, register, vol, ., 82, ,, no, ., 41, ...
2017-03-03 Federal Register / Vol. 82, No. 41 / Frida... [federal, register, /, vol, ., 82, ,, no, ., 4...
2017-10-30 Federal Register \n\nVol. 82, No. 208 \n\n... [federal, register, vol, ., 82, ,, no, ., 208,...
2017-05-03 Federal Register \n\nVol. 82, No. 84 \n\nW... [federal, register, vol, ., 82, ,, no, ., 84, ...
2017-05-03 Federal Register / Vol. 82, No. 84 / Wedne... [federal, register, /, vol, ., 82, ,, no, ., 8...
2017-05-03 Federal Register \n\nVol. 82, No. 84 \n\nW... [federal, register, vol, ., 82, ,, no, ., 84, ...
2017-05-03 Federal Register / Vol. 82, No. 84 / Wedne... [federal, register, /, vol, ., 82, ,, no, ., 8...
2017-05-03 Federal Register / Vol. 82, No. 84 / Wedne... [federal, register, /, vol, ., 82, ,, no, ., 8...
... ... ...
2017-02-06 Federal Register \n\nVol. 82, No. 23 \n\nM... [federal, register, vol, ., 82, ,, no, ., 23, ...
2017-09-06 Federal Register \n\nVol. 82, No. 171 \n\n... [federal, register, vol, ., 82, ,, no, ., 171,...
2017-09-07 Federal Register \n\nVol. 82, No. 172 \n\n... [federal, register, vol, ., 82, ,, no, ., 172,...
2017-09-14 Federal Register \n\nVol. 82, No. 177 \n\n... [federal, register, vol, ., 82, ,, no, ., 177,...
2017-09-14 Federal Register / Vol. 82, No. 177 / Thur... [federal, register, /, vol, ., 82, ,, no, ., 1...
2017-11-22 Federal Register \n\nVol. 82, No. 224 \n\n... [federal, register, vol, ., 82, ,, no, ., 224,...
2017-11-22 Federal Register / Vol. 82, No. 224 / Wedn... [federal, register, /, vol, ., 82, ,, no, ., 2...
2017-04-13 Federal Register / Vol. 82, No. 70 / Thurs... [federal, register, /, vol, ., 82, ,, no, ., 7...
2017-04-13 Federal Register \n\nVol. 82, No. 70 \n\nT... [federal, register, vol, ., 82, ,, no, ., 70, ...
2017-12-05 Federal Register \n\nVol. 82, No. 232 \n\n... [federal, register, vol, ., 82, ,, no, ., 232,...
2017-12-05 Federal Register / Vol. 82, No. 232 / Tues... [federal, register, /, vol, ., 82, ,, no, ., 2...
2017-08-01 Presidential Documents\n\n35881 \n\nProclamati... [presidential, documents, 35881, proclamation,...
2017-04-19 Federal Register \n\nVol. 82, No. 74 \n\nW... [federal, register, vol, ., 82, ,, no, ., 74, ...
2017-03-22 Federal Register \n\nVol. 82, No. 54 \n\nW... [federal, register, vol, ., 82, ,, no, ., 54, ...
2017-09-05 Federal Register \n\nVol. 82, No. 170 \n\n... [federal, register, vol, ., 82, ,, no, ., 170,...
2017-04-27 Federal Register \n\nVol. 82, No. 80 \n\nT... [federal, register, vol, ., 82, ,, no, ., 80, ...
2017-11-13 Federal Register \n\nVol. 82, No. 217 \n\n... [federal, register, vol, ., 82, ,, no, ., 217,...
2017-11-17 Federal Register \n\nVol. 82, No. 221 \n\n... [federal, register, vol, ., 82, ,, no, ., 221,...
2017-11-14 Federal Register \n\nVol. 82, No. 218 \n\n... [federal, register, vol, ., 82, ,, no, ., 218,...
2017-11-02 Federal Register \n\nVol. 82, No. 211 \n\n... [federal, register, vol, ., 82, ,, no, ., 211,...
2017-04-26 Federal Register \n\nVol. 82, No. 79 \n\nW... [federal, register, vol, ., 82, ,, no, ., 79, ...
2017-09-19 Federal Register \n\nVol. 82, No. 180 \n\n... [federal, register, vol, ., 82, ,, no, ., 180,...
2017-09-13 Federal Register \n\nVol. 82, No. 176 \n\n... [federal, register, vol, ., 82, ,, no, ., 176,...
2017-09-13 Federal Register \n\nVol. 82, No. 176 \n\n... [federal, register, vol, ., 82, ,, no, ., 176,...
2017-08-17 Federal Register \n\nVol. 82, No. 158 \n\n... [federal, register, vol, ., 82, ,, no, ., 158,...
2017-06-30 Federal Register \n\nVol. 82, No. 125 \n\n... [federal, register, vol, ., 82, ,, no, ., 125,...
2017-07-21 Federal Register \n\nVol. 82, No. 139 \n\n... [federal, register, vol, ., 82, ,, no, ., 139,...
2017-07-26 Federal Register \n\nVol. 82, No. 142 \n\n... [federal, register, vol, ., 82, ,, no, ., 142,...
2017-08-29 Federal Register \n\nVol. 82, No. 166 \n\n... [federal, register, vol, ., 82, ,, no, ., 166,...
2017-05-02 Federal Register \n\nVol. 82, No. 83 \n\nT... [federal, register, vol, ., 82, ,, no, ., 83, ...

240 rows × 2 columns

Pickle the dataframe, so that you only need to process the text once


In [15]:
# pickle final data
fed_reg_data = r'data/fed_reg_data.pickle'
final_df.to_pickle(fed_reg_data)

In [ ]: