``````

In [1]:

# imports
import pandas as pd
import numpy as np
import pickle
import re
import pandas as pd

``````

## The following data was generated using code that can be found on GitHub

``````

In [2]:

# load federal document data from pickle file
fed_reg_data = r'data/fed_reg_data.pickle'

``````
``````

In [3]:

len(fed_data)

``````
``````

Out[3]:

281

``````

## In order to explore the twitter and executive document data I will look at the following:

1. Determine the most used hashtags
2. Determine who President Trump tweeted at(@) the most
3. Create a word frequency plot for the most used words in the twitter data and the presidental documents
4. Find words that both data sets have in common, and determine those words document frequency (what percentage of documents those words appear in)

``````

In [4]:

# imports
import nltk
from nltk.corpus import stopwords
import itertools
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('ggplot')

``````
``````

[nltk_data]   Package stopwords is already up-to-date!

``````

### Plot the most used hashtags and @ tags

``````

In [5]:

# find the most used hashtags
hashtag_top20 = hashtag_freq.most_common(20)
# find the most used @ tags
at_tags_top20 = at_tag_freq.most_common(20)

print(hashtag_top20)

``````
``````

[('#MAGA', 25), ('#USAðŸ‡ºðŸ‡¸', 25), ('#MAGAðŸ‡ºðŸ‡¸', 14), ('#FakeNews', 11), ('#TaxReform', 10), ('#HurricaneHarvey', 10), ('#UNGA', 9), ('#AmericaFirstðŸ‡ºðŸ‡¸', 8), ('#MAGA!', 7), ('#APEC2017', 7), ('#WeeklyAddressðŸ‡ºðŸ‡¸', 7), ('#WEF18', 6), ('#LESM', 6), ('#PuertoRico', 6), ('#MakeAmericaGreatAgain', 5), ('#MakeAmericaGreatAgainðŸ‡ºðŸ‡¸', 5), ('#Harvey', 5), ('#USA', 5), ('#ICYMI-', 5), ('#Fake', 4)]

``````
``````

In [6]:

# frequency plot for the most used hashtags
df = pd.DataFrame(hashtag_top20, columns=['Hashtag', 'frequency'])
df.plot(kind='bar', x='Hashtag',legend=None,fontsize = 15, figsize = (15,5))
plt.ylabel('Frequency',fontsize = 18)
plt.xlabel('Hashtag', fontsize=18)
plt.title('Most Common Hashtags', fontsize = 15)
plt.show()

``````
``````

``````
``````

In [7]:

# frequency plot for the most used @ tags
df = pd.DataFrame(at_tags_top20, columns=['@ Tag', 'frequency'])
df.plot(kind='bar', x='@ Tag',legend=None, figsize = (15,5))
plt.ylabel('Frequency',fontsize = 18)
plt.xlabel('@ Tags', fontsize=18)
plt.title('Most Common @ Tags', fontsize = 15)
plt.show()

``````
``````

``````

### Top used words for the twitter data and the federal document data

#### Define a list of words that have no meaning, such as 'a', 'the', and punctuation

``````

In [8]:

# use nltk's list of stopwords
stop_words = set(stopwords.words('english'))
stop_words.update(['.', ',','get','going','one', 'amp','like' '"','...',"''", "'","n't", '?', '!', ':', ';', '#','@', '(', ')', 'https', '``',"'s", 'rt' ])

``````

#### Make a list of hashtags and @entites used in the twitter data

``````

In [9]:

# combine the hashtags and @ tags, flatten the list of lists, keep the unique items

``````

#### The federal document data also has some words that need to be removed. The words Federal Registry and the date are on the top of every page so they should be removed. Also, words like 'shall', 'order', and 'act' are used quite a bit but don't convay much meaning, so I'm going to remove those words as well.

``````

In [10]:

stop_fed_docs = ['united', 'states', '1','2','3','4','5','6','7','8','9','10', '11','12',
'13','14','15','16','17','18','19','20','21','22','23','24','25','26',
'27','28','29','30','31','2016', '2015','2014','federal','shall', '4790',
'national', '2017', 'order','president', 'presidential', 'sep',
'register','po','verdate', 'jkt','00000','frm','fmt','sfmt','vol',
'section','donald','act','america', 'executive','secretary', 'law',
'proclamation','81','day','including', 'code', '4705','authority', 'agencies',
'241001','americans','238001','year', 'amp','government','agency','hereby',
'people','public','person','state','american','two','nation', '82', 'sec',
'laws', 'policy','set','fr','appropriate','doc','new','filed','u.s.c',
'department','ii','also','office','country','within','memorandum',
'director', 'us', 'sunday','monday', 'tuesday','wednesday','thursday',
'friday', 'saturday','title','upon','constitution','support', 'vested',
'february', 'march', 'april','may','june','july','august', 'september',
'october', 'november', 'december', 'council','provide','consistent','pursuant',
'thereof','00001','documents','11:15', 'area','management',
'following','house','white','week','therefore','amended', 'continue',
'chapter','must','years', '00002', 'use','make','date','one',
'many','12', 'commission','provisions', 'every','u.s.','functions',
'follows','thousand','efforts','jan', 'trump','j.',
'applicable', '4717','whereof','hereunto', 'subject', 'report',
'3—', '3295–f7–p']

``````

#### Create functions that removes the stop words for each of the datasets

``````

In [11]:

def remove_from_fed_data(token_lst):
# remove stopwords and one letter words
filtered_lst = [word for word in token_lst if word.lower() not in stop_fed_docs and len(word) > 1
and word.lower() not in stop_words]
return filtered_lst

``````
``````

In [12]:

# remove stopwords and one letter words
filtered_lst = [word for word in token_lst if word.lower() not in stop_words and len(word) > 1
return filtered_lst

``````

#### Remove all of the stop words from the tokenized twitter and document data

``````

In [13]:

# apply the remove_stopwords function to all of the tokenized twitter text
# apply the remove_stopwords function to all of the tokenized document text
document_words = fed_data.token_text.apply(lambda x: remove_from_fed_data(x))

# flatten each the word lists into one list
all_document_words =list(itertools.chain(*document_words))

``````

#### Count how many times each word is used for both datasets

``````

In [14]:

# create a dictionary using the Counter method, where the key is a word and the value is the number of time it was used
doc_freq = Counter(all_document_words)
# determine the top 30 words used in the twitter data
top_30_fed = doc_freq.most_common(30)

``````

### Plot the most used words for the twitter data and the federal document data

``````

In [15]:

# frequency plot for the most used Federal Data
df = pd.DataFrame(top_30_fed, columns=['Federal Data', 'frequency'])
df.plot(kind='bar', x='Federal Data',legend=None, figsize = (15,5))
plt.ylabel('Frequency',fontsize = 18)
plt.xlabel('Words', fontsize=18)
plt.title('Most Used Words that Occured in the Federal Data', fontsize = 15)
plt.show()

``````
``````

``````
``````

In [16]:

# frequency plot for the most used words in the twitter data
df = pd.DataFrame(top_30_tweet, columns=['Twitter Data', 'frequency'])
df.plot(kind='bar', x='Twitter Data',legend=None, figsize = (15,5))
plt.ylabel('Frequency',fontsize = 18)
plt.xlabel('Words', fontsize=18)
plt.title('Most Used Words that Occured in the Twitter Data', fontsize = 15)
plt.show()

``````
``````

``````

#### Determine all of the words that are used in both datasets

``````

In [17]:

# find the unique words in each dataset

``````

#### Create a dictionary with the unique joint words as keys

``````

In [18]:

# make array of zeros
values = np.zeros(len(joint_words))
# create dictionary
joint_words_dict = dict(zip(joint_words, values))

``````

#### Create dictionaries for both datasets with document frequency for each joint word

``````

In [19]:

# create a dictionary with a word as key, and a value = number of documents that contain the word for Twitter
for word in joint_words:
if word in lst:

# create a dictionary with a word as key, and a value = number of documents that contain the word for Fed Data
fed_document_freq = joint_words_dict.copy()
for word in joint_words:
for lst in fed_data.token_text:
if word in lst:
fed_document_freq[word]= fed_document_freq[word] + 1

``````

#### Create dataframe with the word and the document percentage for each data set

``````

In [20]:

``````
``````

In [21]:

df.columns = ['Fed', 'Tweet']
df['% Fed'] = (df.Fed/len(df.Fed))*100
df['% Tweet'] = (df.Tweet/len(df.Tweet))*100

``````
``````

In [22]:

top_joint_fed = df[['% Fed','% Tweet']].sort_values(by='% Fed', ascending=False)[0:50]
top_joint_tweet = df[['% Fed','% Tweet']].sort_values(by='% Tweet', ascending=False)[0:50]

``````
``````

In [23]:

top_joint_fed.plot.bar(figsize=(14,5))
plt.show()

``````
``````

``````
``````

In [24]:

top_joint_tweet.plot.bar(figsize=(14,5))
plt.show()

``````
``````

``````
``````

In [25]:

df['diff %'] = df['% Fed'] - df['% Tweet']

``````
``````

In [26]:

top_same = df[df['diff %'] == 0].sort_values(by='% Fed', ascending=False)[0:50]

``````
``````

In [27]:

top_same[['% Fed', '% Tweet']].plot.bar(figsize=(14,5))
plt.show()

``````
``````

``````