In [1]:
# imports
import pandas as pd
import numpy as np
import pickle
import re
import pandas as pd
In [2]:
# load federal document data from pickle file
fed_reg_data = r'data/fed_reg_data.pickle'
fed_data = pd.read_pickle(fed_reg_data)
# load twitter data from csv
twitter_file_path = r'data/twitter_01_20_17_to_3-2-18.pickle'
twitter_data = pd.read_pickle(twitter_file_path)
In [3]:
len(fed_data)
Out[3]:
In [4]:
# imports
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import itertools
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('ggplot')
In [5]:
# find the most used hashtags
hashtag_freq = Counter(list(itertools.chain(*(twitter_data.hash_tags))))
hashtag_top20 = hashtag_freq.most_common(20)
# find the most used @ tags
at_tag_freq = Counter(list(itertools.chain(*(twitter_data['@_tags']))))
at_tags_top20 = at_tag_freq.most_common(20)
print(hashtag_top20)
In [6]:
# frequency plot for the most used hashtags
df = pd.DataFrame(hashtag_top20, columns=['Hashtag', 'frequency'])
df.plot(kind='bar', x='Hashtag',legend=None,fontsize = 15, figsize = (15,5))
plt.ylabel('Frequency',fontsize = 18)
plt.xlabel('Hashtag', fontsize=18)
plt.title('Most Common Hashtags', fontsize = 15)
plt.show()
In [7]:
# frequency plot for the most used @ tags
df = pd.DataFrame(at_tags_top20, columns=['@ Tag', 'frequency'])
df.plot(kind='bar', x='@ Tag',legend=None, figsize = (15,5))
plt.ylabel('Frequency',fontsize = 18)
plt.xlabel('@ Tags', fontsize=18)
plt.title('Most Common @ Tags', fontsize = 15)
plt.show()
In [8]:
# use nltk's list of stopwords
stop_words = set(stopwords.words('english'))
# add puncuation to stopwords
stop_words.update(['.', ',','get','going','one', 'amp','like' '"','...',"''", "'","n't", '?', '!', ':', ';', '#','@', '(', ')', 'https', '``',"'s", 'rt' ])
In [9]:
# combine the hashtags and @ tags, flatten the list of lists, keep the unique items
stop_twitter = set(list(itertools.chain(*(twitter_data.hash_tags + twitter_data['@_tags']))))
In [10]:
stop_fed_docs = ['united', 'states', '1','2','3','4','5','6','7','8','9','10', '11','12',
'13','14','15','16','17','18','19','20','21','22','23','24','25','26',
'27','28','29','30','31','2016', '2015','2014','federal','shall', '4790',
'national', '2017', 'order','president', 'presidential', 'sep',
'register','po','verdate', 'jkt','00000','frm','fmt','sfmt','vol',
'section','donald','act','america', 'executive','secretary', 'law',
'proclamation','81','day','including', 'code', '4705','authority', 'agencies',
'241001','americans','238001','year', 'amp','government','agency','hereby',
'people','public','person','state','american','two','nation', '82', 'sec',
'laws', 'policy','set','fr','appropriate','doc','new','filed','u.s.c',
'department','ii','also','office','country','within','memorandum',
'director', 'us', 'sunday','monday', 'tuesday','wednesday','thursday',
'friday', 'saturday','title','upon','constitution','support', 'vested',
'part', 'month', 'subheading', 'foreign','general','january',
'february', 'march', 'april','may','june','july','august', 'september',
'october', 'november', 'december', 'council','provide','consistent','pursuant',
'thereof','00001','documents','11:15', 'area','management',
'following','house','white','week','therefore','amended', 'continue',
'chapter','must','years', '00002', 'use','make','date','one',
'many','12', 'commission','provisions', 'every','u.s.','functions',
'made','hand','necessary', 'witness','time','otherwise', 'proclaim',
'follows','thousand','efforts','jan', 'trump','j.',
'applicable', '4717','whereof','hereunto', 'subject', 'report',
'3—', '3295–f7–p']
In [11]:
def remove_from_fed_data(token_lst):
# remove stopwords and one letter words
filtered_lst = [word for word in token_lst if word.lower() not in stop_fed_docs and len(word) > 1
and word.lower() not in stop_words]
return filtered_lst
In [12]:
def remove_from_twitter_data(token_lst):
# remove stopwords and one letter words
filtered_lst = [word for word in token_lst if word.lower() not in stop_words and len(word) > 1
and word.lower() not in stop_twitter]
return filtered_lst
In [13]:
# apply the remove_stopwords function to all of the tokenized twitter text
twitter_words = twitter_data.text_tokenized.apply(lambda x: remove_from_twitter_data(x))
# apply the remove_stopwords function to all of the tokenized document text
document_words = fed_data.token_text.apply(lambda x: remove_from_fed_data(x))
# flatten each the word lists into one list
all_twitter_words = list(itertools.chain(*twitter_words))
all_document_words =list(itertools.chain(*document_words))
In [14]:
# create a dictionary using the Counter method, where the key is a word and the value is the number of time it was used
twitter_freq = Counter(all_twitter_words)
doc_freq = Counter(all_document_words)
# determine the top 30 words used in the twitter data
top_30_tweet = twitter_freq.most_common(30)
top_30_fed = doc_freq.most_common(30)
In [15]:
# frequency plot for the most used Federal Data
df = pd.DataFrame(top_30_fed, columns=['Federal Data', 'frequency'])
df.plot(kind='bar', x='Federal Data',legend=None, figsize = (15,5))
plt.ylabel('Frequency',fontsize = 18)
plt.xlabel('Words', fontsize=18)
plt.title('Most Used Words that Occured in the Federal Data', fontsize = 15)
plt.show()
In [16]:
# frequency plot for the most used words in the twitter data
df = pd.DataFrame(top_30_tweet, columns=['Twitter Data', 'frequency'])
df.plot(kind='bar', x='Twitter Data',legend=None, figsize = (15,5))
plt.ylabel('Frequency',fontsize = 18)
plt.xlabel('Words', fontsize=18)
plt.title('Most Used Words that Occured in the Twitter Data', fontsize = 15)
plt.show()
In [17]:
# find the unique words in each dataset
joint_words = list((set(all_document_words)).intersection(all_twitter_words))
In [18]:
# make array of zeros
values = np.zeros(len(joint_words))
# create dictionary
joint_words_dict = dict(zip(joint_words, values))
In [19]:
# create a dictionary with a word as key, and a value = number of documents that contain the word for Twitter
twitter_document_freq = joint_words_dict.copy()
for word in joint_words:
for lst in twitter_data.text_tokenized:
if word in lst:
twitter_document_freq[word]= twitter_document_freq[word] + 1
# create a dictionary with a word as key, and a value = number of documents that contain the word for Fed Data
fed_document_freq = joint_words_dict.copy()
for word in joint_words:
for lst in fed_data.token_text:
if word in lst:
fed_document_freq[word]= fed_document_freq[word] + 1
In [20]:
df = pd.DataFrame([fed_document_freq, twitter_document_freq]).T
In [21]:
df.columns = ['Fed', 'Tweet']
df['% Fed'] = (df.Fed/len(df.Fed))*100
df['% Tweet'] = (df.Tweet/len(df.Tweet))*100
In [22]:
top_joint_fed = df[['% Fed','% Tweet']].sort_values(by='% Fed', ascending=False)[0:50]
top_joint_tweet = df[['% Fed','% Tweet']].sort_values(by='% Tweet', ascending=False)[0:50]
In [23]:
top_joint_fed.plot.bar(figsize=(14,5))
plt.show()
In [24]:
top_joint_tweet.plot.bar(figsize=(14,5))
plt.show()
In [25]:
df['diff %'] = df['% Fed'] - df['% Tweet']
In [26]:
top_same = df[df['diff %'] == 0].sort_values(by='% Fed', ascending=False)[0:50]
In [27]:
top_same[['% Fed', '% Tweet']].plot.bar(figsize=(14,5))
plt.show()