In [1]:
"""
Initialization
"""
'''
Standard modules
'''
import os
import pickle
import csv
import time
from pprint import pprint
import json
import pymongo
import multiprocessing
import logging
import collections
'''
Analysis modules
'''
%matplotlib inline
%config InlineBackend.figure_format = 'retina' # render double resolution plot output for Retina screens
import matplotlib.pyplot as plt
import pandas as pd
'''
Custom modules
'''
import config
import utilities
import mongodb
import multiprocessing_workers
'''
R magic and packages
'''
# hide all RRuntimeWarnings
import warnings
warnings.filterwarnings('ignore')
# add home for R in anaconda on PATH sys env
os.environ['PATH'] += ':/opt/anaconda3/bin'
# load R magic
%load_ext rpy2.ipython
# load R packages
%R require(ggplot2)
'''
Misc
'''
nb_name = '20171024-daheng-prepare_ibm_tweets_news_data'
# all tweets with keywork 'ibm' in tweet_text field from ND IBM dataset
ibm_tweets_file = os.path.join(config.IBM_TWEETS_NEWS_DIR, 'ibm_tweets.json')
# based on ibm_tweets_file. Duplicate tweets with the same or similar tweet_text are removed
ibm_unique_tweets_file = os.path.join(config.IBM_TWEETS_NEWS_DIR, 'ibm_unique_tweets.json')
# manually selected news sources list by examing most common news sources of valid urls embedded in ibm unique tweets
# selected_news_sources_lst = ['www.forbes.com', 'finance.yahoo.com', 'venturebeat.com',
# 'medium.com', 'www.engadget.com', 'alltheinternetofthings.com',
# 'www.zdnet.com', 'www.wsj.com', 'www.cnbc.com']
selected_news_sources_lst = ['venturebeat', 'engadget', 'wsj', 'cnbc']
# manually collected ibm news data
ibm_news_file = os.path.join(config.HR_DIR, 'selected_ibm_news.csv')
# all tweets related to the 'social_capital_ceo_palihapitiya_watson_joke' news by cnbc
palihapitiya_watson_joke_tweets_file = os.path.join(config.HR_DIR, 'palihapitiya_watson_joke_tweets.csv')
# manually tag information of all tweets related to the 'social_capital_ceo_palihapitiya_watson_joke' news by cnbc
palihapitiya_watson_joke_tweets_tag_file = os.path.join(config.HR_DIR, 'palihapitiya_watson_joke_tweets_tag.csv')
Copy mongodb.py and multiprocessing_workers.py files to the project root dir.
mongodb.py can be used to get connection to local MongoDB database.multiprocessing_workers.py can be used to query MongoDB database in multiple processes to save time (need modifications)Native tweets are stored in tweets_ek-2 db and tw_nt table.
In [2]:
%%time
"""
Register
IBM_TWEETS_NEWS_DIR = os.path.join(DATA_DIR, 'ibm_tweets_news')
in config
"""
DB_NAME = 'tweets_ek-2'
COL_NAME = 'tw_nt'
if 0 == 1:
multiprocessing.log_to_stderr(logging.DEBUG)
'''
Use multiprocessing to parse tweet_text field for "ibm" keyword
'''
procedure_name = 'tag_native_tweets_text_ibm'
# set processes number to CPU numbers minus 1
process_num = multiprocessing.cpu_count() - 1
process_file_names_lst = ['{}-{}.json'.format(process_ind, procedure_name)
for process_ind in range(process_num)]
process_files_lst = [os.path.join(config.IBM_TWEETS_NEWS_DIR, process_file_name)
for process_file_name in process_file_names_lst]
jobs = []
for process_ind in range(process_num):
p = multiprocessing.Process(target=multiprocessing_workers.find_keywords_in_tweet_text,
args=(DB_NAME, COL_NAME, process_ind, process_num, process_files_lst[process_ind], ['ibm']),
name='Process-{}/{}'.format(process_ind, process_num))
jobs.append(p)
for job in jobs:
job.start()
for job in jobs:
job.join()
In [2]:
%%time
"""
Merger all process files into a single file
Register
ibm_tweets_file = os.path.join(config.IBM_TWEETS_NEWS_DIR, 'ibm_tweets.json')
in Initialization section.
"""
if 0 == 1:
'''
Re-generate process file names
'''
procedure_name = 'tag_native_tweets_text_ibm'
process_num = multiprocessing.cpu_count() - 1
process_file_names_lst = ['{}-{}.json'.format(process_ind, procedure_name)
for process_ind in range(process_num)]
process_files_lst = [os.path.join(config.IBM_TWEETS_NEWS_DIR, process_file_name)
for process_file_name in process_file_names_lst]
with open(ibm_tweets_file, 'w') as output_f:
for process_file in process_files_lst:
with open(process_file, 'r') as input_f:
for line in input_f:
output_f.write(line)
In [3]:
%%time
"""
Remove tweets with the same or silimar tweet_text field
Register
ibm_unique_tweets_file = os.path.join(config.IBM_TWEETS_NEWS_DIR, 'ibm_unique_tweets.json')
in Initialization section.
"""
if 0 == 1:
with open(ibm_unique_tweets_file, 'w') as output_f:
with open(ibm_tweets_file, 'r') as input_f:
uniqe_tweet_text_field = set()
for line in input_f:
tweet_json = json.loads(line)
tweet_text = tweet_json['text']
cleaned_tweet_text = utilities.clean_tweet_text(tweet_text)
if cleaned_tweet_text not in uniqe_tweet_text_field:
uniqe_tweet_text_field.add(cleaned_tweet_text)
output_f.write(line)
In [2]:
"""
Check number of ibm tweets and number of ibm unique tweets
"""
if 1 == 1:
with open(ibm_tweets_file, 'r') as f:
ibm_tweets_num = sum([1 for line in f])
print('Number of ibm tweets: {}'.format(ibm_tweets_num))
with open(ibm_unique_tweets_file, 'r') as f:
ibm_unique_tweets_num = sum([1 for line in f])
print('Number of unique ibm tweets: {}'.format(ibm_unique_tweets_num))
In [4]:
"""
Check number of ibm unique tweets with URL
"""
if 1 == 1:
with open(ibm_unique_tweets_file, 'r') as f:
# if entities.urls field is not empty
ibm_unique_tweets_url_num = sum([1 for line in f
if json.loads(line)['entities']['urls']])
print('Number of unique ibm tweets with URL: {}'.format(ibm_unique_tweets_url_num))
In [6]:
%%time
"""
Check most popular domain names in URLs embedded in ibm unique tweets
"""
if 1 == 1:
url_domain_names_counter = collections.Counter()
with open(ibm_unique_tweets_file, 'r') as f:
for line in f:
tweet_json = json.loads(line)
# if tweet contains at least one url, entities.urls is not empty
entities_urls = tweet_json['entities']['urls']
if entities_urls:
for entities_url in entities_urls:
# expanded_url field may contain full unshortened url
expanded_url = entities_url['expanded_url']
url_domain_name = expanded_url.split('/')[2]
url_domain_names_counter.update([url_domain_name])
pprint(url_domain_names_counter.most_common(50))
In [7]:
%%time
"""
Re-compute most popular domain names in URLs embedded in ibm unique tweets
- ignore misc irrelevant website domain names
- ignore all shortened urls
Register
selected_news_sources_lst
in Initialization section.
"""
misc_irrelevant_websites_lst = ['twitter', 'youtube', 'youtu.be', 'amazon', 'paper.li', 'linkedin', 'lnkd.in', 'instagram']
shortened_url_identifiers_lst = ['bit.ly', 'ift.tt', 'dlvr.it', 'ow.ly', 'buff.ly', 'oal.lu', 'goo.gl', 'ln.is', 'gag.gl', 'fb.me', 'trap.it', 'ibm.co',
'ibm.biz', 'shar.es', 'crwd.fr', 'klou.tt', 'tek.io', 'owler.us', 'upflow.co', 'hubs.ly', 'zd.net', 'spr.ly', 'flip.it']
if 0 == 1:
valid_url_domain_names_counter = collections.Counter()
ignore_lst = misc_irrelevant_websites_lst + shortened_url_identifiers_lst
with open(ibm_unique_tweets_file, 'r') as f:
for line in f:
tweet_json = json.loads(line)
# if tweet contains at least one url, entities.urls is not empty
entities_urls = tweet_json['entities']['urls']
if entities_urls:
for entities_url in entities_urls:
# expanded_url field may contain full unshortened url
expanded_url = entities_url['expanded_url']
# ignore all urls with manually selected tokens
if not any(token in expanded_url for token in ignore_lst):
# ignore all shortned urls by HEURISTIC
if len(expanded_url.split('/')) > 4:
valid_url_domain_name = expanded_url.split('/')[2]
valid_url_domain_names_counter.update([valid_url_domain_name])
pprint(valid_url_domain_names_counter.most_common(50))
In [2]:
%%time
"""
Check most common valid links
"""
misc_irrelevant_websites_lst = ['twitter', 'youtube', 'youtu.be', 'amazon', 'paper.li', 'linkedin', 'lnkd.in', 'instagram']
shortened_url_identifiers_lst = ['bit.ly', 'ift.tt', 'dlvr.it', 'ow.ly', 'buff.ly', 'oal.lu', 'goo.gl', 'ln.is', 'gag.gl', 'fb.me', 'trap.it', 'ibm.co',
'ibm.biz', 'shar.es', 'crwd.fr', 'klou.tt', 'tek.io', 'owler.us', 'upflow.co', 'hubs.ly', 'zd.net', 'spr.ly', 'flip.it']
if 0 == 1:
urls_counter = collections.Counter()
ignore_lst = misc_irrelevant_websites_lst + shortened_url_identifiers_lst
with open(ibm_unique_tweets_file, 'r') as f:
for line in f:
tweet_json = json.loads(line)
# if tweet contains at least one url, entities.urls is not empty
entities_urls = tweet_json['entities']['urls']
if entities_urls:
for entities_url in entities_urls:
# expanded_url field may contain full unshortened url
expanded_url = entities_url['expanded_url']
# ignore all urls with manually selected tokens
if not any(token in expanded_url for token in ignore_lst):
# ignore all shortned urls by HEURISTIC
if len(expanded_url.split('/')) > 4:
urls_counter.update([expanded_url])
pprint(urls_counter.most_common(50))
In [2]:
%%time
"""
Check most common links to selected news sources
"""
if 0 == 1:
selected_news_sources_urls_counter = collections.Counter()
with open(ibm_tweets_file, 'r') as f:
for line in f:
tweet_json = json.loads(line)
# if tweet contains at least one url, entities.urls is not empty
entities_urls = tweet_json['entities']['urls']
if entities_urls:
for entities_url in entities_urls:
# expanded_url field may contain full unshortened url
expanded_url = entities_url['expanded_url']
# filter out only url links to selected news sources
if any(selected_news_source in expanded_url for selected_news_source in selected_news_sources_lst):
selected_news_sources_urls_counter.update([expanded_url])
pprint(selected_news_sources_urls_counter.most_common(50))
After examining
Note:
In [ ]:
"""
Register
ibm_news_file
in Initialization section.
"""
In [2]:
"""
Load in csv file
"""
if 1 == 1:
ibm_news_df = pd.read_csv(filepath_or_buffer=ibm_news_file, sep='\t')
In [3]:
with pd.option_context('display.max_colwidth', 100, 'expand_frame_repr', False):
display(ibm_news_df[['NEWS_DATE', 'NEWS_NAME', 'NEWS_DOC']])
In [4]:
"""
Print any news_doc by paragraphs
"""
test_lst = ibm_news_df.iloc[10]['NEWS_DOC'].split('::::::::')
for ind, item in enumerate(test_lst):
print('({})'.format(ind+1))
print(item)
New Objective:
In [2]:
%%time
"""
Find out all tweets related to the 'social_capital_ceo_palihapitiya_watson_joke' news
News URL 1: https://www.cnbc.com/2017/05/08/ibms-watson-is-a-joke-says-social-capital-ceo-palihapitiya.html
News URL 2: https://www.cnbc.com/2017/05/09/no-joke-id-like-to-see-my-firm-go-head-to-head-with-ibm-on-a-i-palihapitiya.html
Register
palihapitiya_watson_joke_tweets_file
in Initialization section
"""
if 0 == 1:
target_news_keywords_lst = ['social capital', 'chamath', 'palihapitiya']
target_tweets_dict_lst = []
with open(ibm_unique_tweets_file, 'r') as f:
for line in f:
tweet_json = json.loads(line)
tweet_text = tweet_json['text'].replace('\n', ' ').replace('\r', ' ')
tweet_user_screen_name = tweet_json['user']['screen_name']
tweet_created_at = utilities.parse_tweet_post_time(tweet_json['created_at'])
if any(kw.lower() in tweet_text.lower() for kw in target_news_keywords_lst):
target_tweet_dict = {'tweet_created_at': tweet_created_at,
'tweet_user_screen_name': tweet_user_screen_name,
'tweet_text': tweet_text}
target_tweets_dict_lst.append(target_tweet_dict)
target_tweets_df = pd.DataFrame(target_tweets_dict_lst)
target_tweets_df.to_csv(path_or_buf=palihapitiya_watson_joke_tweets_file, sep='\t', index=True, quoting=csv.QUOTE_MINIMAL)
In [2]:
"""
Read in data
"""
if 1 == 1:
target_tweets_df = pd.read_csv(filepath_or_buffer=palihapitiya_watson_joke_tweets_file,
sep='\t',
index_col=0,
parse_dates=['tweet_created_at'],
quoting=csv.QUOTE_MINIMAL)
In [3]:
with pd.option_context('display.max_rows', 260, 'display.max_colwidth', 150, 'expand_frame_repr', False):
display(target_tweets_df)
Manually tag each tweet related to "social_capital_ceo_palihapitiya_watson_joke" news for:
In [ ]:
"""
Register
palihapitiya_watson_joke_tweets_tag_file
in Initialization section
"""
In [2]:
"""
Load data
"""
if 1 == 1:
'''
Read in all tweets related to the 'social_capital_ceo_palihapitiya_watson_joke' news
'''
target_tweets_df = pd.read_csv(filepath_or_buffer=palihapitiya_watson_joke_tweets_file,
sep='\t',
index_col=0,
parse_dates=['tweet_created_at'],
quoting=csv.QUOTE_MINIMAL)
'''
Read in manually tagged information for all tweets just loaded
'''
target_tweets_tag_df = pd.read_csv(filepath_or_buffer=palihapitiya_watson_joke_tweets_tag_file,
sep='\t',
index_col=0)
'''
Combine dfs and set index
'''
test_tweets_df = target_tweets_df.join(target_tweets_tag_df)
test_tweets_df['tweet_index'] = test_tweets_df.index
test_tweets_df = test_tweets_df.set_index('tweet_created_at')
In [3]:
"""
Check tweets related to second news
"""
if 1 == 1:
test_df = test_tweets_df[test_tweets_df['tweet_news'] == 2]
display(test_df)
In [4]:
"""
For tweets related to first news
Build tmp dfs for tweets in mild sentiment and harsh sentiment separately
"""
if 1 == 1:
mild_cond = (test_tweets_df['tweet_news'] == 1) & (test_tweets_df['tweet_sentiment'] == 2)
harsh_cond = (test_tweets_df['tweet_news'] == 1) & (test_tweets_df['tweet_sentiment'] == 3)
mild_tweets_df = test_tweets_df[mild_cond]
harsh_tweets_df = test_tweets_df[harsh_cond]
In [5]:
"""
Check tweets in mild sentiment
"""
print(mild_tweets_df['tweet_index'].count())
with pd.option_context('display.max_rows', 100, 'display.max_colwidth', 150, 'expand_frame_repr', False):
display(mild_tweets_df)
In [6]:
"""
Check tweets in harsh sentiment
"""
print(harsh_tweets_df['tweet_index'].count())
with pd.option_context('display.max_rows', 100, 'display.max_colwidth', 150, 'expand_frame_repr', False):
display(harsh_tweets_df)
In [7]:
"""
Bin mild/harsh tweets by 4H period and count numbers
"""
if 1 == 1:
mild_tweets_bin_count = mild_tweets_df['tweet_index'].resample('4H', convention='start').count().rename('mild_tweets_count')
harsh_tweets_bin_count = harsh_tweets_df['tweet_index'].resample('4H', convention='start').count().rename('harsh_tweets_count')
tweets_count = pd.concat([mild_tweets_bin_count, harsh_tweets_bin_count], axis=1)[:24]
In [8]:
with pd.option_context('display.max_rows', 100, 'display.max_colwidth', 150, 'expand_frame_repr', False):
display(tweets_count)
In [9]:
if 1 == 1:
tweets_count.plot(kind="bar", figsize=(12,6), title='# of mild/harsh tweets', stacked=True)
In [2]:
"""
Prepare df data
"""
if 1 == 1:
'''
Read in all tweets related to the 'social_capital_ceo_palihapitiya_watson_joke' news
'''
target_tweets_df = pd.read_csv(filepath_or_buffer=palihapitiya_watson_joke_tweets_file,
sep='\t',
index_col=0,
parse_dates=['tweet_created_at'],
quoting=csv.QUOTE_MINIMAL)
'''
Read in manually tagged information for all tweets just loaded
'''
target_tweets_tag_df = pd.read_csv(filepath_or_buffer=palihapitiya_watson_joke_tweets_tag_file,
sep='\t',
index_col=0)
'''
Join dfs and set index
'''
test_tweets_df = target_tweets_df.join(target_tweets_tag_df)
test_tweets_df['tweet_index'] = test_tweets_df.index
test_tweets_df = test_tweets_df.set_index('tweet_created_at')
'''
Bin mild/harsh tweets by 4H period and count numbers
'''
mild_tweets_df = test_tweets_df[(test_tweets_df['tweet_news'] == 1) & (test_tweets_df['tweet_sentiment'] == 2)]
harsh_tweets_df = test_tweets_df[(test_tweets_df['tweet_news'] == 1) & (test_tweets_df['tweet_sentiment'] == 3)]
second_news_mild_tweets_df = test_tweets_df[(test_tweets_df['tweet_news'] == 2) & (test_tweets_df['tweet_sentiment'] == 2)]
mild_tweets_bin_count = mild_tweets_df['tweet_index'].resample('4H', label='start', loffset='2H 1S').count().rename('mild_tweets_count')
harsh_tweets_bin_count = harsh_tweets_df['tweet_index'].resample('4H', label='start', loffset='2H 1S').count().rename('harsh_tweets_count')
second_news_mild_tweets_bin_count = second_news_mild_tweets_df['tweet_index'].resample('4H', label='start', loffset='2H 1S').count().rename('second_news_mild_tweets_count')
tweets_count = pd.concat([mild_tweets_bin_count, harsh_tweets_bin_count, second_news_mild_tweets_bin_count], axis=1)
'''
Misc operations
'''
tweets_count = tweets_count.fillna(0)
tweets_count['mild_tweets_count'] = tweets_count['mild_tweets_count'].astype(int)
tweets_count['harsh_mild_diff'] = tweets_count['harsh_tweets_count'] - tweets_count['mild_tweets_count']
tweets_count['mild_tweets_count_neg'] = - tweets_count['mild_tweets_count']
tweets_count['second_news_mild_tweets_count'] = tweets_count['second_news_mild_tweets_count'].astype(int)
tweets_count['second_news_mild_tweets_count_neg'] = - tweets_count['second_news_mild_tweets_count']
tweets_count.reset_index(drop=False, inplace=True)
tweets_r_df = tweets_count
In [3]:
tweets_r_df
Out[3]:
In [4]:
%%R -i tweets_r_df
#
# Prepare data
#
# cast data types
tweets_r_df$tweet_created_at <- as.POSIXct(strptime(tweets_r_df$tweet_created_at, format="%Y-%m-%d %H:%M:%S"))
#
# Plot and tweak histogram
#
# initialize new plot
# cols <- c('Harsh'='red', 'Mild'='blue', 'diff_line'='black')
plt <- ggplot(data=tweets_r_df, aes(x=tweet_created_at)) +
# layers of ref lines for publishing times of first and second news
geom_vline(xintercept=as.POSIXct(strptime('2017-05-08 16:45:00', format="%Y-%m-%d %H:%M:%S")), linetype='dashed', color='grey80') +
geom_vline(xintercept=as.POSIXct(strptime('2017-05-09 09:55:00', format="%Y-%m-%d %H:%M:%S")), linetype='dashed', color='grey80') +
# layer of geom_bar for harsh tweets
geom_bar(aes(y=harsh_tweets_count, fill='Harsh'), stat='identity', alpha=0.65) +
# layer of geom_rect for highlighting largest bar
geom_rect(aes(xmin=as.POSIXct(strptime('2017-05-09 12:15:00', format="%Y-%m-%d %H:%M:%S")),
xmax=as.POSIXct(strptime('2017-05-09 15:45:00', format="%Y-%m-%d %H:%M:%S")),
ymin=0, ymax=27), fill=NA, color="red", size=0.7, alpha=1) +
# layer of geom_bar for mild tweets
geom_bar(aes(y=mild_tweets_count_neg, fill='Mild'), stat='identity', alpha=0.65) +
# layer of geom_line for diff between harsh tweets and mild tweets
geom_line(aes(x=(tweet_created_at), y=harsh_mild_diff), stat='identity', linetype='solid') +
# layer of geom_bar for a few tweets related to second news in mild sentiment
geom_bar(aes(y=second_news_mild_tweets_count_neg), stat='identity', alpha=0.65, fill='green') +
# x-axis and y-axis
scale_x_datetime(name = 'Time',
date_labels = "%b %d %I%p",
date_breaks = "4 hour",
expand = c(0, 0),
limits = c(as.POSIXct(strptime('2017-05-08 12:00:00', format="%Y-%m-%d %H:%M:%S")),
as.POSIXct(strptime('2017-05-10 19:00:00', format="%Y-%m-%d %H:%M:%S")))) +
scale_y_continuous(name = 'Number of users',
breaks = c(-10, -5, 0, 5, 10, 15, 20, 25),
labels = c('10', '5', '0', '5', '10', '15', '20', '25'),
limits = c(-15, 30)) +
# legend
scale_fill_manual(name = "Sentiment Intensity",
values = c('Harsh'='red', 'Mild'='blue')) +
# theme
theme(panel.background = element_blank(),
axis.line = element_line(color='black'),
panel.grid.major.y = element_line(color='grey80'),
panel.grid.major.x = element_blank(),
panel.grid.minor = element_blank(),
axis.text.x = element_text(angle=90),
legend.position = 'top')
#
# Output figure
#
ggsave('./fig/ibm_joke_or_not.png', plt, height=5, width=5, dpi=200)