In [1]:
"""
Initialization
"""
'''
Standard modules
'''
import os
import pickle
import csv
import time
from pprint import pprint
'''
Analysis modules
'''
%matplotlib inline
%config InlineBackend.figure_format = 'retina' # render double resolution plot output for Retina screens
import matplotlib.pyplot as plt
import pandas as pd
'''
Custom modules
'''
import config
import utilities
'''
Misc
'''
nb_name = '20171021-daheng-eda_topic_news_tweets_happiness'
In [2]:
%%time
"""
Compute h_avg scores for each topic_news and topic_tweets doc
"""
topics_h_avg_csv_file = os.path.join(config.FIG_DIR, 'topics_h_avg.updated.csv')
if 0 == 1:
topics_h_avg_lst = []
'''
Recover pkl info
'''
with open(config.TOPICS_NEWS_SHED_WORDS_FREQ_DICT_PKL, 'rb') as f:
topics_news_shed_words_freq_dict = pickle.load(f)
with open(config.IND_HAPPS_DICT_PKL, 'rb') as f:
ind_happs_dict = pickle.load(f)
for topic_ind, topic in enumerate(config.MANUALLY_SELECTED_TOPICS_LST):
topic_dict = {'topic_ind': topic_ind, 'topic_name': topic['name']}
localtime = time.asctime(time.localtime(time.time()))
print('({}/{}) processing topic: {} ... {}'.format(topic_ind+1,
len(config.MANUALLY_SELECTED_TOPICS_LST),
topic['name'],
localtime))
'''
Compute h_avg score for topic_news doc
'''
topic_news_shed_words_freq_dict = topics_news_shed_words_freq_dict[topic_ind]
topic_dict['news_num'] = len(topic_news_shed_words_freq_dict.keys())
topic_news_merged_freq_dict = utilities.merge_shed_words_freq_dicts(topic_news_shed_words_freq_dict.values())
topic_news_h_avg = utilities.compute_h_score(topic_news_merged_freq_dict, ind_happs_dict)
topic_dict['news_h_avg'] = topic_news_h_avg
'''
Compute h_avg score for topic_tweets doc
'''
topic_tweets_shed_words_freq_dict_pkl_file = os.path.join(config.TOPICS_TWEETS_SHED_WORDS_FREQ_DICT_PKLS_DIR,
'{}.updated.dict.pkl'.format(topic_ind))
with open(topic_tweets_shed_words_freq_dict_pkl_file, 'rb') as f:
topic_tweets_shed_words_freq_dict = pickle.load(f)
topic_dict['tweets_num'] = len(topic_tweets_shed_words_freq_dict.keys())
topic_tweets_merged_freq_dict = utilities.merge_shed_words_freq_dicts(topic_tweets_shed_words_freq_dict.values())
topic_tweets_h_avg = utilities.compute_h_score(topic_tweets_merged_freq_dict, ind_happs_dict)
topic_dict['tweets_h_avg'] = topic_tweets_h_avg
topics_h_avg_lst.append(topic_dict)
topics_h_avg_df = pd.DataFrame(topics_h_avg_lst)
topics_h_avg_df.to_csv(path_or_buf=topics_h_avg_csv_file,
columns=['topic_ind', 'topic_name', 'news_num', 'news_h_avg', 'tweets_num', 'tweets_h_avg'],
sep='\t',
quoting=csv.QUOTE_MINIMAL,
header=True,
index=False)
print('Done')
In [3]:
"""
Load data
"""
if 1 == 1:
topics_h_avg_csv_file = os.path.join(config.FIG_DIR, 'topics_h_avg.updated.csv')
topics_h_avg_df = pd.read_csv(topics_h_avg_csv_file, sep='\t', quoting=csv.QUOTE_MINIMAL)
topic_lst_df = pd.DataFrame(config.MANUALLY_SELECTED_TOPICS_LST)
topics_h_avg_df['category'] = topic_lst_df['category']
In [4]:
with pd.option_context('display.max_columns', 7, 'display.max_colwidth', 50, 'expand_frame_repr', False):
dis_lst = ['topic_ind', 'news_num', 'news_h_avg', 'tweets_num', 'tweets_h_avg', 'topic_name', 'category']
display(topics_h_avg_df[dis_lst])
In [5]:
"""
Plot distribution of news_h_avg and tweets_h_avg
"""
if 1 == 1:
'''
Prepare data
'''
data = [topics_h_avg_df['news_h_avg'], topics_h_avg_df['tweets_h_avg']]
'''
Plot
'''
fig, ax = plt.subplots(figsize=(9, 6))
bp = plt.boxplot(data, notch=False, sym='k+', vert=True)
plt.setp(bp['boxes'], color='black')
plt.setp(bp['whiskers'], color='black')
plt.setp(bp['fliers'], color='red', marker='+', markersize=5)
ax.set_xticklabels(['News', 'Tweets'])
title_fontdict = {'weight': 'bold', 'size': 'x-large'}
ax.set_title('Distribution of news and tweets h_avg scores', fontdict=title_fontdict)
label_fontdict = {'size': 'large'}
ax.set_xlabel('Platform', fontdict=label_fontdict)
ax.set_ylabel('Happiness scores', fontdict=label_fontdict)
# ax.set_yscale('log')
# add a horizontal grid to the plot, but make it very light in color
ax.yaxis.grid(True, linestyle='-', which='major', color='lightgrey', alpha=0.5)
ax.set_axisbelow(True)
# add median value text to the median line
for line in bp['medians']:
# get position data for median line
x, y = line.get_xydata()[1] # right point of median line
# overlay median value
ax.text(x, y, ' {:.4}'.format(y), horizontalalignment='left', verticalalignment='center')
In [6]:
"""
Plot distribution of news_h_avg and tweets_h_avg by category
"""
if 1 == 1:
'''
Prepare data
'''
data = []
for cat in ['politics', 'social', 'entertainment', 'tragedy']:
cat_df = topics_h_avg_df[topics_h_avg_df['category'] == cat]
data.extend([cat_df['news_h_avg'], cat_df['tweets_h_avg']])
'''
Plot
'''
fig, ax = plt.subplots(figsize=(15, 6))
bp = plt.boxplot(data, notch=False, sym='k+', vert=True)
plt.setp(bp['boxes'], color='black')
plt.setp(bp['whiskers'], color='black')
plt.setp(bp['fliers'], color='red', marker='+', markersize=5)
ax.set_xticklabels(['Politics News', 'Politics Tweets', 'Social News', 'Social Tweets',
'Entertainment News', 'Entertainment Tweets', 'Tragedy News', 'Tragedy Tweets'])
title_fontdict = {'weight': 'bold', 'size': 'x-large'}
ax.set_title('Distribution of news and tweets h_avg scores', fontdict=title_fontdict)
label_fontdict = {'size': 'large'}
ax.set_xlabel('Platform', fontdict=label_fontdict)
ax.set_ylabel('Happiness scores', fontdict=label_fontdict)
# ax.set_yscale('log')
# add a horizontal grid to the plot, but make it very light in color
ax.yaxis.grid(True, linestyle='-', which='major', color='lightgrey', alpha=0.5)
ax.set_axisbelow(True)
# add median value text to the median line
for line in bp['medians']:
# get position data for median line
x, y = line.get_xydata()[1] # right point of median line
# overlay median value
ax.text(x, y, ' {:.4}'.format(y), horizontalalignment='left', verticalalignment='center')
In [7]:
"""
Load data
"""
if 1 == 1:
topics_h_avg_csv_file = os.path.join(config.FIG_DIR, 'topics_h_avg.csv')
topics_h_avg_updated_csv_file = os.path.join(config.FIG_DIR, 'topics_h_avg.updated.csv')
topics_h_avg_df = pd.read_csv(topics_h_avg_csv_file, sep='\t', quoting=csv.QUOTE_MINIMAL)
topics_h_avg_updated_df = pd.read_csv(topics_h_avg_updated_csv_file, sep='\t', quoting=csv.QUOTE_MINIMAL)
topic_lst_df = pd.DataFrame(config.MANUALLY_SELECTED_TOPICS_LST)
topics_h_avg_df['category'] = topic_lst_df['category']
topics_h_avg_updated_df['category'] = topic_lst_df['category']
In [8]:
"""
Pearson corr
"""
print('Original:')
display(topics_h_avg_df[['news_h_avg', 'tweets_h_avg']].corr(method='pearson'))
print('After removal duplicate tweets:')
display(topics_h_avg_updated_df[['news_h_avg', 'tweets_h_avg']].corr(method='pearson'))
In [9]:
"""
Spearman corr
"""
print('Original:')
display(topics_h_avg_df[['news_h_avg', 'tweets_h_avg']].corr(method='spearman'))
print('After removal duplicate tweets:')
display(topics_h_avg_updated_df[['news_h_avg', 'tweets_h_avg']].corr(method='spearman'))
In [10]:
"""
Pearson corr by category
"""
if 1 == 1:
print('Pearson corrs')
print('Original:')
for cat in ['politics', 'social', 'entertainment', 'tragedy']:
cat_df = topics_h_avg_df[topics_h_avg_df['category'] == cat]
cat_df_corr = cat_df[['news_h_avg', 'tweets_h_avg']].corr(method='pearson')
print('Topics category: {}'.format(cat))
print(cat_df_corr)
print('')
print('After removal duplicate tweets:')
for cat in ['politics', 'social', 'entertainment', 'tragedy']:
cat_df = topics_h_avg_updated_df[topics_h_avg_df['category'] == cat]
cat_df_corr = cat_df[['news_h_avg', 'tweets_h_avg']].corr(method='pearson')
print('Topics category: {}'.format(cat))
print(cat_df_corr)
print('')
In [11]:
"""
Spearman corr by category
"""
if 1 == 1:
print('Spearman corrs')
print('Original:')
for cat in ['politics', 'social', 'entertainment', 'tragedy']:
cat_df = topics_h_avg_df[topics_h_avg_df['category'] == cat]
cat_df_corr = cat_df[['news_h_avg', 'tweets_h_avg']].corr(method='spearman')
print('Topics category: {}'.format(cat))
print(cat_df_corr)
print('')
print('After removal duplicate tweets:')
for cat in ['politics', 'social', 'entertainment', 'tragedy']:
cat_df = topics_h_avg_updated_df[topics_h_avg_df['category'] == cat]
cat_df_corr = cat_df[['news_h_avg', 'tweets_h_avg']].corr(method='spearman')
print('Topics category: {}'.format(cat))
print(cat_df_corr)
print('')