Exploratory data analysis on happiness scores of topic_news and topic_tweets docs

Last modified: 2017-10-22

Roadmap

  1. Check happiness scores of topic_news and topic_tweets docs
  2. Check word coverage of shed words on topic_news and topic_tweets docs
  3. Check any special topic

Steps


In [1]:
"""
Initialization
"""

'''
Standard modules
'''
import os
import pickle
import csv
import time
from pprint import pprint

'''
Analysis modules
'''
%matplotlib inline
%config InlineBackend.figure_format = 'retina' # render double resolution plot output for Retina screens 
import matplotlib.pyplot as plt
import pandas as pd


'''
Custom modules
'''
import config
import utilities

'''
Misc
'''
nb_name = '20171021-daheng-eda_topic_news_tweets_happiness'

Check happiness scores of topic_news and topic_tweets docs

Avg. happiness scores


In [2]:
%%time
"""
Compute h_avg scores for each topic_news and topic_tweets doc
"""

topics_h_avg_csv_file = os.path.join(config.FIG_DIR, 'topics_h_avg.updated.csv')

if 0 == 1:
    topics_h_avg_lst = []
    
    '''
    Recover pkl info
    '''
    with open(config.TOPICS_NEWS_SHED_WORDS_FREQ_DICT_PKL, 'rb') as f:
        topics_news_shed_words_freq_dict = pickle.load(f)
    
    with open(config.IND_HAPPS_DICT_PKL, 'rb') as f:
        ind_happs_dict = pickle.load(f)
    
    for topic_ind, topic in enumerate(config.MANUALLY_SELECTED_TOPICS_LST):
        topic_dict = {'topic_ind': topic_ind, 'topic_name': topic['name']}
        localtime = time.asctime(time.localtime(time.time()))
        print('({}/{}) processing topic: {} ... {}'.format(topic_ind+1,
                                                           len(config.MANUALLY_SELECTED_TOPICS_LST),
                                                           topic['name'],
                                                           localtime))
        
        '''
        Compute h_avg score for topic_news doc
        '''
        topic_news_shed_words_freq_dict = topics_news_shed_words_freq_dict[topic_ind]
        
        topic_dict['news_num'] = len(topic_news_shed_words_freq_dict.keys())
        
        topic_news_merged_freq_dict = utilities.merge_shed_words_freq_dicts(topic_news_shed_words_freq_dict.values())
        topic_news_h_avg = utilities.compute_h_score(topic_news_merged_freq_dict, ind_happs_dict)
        
        topic_dict['news_h_avg'] = topic_news_h_avg
        
        '''
        Compute h_avg score for topic_tweets doc
        '''
        topic_tweets_shed_words_freq_dict_pkl_file = os.path.join(config.TOPICS_TWEETS_SHED_WORDS_FREQ_DICT_PKLS_DIR,
                                                                  '{}.updated.dict.pkl'.format(topic_ind))
        with open(topic_tweets_shed_words_freq_dict_pkl_file, 'rb') as f:
            topic_tweets_shed_words_freq_dict = pickle.load(f)
        
        topic_dict['tweets_num'] = len(topic_tweets_shed_words_freq_dict.keys())
        
        topic_tweets_merged_freq_dict = utilities.merge_shed_words_freq_dicts(topic_tweets_shed_words_freq_dict.values())
        topic_tweets_h_avg = utilities.compute_h_score(topic_tweets_merged_freq_dict, ind_happs_dict)
        
        topic_dict['tweets_h_avg'] = topic_tweets_h_avg
        
        topics_h_avg_lst.append(topic_dict)
    
    topics_h_avg_df = pd.DataFrame(topics_h_avg_lst)
    topics_h_avg_df.to_csv(path_or_buf=topics_h_avg_csv_file,
                           columns=['topic_ind', 'topic_name', 'news_num', 'news_h_avg', 'tweets_num', 'tweets_h_avg'],
                           sep='\t',
                           quoting=csv.QUOTE_MINIMAL,
                           header=True,
                           index=False)
    print('Done')


(1/51) processing topic: Hillary_Clinton_email_controversy ... Mon Oct 23 16:25:46 2017
(2/51) processing topic: Iran_nuclear_deal ... Mon Oct 23 16:25:53 2017
(3/51) processing topic: ISIS_Jihadi_John_identity_reveal ... Mon Oct 23 16:26:19 2017
(4/51) processing topic: Ukraine_cease_fire ... Mon Oct 23 16:26:25 2017
(5/51) processing topic: Egypt_free_Al_Jazeera_journalist ... Mon Oct 23 16:26:29 2017
(6/51) processing topic: Keystone_XL_Pipeline_bill ... Mon Oct 23 16:26:30 2017
(7/51) processing topic: CIA_Torture_Report ... Mon Oct 23 16:26:31 2017
(8/51) processing topic: Obama_cybersecurity_plan ... Mon Oct 23 16:26:33 2017
(9/51) processing topic: DHS_funding_issue ... Mon Oct 23 16:26:37 2017
(10/51) processing topic: US_Cuba_relationship ... Mon Oct 23 16:26:39 2017
(11/51) processing topic: 2015_CPAC ... Mon Oct 23 16:26:54 2017
(12/51) processing topic: Iraq_free_ISIS_Tikrit ... Mon Oct 23 16:26:56 2017
(13/51) processing topic: Nigeria_Boko_Haram_terrorists ... Mon Oct 23 16:27:01 2017
(14/51) processing topic: Ferguson_unrest ... Mon Oct 23 16:27:04 2017
(15/51) processing topic: Hong_Kong_protest ... Mon Oct 23 16:27:37 2017
(16/51) processing topic: Sony_cyberattack ... Mon Oct 23 16:27:39 2017
(17/51) processing topic: Bill_Cosby_sexual_assault_allegation ... Mon Oct 23 16:27:57 2017
(18/51) processing topic: SpaceX_rocket_landing ... Mon Oct 23 16:28:03 2017
(19/51) processing topic: Brian_Williams_fake_story ... Mon Oct 23 16:28:06 2017
(20/51) processing topic: HSBC_tax_scandal ... Mon Oct 23 16:28:08 2017
(21/51) processing topic: David_Carr_death ... Mon Oct 23 16:28:09 2017
(22/51) processing topic: Patriots_Deflategate ... Mon Oct 23 16:28:10 2017
(23/51) processing topic: Delhi_Uber_driver_rape ... Mon Oct 23 16:28:13 2017
(24/51) processing topic: Superbug_spread ... Mon Oct 23 16:28:17 2017
(25/51) processing topic: Rudy_Giuliani_Obama_critique ... Mon Oct 23 16:28:20 2017
(26/51) processing topic: Oscar ... Mon Oct 23 16:28:24 2017
(27/51) processing topic: Super_Bowl ... Mon Oct 23 16:28:44 2017
(28/51) processing topic: Grammy ... Mon Oct 23 16:29:03 2017
(29/51) processing topic: Golden_Globe ... Mon Oct 23 16:29:10 2017
(30/51) processing topic: 500_million_Powerball ... Mon Oct 23 16:29:19 2017
(31/51) processing topic: Thanksgiving ... Mon Oct 23 16:29:24 2017
(32/51) processing topic: Black_Friday_and_Cyber_Monday ... Mon Oct 23 16:29:53 2017
(33/51) processing topic: Christmas ... Mon Oct 23 16:30:13 2017
(34/51) processing topic: New_Year ... Mon Oct 23 16:30:58 2017
(35/51) processing topic: Apple_Watch ... Mon Oct 23 16:31:13 2017
(36/51) processing topic: Yosemite_historic_climb ... Mon Oct 23 16:31:19 2017
(37/51) processing topic: Jon_Stewart_Daily_Show ... Mon Oct 23 16:31:19 2017
(38/51) processing topic: success_of_American_Sniper ... Mon Oct 23 16:31:20 2017
(39/51) processing topic: Ebola_virus_spread ... Mon Oct 23 16:31:28 2017
(40/51) processing topic: Indonesia_AirAsia_Flight_QZ8501_crash ... Mon Oct 23 16:31:37 2017
(41/51) processing topic: Paris_attacks ... Mon Oct 23 16:31:46 2017
(42/51) processing topic: Vanuatu_Cyclone_Pam ... Mon Oct 23 16:32:00 2017
(43/51) processing topic: Malaysia_Airlines_Flight_MH370_crash ... Mon Oct 23 16:32:02 2017
(44/51) processing topic: Colorado_NAACP_bombing ... Mon Oct 23 16:32:07 2017
(45/51) processing topic: FSU_shooting ... Mon Oct 23 16:32:09 2017
(46/51) processing topic: Chapel_Hill_shooting ... Mon Oct 23 16:32:11 2017
(47/51) processing topic: Bobbi_Kristina_Brown_death ... Mon Oct 23 16:32:12 2017
(48/51) processing topic: Taliban_Pakistan_school_massacre ... Mon Oct 23 16:32:13 2017
(49/51) processing topic: American_ISIS_Hostage_Kayla_Mueller ... Mon Oct 23 16:32:16 2017
(50/51) processing topic: TransAsia_Airways_Flight_GE235_crash ... Mon Oct 23 16:32:16 2017
(51/51) processing topic: Germanwings_Flight_9525_crash ... Mon Oct 23 16:32:17 2017
Done
CPU times: user 6min 29s, sys: 6.93 s, total: 6min 36s
Wall time: 6min 36s

Basic box-plots


In [3]:
"""
Load data
"""
if 1 == 1:
    topics_h_avg_csv_file = os.path.join(config.FIG_DIR, 'topics_h_avg.updated.csv')
    
    topics_h_avg_df = pd.read_csv(topics_h_avg_csv_file, sep='\t', quoting=csv.QUOTE_MINIMAL)
    
    topic_lst_df = pd.DataFrame(config.MANUALLY_SELECTED_TOPICS_LST)
    
    topics_h_avg_df['category'] = topic_lst_df['category']

In [4]:
with pd.option_context('display.max_columns', 7, 'display.max_colwidth', 50, 'expand_frame_repr', False):
    dis_lst = ['topic_ind', 'news_num', 'news_h_avg', 'tweets_num', 'tweets_h_avg', 'topic_name', 'category']
    display(topics_h_avg_df[dis_lst])


topic_ind news_num news_h_avg tweets_num tweets_h_avg topic_name category
0 0 228 5.912027 367618 5.969572 Hillary_Clinton_email_controversy politics
1 1 406 5.542578 1238107 5.812523 Iran_nuclear_deal politics
2 2 101 5.649552 310280 5.962166 ISIS_Jihadi_John_identity_reveal politics
3 3 84 5.353950 233999 5.414554 Ukraine_cease_fire politics
4 4 50 5.463536 39845 5.832747 Egypt_free_Al_Jazeera_journalist politics
5 5 55 5.709798 37041 5.456286 Keystone_XL_Pipeline_bill politics
6 6 41 5.549009 84081 4.892545 CIA_Torture_Report politics
7 7 73 5.845357 249609 5.855015 Obama_cybersecurity_plan politics
8 8 45 5.518097 66855 5.790961 DHS_funding_issue politics
9 9 235 5.871696 746329 5.986725 US_Cuba_relationship politics
10 10 68 5.861402 111755 5.897799 2015_CPAC politics
11 11 94 5.090966 224805 5.619050 Iraq_free_ISIS_Tikrit politics
12 12 243 5.183748 187452 4.953919 Nigeria_Boko_Haram_terrorists politics
13 13 611 5.332389 1616426 5.669085 Ferguson_unrest social
14 14 157 5.311911 110839 5.640524 Hong_Kong_protest social
15 15 275 5.771321 902546 5.994466 Sony_cyberattack social
16 16 168 5.662904 241487 5.224992 Bill_Cosby_sexual_assault_allegation social
17 17 86 6.042245 159027 6.146797 SpaceX_rocket_landing social
18 18 69 5.626322 131549 5.299628 Brian_Williams_fake_story social
19 19 28 5.637849 39947 5.859146 HSBC_tax_scandal social
20 20 36 5.897026 23416 5.601713 David_Carr_death social
21 21 44 5.837253 159463 5.864361 Patriots_Deflategate social
22 22 36 5.369789 199832 5.538784 Delhi_Uber_driver_rape social
23 23 41 5.188100 159846 5.821115 Superbug_spread social
24 24 50 5.881939 195681 5.776534 Rudy_Giuliani_Obama_critique social
25 25 241 6.213340 993397 6.156303 Oscar entertainment
26 26 211 6.061495 947507 6.128209 Super_Bowl entertainment
27 27 99 6.248144 380804 6.159709 Grammy entertainment
28 28 79 6.303521 413222 6.244079 Golden_Globe entertainment
29 29 79 6.402999 246179 6.161410 500_million_Powerball entertainment
30 30 150 6.109283 1402625 6.546168 Thanksgiving entertainment
31 31 121 6.079777 995610 6.165598 Black_Friday_and_Cyber_Monday entertainment
32 32 237 6.040478 2123840 6.692437 Christmas entertainment
33 33 69 5.746321 698497 6.623734 New_Year entertainment
34 34 73 6.323466 260718 6.395007 Apple_Watch entertainment
35 35 41 6.008305 23084 6.391506 Yosemite_historic_climb entertainment
36 36 35 5.947272 66622 5.630606 Jon_Stewart_Daily_Show entertainment
37 37 155 5.429828 402621 5.877318 success_of_American_Sniper entertainment
38 38 173 5.469454 453159 5.715878 Ebola_virus_spread tragedy
39 39 258 5.651412 454324 5.898451 Indonesia_AirAsia_Flight_QZ8501_crash tragedy
40 40 225 5.389137 684566 5.756385 Paris_attacks tragedy
41 41 89 5.653128 81207 5.765197 Vanuatu_Cyclone_Pam tragedy
42 42 58 5.719108 237927 5.917334 Malaysia_Airlines_Flight_MH370_crash tragedy
43 43 38 5.188676 128994 5.391193 Colorado_NAACP_bombing tragedy
44 44 39 5.623797 106232 5.686700 FSU_shooting tragedy
45 45 37 5.230333 38829 4.828974 Chapel_Hill_shooting tragedy
46 46 49 5.798599 54124 5.906122 Bobbi_Kristina_Brown_death tragedy
47 47 80 5.060923 114761 5.139857 Taliban_Pakistan_school_massacre tragedy
48 48 38 5.745052 21529 5.368661 American_ISIS_Hostage_Kayla_Mueller tragedy
49 49 56 5.455274 77302 5.402689 TransAsia_Airways_Flight_GE235_crash tragedy
50 50 71 5.441900 225951 5.784150 Germanwings_Flight_9525_crash tragedy

In [5]:
"""
Plot distribution of news_h_avg and tweets_h_avg
"""
if 1 == 1:
    '''
    Prepare data
    '''
    data = [topics_h_avg_df['news_h_avg'], topics_h_avg_df['tweets_h_avg']]
    
    '''
    Plot
    '''
    fig, ax = plt.subplots(figsize=(9, 6))
    
    bp = plt.boxplot(data, notch=False, sym='k+', vert=True)
    plt.setp(bp['boxes'], color='black')
    plt.setp(bp['whiskers'], color='black')
    plt.setp(bp['fliers'], color='red', marker='+', markersize=5)
    
    ax.set_xticklabels(['News', 'Tweets'])
    
    title_fontdict = {'weight': 'bold', 'size': 'x-large'}
    ax.set_title('Distribution of news and tweets h_avg scores', fontdict=title_fontdict)
    
    label_fontdict = {'size': 'large'}
    ax.set_xlabel('Platform', fontdict=label_fontdict)
    ax.set_ylabel('Happiness scores', fontdict=label_fontdict)
#     ax.set_yscale('log')
        
    # add a horizontal grid to the plot, but make it very light in color
    ax.yaxis.grid(True, linestyle='-', which='major', color='lightgrey', alpha=0.5)
    ax.set_axisbelow(True)
    
    # add median value text to the median line
    for line in bp['medians']:
        # get position data for median line
        x, y = line.get_xydata()[1] # right point of median line
        # overlay median value
        ax.text(x, y, ' {:.4}'.format(y), horizontalalignment='left', verticalalignment='center')



In [6]:
"""
Plot distribution of news_h_avg and tweets_h_avg by category
"""
if 1 == 1:
    '''
    Prepare data
    '''
    data = []
    for cat in ['politics', 'social', 'entertainment', 'tragedy']:
        cat_df = topics_h_avg_df[topics_h_avg_df['category'] == cat]
        data.extend([cat_df['news_h_avg'], cat_df['tweets_h_avg']])
    
    '''
    Plot
    '''
    fig, ax = plt.subplots(figsize=(15, 6))
    
    bp = plt.boxplot(data, notch=False, sym='k+', vert=True)
    plt.setp(bp['boxes'], color='black')
    plt.setp(bp['whiskers'], color='black')
    plt.setp(bp['fliers'], color='red', marker='+', markersize=5)
    
    ax.set_xticklabels(['Politics News', 'Politics Tweets', 'Social News', 'Social Tweets', 
                        'Entertainment News', 'Entertainment Tweets', 'Tragedy News', 'Tragedy Tweets'])
    
    title_fontdict = {'weight': 'bold', 'size': 'x-large'}
    ax.set_title('Distribution of news and tweets h_avg scores', fontdict=title_fontdict)
    
    label_fontdict = {'size': 'large'}
    ax.set_xlabel('Platform', fontdict=label_fontdict)
    ax.set_ylabel('Happiness scores', fontdict=label_fontdict)
#     ax.set_yscale('log')
        
    # add a horizontal grid to the plot, but make it very light in color
    ax.yaxis.grid(True, linestyle='-', which='major', color='lightgrey', alpha=0.5)
    ax.set_axisbelow(True)
    
    # add median value text to the median line
    for line in bp['medians']:
        # get position data for median line
        x, y = line.get_xydata()[1] # right point of median line
        # overlay median value
        ax.text(x, y, ' {:.4}'.format(y), horizontalalignment='left', verticalalignment='center')


Check correlations


In [7]:
"""
Load data
"""
if 1 == 1:
    topics_h_avg_csv_file = os.path.join(config.FIG_DIR, 'topics_h_avg.csv')
    topics_h_avg_updated_csv_file = os.path.join(config.FIG_DIR, 'topics_h_avg.updated.csv')
    
    topics_h_avg_df = pd.read_csv(topics_h_avg_csv_file, sep='\t', quoting=csv.QUOTE_MINIMAL)
    topics_h_avg_updated_df = pd.read_csv(topics_h_avg_updated_csv_file, sep='\t', quoting=csv.QUOTE_MINIMAL)
    
    topic_lst_df = pd.DataFrame(config.MANUALLY_SELECTED_TOPICS_LST)
    
    topics_h_avg_df['category'] = topic_lst_df['category']
    topics_h_avg_updated_df['category'] = topic_lst_df['category']

In [8]:
"""
Pearson corr
"""
print('Original:')
display(topics_h_avg_df[['news_h_avg', 'tweets_h_avg']].corr(method='pearson'))
print('After removal duplicate tweets:')
display(topics_h_avg_updated_df[['news_h_avg', 'tweets_h_avg']].corr(method='pearson'))


Original:
news_h_avg tweets_h_avg
news_h_avg 1.000000 0.790032
tweets_h_avg 0.790032 1.000000
After removal duplicate tweets:
news_h_avg tweets_h_avg
news_h_avg 1.000000 0.687085
tweets_h_avg 0.687085 1.000000

In [9]:
"""
Spearman corr
"""
print('Original:')
display(topics_h_avg_df[['news_h_avg', 'tweets_h_avg']].corr(method='spearman'))
print('After removal duplicate tweets:')
display(topics_h_avg_updated_df[['news_h_avg', 'tweets_h_avg']].corr(method='spearman'))


Original:
news_h_avg tweets_h_avg
news_h_avg 1.000000 0.814932
tweets_h_avg 0.814932 1.000000
After removal duplicate tweets:
news_h_avg tweets_h_avg
news_h_avg 1.000000 0.729321
tweets_h_avg 0.729321 1.000000

In [10]:
"""
Pearson corr by category
"""
if 1 == 1:
    print('Pearson corrs')
    print('Original:')
    for cat in ['politics', 'social', 'entertainment', 'tragedy']:
        cat_df = topics_h_avg_df[topics_h_avg_df['category'] == cat]
        cat_df_corr = cat_df[['news_h_avg', 'tweets_h_avg']].corr(method='pearson')
        print('Topics category: {}'.format(cat))
        print(cat_df_corr)
        print('')
        
    print('After removal duplicate tweets:')
    for cat in ['politics', 'social', 'entertainment', 'tragedy']:
        cat_df = topics_h_avg_updated_df[topics_h_avg_df['category'] == cat]
        cat_df_corr = cat_df[['news_h_avg', 'tweets_h_avg']].corr(method='pearson')
        print('Topics category: {}'.format(cat))
        print(cat_df_corr)
        print('')


Pearson corrs
Original:
Topics category: politics
              news_h_avg  tweets_h_avg
news_h_avg      1.000000      0.681319
tweets_h_avg    0.681319      1.000000

Topics category: social
              news_h_avg  tweets_h_avg
news_h_avg      1.000000      0.570608
tweets_h_avg    0.570608      1.000000

Topics category: entertainment
              news_h_avg  tweets_h_avg
news_h_avg      1.000000      0.422557
tweets_h_avg    0.422557      1.000000

Topics category: tragedy
              news_h_avg  tweets_h_avg
news_h_avg       1.00000       0.72168
tweets_h_avg     0.72168       1.00000

After removal duplicate tweets:
Topics category: politics
              news_h_avg  tweets_h_avg
news_h_avg      1.000000      0.560147
tweets_h_avg    0.560147      1.000000

Topics category: social
              news_h_avg  tweets_h_avg
news_h_avg      1.000000      0.319429
tweets_h_avg    0.319429      1.000000

Topics category: entertainment
              news_h_avg  tweets_h_avg
news_h_avg      1.000000      0.174124
tweets_h_avg    0.174124      1.000000

Topics category: tragedy
              news_h_avg  tweets_h_avg
news_h_avg      1.000000      0.690493
tweets_h_avg    0.690493      1.000000


In [11]:
"""
Spearman corr by category
"""
if 1 == 1:
    print('Spearman corrs')
    print('Original:')
    for cat in ['politics', 'social', 'entertainment', 'tragedy']:
        cat_df = topics_h_avg_df[topics_h_avg_df['category'] == cat]
        cat_df_corr = cat_df[['news_h_avg', 'tweets_h_avg']].corr(method='spearman')
        print('Topics category: {}'.format(cat))
        print(cat_df_corr)
        print('')
    
    print('After removal duplicate tweets:')
    for cat in ['politics', 'social', 'entertainment', 'tragedy']:
        cat_df = topics_h_avg_updated_df[topics_h_avg_df['category'] == cat]
        cat_df_corr = cat_df[['news_h_avg', 'tweets_h_avg']].corr(method='spearman')
        print('Topics category: {}'.format(cat))
        print(cat_df_corr)
        print('')


Spearman corrs
Original:
Topics category: politics
              news_h_avg  tweets_h_avg
news_h_avg      1.000000      0.741758
tweets_h_avg    0.741758      1.000000

Topics category: social
              news_h_avg  tweets_h_avg
news_h_avg      1.000000      0.524476
tweets_h_avg    0.524476      1.000000

Topics category: entertainment
              news_h_avg  tweets_h_avg
news_h_avg       1.00000       0.17033
tweets_h_avg     0.17033       1.00000

Topics category: tragedy
              news_h_avg  tweets_h_avg
news_h_avg      1.000000      0.593407
tweets_h_avg    0.593407      1.000000

After removal duplicate tweets:
Topics category: politics
              news_h_avg  tweets_h_avg
news_h_avg      1.000000      0.703297
tweets_h_avg    0.703297      1.000000

Topics category: social
              news_h_avg  tweets_h_avg
news_h_avg      1.000000      0.335664
tweets_h_avg    0.335664      1.000000

Topics category: entertainment
              news_h_avg  tweets_h_avg
news_h_avg       1.00000       0.10989
tweets_h_avg     0.10989       1.00000

Topics category: tragedy
              news_h_avg  tweets_h_avg
news_h_avg      1.000000      0.598901
tweets_h_avg    0.598901      1.000000