Exploratory data analysis on happiness scores of topic_news and topic_tweets docs

Last modified: 2017-10-22

Roadmap

Check happiness scores of topic_news and topic_tweets docs
Check word coverage of shed words on topic_news and topic_tweets docs
Check any special topic

Steps



In [1]:

    
"""
Initialization
"""

'''
Standard modules
'''
import os
import pickle
import csv
import time
from pprint import pprint

'''
Analysis modules
'''
%matplotlib inline
%config InlineBackend.figure_format = 'retina' # render double resolution plot output for Retina screens 
import matplotlib.pyplot as plt
import pandas as pd


'''
Custom modules
'''
import config
import utilities

'''
Misc
'''
nb_name = '20171021-daheng-eda_topic_news_tweets_happiness'

Check happiness scores of topic_news and topic_tweets docs

Avg. happiness scores



In [2]:

    
%%time
"""
Compute h_avg scores for each topic_news and topic_tweets doc
"""

topics_h_avg_csv_file = os.path.join(config.FIG_DIR, 'topics_h_avg.updated.csv')

if 0 == 1:
    topics_h_avg_lst = []
    
    '''
    Recover pkl info
    '''
    with open(config.TOPICS_NEWS_SHED_WORDS_FREQ_DICT_PKL, 'rb') as f:
        topics_news_shed_words_freq_dict = pickle.load(f)
    
    with open(config.IND_HAPPS_DICT_PKL, 'rb') as f:
        ind_happs_dict = pickle.load(f)
    
    for topic_ind, topic in enumerate(config.MANUALLY_SELECTED_TOPICS_LST):
        topic_dict = {'topic_ind': topic_ind, 'topic_name': topic['name']}
        localtime = time.asctime(time.localtime(time.time()))
        print('({}/{}) processing topic: {} ... {}'.format(topic_ind+1,
                                                           len(config.MANUALLY_SELECTED_TOPICS_LST),
                                                           topic['name'],
                                                           localtime))
        
        '''
        Compute h_avg score for topic_news doc
        '''
        topic_news_shed_words_freq_dict = topics_news_shed_words_freq_dict[topic_ind]
        
        topic_dict['news_num'] = len(topic_news_shed_words_freq_dict.keys())
        
        topic_news_merged_freq_dict = utilities.merge_shed_words_freq_dicts(topic_news_shed_words_freq_dict.values())
        topic_news_h_avg = utilities.compute_h_score(topic_news_merged_freq_dict, ind_happs_dict)
        
        topic_dict['news_h_avg'] = topic_news_h_avg
        
        '''
        Compute h_avg score for topic_tweets doc
        '''
        topic_tweets_shed_words_freq_dict_pkl_file = os.path.join(config.TOPICS_TWEETS_SHED_WORDS_FREQ_DICT_PKLS_DIR,
                                                                  '{}.updated.dict.pkl'.format(topic_ind))
        with open(topic_tweets_shed_words_freq_dict_pkl_file, 'rb') as f:
            topic_tweets_shed_words_freq_dict = pickle.load(f)
        
        topic_dict['tweets_num'] = len(topic_tweets_shed_words_freq_dict.keys())
        
        topic_tweets_merged_freq_dict = utilities.merge_shed_words_freq_dicts(topic_tweets_shed_words_freq_dict.values())
        topic_tweets_h_avg = utilities.compute_h_score(topic_tweets_merged_freq_dict, ind_happs_dict)
        
        topic_dict['tweets_h_avg'] = topic_tweets_h_avg
        
        topics_h_avg_lst.append(topic_dict)
    
    topics_h_avg_df = pd.DataFrame(topics_h_avg_lst)
    topics_h_avg_df.to_csv(path_or_buf=topics_h_avg_csv_file,
                           columns=['topic_ind', 'topic_name', 'news_num', 'news_h_avg', 'tweets_num', 'tweets_h_avg'],
                           sep='\t',
                           quoting=csv.QUOTE_MINIMAL,
                           header=True,
                           index=False)
    print('Done')









    



(1/51) processing topic: Hillary_Clinton_email_controversy ... Mon Oct 23 16:25:46 2017
(2/51) processing topic: Iran_nuclear_deal ... Mon Oct 23 16:25:53 2017
(3/51) processing topic: ISIS_Jihadi_John_identity_reveal ... Mon Oct 23 16:26:19 2017
(4/51) processing topic: Ukraine_cease_fire ... Mon Oct 23 16:26:25 2017
(5/51) processing topic: Egypt_free_Al_Jazeera_journalist ... Mon Oct 23 16:26:29 2017
(6/51) processing topic: Keystone_XL_Pipeline_bill ... Mon Oct 23 16:26:30 2017
(7/51) processing topic: CIA_Torture_Report ... Mon Oct 23 16:26:31 2017
(8/51) processing topic: Obama_cybersecurity_plan ... Mon Oct 23 16:26:33 2017
(9/51) processing topic: DHS_funding_issue ... Mon Oct 23 16:26:37 2017
(10/51) processing topic: US_Cuba_relationship ... Mon Oct 23 16:26:39 2017
(11/51) processing topic: 2015_CPAC ... Mon Oct 23 16:26:54 2017
(12/51) processing topic: Iraq_free_ISIS_Tikrit ... Mon Oct 23 16:26:56 2017
(13/51) processing topic: Nigeria_Boko_Haram_terrorists ... Mon Oct 23 16:27:01 2017
(14/51) processing topic: Ferguson_unrest ... Mon Oct 23 16:27:04 2017
(15/51) processing topic: Hong_Kong_protest ... Mon Oct 23 16:27:37 2017
(16/51) processing topic: Sony_cyberattack ... Mon Oct 23 16:27:39 2017
(17/51) processing topic: Bill_Cosby_sexual_assault_allegation ... Mon Oct 23 16:27:57 2017
(18/51) processing topic: SpaceX_rocket_landing ... Mon Oct 23 16:28:03 2017
(19/51) processing topic: Brian_Williams_fake_story ... Mon Oct 23 16:28:06 2017
(20/51) processing topic: HSBC_tax_scandal ... Mon Oct 23 16:28:08 2017
(21/51) processing topic: David_Carr_death ... Mon Oct 23 16:28:09 2017
(22/51) processing topic: Patriots_Deflategate ... Mon Oct 23 16:28:10 2017
(23/51) processing topic: Delhi_Uber_driver_rape ... Mon Oct 23 16:28:13 2017
(24/51) processing topic: Superbug_spread ... Mon Oct 23 16:28:17 2017
(25/51) processing topic: Rudy_Giuliani_Obama_critique ... Mon Oct 23 16:28:20 2017
(26/51) processing topic: Oscar ... Mon Oct 23 16:28:24 2017
(27/51) processing topic: Super_Bowl ... Mon Oct 23 16:28:44 2017
(28/51) processing topic: Grammy ... Mon Oct 23 16:29:03 2017
(29/51) processing topic: Golden_Globe ... Mon Oct 23 16:29:10 2017
(30/51) processing topic: 500_million_Powerball ... Mon Oct 23 16:29:19 2017
(31/51) processing topic: Thanksgiving ... Mon Oct 23 16:29:24 2017
(32/51) processing topic: Black_Friday_and_Cyber_Monday ... Mon Oct 23 16:29:53 2017
(33/51) processing topic: Christmas ... Mon Oct 23 16:30:13 2017
(34/51) processing topic: New_Year ... Mon Oct 23 16:30:58 2017
(35/51) processing topic: Apple_Watch ... Mon Oct 23 16:31:13 2017
(36/51) processing topic: Yosemite_historic_climb ... Mon Oct 23 16:31:19 2017
(37/51) processing topic: Jon_Stewart_Daily_Show ... Mon Oct 23 16:31:19 2017
(38/51) processing topic: success_of_American_Sniper ... Mon Oct 23 16:31:20 2017
(39/51) processing topic: Ebola_virus_spread ... Mon Oct 23 16:31:28 2017
(40/51) processing topic: Indonesia_AirAsia_Flight_QZ8501_crash ... Mon Oct 23 16:31:37 2017
(41/51) processing topic: Paris_attacks ... Mon Oct 23 16:31:46 2017
(42/51) processing topic: Vanuatu_Cyclone_Pam ... Mon Oct 23 16:32:00 2017
(43/51) processing topic: Malaysia_Airlines_Flight_MH370_crash ... Mon Oct 23 16:32:02 2017
(44/51) processing topic: Colorado_NAACP_bombing ... Mon Oct 23 16:32:07 2017
(45/51) processing topic: FSU_shooting ... Mon Oct 23 16:32:09 2017
(46/51) processing topic: Chapel_Hill_shooting ... Mon Oct 23 16:32:11 2017
(47/51) processing topic: Bobbi_Kristina_Brown_death ... Mon Oct 23 16:32:12 2017
(48/51) processing topic: Taliban_Pakistan_school_massacre ... Mon Oct 23 16:32:13 2017
(49/51) processing topic: American_ISIS_Hostage_Kayla_Mueller ... Mon Oct 23 16:32:16 2017
(50/51) processing topic: TransAsia_Airways_Flight_GE235_crash ... Mon Oct 23 16:32:16 2017
(51/51) processing topic: Germanwings_Flight_9525_crash ... Mon Oct 23 16:32:17 2017
Done
CPU times: user 6min 29s, sys: 6.93 s, total: 6min 36s
Wall time: 6min 36s

Basic box-plots



In [3]:

    
"""
Load data
"""
if 1 == 1:
    topics_h_avg_csv_file = os.path.join(config.FIG_DIR, 'topics_h_avg.updated.csv')
    
    topics_h_avg_df = pd.read_csv(topics_h_avg_csv_file, sep='\t', quoting=csv.QUOTE_MINIMAL)
    
    topic_lst_df = pd.DataFrame(config.MANUALLY_SELECTED_TOPICS_LST)
    
    topics_h_avg_df['category'] = topic_lst_df['category']



In [4]:

    
with pd.option_context('display.max_columns', 7, 'display.max_colwidth', 50, 'expand_frame_repr', False):
    dis_lst = ['topic_ind', 'news_num', 'news_h_avg', 'tweets_num', 'tweets_h_avg', 'topic_name', 'category']
    display(topics_h_avg_df[dis_lst])









    







  
    
      
      topic_ind
      news_num
      news_h_avg
      tweets_num
      tweets_h_avg
      topic_name
      category
    
  
  
    
      0
      0
      228
      5.912027
      367618
      5.969572
      Hillary_Clinton_email_controversy
      politics
    
    
      1
      1
      406
      5.542578
      1238107
      5.812523
      Iran_nuclear_deal
      politics
    
    
      2
      2
      101
      5.649552
      310280
      5.962166
      ISIS_Jihadi_John_identity_reveal
      politics
    
    
      3
      3
      84
      5.353950
      233999
      5.414554
      Ukraine_cease_fire
      politics
    
    
      4
      4
      50
      5.463536
      39845
      5.832747
      Egypt_free_Al_Jazeera_journalist
      politics
    
    
      5
      5
      55
      5.709798
      37041
      5.456286
      Keystone_XL_Pipeline_bill
      politics
    
    
      6
      6
      41
      5.549009
      84081
      4.892545
      CIA_Torture_Report
      politics
    
    
      7
      7
      73
      5.845357
      249609
      5.855015
      Obama_cybersecurity_plan
      politics
    
    
      8
      8
      45
      5.518097
      66855
      5.790961
      DHS_funding_issue
      politics
    
    
      9
      9
      235
      5.871696
      746329
      5.986725
      US_Cuba_relationship
      politics
    
    
      10
      10
      68
      5.861402
      111755
      5.897799
      2015_CPAC
      politics
    
    
      11
      11
      94
      5.090966
      224805
      5.619050
      Iraq_free_ISIS_Tikrit
      politics
    
    
      12
      12
      243
      5.183748
      187452
      4.953919
      Nigeria_Boko_Haram_terrorists
      politics
    
    
      13
      13
      611
      5.332389
      1616426
      5.669085
      Ferguson_unrest
      social
    
    
      14
      14
      157
      5.311911
      110839
      5.640524
      Hong_Kong_protest
      social
    
    
      15
      15
      275
      5.771321
      902546
      5.994466
      Sony_cyberattack
      social
    
    
      16
      16
      168
      5.662904
      241487
      5.224992
      Bill_Cosby_sexual_assault_allegation
      social
    
    
      17
      17
      86
      6.042245
      159027
      6.146797
      SpaceX_rocket_landing
      social
    
    
      18
      18
      69
      5.626322
      131549
      5.299628
      Brian_Williams_fake_story
      social
    
    
      19
      19
      28
      5.637849
      39947
      5.859146
      HSBC_tax_scandal
      social
    
    
      20
      20
      36
      5.897026
      23416
      5.601713
      David_Carr_death
      social
    
    
      21
      21
      44
      5.837253
      159463
      5.864361
      Patriots_Deflategate
      social
    
    
      22
      22
      36
      5.369789
      199832
      5.538784
      Delhi_Uber_driver_rape
      social
    
    
      23
      23
      41
      5.188100
      159846
      5.821115
      Superbug_spread
      social
    
    
      24
      24
      50
      5.881939
      195681
      5.776534
      Rudy_Giuliani_Obama_critique
      social
    
    
      25
      25
      241
      6.213340
      993397
      6.156303
      Oscar
      entertainment
    
    
      26
      26
      211
      6.061495
      947507
      6.128209
      Super_Bowl
      entertainment
    
    
      27
      27
      99
      6.248144
      380804
      6.159709
      Grammy
      entertainment
    
    
      28
      28
      79
      6.303521
      413222
      6.244079
      Golden_Globe
      entertainment
    
    
      29
      29
      79
      6.402999
      246179
      6.161410
      500_million_Powerball
      entertainment
    
    
      30
      30
      150
      6.109283
      1402625
      6.546168
      Thanksgiving
      entertainment
    
    
      31
      31
      121
      6.079777
      995610
      6.165598
      Black_Friday_and_Cyber_Monday
      entertainment
    
    
      32
      32
      237
      6.040478
      2123840
      6.692437
      Christmas
      entertainment
    
    
      33
      33
      69
      5.746321
      698497
      6.623734
      New_Year
      entertainment
    
    
      34
      34
      73
      6.323466
      260718
      6.395007
      Apple_Watch
      entertainment
    
    
      35
      35
      41
      6.008305
      23084
      6.391506
      Yosemite_historic_climb
      entertainment
    
    
      36
      36
      35
      5.947272
      66622
      5.630606
      Jon_Stewart_Daily_Show
      entertainment
    
    
      37
      37
      155
      5.429828
      402621
      5.877318
      success_of_American_Sniper
      entertainment
    
    
      38
      38
      173
      5.469454
      453159
      5.715878
      Ebola_virus_spread
      tragedy
    
    
      39
      39
      258
      5.651412
      454324
      5.898451
      Indonesia_AirAsia_Flight_QZ8501_crash
      tragedy
    
    
      40
      40
      225
      5.389137
      684566
      5.756385
      Paris_attacks
      tragedy
    
    
      41
      41
      89
      5.653128
      81207
      5.765197
      Vanuatu_Cyclone_Pam
      tragedy
    
    
      42
      42
      58
      5.719108
      237927
      5.917334
      Malaysia_Airlines_Flight_MH370_crash
      tragedy
    
    
      43
      43
      38
      5.188676
      128994
      5.391193
      Colorado_NAACP_bombing
      tragedy
    
    
      44
      44
      39
      5.623797
      106232
      5.686700
      FSU_shooting
      tragedy
    
    
      45
      45
      37
      5.230333
      38829
      4.828974
      Chapel_Hill_shooting
      tragedy
    
    
      46
      46
      49
      5.798599
      54124
      5.906122
      Bobbi_Kristina_Brown_death
      tragedy
    
    
      47
      47
      80
      5.060923
      114761
      5.139857
      Taliban_Pakistan_school_massacre
      tragedy
    
    
      48
      48
      38
      5.745052
      21529
      5.368661
      American_ISIS_Hostage_Kayla_Mueller
      tragedy
    
    
      49
      49
      56
      5.455274
      77302
      5.402689
      TransAsia_Airways_Flight_GE235_crash
      tragedy
    
    
      50
      50
      71
      5.441900
      225951
      5.784150
      Germanwings_Flight_9525_crash
      tragedy



In [5]:

    
"""
Plot distribution of news_h_avg and tweets_h_avg
"""
if 1 == 1:
    '''
    Prepare data
    '''
    data = [topics_h_avg_df['news_h_avg'], topics_h_avg_df['tweets_h_avg']]
    
    '''
    Plot
    '''
    fig, ax = plt.subplots(figsize=(9, 6))
    
    bp = plt.boxplot(data, notch=False, sym='k+', vert=True)
    plt.setp(bp['boxes'], color='black')
    plt.setp(bp['whiskers'], color='black')
    plt.setp(bp['fliers'], color='red', marker='+', markersize=5)
    
    ax.set_xticklabels(['News', 'Tweets'])
    
    title_fontdict = {'weight': 'bold', 'size': 'x-large'}
    ax.set_title('Distribution of news and tweets h_avg scores', fontdict=title_fontdict)
    
    label_fontdict = {'size': 'large'}
    ax.set_xlabel('Platform', fontdict=label_fontdict)
    ax.set_ylabel('Happiness scores', fontdict=label_fontdict)
#     ax.set_yscale('log')
        
    # add a horizontal grid to the plot, but make it very light in color
    ax.yaxis.grid(True, linestyle='-', which='major', color='lightgrey', alpha=0.5)
    ax.set_axisbelow(True)
    
    # add median value text to the median line
    for line in bp['medians']:
        # get position data for median line
        x, y = line.get_xydata()[1] # right point of median line
        # overlay median value
        ax.text(x, y, ' {:.4}'.format(y), horizontalalignment='left', verticalalignment='center')



In [6]:

    
"""
Plot distribution of news_h_avg and tweets_h_avg by category
"""
if 1 == 1:
    '''
    Prepare data
    '''
    data = []
    for cat in ['politics', 'social', 'entertainment', 'tragedy']:
        cat_df = topics_h_avg_df[topics_h_avg_df['category'] == cat]
        data.extend([cat_df['news_h_avg'], cat_df['tweets_h_avg']])
    
    '''
    Plot
    '''
    fig, ax = plt.subplots(figsize=(15, 6))
    
    bp = plt.boxplot(data, notch=False, sym='k+', vert=True)
    plt.setp(bp['boxes'], color='black')
    plt.setp(bp['whiskers'], color='black')
    plt.setp(bp['fliers'], color='red', marker='+', markersize=5)
    
    ax.set_xticklabels(['Politics News', 'Politics Tweets', 'Social News', 'Social Tweets', 
                        'Entertainment News', 'Entertainment Tweets', 'Tragedy News', 'Tragedy Tweets'])
    
    title_fontdict = {'weight': 'bold', 'size': 'x-large'}
    ax.set_title('Distribution of news and tweets h_avg scores', fontdict=title_fontdict)
    
    label_fontdict = {'size': 'large'}
    ax.set_xlabel('Platform', fontdict=label_fontdict)
    ax.set_ylabel('Happiness scores', fontdict=label_fontdict)
#     ax.set_yscale('log')
        
    # add a horizontal grid to the plot, but make it very light in color
    ax.yaxis.grid(True, linestyle='-', which='major', color='lightgrey', alpha=0.5)
    ax.set_axisbelow(True)
    
    # add median value text to the median line
    for line in bp['medians']:
        # get position data for median line
        x, y = line.get_xydata()[1] # right point of median line
        # overlay median value
        ax.text(x, y, ' {:.4}'.format(y), horizontalalignment='left', verticalalignment='center')

Check correlations



In [7]:

    
"""
Load data
"""
if 1 == 1:
    topics_h_avg_csv_file = os.path.join(config.FIG_DIR, 'topics_h_avg.csv')
    topics_h_avg_updated_csv_file = os.path.join(config.FIG_DIR, 'topics_h_avg.updated.csv')
    
    topics_h_avg_df = pd.read_csv(topics_h_avg_csv_file, sep='\t', quoting=csv.QUOTE_MINIMAL)
    topics_h_avg_updated_df = pd.read_csv(topics_h_avg_updated_csv_file, sep='\t', quoting=csv.QUOTE_MINIMAL)
    
    topic_lst_df = pd.DataFrame(config.MANUALLY_SELECTED_TOPICS_LST)
    
    topics_h_avg_df['category'] = topic_lst_df['category']
    topics_h_avg_updated_df['category'] = topic_lst_df['category']



In [8]:

    
"""
Pearson corr
"""
print('Original:')
display(topics_h_avg_df[['news_h_avg', 'tweets_h_avg']].corr(method='pearson'))
print('After removal duplicate tweets:')
display(topics_h_avg_updated_df[['news_h_avg', 'tweets_h_avg']].corr(method='pearson'))









    



Original:






    







  
    
      
      news_h_avg
      tweets_h_avg
    
  
  
    
      news_h_avg
      1.000000
      0.790032
    
    
      tweets_h_avg
      0.790032
      1.000000
    
  








    



After removal duplicate tweets:






    







  
    
      
      news_h_avg
      tweets_h_avg
    
  
  
    
      news_h_avg
      1.000000
      0.687085
    
    
      tweets_h_avg
      0.687085
      1.000000



In [9]:

    
"""
Spearman corr
"""
print('Original:')
display(topics_h_avg_df[['news_h_avg', 'tweets_h_avg']].corr(method='spearman'))
print('After removal duplicate tweets:')
display(topics_h_avg_updated_df[['news_h_avg', 'tweets_h_avg']].corr(method='spearman'))









    



Original:






    







  
    
      
      news_h_avg
      tweets_h_avg
    
  
  
    
      news_h_avg
      1.000000
      0.814932
    
    
      tweets_h_avg
      0.814932
      1.000000
    
  








    



After removal duplicate tweets:






    







  
    
      
      news_h_avg
      tweets_h_avg
    
  
  
    
      news_h_avg
      1.000000
      0.729321
    
    
      tweets_h_avg
      0.729321
      1.000000



In [10]:

    
"""
Pearson corr by category
"""
if 1 == 1:
    print('Pearson corrs')
    print('Original:')
    for cat in ['politics', 'social', 'entertainment', 'tragedy']:
        cat_df = topics_h_avg_df[topics_h_avg_df['category'] == cat]
        cat_df_corr = cat_df[['news_h_avg', 'tweets_h_avg']].corr(method='pearson')
        print('Topics category: {}'.format(cat))
        print(cat_df_corr)
        print('')
        
    print('After removal duplicate tweets:')
    for cat in ['politics', 'social', 'entertainment', 'tragedy']:
        cat_df = topics_h_avg_updated_df[topics_h_avg_df['category'] == cat]
        cat_df_corr = cat_df[['news_h_avg', 'tweets_h_avg']].corr(method='pearson')
        print('Topics category: {}'.format(cat))
        print(cat_df_corr)
        print('')









    



Pearson corrs
Original:
Topics category: politics
              news_h_avg  tweets_h_avg
news_h_avg      1.000000      0.681319
tweets_h_avg    0.681319      1.000000

Topics category: social
              news_h_avg  tweets_h_avg
news_h_avg      1.000000      0.570608
tweets_h_avg    0.570608      1.000000

Topics category: entertainment
              news_h_avg  tweets_h_avg
news_h_avg      1.000000      0.422557
tweets_h_avg    0.422557      1.000000

Topics category: tragedy
              news_h_avg  tweets_h_avg
news_h_avg       1.00000       0.72168
tweets_h_avg     0.72168       1.00000

After removal duplicate tweets:
Topics category: politics
              news_h_avg  tweets_h_avg
news_h_avg      1.000000      0.560147
tweets_h_avg    0.560147      1.000000

Topics category: social
              news_h_avg  tweets_h_avg
news_h_avg      1.000000      0.319429
tweets_h_avg    0.319429      1.000000

Topics category: entertainment
              news_h_avg  tweets_h_avg
news_h_avg      1.000000      0.174124
tweets_h_avg    0.174124      1.000000

Topics category: tragedy
              news_h_avg  tweets_h_avg
news_h_avg      1.000000      0.690493
tweets_h_avg    0.690493      1.000000



In [11]:

    
"""
Spearman corr by category
"""
if 1 == 1:
    print('Spearman corrs')
    print('Original:')
    for cat in ['politics', 'social', 'entertainment', 'tragedy']:
        cat_df = topics_h_avg_df[topics_h_avg_df['category'] == cat]
        cat_df_corr = cat_df[['news_h_avg', 'tweets_h_avg']].corr(method='spearman')
        print('Topics category: {}'.format(cat))
        print(cat_df_corr)
        print('')
    
    print('After removal duplicate tweets:')
    for cat in ['politics', 'social', 'entertainment', 'tragedy']:
        cat_df = topics_h_avg_updated_df[topics_h_avg_df['category'] == cat]
        cat_df_corr = cat_df[['news_h_avg', 'tweets_h_avg']].corr(method='spearman')
        print('Topics category: {}'.format(cat))
        print(cat_df_corr)
        print('')









    



Spearman corrs
Original:
Topics category: politics
              news_h_avg  tweets_h_avg
news_h_avg      1.000000      0.741758
tweets_h_avg    0.741758      1.000000

Topics category: social
              news_h_avg  tweets_h_avg
news_h_avg      1.000000      0.524476
tweets_h_avg    0.524476      1.000000

Topics category: entertainment
              news_h_avg  tweets_h_avg
news_h_avg       1.00000       0.17033
tweets_h_avg     0.17033       1.00000

Topics category: tragedy
              news_h_avg  tweets_h_avg
news_h_avg      1.000000      0.593407
tweets_h_avg    0.593407      1.000000

After removal duplicate tweets:
Topics category: politics
              news_h_avg  tweets_h_avg
news_h_avg      1.000000      0.703297
tweets_h_avg    0.703297      1.000000

Topics category: social
              news_h_avg  tweets_h_avg
news_h_avg      1.000000      0.335664
tweets_h_avg    0.335664      1.000000

Topics category: entertainment
              news_h_avg  tweets_h_avg
news_h_avg       1.00000       0.10989
tweets_h_avg     0.10989       1.00000

Topics category: tragedy
              news_h_avg  tweets_h_avg
news_h_avg      1.000000      0.598901
tweets_h_avg    0.598901      1.000000

	topic_ind	news_num	news_h_avg	tweets_num	tweets_h_avg	topic_name	category
0	0	228	5.912027	367618	5.969572	Hillary_Clinton_email_controversy	politics
1	1	406	5.542578	1238107	5.812523	Iran_nuclear_deal	politics
2	2	101	5.649552	310280	5.962166	ISIS_Jihadi_John_identity_reveal	politics
3	3	84	5.353950	233999	5.414554	Ukraine_cease_fire	politics
4	4	50	5.463536	39845	5.832747	Egypt_free_Al_Jazeera_journalist	politics
5	5	55	5.709798	37041	5.456286	Keystone_XL_Pipeline_bill	politics
6	6	41	5.549009	84081	4.892545	CIA_Torture_Report	politics
7	7	73	5.845357	249609	5.855015	Obama_cybersecurity_plan	politics
8	8	45	5.518097	66855	5.790961	DHS_funding_issue	politics
9	9	235	5.871696	746329	5.986725	US_Cuba_relationship	politics
10	10	68	5.861402	111755	5.897799	2015_CPAC	politics
11	11	94	5.090966	224805	5.619050	Iraq_free_ISIS_Tikrit	politics
12	12	243	5.183748	187452	4.953919	Nigeria_Boko_Haram_terrorists	politics
13	13	611	5.332389	1616426	5.669085	Ferguson_unrest	social
14	14	157	5.311911	110839	5.640524	Hong_Kong_protest	social
15	15	275	5.771321	902546	5.994466	Sony_cyberattack	social
16	16	168	5.662904	241487	5.224992	Bill_Cosby_sexual_assault_allegation	social
17	17	86	6.042245	159027	6.146797	SpaceX_rocket_landing	social
18	18	69	5.626322	131549	5.299628	Brian_Williams_fake_story	social
19	19	28	5.637849	39947	5.859146	HSBC_tax_scandal	social
20	20	36	5.897026	23416	5.601713	David_Carr_death	social
21	21	44	5.837253	159463	5.864361	Patriots_Deflategate	social
22	22	36	5.369789	199832	5.538784	Delhi_Uber_driver_rape	social
23	23	41	5.188100	159846	5.821115	Superbug_spread	social
24	24	50	5.881939	195681	5.776534	Rudy_Giuliani_Obama_critique	social
25	25	241	6.213340	993397	6.156303	Oscar	entertainment
26	26	211	6.061495	947507	6.128209	Super_Bowl	entertainment
27	27	99	6.248144	380804	6.159709	Grammy	entertainment
28	28	79	6.303521	413222	6.244079	Golden_Globe	entertainment
29	29	79	6.402999	246179	6.161410	500_million_Powerball	entertainment
30	30	150	6.109283	1402625	6.546168	Thanksgiving	entertainment
31	31	121	6.079777	995610	6.165598	Black_Friday_and_Cyber_Monday	entertainment
32	32	237	6.040478	2123840	6.692437	Christmas	entertainment
33	33	69	5.746321	698497	6.623734	New_Year	entertainment
34	34	73	6.323466	260718	6.395007	Apple_Watch	entertainment
35	35	41	6.008305	23084	6.391506	Yosemite_historic_climb	entertainment
36	36	35	5.947272	66622	5.630606	Jon_Stewart_Daily_Show	entertainment
37	37	155	5.429828	402621	5.877318	success_of_American_Sniper	entertainment
38	38	173	5.469454	453159	5.715878	Ebola_virus_spread	tragedy
39	39	258	5.651412	454324	5.898451	Indonesia_AirAsia_Flight_QZ8501_crash	tragedy
40	40	225	5.389137	684566	5.756385	Paris_attacks	tragedy
41	41	89	5.653128	81207	5.765197	Vanuatu_Cyclone_Pam	tragedy
42	42	58	5.719108	237927	5.917334	Malaysia_Airlines_Flight_MH370_crash	tragedy
43	43	38	5.188676	128994	5.391193	Colorado_NAACP_bombing	tragedy
44	44	39	5.623797	106232	5.686700	FSU_shooting	tragedy
45	45	37	5.230333	38829	4.828974	Chapel_Hill_shooting	tragedy
46	46	49	5.798599	54124	5.906122	Bobbi_Kristina_Brown_death	tragedy
47	47	80	5.060923	114761	5.139857	Taliban_Pakistan_school_massacre	tragedy
48	48	38	5.745052	21529	5.368661	American_ISIS_Hostage_Kayla_Mueller	tragedy
49	49	56	5.455274	77302	5.402689	TransAsia_Airways_Flight_GE235_crash	tragedy
50	50	71	5.441900	225951	5.784150	Germanwings_Flight_9525_crash	tragedy