Clean Hedonometer happiness words data

Last modified: 2017-10-22

Roadmap

  1. Download Hedonometer happiness words and upload into config.HED_DATA_DIR
  2. Readin raw JSON file and parse to standard df
  3. Filter out words (with delta of happs_avg less than 1)

Steps


In [1]:
"""
Initialization
"""

'''
Standard modules
'''
import os
import pickle
import json
import csv
from pprint import pprint

'''
Analysis modules
'''
import pandas as pd


'''
Custom modules
'''
import config
import utilities

'''
Misc
'''
nb_name = '20171017-daheng-clean_hed_words_data'

Download Hedonometer happiness words and upload into config.HED_DATA_DIR

Readin raw JSON file and parse to standard df


In [2]:
"""
Register
    HED_WORDS_DF_PKL = os.path.join(DATA_DIR, 'hed_words.df.pkl')
in config
"""
if 0 == 1:
    with open(config.HED_WORDS_JSON_FILE, 'r') as f:
        hed_words_json = json.load(f)
    
    # words are complied as list of dict inside 'objects' field
    hed_words_df = pd.DataFrame(hed_words_json['objects'])
    
    hed_words_df = hed_words_df[['rank', 'word', 'text', 'happs', 'stdDev']]
    
    hed_words_df.to_pickle(config.HED_WORDS_DF_PKL)

In [5]:
"""
Test recover df pkl and check basic statistics
"""
if 0 == 1:
    hed_words_df = pd.read_pickle(config.HED_WORDS_DF_PKL)
    
    print(hed_words_df)


        rank             word             text  happs  stdDev
0          1         laughter         laughter   8.50  0.9313
1          2        happiness        happiness   8.44  0.9723
2          3             love             love   8.42  1.1082
3          4            happy            happy   8.30  0.9949
4          5          laughed          laughed   8.26  1.1572
5          6            laugh            laugh   8.22  1.3746
6          7         laughing         laughing   8.20  1.1066
7          8        excellent        excellent   8.18  1.1008
8          9           laughs           laughs   8.18  1.1551
9         10              joy              joy   8.16  1.0568
10        11       successful       successful   8.16  1.0759
11        12              win              win   8.12  1.0812
12        13          rainbow          rainbow   8.10  0.9949
13        14            smile            smile   8.10  1.0152
14        15              won              won   8.10  1.2164
15        16         pleasure         pleasure   8.08  0.9655
16        17           smiled           smiled   8.08  1.0660
17        18         rainbows         rainbows   8.06  1.3603
18        19          winning          winning   8.04  1.0490
19        20      celebration      celebration   8.02  1.5318
20        21          enjoyed          enjoyed   8.02  1.5318
21        22          healthy          healthy   8.02  1.0593
22        23            music            music   8.02  1.1156
23        24      celebrating      celebrating   8.00  1.1429
24        25  congratulations  congratulations   8.00  1.6288
25        26          weekend          weekend   8.00  1.2936
26        27        celebrate        celebrate   7.98  1.1516
27        28           comedy           comedy   7.98  1.1516
28        29            jokes            jokes   7.98  0.9792
29        30             rich             rich   7.98  1.3169
...      ...              ...              ...    ...     ...
10192  10193         violence         violence   1.86  1.0500
10193  10194            cruel            cruel   1.84  1.1493
10194  10195              cry              cry   1.84  1.2835
10195  10196           failed           failed   1.84  0.9971
10196  10197         sickness         sickness   1.84  1.1843
10197  10198           abused           abused   1.83  1.3101
10198  10199         tortured         tortured   1.82  1.4241
10199  10200            fatal            fatal   1.80  1.5253
10200  10201         killings         killings   1.80  1.5386
10201  10202         murdered         murdered   1.80  1.6288
10202  10203              war              war   1.80  1.4142
10203  10204            kills            kills   1.78  1.2337
10204  10205             jail             jail   1.76  1.0214
10205  10206           terror           terror   1.76  1.0012
10206  10207              die              die   1.74  1.1920
10207  10208          killing          killing   1.70  1.3590
10208  10209         arrested         arrested   1.64  1.0053
10209  10210           deaths           deaths   1.64  1.1386
10210  10211            raped            raped   1.64  1.4251
10211  10212          torture          torture   1.58  1.0515
10212  10213             died             died   1.56  1.1980
10213  10214             kill             kill   1.56  1.0529
10214  10215           killed           killed   1.56  1.2316
10215  10216           cancer           cancer   1.54  1.0730
10216  10217            death            death   1.54  1.2811
10217  10218           murder           murder   1.48  1.0150
10218  10219        terrorism        terrorism   1.48  0.9089
10219  10220             rape             rape   1.44  0.7866
10220  10221          suicide          suicide   1.30  0.8391
10221  10222        terrorist        terrorist   1.30  0.9091

[10222 rows x 5 columns]

In [7]:
if 0 == 1:
    print(hed_words_df.describe())


              rank         happs        stdDev
count  10222.00000  10222.000000  10222.000000
mean    5111.50000      5.375240      1.380979
std     2950.98156      1.084905      0.294834
min        1.00000      1.300000      0.395900
25%     2556.25000      4.900000      1.178700
50%     5111.50000      5.440000      1.361400
75%     7666.75000      6.020000      1.562400
max    10222.00000      8.500000      2.926000

Filter out words (with delta of happs_avg less than 1)


In [8]:
"""
Follow the recommendation by original author, filter out only words with happs inside [1, 4] and [6, 9]

Ref:
 - 'Here, we exclude words whose average happiness h_avg lies within del_h_avg of the neutral score of 5, i.e. 5-del_h_avg<h_avg<5+del_h_avg.'
 - 'For del_h_avg=1, 3,686 unique words of the original 10,222 remain.'

NOTE:
    The number of selected words is slightly different from reported in the paper.
    We get 3731 words instead of 3,686 words.
"""
if 0 == 1:
    hed_words_df = pd.read_pickle(config.HED_WORDS_DF_PKL)
    
    select_cond = (hed_words_df['happs'] <= 4) | (hed_words_df['happs'] >= 6)
    
    selected_hed_words_df = hed_words_df[select_cond]
    
    # reset index for selected words
    selected_hed_words_df = selected_hed_words_df.reset_index(drop=True)
    
    selected_hed_words_df.to_pickle(config.SHED_WORDS_DF_PKL)

In [2]:
if 1 == 1:
    shed_words_df = pd.read_pickle(config.SHED_WORDS_DF_PKL)
    print(shed_words_df)


       rank             word             text  happs  stdDev
0         1         laughter         laughter   8.50  0.9313
1         2        happiness        happiness   8.44  0.9723
2         3             love             love   8.42  1.1082
3         4            happy            happy   8.30  0.9949
4         5          laughed          laughed   8.26  1.1572
5         6            laugh            laugh   8.22  1.3746
6         7         laughing         laughing   8.20  1.1066
7         8        excellent        excellent   8.18  1.1008
8         9           laughs           laughs   8.18  1.1551
9        10              joy              joy   8.16  1.0568
10       11       successful       successful   8.16  1.0759
11       12              win              win   8.12  1.0812
12       13          rainbow          rainbow   8.10  0.9949
13       14            smile            smile   8.10  1.0152
14       15              won              won   8.10  1.2164
15       16         pleasure         pleasure   8.08  0.9655
16       17           smiled           smiled   8.08  1.0660
17       18         rainbows         rainbows   8.06  1.3603
18       19          winning          winning   8.04  1.0490
19       20      celebration      celebration   8.02  1.5318
20       21          enjoyed          enjoyed   8.02  1.5318
21       22          healthy          healthy   8.02  1.0593
22       23            music            music   8.02  1.1156
23       24      celebrating      celebrating   8.00  1.1429
24       25  congratulations  congratulations   8.00  1.6288
25       26          weekend          weekend   8.00  1.2936
26       27        celebrate        celebrate   7.98  1.1516
27       28           comedy           comedy   7.98  1.1516
28       29            jokes            jokes   7.98  0.9792
29       30             rich             rich   7.98  1.3169
...     ...              ...              ...    ...     ...
3701  10193         violence         violence   1.86  1.0500
3702  10194            cruel            cruel   1.84  1.1493
3703  10195              cry              cry   1.84  1.2835
3704  10196           failed           failed   1.84  0.9971
3705  10197         sickness         sickness   1.84  1.1843
3706  10198           abused           abused   1.83  1.3101
3707  10199         tortured         tortured   1.82  1.4241
3708  10200            fatal            fatal   1.80  1.5253
3709  10201         killings         killings   1.80  1.5386
3710  10202         murdered         murdered   1.80  1.6288
3711  10203              war              war   1.80  1.4142
3712  10204            kills            kills   1.78  1.2337
3713  10205             jail             jail   1.76  1.0214
3714  10206           terror           terror   1.76  1.0012
3715  10207              die              die   1.74  1.1920
3716  10208          killing          killing   1.70  1.3590
3717  10209         arrested         arrested   1.64  1.0053
3718  10210           deaths           deaths   1.64  1.1386
3719  10211            raped            raped   1.64  1.4251
3720  10212          torture          torture   1.58  1.0515
3721  10213             died             died   1.56  1.1980
3722  10214             kill             kill   1.56  1.0529
3723  10215           killed           killed   1.56  1.2316
3724  10216           cancer           cancer   1.54  1.0730
3725  10217            death            death   1.54  1.2811
3726  10218           murder           murder   1.48  1.0150
3727  10219        terrorism        terrorism   1.48  0.9089
3728  10220             rape             rape   1.44  0.7866
3729  10221          suicide          suicide   1.30  0.8391
3730  10222        terrorist        terrorist   1.30  0.9091

[3731 rows x 5 columns]

Make word-index and index-word mapping dict pickle


In [4]:
"""
Register
    SHED_WORD_IND_DICT_PKL = (DATA_DIR, 'shed_word-ind.dict.pkl')
    IND_SHED_WORD_DICT_PKL = (DATA_DIR, 'ind-shed_word.dict.pkl')
in config
"""
if 0 == 1:
    '''
    Make pkl for word to index mapping
    '''
    ind_shed_word_dict = shed_words_df['word'].to_dict()
    
    '''
    Make pkl for index to word mapping
    '''
    shed_word_ind_dict = {shed_word: ind for ind, shed_word in ind_shed_word_dict.items()}
    
    with open(config.IND_SHED_WORD_DICT_PKL, 'wb') as f:
        pickle.dump(ind_shed_word_dict, f)
        
    with open(config.SHED_WORD_IND_DICT_PKL, 'wb') as f:
        pickle.dump(shed_word_ind_dict, f)

In [2]:
"""
Test recover df pkls and check correct
"""
if 1 == 1:
    with open(config.IND_SHED_WORD_DICT_PKL, 'rb') as f:
        ind_shed_word_dict = pickle.load(f)
        
    with open(config.SHED_WORD_IND_DICT_PKL, 'rb') as f:
        shed_word_ind_dict = pickle.load(f)
    
    random_ind = 688
    print('word: {}'.format(ind_shed_word_dict[random_ind]))
    print('ind: {}'.format(shed_word_ind_dict[ind_shed_word_dict[random_ind]]))


word: necklace
ind: 688

Make index-happs mapping dict pickle


In [3]:
"""
Register
    IND_HAPPS_DICT_PKL = (DATA_DIR, 'ind-happs.dict.pkl')
in config
"""
if 0 == 1:
    '''
    Make pkl for index to happs score mapping
    '''
    ind_happs_dict = shed_words_df['happs'].to_dict()
    
    with open(config.IND_HAPPS_DICT_PKL, 'wb') as f:
        pickle.dump(ind_happs_dict, f)

In [4]:
"""
Test recover df pkls and check correct
"""
if 1 == 1:
    with open(config.IND_HAPPS_DICT_PKL, 'rb') as f:
        ind_happs_dict = pickle.load(f)
    
    random_ind = 3725
    print(len(list(ind_happs_dict.keys())))
    print('happs: {}'.format(ind_happs_dict[random_ind]))


3731
happs: 1.54

Write out csv file to config.HR_DIR


In [3]:
"""
Write out to csv file for manually inspection
"""
shed_words_csv_file = os.path.join(config.HR_DIR, 'shed_words.csv')

if 0 == 1:
    shed_words_df = pd.read_pickle(config.SHED_WORDS_DF_PKL)
    
    shed_words_df.to_csv(path_or_buf=shed_words_csv_file,
                         columns=['rank', 'word', 'text', 'happs', 'stdDev'],
                         sep='\t',
                         quoting=csv.QUOTE_MINIMAL,
                         header=True,
                         index=True)
    print('Done')


Done