In [1]:
"""
Initialization
"""
'''
Standard modules
'''
import os
import pickle
import json
import csv
from pprint import pprint
'''
Analysis modules
'''
import pandas as pd
'''
Custom modules
'''
import config
import utilities
'''
Misc
'''
nb_name = '20171017-daheng-clean_hed_words_data'
In [2]:
"""
Register
HED_WORDS_DF_PKL = os.path.join(DATA_DIR, 'hed_words.df.pkl')
in config
"""
if 0 == 1:
with open(config.HED_WORDS_JSON_FILE, 'r') as f:
hed_words_json = json.load(f)
# words are complied as list of dict inside 'objects' field
hed_words_df = pd.DataFrame(hed_words_json['objects'])
hed_words_df = hed_words_df[['rank', 'word', 'text', 'happs', 'stdDev']]
hed_words_df.to_pickle(config.HED_WORDS_DF_PKL)
In [5]:
"""
Test recover df pkl and check basic statistics
"""
if 0 == 1:
hed_words_df = pd.read_pickle(config.HED_WORDS_DF_PKL)
print(hed_words_df)
In [7]:
if 0 == 1:
print(hed_words_df.describe())
In [8]:
"""
Follow the recommendation by original author, filter out only words with happs inside [1, 4] and [6, 9]
Ref:
- 'Here, we exclude words whose average happiness h_avg lies within del_h_avg of the neutral score of 5, i.e. 5-del_h_avg<h_avg<5+del_h_avg.'
- 'For del_h_avg=1, 3,686 unique words of the original 10,222 remain.'
NOTE:
The number of selected words is slightly different from reported in the paper.
We get 3731 words instead of 3,686 words.
"""
if 0 == 1:
hed_words_df = pd.read_pickle(config.HED_WORDS_DF_PKL)
select_cond = (hed_words_df['happs'] <= 4) | (hed_words_df['happs'] >= 6)
selected_hed_words_df = hed_words_df[select_cond]
# reset index for selected words
selected_hed_words_df = selected_hed_words_df.reset_index(drop=True)
selected_hed_words_df.to_pickle(config.SHED_WORDS_DF_PKL)
In [2]:
if 1 == 1:
shed_words_df = pd.read_pickle(config.SHED_WORDS_DF_PKL)
print(shed_words_df)
In [4]:
"""
Register
SHED_WORD_IND_DICT_PKL = (DATA_DIR, 'shed_word-ind.dict.pkl')
IND_SHED_WORD_DICT_PKL = (DATA_DIR, 'ind-shed_word.dict.pkl')
in config
"""
if 0 == 1:
'''
Make pkl for word to index mapping
'''
ind_shed_word_dict = shed_words_df['word'].to_dict()
'''
Make pkl for index to word mapping
'''
shed_word_ind_dict = {shed_word: ind for ind, shed_word in ind_shed_word_dict.items()}
with open(config.IND_SHED_WORD_DICT_PKL, 'wb') as f:
pickle.dump(ind_shed_word_dict, f)
with open(config.SHED_WORD_IND_DICT_PKL, 'wb') as f:
pickle.dump(shed_word_ind_dict, f)
In [2]:
"""
Test recover df pkls and check correct
"""
if 1 == 1:
with open(config.IND_SHED_WORD_DICT_PKL, 'rb') as f:
ind_shed_word_dict = pickle.load(f)
with open(config.SHED_WORD_IND_DICT_PKL, 'rb') as f:
shed_word_ind_dict = pickle.load(f)
random_ind = 688
print('word: {}'.format(ind_shed_word_dict[random_ind]))
print('ind: {}'.format(shed_word_ind_dict[ind_shed_word_dict[random_ind]]))
In [3]:
"""
Register
IND_HAPPS_DICT_PKL = (DATA_DIR, 'ind-happs.dict.pkl')
in config
"""
if 0 == 1:
'''
Make pkl for index to happs score mapping
'''
ind_happs_dict = shed_words_df['happs'].to_dict()
with open(config.IND_HAPPS_DICT_PKL, 'wb') as f:
pickle.dump(ind_happs_dict, f)
In [4]:
"""
Test recover df pkls and check correct
"""
if 1 == 1:
with open(config.IND_HAPPS_DICT_PKL, 'rb') as f:
ind_happs_dict = pickle.load(f)
random_ind = 3725
print(len(list(ind_happs_dict.keys())))
print('happs: {}'.format(ind_happs_dict[random_ind]))
In [3]:
"""
Write out to csv file for manually inspection
"""
shed_words_csv_file = os.path.join(config.HR_DIR, 'shed_words.csv')
if 0 == 1:
shed_words_df = pd.read_pickle(config.SHED_WORDS_DF_PKL)
shed_words_df.to_csv(path_or_buf=shed_words_csv_file,
columns=['rank', 'word', 'text', 'happs', 'stdDev'],
sep='\t',
quoting=csv.QUOTE_MINIMAL,
header=True,
index=True)
print('Done')