In [5]:
import clean_extracted_text, get_LIWC_counts
reload (clean_extracted_text)
reload (get_LIWC_counts)
from get_LIWC_counts import get_LIWC_counts
from clean_extracted_text import clean_text
import os, glob
from nltk.tokenize import WordPunctTokenizer
from collections import Counter
import json

In [2]:
dataDir = "../data/subtitles/subtitlesInTSV/"
LIWC_dir = "/hg191/corpora/LIWC/resources/liwc_lexicons/"
files = sorted (glob.glob (os.path.join (dataDir, "*.tsv")))
categories = os.listdir (LIWC_dir)
tokenizer = WordPunctTokenizer()
LIWC_words = {category: ["^" + l.strip() + "$" for l in open(os.path.join(LIWC_dir, category), 'r')] for category in categories} 

jsonList = []
              
for filename in files:
    with open (filename) as fin:
        categoryCounts = {category:Counter() for category in categories}  
        for line in fin:
            if "frameNo" in line:
                continue
            dialogue = line.strip().split("\t")[-1]
            dialogue = dialogue.replace ("$", " ")
            tokens = tokenizer.tokenize(clean_text (dialogue.strip()))
            for category in categories:
                counts = get_LIWC_counts(tokens, LIWC_words=LIWC_words[category])
                if len (counts) > 0:
                    #print counts
                    categoryCounts[category].update (counts)
    temp_dict = {"name": os.path.basename (filename)}
    temp_dict.update (categoryCounts)
    jsonList.append (temp_dict)

In [3]:
with open ("../data/text_category_stats/episodeLevel.json", "w") as fout:
    json.dump (jsonList, fout)


{'play': 1, 'good': 1, 'love': 1, 'pretty': 1, 'relax': 1, 'please': 2, 'share': 1, 'accept': 1, 'certain': 1, 'ready': 1, 'clever': 1, 'likes': 1, 'dear': 1, 'friend': 1, 'trust': 1, 'fine': 1, 'grace': 1, 'like': 1, 'perfect': 1, 'loved': 1, 'lovely': 1, 'talent': 1, 'thank': 1, 'giving': 1, 'true': 1, 'well': 1, 'honor': 1, 'better': 1, 'favor': 1, 'loyal': 1, 'won': 1, 'hope': 1, 'easy': 1, 'win': 1, 'care': 1, 'sweet': 1, 'agree': 1, 'sure': 1, 'happy': 1}