Preprocess raw comments from tripadvisor

input : multiple lines of JSON file. Each line is a JSON object.

There are several keys in JSON object , and the value of 'text' is a comment.


In [1]:
import json
import codecs
import re
import string

In [2]:
def load_file(path):
    """
        load file from raw txt file
        rtype: list(str)
    """
    with codecs.open(path,encoding="utf-8") as file:
        lines = file.readlines()
    res = []
    for line in lines:
        line = json.loads(line)['text'].lower()
        line = re.sub(r'\n|\t', ' ', line)
        line = line.encode('utf-8') 
        line = line.strip().translate(None, string.punctuation)
        res.append(line)
    return res

In [3]:
def write_file(file , path):
    """
        write processed file to file at path
        itype: file - list[str] , intermediate file from load_file(path)
               path - str   , path to store processed file.
        rtype: None
    """
    f = codecs.open(path , "w" )
    for l in file:
        f.write(l)
        f.write("\n")
    f.close()

In [4]:
path = "./dummy/reviews.txt"
path_write = "./dummy/processed_reviews.txt"
comments = load_file(path)
write_file(comments, path_write)

In [5]:
print "Processed %d comments." % (len(comments))


Processed 10000 comments.

In [ ]: