Preprocess raw comments from tripadvisor
input : multiple lines of JSON file. Each line is a JSON object.
There are several keys in JSON object , and the value of 'text' is a comment.
In [1]:
import json
import codecs
import re
import string
In [2]:
def load_file(path):
"""
load file from raw txt file
rtype: list(str)
"""
with codecs.open(path,encoding="utf-8") as file:
lines = file.readlines()
res = []
for line in lines:
line = json.loads(line)['text'].lower()
line = re.sub(r'\n|\t', ' ', line)
line = line.encode('utf-8')
line = line.strip().translate(None, string.punctuation)
res.append(line)
return res
In [3]:
def write_file(file , path):
"""
write processed file to file at path
itype: file - list[str] , intermediate file from load_file(path)
path - str , path to store processed file.
rtype: None
"""
f = codecs.open(path , "w" )
for l in file:
f.write(l)
f.write("\n")
f.close()
In [4]:
path = "./dummy/reviews.txt"
path_write = "./dummy/processed_reviews.txt"
comments = load_file(path)
write_file(comments, path_write)
In [5]:
print "Processed %d comments." % (len(comments))
In [ ]: