In [10]:
import pandas as pd
import os, re, json
In [2]:
ls
In [3]:
# ram이 8기가 넘어가는 시점에서 여유메모리가 남아 있어도 커널이 다시 시작됨
# df = pd.read_json('HNCommentsAll.1perline.json')
In [14]:
# text preprocessing utils
html_tags = re.compile(r'<.*?>')
to_replace = [(''', "'")]
hex_tags = re.compile(r'&.*?;')
def clean_comment(comment):
c = str(comment.encode("utf-8"))
c = html_tags.sub(' ', c)
for tag, char in to_replace:
c = c.replace(tag, char)
c = hex_tags.sub(' ', c)
return c
def text_generator(data_path):
with open(data_path) as f:
for i, l in enumerate(f):
comment_data = json.loads(l)
comment_text = comment_data["comment_text"]
comment_text = clean_comment(comment_text)
if i % 10000 == 0:
print(i)
return comment_data, comment_text
In [15]:
d,t = text_generator('HNCommentsAll.1perline.json')
In [18]:
print d.keys()
print t
In [ ]: