In [10]:
import pandas as pd
import os, re, json

In [2]:
ls


anaconda/                                  LSTM-amazon.ipynb
Anaconda-2.2.0-Linux-x86_64.sh*            src/
AWS-skipgram_word_embeddings.ipynb         submission.csv
AWS-skipgram_word_embeddings-pandas.ipynb  SVM-amazon.ipynb
HNCommentsAll.1perline.json                test.csv
HNCommentsAll.1perline.json.bz2            train.csv

In [3]:
# ram이 8기가 넘어가는 시점에서 여유메모리가 남아 있어도 커널이 다시 시작됨
# df = pd.read_json('HNCommentsAll.1perline.json')

In [14]:
# text preprocessing utils
html_tags = re.compile(r'<.*?>')
to_replace = [('&#x27;', "'")]
hex_tags = re.compile(r'&.*?;')

def clean_comment(comment):
    c = str(comment.encode("utf-8"))
    c = html_tags.sub(' ', c)
    for tag, char in to_replace:
        c = c.replace(tag, char)
    c = hex_tags.sub(' ', c)
    return c

def text_generator(data_path):
    with open(data_path) as f:
        for i, l in enumerate(f):
            comment_data = json.loads(l)
            comment_text = comment_data["comment_text"]
            comment_text = clean_comment(comment_text)
            if i % 10000 == 0:
                print(i)
            return comment_data, comment_text

In [15]:
d,t = text_generator('HNCommentsAll.1perline.json')


0

In [18]:
print d.keys()
print t


[u'comment_text', u'story_text', u'objectID', u'author', u'url', u'num_comments', u'created_at', u'title', u'_tags', u'parent_id', u'_highlightResult', u'points', u'story_id', u'created_at_i', u'story_url', u'story_title']
Because you don't have to rely on a political apparatus to spend the money wisely. By the way, nobody has to wait for billionaires anywhere, if you want to help out in education, get up and do it.

In [ ]: