In [2]:
import tensorflow as tf
import numpy as np
import csv
import itertools
import operator
import numpy as np
import nltk
import sys
from datetime import datetime
from utils import *
import matplotlib.pyplot as plt
%matplotlib inline
import os
import collections
from six.moves import cPickle
import collections

In [3]:
save_dir = "data/"

sentence_start_token = "START"
sentence_end_token = "END"

f = open('data/ratings_train.txt', 'r')
lines = f.readlines()
for i in range(len(lines)):
    lines[i] = lines[i].replace("/n","").replace("\n","")
reader = []
for line in lines:
    line_document = line.split("\t")[1]
    reader.append(line_document)
f.close()

In [4]:
sentences = ["%s %s %s" % (sentence_start_token, x, sentence_end_token) for x in reader[:10000]]

In [5]:
from konlpy.tag import Twitter
pos_tagger = Twitter()
def tokenize(doc):
    return ['/'.join(t) for t in pos_tagger.pos(doc, norm=True, stem=True)]
tokenized_sentences = [tokenize(row) for row in sentences]

In [6]:
Verb_Noun_Adjective_Alpha_in_text = []
index = 0
for text in tokenized_sentences:
    Verb_Noun_Adjective_Alpha_in_text.append([])
    for word in text:
        parts_of_speech = word.split("/")
        if parts_of_speech[1] in ["Noun","Verb","Adjective"] :
            Verb_Noun_Adjective_Alpha_in_text[index].append(word.split("/")[0])
        elif parts_of_speech[1] in ["Alpha"] and len(parts_of_speech[0]) ==3 or len(parts_of_speech[0]) ==5:
            Verb_Noun_Adjective_Alpha_in_text[index].append(word.split("/")[0])            
    index += 1

In [7]:
Verb_Noun_Adjective_Alpha_in_text_tokens = [t for d in Verb_Noun_Adjective_Alpha_in_text for t in d]

In [8]:
counter = collections.Counter(Verb_Noun_Adjective_Alpha_in_text_tokens)

In [9]:
count_pairs = sorted(counter.items(), key=lambda x: -x[1])

In [10]:
chars, counts = zip(*count_pairs)

In [11]:
vocab = dict(zip(chars, range(len(chars))))

In [12]:
save_name = os.path.join(save_dir, 'chars_vocab.pkl')
with open(save_name, 'wb') as fsave:
    cPickle.dump((chars, vocab), fsave)

In [13]:
load_name = os.path.join(save_dir, 'chars_vocab.pkl')
with open(load_name, 'rb') as fload:
    chars2, vocab2 = cPickle.load(fload)

In [14]:
corpus = np.array(list(map(vocab.get,Verb_Noun_Adjective_Alpha_in_text_tokens)))

In [15]:
save_name = os.path.join(save_dir, 'corpus_data.pkl')
with open(save_name, 'wb') as fsave:
    cPickle.dump((corpus, Verb_Noun_Adjective_Alpha_in_text_tokens), fsave)