In [2]:
import tensorflow as tf
import numpy as np
import csv
import itertools
import operator
import numpy as np
import nltk
import sys
from datetime import datetime
from utils import *
import matplotlib.pyplot as plt
%matplotlib inline
import os
import collections
from six.moves import cPickle
import collections
In [3]:
save_dir = "data/"
sentence_start_token = "START"
sentence_end_token = "END"
f = open('data/ratings_train.txt', 'r')
lines = f.readlines()
for i in range(len(lines)):
lines[i] = lines[i].replace("/n","").replace("\n","")
reader = []
for line in lines:
line_document = line.split("\t")[1]
reader.append(line_document)
f.close()
In [4]:
sentences = ["%s %s %s" % (sentence_start_token, x, sentence_end_token) for x in reader[:10000]]
In [5]:
from konlpy.tag import Twitter
pos_tagger = Twitter()
def tokenize(doc):
return ['/'.join(t) for t in pos_tagger.pos(doc, norm=True, stem=True)]
tokenized_sentences = [tokenize(row) for row in sentences]
In [6]:
Verb_Noun_Adjective_Alpha_in_text = []
index = 0
for text in tokenized_sentences:
Verb_Noun_Adjective_Alpha_in_text.append([])
for word in text:
parts_of_speech = word.split("/")
if parts_of_speech[1] in ["Noun","Verb","Adjective"] :
Verb_Noun_Adjective_Alpha_in_text[index].append(word.split("/")[0])
elif parts_of_speech[1] in ["Alpha"] and len(parts_of_speech[0]) ==3 or len(parts_of_speech[0]) ==5:
Verb_Noun_Adjective_Alpha_in_text[index].append(word.split("/")[0])
index += 1
In [7]:
Verb_Noun_Adjective_Alpha_in_text_tokens = [t for d in Verb_Noun_Adjective_Alpha_in_text for t in d]
In [8]:
counter = collections.Counter(Verb_Noun_Adjective_Alpha_in_text_tokens)
In [9]:
count_pairs = sorted(counter.items(), key=lambda x: -x[1])
In [10]:
chars, counts = zip(*count_pairs)
In [11]:
vocab = dict(zip(chars, range(len(chars))))
In [12]:
save_name = os.path.join(save_dir, 'chars_vocab.pkl')
with open(save_name, 'wb') as fsave:
cPickle.dump((chars, vocab), fsave)
In [13]:
load_name = os.path.join(save_dir, 'chars_vocab.pkl')
with open(load_name, 'rb') as fload:
chars2, vocab2 = cPickle.load(fload)
In [14]:
corpus = np.array(list(map(vocab.get,Verb_Noun_Adjective_Alpha_in_text_tokens)))
In [15]:
save_name = os.path.join(save_dir, 'corpus_data.pkl')
with open(save_name, 'wb') as fsave:
cPickle.dump((corpus, Verb_Noun_Adjective_Alpha_in_text_tokens), fsave)