notebook.community

Edit and run



In [2]:

    
import tensorflow as tf
import numpy as np
import csv
import itertools
import operator
import numpy as np
import nltk
import sys
from datetime import datetime
from utils import *
import matplotlib.pyplot as plt
%matplotlib inline
import os
import collections
from six.moves import cPickle
import collections



In [3]:

    
save_dir = "data/"

sentence_start_token = "START"
sentence_end_token = "END"

f = open('data/ratings_train.txt', 'r')
lines = f.readlines()
for i in range(len(lines)):
    lines[i] = lines[i].replace("/n","").replace("\n","")
reader = []
for line in lines:
    line_document = line.split("\t")[1]
    reader.append(line_document)
f.close()



In [4]:

    
sentences = ["%s %s %s" % (sentence_start_token, x, sentence_end_token) for x in reader[:10000]]



In [5]:

    
from konlpy.tag import Twitter
pos_tagger = Twitter()
def tokenize(doc):
    return ['/'.join(t) for t in pos_tagger.pos(doc, norm=True, stem=True)]
tokenized_sentences = [tokenize(row) for row in sentences]



In [6]:

    
Verb_Noun_Adjective_Alpha_in_text = []
index = 0
for text in tokenized_sentences:
    Verb_Noun_Adjective_Alpha_in_text.append([])
    for word in text:
        parts_of_speech = word.split("/")
        if parts_of_speech[1] in ["Noun","Verb","Adjective"] :
            Verb_Noun_Adjective_Alpha_in_text[index].append(word.split("/")[0])
        elif parts_of_speech[1] in ["Alpha"] and len(parts_of_speech[0]) ==3 or len(parts_of_speech[0]) ==5:
            Verb_Noun_Adjective_Alpha_in_text[index].append(word.split("/")[0])            
    index += 1



In [7]:

    
Verb_Noun_Adjective_Alpha_in_text_tokens = [t for d in Verb_Noun_Adjective_Alpha_in_text for t in d]



In [8]:

    
counter = collections.Counter(Verb_Noun_Adjective_Alpha_in_text_tokens)



In [9]:

    
count_pairs = sorted(counter.items(), key=lambda x: -x[1])



In [10]:

    
chars, counts = zip(*count_pairs)



In [11]:

    
vocab = dict(zip(chars, range(len(chars))))



In [12]:

    
save_name = os.path.join(save_dir, 'chars_vocab.pkl')
with open(save_name, 'wb') as fsave:
    cPickle.dump((chars, vocab), fsave)



In [13]:

    
load_name = os.path.join(save_dir, 'chars_vocab.pkl')
with open(load_name, 'rb') as fload:
    chars2, vocab2 = cPickle.load(fload)



In [14]:

    
corpus = np.array(list(map(vocab.get,Verb_Noun_Adjective_Alpha_in_text_tokens)))



In [15]:

    
save_name = os.path.join(save_dir, 'corpus_data.pkl')
with open(save_name, 'wb') as fsave:
    cPickle.dump((corpus, Verb_Noun_Adjective_Alpha_in_text_tokens), fsave)