In [5]:
import argparse
import os
import shutil
import gzip
import torch
import torch.nn as nn
from torch.autograd import Variable
import spacy
In [4]:
train_path = '/Users/Max/data/beer_reviews/reviews.all.train.txt.gz'
val_path = '/Users/Max/data/beer_reviews/reviews.all.heldout.txt.gz'
nlp = spacy.load('en')
In [ ]:
In [7]:
import gzip
check_path = '/Users/Max/data/full_beer/aspects/reviews.aspect1.train.txt.gz'
with gzip.open(check_path, 'rt') as f:
lines = f.readlines()
print(len(lines))
In [14]:
import tqdm
for line in tqdm.tqdm(lines[:1000]):
target, _, review = line.partition("\t")
doc = nlp(review)
# parsing in spacy style
words = [tuple([token.text]) for token in doc]
sents = [tuple([token.text for token in sent]) for sent in doc.sents]
chunks = [tuple([word.text for word in token.subtree if word.text != '\n' and word.text != '\t']) for token in doc]
# creating encodings
enc_words = target + '\D' + '\T'.join(['\W'.join(tup) for tup in words])
enc_sents = target + '\D' + '\T'.join(['\W'.join(tup) for tup in sents])
enc_chunks = target + '\D' + '\T'.join(['\W'.join(tup) for tup in chunks]) + '\n'
with gzip.open('/Users/Max/data/beer_reviews/reviews.short.train.words.txt.gz', 'at') as f:
f.write(enc_words)
with gzip.open('/Users/Max/data/beer_reviews/reviews.short.train.chunks.txt.gz', 'at') as f:
f.write(enc_chunks)
with gzip.open('/Users/Max/data/beer_reviews/reviews.short.train.sents.txt.gz', 'at') as f:
f.write(enc_sents)
In [13]:
len(lines)
Out[13]:
In [ ]:
with gzip.open('/Users/Max/data/beer_reviews/reviews.all.train.chunks.txt.gz', 'rt') as f:
my_lines = f.readlines()
In [ ]:
# Decode
obj = my_lines[128383]
target, review = obj.split('\D')
[tuple(chunk.split('\W')) for chunk in review.split('\T')]
In [ ]:
my_lines[129322]
In [ ]:
# parsing in spacy style
words = [tuple([token.text]) for token in doc]
sents = [tuple([token.text for token in sent]) for sent in doc.sents]
chunks = [tuple([word.text for word in token.subtree]) for token in doc]
# creating encodings
enc_words = target + '\D' + '\T'.join(['\W'.join(tup) for tup in words])
enc_sents = target + '\D' + '\T'.join(['\W'.join(tup) for tup in sents])
enc_chunks = target + '\D' + '\T'.join(['\W'.join(tup) for tup in chunks])
In [ ]:
lines[1]
In [ ]:
target, review = enc_chunks.split('\D')
In [ ]:
target
In [ ]:
review.split('\T')[0].split('\W')
In [ ]:
enc_chunks[]
In [ ]:
import json
import gzip
def read_rationales(path):
"""
This reads the json.annotations file.
Creates a list of dictionaries, which holds the 994 reviews for which
sentence-level annotations are available.
"""
data = []
fopen = gzip.open if path.endswith(".gz") else open
with fopen(path) as fin:
for line in fin:
item = json.loads(line)
data.append(item)
return data
In [ ]:
import spacy
import ast
nlp = spacy.load('en')
anno = '/Users/Max/data/beer_reviews/annotations.json'
annotations = read_rationales(anno)
In [ ]:
# Better
# For each sentence that deserves a label, we extract the sentence from doc that is closest.
def map_sentence(sen, tup):
sen = set([str(token).lower() for token in sen])
n = len(sen) #+ len(tup[0])
s = len(sen & tup[0])
score = s / n
return score
# Which sentences deserve a label and what label?
ix = 2
review = annotations[ix]
doc = nlp(ast.literal_eval(annotations[ix]['raw'])['review/text'])
all_words = review['x']
label_sens = []
for label in ['0','1','2']:
label_sens.extend([(set(all_words[s:e]), label) for s, e in review[label]])
# Label the sentences in doc
sentences = [(sen, set()) for sen in doc.sents]
for tup in label_sens:
scores = ([map_sentence(sen, tup) for sen, _ in sentences])
# print(scores)
sentences[scores.index(max(scores))][1].add(tup[1])
# Process the sentences
words = []
chunks= []
#mode = 'chunks'
for tup in sentences:
words1 = [tuple([tuple([token.text]),tup[1]]) for token in tup[0]]
chunks1 = [tuple([tuple([word.text for word in token.subtree if word.text != '\n' and word.text != '\t']),tup[1]]) for token in tup[0]]
words.extend(words1)
chunks.extend(chunks1)
sents = sentences
In [ ]:
words
In [ ]:
doc
In [ ]:
doc
In [ ]:
[review[label] for label in ['0','1','2']]
In [8]:
In [ ]: