In [5]:
import argparse
import os
import shutil
import gzip

import torch
import torch.nn as nn
from torch.autograd import Variable

import spacy

In [4]:
train_path = '/Users/Max/data/beer_reviews/reviews.all.train.txt.gz'
val_path = '/Users/Max/data/beer_reviews/reviews.all.heldout.txt.gz'

nlp = spacy.load('en')

In [ ]:


In [7]:
import gzip
check_path = '/Users/Max/data/full_beer/aspects/reviews.aspect1.train.txt.gz'

with gzip.open(check_path, 'rt') as f:
    lines = f.readlines()
print(len(lines))


70000

In [14]:
import tqdm

for line in tqdm.tqdm(lines[:1000]):
    target, _, review = line.partition("\t")
    doc = nlp(review)
    
    # parsing in spacy style
    words = [tuple([token.text]) for token in doc]
    sents = [tuple([token.text for token in sent]) for sent in doc.sents]
    chunks = [tuple([word.text for word in token.subtree if word.text != '\n' and word.text != '\t']) for token in doc]
    
    # creating encodings
    enc_words  = target + '\D' + '\T'.join(['\W'.join(tup) for tup in words])
    enc_sents  = target + '\D' + '\T'.join(['\W'.join(tup) for tup in sents])
    enc_chunks = target + '\D' + '\T'.join(['\W'.join(tup) for tup in chunks]) + '\n'
    
    with gzip.open('/Users/Max/data/beer_reviews/reviews.short.train.words.txt.gz', 'at') as f:
        f.write(enc_words)
    
    with gzip.open('/Users/Max/data/beer_reviews/reviews.short.train.chunks.txt.gz', 'at') as f:
        f.write(enc_chunks)
        
    with gzip.open('/Users/Max/data/beer_reviews/reviews.short.train.sents.txt.gz', 'at') as f:
        f.write(enc_sents)


100%|██████████| 1000/1000 [00:22<00:00, 56.87it/s]

In [13]:
len(lines)


Out[13]:
210000

In [ ]:
with gzip.open('/Users/Max/data/beer_reviews/reviews.all.train.chunks.txt.gz', 'rt') as f:
    my_lines = f.readlines()

In [ ]:
# Decode
obj = my_lines[128383]
target, review = obj.split('\D')
[tuple(chunk.split('\W')) for chunk in review.split('\T')]

In [ ]:
my_lines[129322]

In [ ]:
# parsing in spacy style
words = [tuple([token.text]) for token in doc]
sents = [tuple([token.text for token in sent]) for sent in doc.sents]
chunks = [tuple([word.text for word in token.subtree]) for token in doc]

# creating encodings
enc_words  = target + '\D' + '\T'.join(['\W'.join(tup) for tup in words])
enc_sents  = target + '\D' + '\T'.join(['\W'.join(tup) for tup in sents])
enc_chunks = target + '\D' + '\T'.join(['\W'.join(tup) for tup in chunks])

In [ ]:
lines[1]

In [ ]:
target, review = enc_chunks.split('\D')

In [ ]:
target

In [ ]:
review.split('\T')[0].split('\W')

In [ ]:
enc_chunks[]

In [ ]:
import json
import gzip

def read_rationales(path):
    """
    This reads the json.annotations file. 
    Creates a list of dictionaries, which holds the 994 reviews for which
    sentence-level annotations are available. 
    """
    data = []
    fopen = gzip.open if path.endswith(".gz") else open
    with fopen(path) as fin:
        for line in fin:
            item = json.loads(line)
            data.append(item)
    return data

In [ ]:
import spacy
import ast
nlp = spacy.load('en')
anno = '/Users/Max/data/beer_reviews/annotations.json'
annotations = read_rationales(anno)

In [ ]:
# Better
# For each sentence that deserves a label, we extract the sentence from doc that is closest. 

def map_sentence(sen, tup):
    sen = set([str(token).lower() for token in sen])
    n = len(sen) #+ len(tup[0])
    s = len(sen & tup[0])
    score = s / n
    return score

# Which sentences deserve a label and what label?
ix = 2
review = annotations[ix]
doc = nlp(ast.literal_eval(annotations[ix]['raw'])['review/text'])
all_words = review['x']
label_sens = []
for label in ['0','1','2']:
    label_sens.extend([(set(all_words[s:e]), label) for s, e in review[label]])

# Label the sentences in doc
sentences = [(sen, set()) for sen in doc.sents]
for tup in label_sens:
    scores = ([map_sentence(sen, tup) for sen, _ in sentences])
    # print(scores)
    sentences[scores.index(max(scores))][1].add(tup[1])

# Process the sentences
words = []
chunks= []
#mode = 'chunks'

for tup in sentences:    
    words1 = [tuple([tuple([token.text]),tup[1]]) for token in tup[0]]
    chunks1 = [tuple([tuple([word.text for word in token.subtree if word.text != '\n' and word.text != '\t']),tup[1]]) for token in tup[0]]
    words.extend(words1)
    chunks.extend(chunks1)
sents = sentences

In [ ]:
words

In [ ]:
doc

In [ ]:
doc

In [ ]:
[review[label] for label in ['0','1','2']]

In [8]:


In [ ]: