Aspect-Oriented Sentiment Analysis

Resources

Stemmer

Lemmatiser


In [1]:
import xml.etree.ElementTree as ET
import numpy as np

Comment Structure Representation

The comments in the dataset are representes as XML. To represent it, three classes are used - TaggedDocument, TaggedSentence and TaggedWord.


TaggedWord

The class represents a work with associated metadata - index in sentence, lemma, stem, POS tag and so on.

TaggedSentence

A representation of a sentence with associated metadata. Each sentence has a list of TaggedWord objects representing the words that make it up.

TaggedDocument

A representation of a document and its metadata, with a list of sentences that it contains as a list of TaggedSentence objects.


In [6]:
class TaggedText(object):
    """Object representation of a review document tagged with sentence metadata, source and ratings."""

    def __init__(self):
        self.sentences = []
        self.rating = 0
        self.source = ''
        self.text = ''
        self.__pairs = None

    def add_sentence(self, sentence):
        self.sentences.append(sentence)

    def get_sentences(self):
        return self.sentences
        
    def set_filename(self, fn):
        self.filename = fn
        
    def get_filename(self):
        return self.filename

    def set_source_url(self, url):
        self.source = url

    def get_source_url(self):
        return self.source

    def set_rating(self, rating):
        self.rating = rating

    def get_rating(self):
        return self.rating

    def set_text(self, text):
        self.text = text

    def get_text(self):
        return self.text
    
    def get_length(self):
        return len(self.text)
    
    def get_annotated_pairs(self):
        if self.__pairs:
            return self.__pairs
        
        self.__pairs = []
        
        for sentence in self.sentences:
            # Get all positively marked sentiment-clue pairs
            self.__pairs += [ (a,s) for (a,s,l,r) in sentence.get_annotated_pairs() if l == 1]
            
        return self.__pairs

    def get_clues(self):
        return [a for (a, b) in self.get_annotated_pairs()]
    
    def get_aspects(self):
        return [b for (a, b) in self.get_annotated_pairs()]
    
class TaggedSentence(object):

    def __init__(self):
        self.words = []
        self.dependencies = []
        self.text = ''
        self.start_position = 0
        self.length = 0
        self.annotated_pairs = []

    def add_annotated_pair(self, aspect, anchor, link, sentiment):
        r_link = transform_annotation(link)
        r_sentiment = transform_annotation(sentiment)
        self.annotated_pairs.append((aspect, anchor, r_link, r_sentiment))
        
    def get_annotated_pairs(self):
        return self.annotated_pairs
    
    def get_annotated_pair(self, aspect, anchor):
        aspect_word = aspect.get_word()
        anchor_word = anchor.get_word()
        
        for (r_asp, r_anch, link, sentiment) in self.get_annotated_pairs():
            if aspect_word == r_asp and anchor_word == r_anch:
                return np.array([link, sentiment])
            
        return np.array([0, 0])
        
    def add_word(self, word):
        self.words.append(word)

    def get_words(self):
        return self.words
    
    def get_word(self, i):
        return self.words[i]

    def add_dependency(self, dependency):
        self.dependencies.append(dependency)

    def get_dependencies(self):
        return self.dependencies

    def set_text(self, text):
        self.text = text

    def get_text(self):
        return self.text

    def set_start_position(self, pos):
        self.start_position = pos

    def get_start_position(self):
        return self.start_position

    def get_length(self):
        return len(self.text)
    
class Dependency(object):
    def __init__(self, governor, dependent, relation):
        self.governor = governor
        self.dependent = dependent
        self.relation = relation
        
    def get_governor_index(self):
        return self.governor
    
    def get_dependent_index(self):
        return self.dependent
    
    def get_relation(self):
        return self.relation
    
    def __str__(self):
        return '({0} as {1} to {2})'.format(self.governor, self.relation, self.dependent)


class TaggedWord(object):

    def __init__(self):
        self.word = ''
        self.lemma = ''
        self.molex_lemmas = []
        self.POS = None
        self.stem = ''
        self.MSDs = []
        self.position = 0
        self.index = 0

    def set_word(self, word):
        self.word = word

    def get_word(self):
        return self.word

    def set_stem(self, stem):
        self.stem = stem

    def get_stem(self):
        return self.stem

    def set_lemma(self, lemma):
        self.lemma = lemma

    def get_lemma(self):
        return self.lemma

    def add_molex_lemma(self, m_lemma):
        self.molex_lemmas.append(m_lemma)

    def get_molex_lemmas(self):
        return self.molex_lemmas

    def set_POS_tag(self, POS):
        self.POS = POS

    def get_POS_tag(self):
        return self.POS

    def add_MSD(self, msd):
        self.MSDs.append(msd)

    def get_MSDs(self):
        return self.MSDs

    def set_position(self, pos):
        self.position = pos

    def get_position(self):
        return self.position

    def set_index(self, index):
        self.index = index

    def get_index(self):
        return self.index
    
    def __eq__(self, other):
        '''
        If the word is the same and the index is the same, assume it's also
        the same sentence. Good enough for practical use, but not necessarily
        correct.
        '''
        return self.word == other.word and self.index == other.index
    
def transform_annotation(annotation):
    """Transforms string-based orientation annotations into numbers"""
    #if annotation == '+':
    #    return 1
    #if annotation == '-':
    #    return -1
    if annotation in "+-":
        return 1
    return 0

Parsing documents

There must be a parsing process that turns the XML into the document representation provided above. The function read_document does the job for a single document.


In [3]:
from os import listdir
from os.path import isfile, join
import re

def files_in(directory):
    """
    List all files in directory. Excludes directories.
    Args:
        directory (str): String representation of path
    Returns:
        str[]: List of file names, not prefixed with directory name
    """
    return [f for f in listdir(directory) if isfile(join(directory,f))]

def read_documents_in(directory):
    """
    Parses all XML documents in a given directory to create a list of object
    document representations.
    
    Args:
        directory (str) : The directory path
    
    Returns:
        TaggedDocument[] : List of documents as object
    """
    files = files_in(directory)
    documents = []
    
    for f in files:
        documents.append(read_document(join(directory, f)))
        
    return documents

def read_document(doc):
    """
    Parses an XML document to returns its tagged document representation
    Args:
        doc (str): The path to the document
    Returns:
        TaggedDocument: Parsed XML as object
    """
    tree = ET.parse(doc)
    root = tree.getroot()
    
    filename = doc.split('/')[-1].split('.')[0]
    
    comment = TaggedText()
    comment.set_filename(filename) # Name the document
    
    text = root.find('Text').text
    rating = float(root.find('Rating').text)
    source = root.find('Source').text

    # Setting the basic properties
    comment.set_text(text)
    comment.set_rating(rating)
    comment.set_source_url(source)

    # Iterating through all sentences in the text
    sentences = root.find('Sentences')

    for sentence in sentences.findall('SentenceInfo'):
        tagged_sentence = TaggedSentence()

        text = sentence.find('Text').text
        start = int(sentence.find('StartPosition').text)

        tagged_sentence.set_text(text)
        tagged_sentence.set_start_position(start)

        words = sentence.find('TaggedWords')

        for word in words.findall('CROTaggedWord'):
            tagged_word = TaggedWord()

            word_text = word.find('Word').text
            word_lemma = word.find('Lemma').text
            molex_lemmas = [molex.text for molex in word.find('MolexLemmas').findall('string')]
            POS = word.find('POSTag').text
            stem = word.find('BasicStem').text
            MSDs = [ msd.text for msd in word.find('MSDs').findall('string')]
            position = int(word.find('Position').text)
            index = int(word.find('SentenceIndex').text)

            tagged_word.set_word(word_text)
            tagged_word.set_lemma(word_lemma)
            for molex in molex_lemmas: tagged_word.add_molex_lemma(molex)
            tagged_word.set_stem(stem)
            for msd in MSDs: tagged_word.add_MSD(msd)
            tagged_word.set_position(position)
            tagged_word.set_index(index)
            tagged_word.set_POS_tag(POS)

            tagged_sentence.add_word(tagged_word)
            
        dependencies = sentence.find('DependencyRelations')
        
        for dependency in dependencies.findall('DependencyRelation'):
            governor = int(dependency.find('Governor').find('SentenceIndex').text)
            dependent = int(dependency.find('Dependent').find('SentenceIndex').text)
            relation = dependency.find('Relation').text
            tagged_sentence.add_dependency(Dependency(governor, dependent, relation))
            

        comment.add_sentence(tagged_sentence)

    return comment

Aspects and Clues

For processing to be done, we need to read in the aspects, as well as the positive and negative clues.

tf-idf

For viable processing, tf-idf calculations need to be defined


In [88]:
from math import log
import codecs
import json

def get_all_lemmas(word):
    """Locates all lemmas and all molex lemmas"""
    return word.get_molex_lemmas() + [word.get_lemma()]

def get_all_roots(word):
    """Locates all lemmas, molex lemmas and stems"""
    return word.get_molex_lemmas() + [word.get_lemma(), word.get_stem()]

def read_word_roots(document):
    """
        Read stems from a file formatted as
        >>>word\tstem
        This format is used for stemmer output
    """
    stems = []
    with codecs.open(document, 'r', 'utf-8') as fp:
        for line in fp:
            stems.append(line.split()[1]) # Stems/lemmas are in the second tab-delimited column
    
    return set(stems)

lemma_aspects = read_word_roots('CropinionDataset/dictionary_lemmatised/aspects.txt')
lemma_pos_clues = read_word_roots('CropinionDataset/dictionary_lemmatised/positive_clues.txt')
lemma_neg_clues = read_word_roots('CropinionDataset/dictionary_lemmatised/negative_clues.txt')

stem_aspects = read_word_roots('CropinionDataset/dictionary_stemmed/aspects.txt')
stem_pos_clues = read_word_roots('CropinionDataset/dictionary_stemmed/positive_clues.txt')
stem_neg_clues = read_word_roots('CropinionDataset/dictionary_stemmed/negative_clues.txt')

aspects = lemma_aspects | stem_aspects
pos_clues = lemma_pos_clues | stem_pos_clues
neg_clues = lemma_neg_clues | stem_neg_clues 

all_clues = pos_clues | neg_clues

# We use only aspects and clues as words (at this point)
all_words = aspects | all_clues

#TaggedWord.get_lemma

def get_word_list(docs, processor=get_all_roots):
    word_list = []
    
    # Record occurrences
    for doc in docs:
        doc_name = doc.get_filename()
        
        for sentence in doc.get_sentences():
            for word in sentence.get_words():
                root = processor(word)
                if isinstance(root, list):
                    word_list += root
                else:
                    word_list.append(root)
    
    return set(word_list)
    

def get_idf(docs, processor=get_all_roots, word_list=set()):
    N = len(docs) # Document count
    idfs = {}
    
    # Record occurrences
    for doc in docs:
        doc_name = doc.get_filename()
        
        for sentence in doc.get_sentences():
            for word in sentence.get_words():
                root = processor(word)
                if isinstance(root, list):
                    if not any([w in word_list for w in root]):
                        continue
                elif root not in word_list:
                    continue
                    
                if not isinstance(root, list):
                    if root in idfs:
                        idfs[root][doc_name] = 1
                    else:
                        idfs[root] = { doc_name : 1 }
                        
                else:
                    for r in root:
                        if r in idfs:
                            idfs[r][doc_name] = 1
                        else:
                            idfs[r] = { doc_name : 1 }
                            
    # Actual idfs:
    for root in idfs:
        root_doc_count = len(idfs[root]) + 1.0 # Smoothing
        idfs[root] = log(N / root_doc_count)
    
    return idfs

def get_tf(docs, processor=get_all_roots, word_list=set()):
    tfs = {}
    
    # Record occurrences
    for doc in docs:
        doc_name = doc.get_filename()
        tfs[doc_name] = {}
        
        for sentence in doc.get_sentences():
            for word in sentence.get_words():
                root = processor(word)
                if isinstance(root, list):
                    if not any([w in word_list for w in root]):
                        continue
                elif root not in word_list:
                    continue
                    
                if not isinstance(root, list):
                    if root in tfs:
                        tfs[doc_name][root] += 1
                    else:
                        tfs[doc_name][root] = 1
                        
                else:
                    for r in root:
                        if r in tfs:
                            tfs[doc_name][r] += 1
                        else:
                            tfs[doc_name][r] = 1
    
    for doc_name in tfs:
        # Skip documents without any recognised lemmas
        if not tfs[doc_name].keys():
            continue
        max_freq = max(tfs[doc_name].values())
        
        for root in tfs[doc_name]:
            tfs[doc_name][root] = 0.5 + 0.5 * tfs[doc_name][root] / max_freq
 
    return tfs

def tf_idf_vector(document_name, tfs, idfs, word_list, only_words=None):
    '''
    Given a document with path :document_name, loads it and calculates its tf-idf
    vector. TFS are drawn from the :tfs dictionary, while the IDFS are drawn from
    the :idfs dictionary. Words are drawn from the :word_list iterable. If the
    :only_words variable is set to an iterable, all words NOT in the word list
    are ignored.
    '''
    tf_idf = []
    
    for word in word_list:
        if only_words is not None and word not in only_words:
            tf_idf.append(0.0)
        elif document_name in tfs and word in tfs[document_name] and word in idfs:
            tf_idf.append(tfs[document_name][word] * idfs[word])
        else:
            tf_idf.append(0.0)
        
    return np.array(tf_idf)

train_docs = read_documents_in('CropinionDataset/reviews_new/train2/')
test_docs = read_documents_in('CropinionDataset/reviews_new/test2/')

# We must use train idfs in both cases to prevent information leakage
word_list = sorted(list(get_word_list(train_docs))) # all_words
all_idfs = get_idf(train_docs, word_list=word_list)
test_tfs = get_tf(train_docs, word_list=word_list) # all_words
train_tfs = get_tf(test_docs, word_list=word_list)

all_tfs = dict(test_tfs, **train_tfs)

Finding potential pairs

We need to go through the text and find all potential aspect-clue pairs. Then we tag the bastards.


In [14]:
from os import remove
from os.path import exists, join

def aspects_in(document, aspects=aspects, processor=get_all_roots):
    found = set()
    
    for sentence in document.get_sentences():
        for word in sentence.get_words():
            root = processor(word)
            
            if isinstance(root, list):
                if any([w in aspects for w in root]):
                    found |= set(root) # Add all roots
            else:
                if root in aspects:
                    found.add(root)
    
    return found

def clues_in(document, clues=all_clues, processor=get_all_roots):
    found = set()
    
    for sentence in document.get_sentences():
        for word in sentence.get_words():
            root = processor(word)
            
            if isinstance(root, list):
                if any([w in clues for w in root]):
                    found |= set(root) # Add all roots
            else:
                if root in clues:
                    found.add(root)
    
    return found

def find_pairs(sentence, aspects, clues, processor=get_all_roots):
    """
    Finds all potential clue-aspect pairs in the sentence. Pairs are identified by comparing word stems
    in the sentence to stems given in the aspect and clue dictionaries (sets).
    
    Args:
        sentence (TaggedSentence) : Object representation of a sentence
        aspects (set(str)) : Set of aspect stems
        clues (set(str)) : Set of clue stems
        
    Returns:
        ([(TaggedWord, TaggedWord)]) : List of TaggedWord candidate pairs
    """
    sent_aspects = []
    sent_clues = []
        
    for word in sentence.get_words():
        root_word = processor(word)
        if isinstance(root_word, list):
            if any([w in aspects for w in root_word]):
                sent_aspects.append(word)
            if any([w in clues for w in root_word]):
                sent_clues.append(word)
        else:
            if root_word in aspects:
                sent_aspects.append(word)
            if root_word in clues:
                sent_clues.append(word)
        
    return [(aspect, clue) for aspect in sent_aspects for clue in sent_clues]

def get_annotated_pairs_in_document(document):
    tree = ET.parse(document)
    root = tree.getroot()

    pairs = []
    for sentence in root.iter('sentence'):
        sent_pairs = []
        
        for pair in sentence.iter('pair'):
            if pair.get('link') in "+-":
                sent_pairs.append((pair.get('aspect'), pair.get('anchor')))
        pairs.append(sent_pairs)
            
    return pairs

def get_unlocated_sentence_pair_count(found_pairs, real_pairs):
    unlocated = 0
    
    for (aspect, clue) in real_pairs:
        found = False
        for (w_a, w_c) in found_pairs:
            w_a, w_c = w_a.get_word(), w_c.get_word()
            if (w_a == aspect and w_c == clue) or (w_c == aspect and w_a == clue) :
                found = True
                break
        
        if not found:
            unlocated += 1
            
    return unlocated

def get_unlocated_document_pair_count(document, ann_document, aspects, clues):
    try:
        ann_pairs = get_annotated_pairs_in_document(ann_document)
    except:
        #print ann_document, 'does not exist'
        return 0 #No file, missed nothing
                
    parsed_doc = read_document(document)
    
    count = 0
    
    N = len(ann_pairs)
    
    for i in xrange(N):
        my_pairs = find_pairs(parsed_doc.get_sentences()[i], aspects, clues)
        ap_pairs = ann_pairs[i]
        count += get_unlocated_sentence_pair_count(my_pairs, ap_pairs)
        
    return count
 
def get_dataset_unlocated_pair_count(dataset_dir, annotated_pairs_dir, aspects, clues):
    count = 0
    files = files_in(dataset_dir)
    
    for doc in files:
        my_doc = join(dataset_dir, doc)
        ap_doc = join(annotated_pairs_dir, doc)
        
        count += get_unlocated_document_pair_count(my_doc, ap_doc, aspects, clues)
        
    return count

Extracting actual pairs and their annotations

The features are listed and their annotations provided. We must record them in a separate file, for great justice and ease of use.


In [174]:
import os.path
from os import remove
from os.path import exists  

def doc_load_annotated_pairs(document, src_dir):
    """
    Given a TaggedDocument representation of an xml document, loads the manually annotated pairs and their
    sentiments from a provided source directory. The pair are stored on the document - more precisely, on
    each TaggedSentence within the document.
    """
    filename = join(src_dir, document.get_filename() + '.xml')
    
    if not os.path.isfile(filename):
        return False # No pairs
        
    doc_sentences = document.get_sentences()
    
    ap_tree = ET.parse(filename)
    ap_root = ap_tree.getroot()
    
    ap_sentences = list(ap_root.iter('sentence'))
    
    sent_count = len(ap_sentences)
    
    for i in xrange(sent_count):
        ap_sent = ap_sentences[i]
        doc_sent = doc_sentences[i]
        
        # Find all pairs and append them to the doc
        for pair in ap_sent.iter('pair'):
            doc_sent.add_annotated_pair(pair.get('aspect'), pair.get('anchor'), pair.get('link'), pair.get('sent'))
    
    return True
    

def doc_annotate_located_pairs(document, aspects, clues):
    marks = []
    
    for sentence in document.get_sentences():
        pairs = find_pairs(sentence, aspects, clues)
        for pair in pairs:
            marks.append(sentence.get_annotated_pair(pair[0], pair[1]))
            
    
    return np.array(marks)

def annotate_set_pairs(src_dir, dest_file, annotations_dir, aspects, clues):
    docs = files_in(src_dir)
    fp = codecs.open(dest_file, 'a', 'utf-8')
    status = False
    
    for doc in docs:
        document = read_document(join(src_dir, doc))
        doc_load_annotated_pairs(document, annotations_dir)
        np.savetxt(fp, doc_annotate_located_pairs(document, aspects, clues))
        
    fp.close()
    

def annotate_regression(src_dir, dest_file):
    docs = files_in(src_dir)
    fp = codecs.open(dest_file, 'a', 'utf-8')
    status = False
    scores = []
    for doc in docs:
        document = read_document(join(src_dir, doc))
        doc_score = document.get_rating()
        
        positive = 1 if doc_score >= 4 else 0
        
        score = np.array([doc_score, positive])
        scores.append(score)
    
    np.savetxt(fp, np.array(scores))
        
    fp.close()

def annotate_sets():
    #if(exists('features/train/y.txt')): remove('features/train/y.txt')
    #if(exists('features/test/y.txt')): remove('features/test/y.txt')  
        
    if(exists('features/train/z.txt')): remove('features/train/z.txt')
    if(exists('features/test/z.txt')): remove('features/test/z.txt')  
    
    
    #annotate_set_pairs('CropinionDataset/reviews_new/train2', 'features/train/y.txt', 'CropinionDataset/annotated_pairs/all', aspects, all_clues)
    #annotate_set_pairs('CropinionDataset/reviews_new/test2', 'features/test/y.txt', 'CropinionDataset/annotated_pairs/all', aspects, all_clues)
    
    annotate_regression('CropinionDataset/reviews_new/Train', 'features/train/z.txt')
    annotate_regression('CropinionDataset/reviews_new/Test', 'features/test/z.txt')

Feature extraction

Here goes nothing!

Classification features

The features used in the initial classification (finding pairs) runs are:

  • Absolute difference of pair word indices in sentence
  • Absolute difference of pair word beginnings in string
  • The length of the sentence
  • Whether a dependency relation exists between them
  • Whether their number matches
  • Whether their gender matches
  • Whether their cases match
  • Their POS tags as one-hot vectors
  • The number of positive and negative clues in the sentence
  • Whether there is negation in a 3-wide window from each word

Regression features

The features (to be) used in the initial regression run:

  • tf-idf (still somewhat broken)

In [194]:
def extract_set_features(src_dir, dest_file, aspects, pos_clues, neg_clues):
    docs = files_in(src_dir)
    fp = codecs.open(dest_file, 'a', 'utf-8')
    
    for doc in docs:
        features = extract_document_features(join(src_dir, doc), aspects, pos_clues, neg_clues)
        
        np.savetxt(fp, features)
    
    fp.close()

def extract_document_features(document_path, aspects, pos_clues, neg_clues):
    document = read_document(document_path)
    clues = pos_clues | neg_clues
    
    features = []
    for sentence in document.get_sentences():
        pairs = find_pairs(sentence, aspects, clues)
        for pair in pairs:
            features.append(compute_feature_vector(pair, sentence, pos_clues, neg_clues))
            
    return np.array(features)

def compute_feature_vector(pair, sentence, pos_clues, neg_clues):
    features = []
    
    features.append(pair_distance_index(pair))
    features.append(pair_init_distances(pair))
    features.append(np.array([len(sentence.get_text())]))
    features.append(sentence_length(sentence))
    features.append(govern_relations_exist(pair, sentence))
    features.append(match(pair, extract_plurality))
    features.append(match(pair, extract_genders))
    features.append(POS_vector(pair[0]))
    features.append(POS_vector(pair[1]))
    features.append(clues_counts(sentence, pos_clues, neg_clues))
    features.append(negation_present(pair[0], sentence))
    features.append(negation_present(pair[1], sentence))
    
    # BOW between the pair candidates
    #features.append(get_tfidf_vec_between(pair, sentence))
    
    return np.hstack(features)


def words_between(pair, sentence):
    # Find which words to take
    i, j = pair[0].get_index(), pair[1].get_index()
    
    if i > j:
        i, j = j, i
                
    word_bag = []
    
    # All words inbetween, if any
    for index in xrange(i + 1, j):
        word_bag.append(sentence.get_word(index))
        
    return word_bag
        
    
def get_tf_between(pair, sentence):
    word_bag = words_between(pair, sentence)
    dummy_sent = TaggedSentence()
    
    for word in word_bag:
        dummy_sent.add_word(word)
        
    dummy_doc = TaggedText()
    dummy_doc.set_filename('dummy')
    dummy_doc.add_sentence(dummy_sent)
    
    return get_tf([dummy_doc], word_list=word_list)
    
def get_tfidf_vec_between(pair, sentence, idfs=all_idfs, words=all_words):
    dummy_tfs = get_tf_between(pair, sentence)
    return tf_idf_vector('dummy', dummy_tfs, all_idfs, words)


def clues_counts(sentence, pos_clues, neg_clues, processor = get_all_lemmas):
    counts = [0, 0]
    for word in sentence.get_words():
        root = processor(word)
        if isinstance(root, list):
            if any([r in pos_clues for r in root]):
                counts[0] += 1
            if any([r in neg_clues for r in root]):
                counts[1] += 1
        else:
            if root in pos_clues:
                counts[0] += 1
            if root in neg_clues:
                counts[1] += 1
    
    return np.array(counts)        

    
def pair_distance_index(pair):
    return np.array([abs(pair[0].get_index() - pair[1].get_index())])

def pair_init_distances(pair):
    return np.array([abs(pair[0].get_position() - pair[1].get_position())])

def sentence_length(sentence):
    count = len(sentence.get_words())
    return np.array([count])

def govern_relations_exist(pair, sentence):
    rels = [0, 0] # Aspect governs clue, clue governs aspect
    index_pair = pair[0].get_index(), pair[1].get_index()
    
    for dependency in sentence.get_dependencies():
        (a, b) = dependency.get_governor_index(), dependency.get_dependent_index()
        if (a,b) == index_pair:
            rels[0] = 1
        elif (b, a) == index_pair:
            rels[1] = 1
    
    return np.array(rels)

def match(pair, f):
    options_a = f(pair[0])
    options_b = f(pair[1])
    
    for a_g in options_a:
        for b_g in options_b:
            if a_g == b_g:
                return np.array([1.0])
    return np.array([0.0])

def POS_vector(word):
    tags = "ACIMNPQRSVYZ" # POS tags in set
    one_hot = np.zeros(12)
    
    #print tags, word.get_POS_tag()
    index = tags.index(word.get_POS_tag())
    
    if index != -1:
        one_hot[index] = 1
        
    return one_hot

def extract_plurality(word):
    pluralities = set()
    
    msds = word.get_MSDs()
    for msd in msds:
        if msd[0] == 'A':
            pluralities.add(msd[4])
        elif msd[0] == 'N':
            pluralities.add(msd[3])
        elif msd[0] == 'V':
            pluralities.add(msds[-1])
            
    return list(pluralities)

def extract_genders(word):
    genders = set()
    
    msds = word.get_MSDs()
    for msd in msds:
        if msd[0] == 'A':
            genders.add(msd[3])
        elif msd[0] == 'N':
            genders.add(msd[2])
            
    return list(genders)

def negation_present(word, sentence, k = 3):
    negations = [u'ne', u'nije', u'nimalo', u'nipošto', u'nisam', u'nisu', u'nismo', u'nemojte', u'nikako', u'neće']
    word_index = word.get_index()
    negation_present = 0
    
    words = sentence.get_words()
    N = len(words)
    
    start = max(0, word_index - k)
    end   = min(N - 1, word_index + k)
    
    while start <= end:
        if start == word_index:
            start += 1
            continue
        
        if words[start].get_word().lower() in negations:
            negation_present = 1
            break # Just looking for presence, not count
            
        start += 1
    
    return np.array([negation_present])

def get_BOW(document):
    return tf_idf_vector(document.get_filename(), all_tfs, all_idfs, all_words)

def get_token_count(document):
    return np.array([sum([len(s.get_words()) for s in document.get_sentences()])])

def get_uppercase_percentage(document):
    """Computes the percentage of uppercase letters in document"""
    text = document.get_text()
    upper_length = float(len([c for c in text if c.isupper()]))
    length = float(len(text))
    
    return np.array([upper_length / length])

def count_substrings(document, substrings):
    count = 0
    text = document.get_text().lower()
    
    for substring in substrings:
        count += text.count(substring.lower())
        
    return np.array([count])

def count_good_smileys(document):
    """Naive good emoticon count"""
    return count_substrings(document, [':)', ':D', ': )', ':-)', ':-D'])

def count_bad_smileys(document):
    """Naive bad emoticon count"""
    return count_substrings(document, [':(', ':-(', ': ('])

def count_exclamations(document):
    return count_substrings(document, ['!'])

def count_questionmarks(document):
    return count_substrings(document, ['?'])

def aspect_bow(document):
    my_aspects = aspects_in(document)
    return tf_idf_vector(document.get_filename(), all_tfs, all_idfs, aspects, my_aspects)

def clue_bow(document):
    clues = clues_in(document)
    return tf_idf_vector(document.get_filename(), all_tfs, all_idfs, all_clues, clues)

def count_aspects(document):
    return len(aspects_in(document, processor=TaggedWord.get_lemma))

def count_pos_clues(document):
    return len(clues_in(document, processor=TaggedWord.get_lemma, clues=pos_clues))

def count_neg_clues(document):
    return len(clues_in(document, processor=TaggedWord.get_lemma, clues=neg_clues))

def count_plus(document):
    return count_substrings(document, ['+', 'plus'])

def count_minus(document):
    return count_substrings(document, ['-', 'minus'])

def extract_regression_features(src_dir, dest_file):
    docs = files_in(src_dir)
    fp = codecs.open(dest_file, 'a', 'utf-8')
    
    all_features = []

    
    for doc in docs:
        document = read_document(join(src_dir, doc))
        features = []
        
        features.append(get_BOW(document))
        features.append(np.array([document.get_length()]))
        features.append(get_token_count(document))
        # Does nothing
        #features.append(get_uppercase_percentage(document)) Useless
        
        # Symbol features (not very effective)
        #features.append(count_good_smileys(document))
        #features.append(count_bad_smileys(document))
        #features.append(count_plus(document))
        #features.append(count_minus(document))
        
        # Do nothing
        #features.append(count_exclamations(document))
        #features.append(count_questionmarks(document))
        
        features.append(count_aspects(document))
        #features.append(count_pos_clues(document))
        #features.append(count_neg_clues(document))
        features.append(aspect_bow(document))
        #features.append(clue_bow(document))
        
        
        features = np.hstack(features)
        all_features.append(features)
        
    np.savetxt(fp, np.array(all_features))
    
    fp.close()

In [195]:
from os import remove
from os.path import exists

def store_feature_vectors():
    reg_name = 'regression+A.txt'
    """
    Calculate the feature vectors for the entire train and test sets and store the resulting numpy vectors
    into files for easy access. Should be run every time the way features are calculated has changed.
    """
    #Remove old ones
    #if(exists('features/train/classification.txt')): remove('features/train/classification.txt')
    #if(exists('features/test/classification.txt')): remove('features/test/classification.txt')
        
    if(exists('features/train/'+reg_name)): remove('features/train/'+reg_name)
    if(exists('features/test/'+reg_name)): remove('features/test/'+reg_name)
        
    #extract_set_features('CropinionDataset/reviews_new/train2', 'features/train/classification.txt', aspects, pos_clues, neg_clues)
    #extract_set_features('CropinionDataset/reviews_new/test2', 'features/test/classification.txt', aspects, pos_clues, neg_clues)
    
    
    extract_regression_features('CropinionDataset/reviews_new/Train', 'features/train/'+reg_name)
    extract_regression_features('CropinionDataset/reviews_new/Test', 'features/test/'+reg_name)

# Prepare both features and true annotations
store_feature_vectors()
#annotate_sets()

Training - Classification

A set of grid searches is performed, using k-fold validation (where k is 10) to locate the best parameters for precision, recall and F1-score maximisation. The code has been moved to grid.py.

Best parameters when using stems are:

  • kernel = RBF
  • C = 1000
  • Gamma = 0.001

While the best for lemmas are:

  • kernel = RBF
  • C = 50
  • Gamma = 0.001

Best for lemmas and molex lemas combined:

  • kernel = RBF
  • C = 500
  • Gamma = 0.0001

Best for all possible roots (lemmas, stems and molex lemmas) combined (the one we're using):

  • kernel = RBF
  • C = 750
  • Gamma = 0.001

Also with BoW for words in between:

  • kernel = RBF
  • C = 100
  • Gamma = 0.0005

Testing - Classification

This cell performs testing using the developed features and parameters gleaned by cross-validation. The micro and macro scores are computer (P, R and F1) and displayed. All data is scaled.

These are the initial results. Improvements using additional features, as well as analysis which features are most important, will be attempted.

The recorded results are:

Scores with stems


P = 0.88

R = 0.61

F1 = 0.72

The recall is obviously terrible (stems failed to locate a lot of the pairs). The same experiment must be tried with lemmas, which may improve recall performance greatly.

Scores with lemmas


P = 0.99

R = 0.33

F1 = 0.50

What the $&@#

Scores with lemmas AND molex lemmas


P = 0.99

R = 0.47

F1 = 0.64

I have no idea what happened ~ Luka

Scores with all of those combined - This is what we are using

P = 0.90

R = 0.75

F1 = 0.82

All Roots + BoW

P = 0.90

R = 0.73

F1 = 0.81

The results achieved with a combination of all three are almost as good as those in the initial paper. Now we only have to improve them by introducing additional features. However, adding BoW information reduced the recall and F1. We will not use it in the report.


In [103]:
import numpy as np
from sklearn.svm import SVC
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.preprocessing import scale

X_train = np.loadtxt('features/train/classification.txt')
y_train = np.loadtxt('features/train/y.txt')[:,0]

X_test = np.loadtxt('features/test/classification.txt')
y_test = np.loadtxt('features/test/y.txt')[:,0]

X_train = scale(X_train)
X_test = scale(X_test)

N = X_train.shape[0]

model = SVC(C=750, gamma=0.001, kernel='rbf')
model.fit(X_train, y_train)

y_guess = model.predict(X_test)

# ------------- PADDING -------------- #
# Some values were probably not located. We must add them to the count.
# We do so by padding our vector with 0s and the true vector with 1s

cnt = get_dataset_unlocated_pair_count(
      'CropinionDataset/reviews_new/test2', 
      'CropinionDataset/annotated_pairs/all', 
      aspects, 
      all_clues)

print "Missed all of", cnt, "pairs completely while I at at least tagged", y_guess.shape[0]

y_guess = np.lib.pad(y_guess, (cnt,), 'constant', constant_values=(0,))
y_test = np.lib.pad(y_test, (cnt,), 'constant', constant_values=(1,))

print "P  = {0:.3f}".format(precision_score(y_test, y_guess))
print "R  = {0:.3f}".format(recall_score(y_test, y_guess))
print "F1 = {0:.3f}".format(f1_score(y_test, y_guess))


Missed all of 23 pairs completely while I at at least tagged 539
P  = 0.898
R  = 0.748
F1 = 0.817

Score Prediction

Classification - Positive/Negative

Predicting whether the scores are positive or negative, where scores of 4 or more are positive and scores of 2.5 or less are negative. Scores inbetween are ambigious and not considered here.

A grid search (c_grid.py) is used to find optimal parameters (with 5-fold validation).

Basic = "BoW" + Length

The best found parameters when using only BoW + document length are:

  • file = regression.txt
  • kernel = rbf
  • C = 10,000
  • gamma = 0.5

The scores are:

Positive

F1 = 0.84

Negative

F1 = 0.11

Average

F1 = 0.48

Basic + Symbols (Emoticons, +, -)

  • file = regression+E.txt
  • kernel = rbf
  • C = 10,000
  • gamma = 0.1

Positive

F1 = 0.85

Negative

F1 = 0.36

Average

F1 = 0.61

Basic + Aspects (BOW + Count)

rbf, 5000, 0.1

F1 = 0.84, 0.11, 0.48

B + Clues (BOW + Pos Count + Neg Count)

rbf, 100, 0.0001

F1 = 0.90, 0.65, 0.77

B + A + C + E

rbf, 750, 0.00001

F1 = 0.91, 0.74, 0.82


In [240]:
import numpy as np
from sklearn.svm import SVC
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.preprocessing import scale

X_train = np.loadtxt('features/train/regression+E.txt')
y_train = np.loadtxt('features/train/z.txt')[:,1]

X_test = np.loadtxt('features/test/regression+E.txt')
y_test = np.loadtxt('features/test/z.txt')[:,1]

X_train = scale(X_train)
X_test = scale(X_test)

model = SVC(C=10000, gamma=0.1, kernel='rbf')
model.fit(X_train, y_train)

y_guess = model.predict(X_test)

p_p = precision_score(y_test, y_guess, pos_label=1)
r_p = recall_score(y_test, y_guess, pos_label=1)
f1_p = f1_score(y_test, y_guess, pos_label=1)

p_n = precision_score(y_test, y_guess, pos_label=0)
r_n = recall_score(y_test, y_guess, pos_label=0)
f1_n = f1_score(y_test, y_guess, pos_label=0)

print "POSITIVE\n------------------------"
print "P  = {0:.3f}".format(p_p),
print "R  = {0:.3f}".format(r_p),
print "F1 = {0:.3f}".format(f1_p)
print "NEGATIVE\n------------------------"
print "P  = {0:.3f}".format(p_n),
print "R  = {0:.3f}".format(r_n),
print "F1 = {0:.3f}".format(f1_n)
print "AVERAGE\n------------------------"
print "P  = {0:.3f}".format((p_p + p_n) / 2),
print "R  = {0:.3f}".format((r_p + r_n) / 2),
print "F1 = {0:.3f}".format((f1_p + f1_n) / 2)


POSITIVE
------------------------
P  = 0.773 R  = 0.952 F1 = 0.854
NEGATIVE
------------------------
P  = 0.660 R  = 0.250 F1 = 0.363
AVERAGE
------------------------
P  = 0.717 R  = 0.601 F1 = 0.608

Regression

Trying to learn the mapping from review to score.

We are using Support Vector Regression (SVR) with parameteres tuned via grid search.

BOW + Length = Basic

Best found parameters using only BoW and length are:

  • kernel = 'rbf'
  • C = 50
  • gamma = 0.0001

The corresponding scores are:

MAE = 1.513

r = 0.24 (Pearson)

Basic + Symbols

Best params:

  • kernel = 'rbf'
  • C = 100
  • gamma = 0.0005

Scores:

MAE = 1.316

r = 0.37

Basic + Aspects

Best params:

  • kernel = 'rbf'
  • C = 50
  • gamma = 0.0001

Scores:

MAE = 1.381

r = 0.25

Basic + Clues

Best params:

  • kernel = 'rbf'
  • C = 100
  • gamma = 0.005

Scores:

MAE = 1.00

r = 0.68

B + A + S + E

Best params:

  • kernel = 'rbf'
  • C = 10
  • gamma = 0.05

Scores:

MAE = 0.98

r = 0.70


In [246]:
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error
from scipy.stats import pearsonr
from sklearn.preprocessing import scale


def pearson(y_true, y_pred):
    ret_score = pearsonr(y_true, y_pred)[0]
    return ret_score if not np.isnan(ret_score) else 0.0

X_train = np.loadtxt('features/train/regression+A+C+E.txt')
y_train = np.loadtxt('features/train/z.txt')[:,0]

X_test = np.loadtxt('features/test/regression+A+C+E.txt')
y_test = np.loadtxt('features/test/z.txt')[:,0]

X_train = scale(X_train)
X_test = scale(X_test)

N = X_train.shape[0]

model = SVR(C=10, kernel='rbf', gamma=0.05)
model.fit(X_train, y_train)

y_guess = model.predict(X_test)

print 'MAE = {0:.3f}'.format(mean_absolute_error(y_test, y_guess))
print 'r = {0:.3f}'.format(pearson(y_test, y_guess))


MAE = 0.972
r = 0.706

Helpers

Code that is supposed to be run one time or less, like altering the samples, prepping the field and such.

Seriously, don't run this unless you have to


In [25]:
def store_all_words(directory, word_file):
    wf = codecs.open(word_file, 'w', 'utf-8')
    
    files = files_in(directory)
    
    i = 1
    N = len(files)
    
    for document in files:
        if i % 50 == 0:
            print '{}/{}'.format(i, N)
        i += 1
        
        tree = ET.parse(join(directory, document))
        root = tree.getroot()
        words = root.iter('Word')
        
        for word in words:
            wf.write(word.text + '\n')
            
    wf.close()
    
def write_new_stems(lemma_file, src_dir, dest_dir):
    docs = files_in(src_dir)
    with codecs.open(lemma_file, 'r', 'utf-8') as lemmas_list:
        i = 1
        N = len(docs)
        
        for doc in docs:
            if i % 50 == 0:
                print '{}/{}'.format(i, N)
                
            i += 1
            
            tree = ET.parse(join(src_dir, doc))
            root = tree.getroot()
            
            for word in root.iter('BasicStem'): #'Lemma'
                word.text = lemmas_list.readline().split()[1].strip()
            
            tree.write(join(dest_dir, doc), encoding='utf-8')

def print_comment(comment):
    text = []

    for sentence in comment.get_sentences():
        for word in sentence.get_words():
            text.append(word.get_word())

    print '\n'.join(text)
    
def print_dependencies(comment):
    dependencies = []
    
    for sentence in comment.get_sentences():
        subdep = []
        for dependency in sentence.get_dependencies():
            subdep.append(str(dependency))
        dependencies.append(','.join(subdep))
        
    print '\n'.join(dependencies)
    
#store_all_words('CropinionDataset/reviews_new/train', 'train-words.txt')
#write_new_stems('test-lemmas.txt', 'CropinionDataset/reviews_new/test', 'CropinionDataset/reviews_new/test2')

In [21]:



[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]