In [39]:
from keras.models import Sequential
from keras.layers.convolutional import Convolution1D, MaxPooling1D
from keras.layers.embeddings import Embedding
from keras.layers.core import Dense, Dropout, Flatten
from keras.layers.recurrent import LSTM
from keras.preprocessing.sequence import pad_sequences

import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix, classification_report, precision_recall_fscore_support
from sklearn.preprocessing import LabelBinarizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.base import TransformerMixin
from sklearn.pipeline import FeatureUnion
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder, LabelBinarizer


from ttp import ttp
import re

In [2]:
train_file = "data/processed/gold/train/100_topics_100_tweets.sentence-three-point.subtask-A.train.gold.txt"
test_file = "data/processed/gold/dev/100_topics_100_tweets.sentence-three-point.subtask-A.dev.gold.txt"
df_train = pd.read_csv(train_file, header=None, sep="\t")
df_test = pd.read_csv(test_file, header= None, sep="\t")
NULL_TEXT = "Not Available"
df_text_train = df_train[df_train[2] != NULL_TEXT][[1,2]]
df_text_test = df_test[df_test[2] != NULL_TEXT][[1,2]]

In [3]:
df_text_train.head()


Out[3]:
1 2
0 negative dear @Microsoft the newOoffice for Mac is grea...
1 negative @Microsoft how about you make a system that do...
4 neutral If I make a game as a #windows10 Universal App...
5 positive Microsoft, I may not prefer your gaming branch...
6 negative @MikeWolf1980 @Microsoft I will be downgrading...

In [4]:
p = ttp.Parser(include_spans=True)

In [5]:
USER_TAG = "_USER"
HASH_TAG = "_HASHTAG"
URL_TAG = "_URL"
LISTS_TAG = "_LIST"


def preprocess(item):
    txt = item[2]
    cols = ["label", "text", "norm_text", "c_url", "c_user", "c_tag", "c_list", "is_reply","t_len"]
    result = p.parse(txt)
    counts = [len(result.urls), len(result.users), len(result.tags), len(result.lists), result.reply is not None, len(txt)]
    processed_txt = ""
    last = 0
    all_results = map(lambda x: (x[0], USER_TAG, x[1]), result.users) +\
        map(lambda x: (x[0], HASH_TAG, x[1]), result.tags) + map(lambda x: (x[0], URL_TAG, x[1]), result.urls) +\
        map(lambda x: (x[0], LISTS_TAG, x[1]), result.lists)
    all_results = sorted(all_results, key=lambda x: x[2][1])
    for k, t, v in all_results:
        processed_txt += txt[last:v[0]] + t
        last = v[1]
    #return all_results, counts, processed_txt
    return pd.Series([item[1], item[2], processed_txt] + counts, index=cols)

In [6]:
df_text_train.head().apply(preprocess, axis=1)


Out[6]:
label text norm_text c_url c_user c_tag c_list is_reply t_len
0 negative dear @Microsoft the newOoffice for Mac is grea... dear _USER 0 1 0 0 False 83
1 negative @Microsoft how about you make a system that do... _USER 0 1 0 0 True 136
4 neutral If I make a game as a #windows10 Universal App... If I make a game as a _HASHTAG Universal App. ... 0 2 2 0 False 137
5 positive Microsoft, I may not prefer your gaming branch... Microsoft, I may not prefer your gaming branch... 0 1 1 0 False 128
6 negative @MikeWolf1980 @Microsoft I will be downgrading... _USER _USER I will be downgrading and let #Win... 0 2 2 0 True 129

In [8]:
df_text_train_proc = df_text_train.apply(preprocess, axis=1)
df_text_test_proc = df_text_test.apply(preprocess, axis=1)


print df_text_train_proc.shape, df_text_test_proc.shape


 (5366, 9) (1798, 9)

In [92]:
cols = df_text_train_proc.columns.values

class ColumnFeatures(TransformerMixin):
    def __init__(self, colname, to_df=True):
        print "Initialized extractor for column %s" % colname
        self.colname = colname
        self.to_df = to_df
    def get_feature_names(self):
        return [self.colname]
    def transform(self, X, **transform_params):
        print "Extracting column [%s], to_df = %s" % (self.colname, self.to_df)
        if self.to_df:
            return pd.DataFrame(X[self.colname])
        return X[self.colname]
    def fit(self, X, y=None, **fit_params):
        return self



class DenseTransformer(TransformerMixin):
    def transform(self, X, **transform_params):
        print "New shape: ",  X.todense().shape
        return X.todense()
    def fit(self, X, y=None, **fit_params):
        return self
    

class IdentityTransformer(TransformerMixin):
    def transform(self, X, **transform_params):
        print "X processed by parent. Output shape: %s" % (X.shape, )
        return X
    def fit(self, X, y=None, **fit_params):
        return self
    
class MultiColumnExtractor(TransformerMixin):
    def __init__(self, colnames, to_series=False):
        print "Initialized extractor for column %s" % colnames
        self.colnames = colnames
        self.to_series = to_series
    def get_feature_names(self):
        return self.colnames
    def transform(self, X, **transform_params):
        print "Extracting columns [%s] from X with shape: %s" % (self.colnames,X.shape)
        if self.to_series:
            if len(self.colnames) != 1:
                raise Exception("When to_series is True, len(colnames) should be 1")
            return X[self.colnames[0]]
        return pd.DataFrame(X[self.colnames])
    def fit(self, X, y=None, **fit_params):
        return self

    
class DictScoreExtractor(TransformerMixin):
    def __init__(self, colname, senti_dict, pattern=r'[^a-zA-Z]+'):
        print "Initialized extractor for column %s with pattern %s" % (colname, pattern)
        self.colname = colname
        self.pattern = re.compile(pattern)
        self.feature_names = ["tot_pos", "tot_neg"]
        self.senti_dict = senti_dict
    def get_feature_names(self):
        return self.feature_names
    def _extract_features(self,x):
        tokens = re.split(self.pattern, x.strip())
        pos_score, neg_score = 0.0, 0.0
        for t in tokens:
            p, n = self.senti_dict.get(t, [0.,0.])
            pos_score += p
            neg_score += n
        return pd.Series([pos_score, neg_score])
    def transform(self, X, **transform_params):
        print "Extracting columns [%s] from X with shape: %s" % (self.colname,X.shape)
        X_sc = X[self.colname].apply(self._extract_features)
        return pd.DataFrame(X_sc)
    def fit(self, X, y=None, **fit_params):
        return self

In [93]:
print "Dict lookup", swn_dict.get("")

d = DictScoreExtractor("norm_text", swn_dict)


d.fit_transform(df_text_test_proc.head())


Dict lookup [ 0.  0.]
Initialized extractor for column norm_text with pattern [^a-zA-Z]+
Extracting columns [norm_text] from X with shape: (5, 9)
Out[93]:
0 1
0 0.000000 0.00000
1 0.733333 0.81250
2 0.364583 0.21875
4 0.000000 0.00000
5 0.000000 0.00000

In [98]:
pipeline = Pipeline([
        ("features", Pipeline([
                    ("union_feature", FeatureUnion([
                                #("text", Pipeline([
                                #            ("norm_txt", MultiColumnExtractor(["norm_text"], to_series=True)),
                                #            ("count_feature", CountVectorizer(min_df = 3, stop_words="english",\
                                #                                              ngram_range=(1,1), binary=True)),
                                #        ])),
                                ("dict_score", Pipeline([
                                            ("dict_scores", DictScoreExtractor("norm_text", swn_dict)),
                                            ("std_d", StandardScaler()),
                                        ])),
                                ("meta_feature", Pipeline([
                                            ("meta_f", MultiColumnExtractor(cols[3:])),
                                            ("std_f", StandardScaler()),
                                        ]))
                            ]))
                ]))
    ])


Initialized extractor for column norm_text with pattern [^a-zA-Z]+
Initialized extractor for column ['c_url' 'c_user' 'c_tag' 'c_list' 'is_reply' 't_len']

In [99]:
df_all = pd.concat((df_text_test_proc, df_text_train_proc))
X_all = pipeline.fit_transform(df_all)

X_train = X_all[:df_text_train_proc.shape[0]]
y_train = df_text_train_proc["label"].values
X_test = X_all[df_text_train_proc.shape[0]:]
y_test = df_text_test_proc["label"].values

print X_train.shape, y_train.shape, X_test.shape, y_test.shape


Extracting columns [norm_text] from X with shape: (7164, 9)
Extracting columns [['c_url' 'c_user' 'c_tag' 'c_list' 'is_reply' 't_len']] from X with shape: (7164, 9)
(5366, 8) (5366,) (1798, 8) (1798,)

In [102]:
svc = LinearSVC(multi_class="crammer_singer", C=0.5)
svc.fit(X_train, y_train)
y_pred = svc.predict(X_train)
print classification_report(y_pred, y_train)
print precision_recall_fscore_support(y_pred, y_train, average="micro")
print confusion_matrix(y_pred, y_train)

y_pred = svc.predict(X_test)
print classification_report(y_pred, y_test)
print precision_recall_fscore_support(y_pred, y_test, average="micro")
print confusion_matrix(y_pred, y_test)


             precision    recall  f1-score   support

   negative       0.00      0.14      0.01        14
    neutral       0.00      0.20      0.00         5
   positive       1.00      0.51      0.68      5347

avg / total       0.99      0.51      0.68      5366

(0.51304509877003357, 0.51304509877003357, 0.51304509877003357, None)
[[   2    8    4]
 [   1    1    3]
 [ 762 1835 2750]]
             precision    recall  f1-score   support

   negative       0.00      0.33      0.01         3
    neutral       0.00      0.00      0.00         1
   positive       1.00      0.42      0.60      1794

avg / total       1.00      0.42      0.59      1798

(0.42380422691879865, 0.42380422691879865, 0.42380422691879871, None)
[[  1   1   1]
 [  0   0   1]
 [352 681 761]]

In [13]:
df_sentiwn = pd.read_csv("data/SentiWordNet_3.0.0_20130122.txt", sep="\t", skiprows=26, usecols=[0,1,2,3,4])

In [14]:
df_sentiwn.head()


Out[14]:
# POS ID PosScore NegScore SynsetTerms
0 a 1740 0.125 0.00 able#1
1 a 2098 0.000 0.75 unable#1
2 a 2312 0.000 0.00 dorsal#2 abaxial#1
3 a 2527 0.000 0.00 ventral#2 adaxial#1
4 a 2730 0.000 0.00 acroscopic#1

In [15]:
df_sentiwn.shape


Out[15]:
(117660, 5)

In [26]:
senti_wn_scores = dict()
non_unigram_c = 0
for row in df_sentiwn.iterrows():
    terms = row[1]["SynsetTerms"].split()
    pos_score, neg_score = row[1][["PosScore", "NegScore"]]
    #print terms, pos_score, neg_score
    for t in terms:
        t = t.split("#")[0]
        if len(t.split("_")) > 1:
            non_unigram_c += 1
            continue # Consider only unigram words
        if t not in senti_wn_scores:
            senti_wn_scores[t] = [0,0,0,0,0]
        senti_wn_scores[t][0] += pos_score # Total pos score
        senti_wn_scores[t][1] += neg_score # Total neg score
        senti_wn_scores[t][2] += (pos_score > 0) # Total pos counts
        senti_wn_scores[t][3] += (neg_score > 0) # Total neg counts
        senti_wn_scores[t][4] += 1 # Total count
print "Created dictionary with %s unigrams. Skipped %s non unigrams" % (len(senti_wn_scores), non_unigram_c)


Created dictionary with 83119 unigrams. Skipped 68082 non unigrams

In [27]:
df_swn = pd.DataFrame(senti_wn_scores.values(), index=senti_wn_scores.keys(), columns=["pos_score", "neg_score", "pos_c", "neg_c", "tot_c"])

In [89]:
df_swn[["avg_pos_score", "avg_neg_score"]] = df_swn[["pos_score", "neg_score"]].div(df_swn["tot_c"], axis=0).fillna(0)
df_swn.head(20)


Out[89]:
pos_score neg_score pos_c neg_c tot_c avg_pos_score avg_neg_score
fawn 0.250 0.125 2 1 5 0.050 0.025
unattackable 0.125 0.000 1 0 1 0.125 0.000
homomorphism 0.000 0.000 0 0 1 0.000 0.000
underneath 0.000 0.000 0 0 2 0.000 0.000
melosa 0.000 0.000 0 0 1 0.000 0.000
nunnery 0.000 0.000 0 0 1 0.000 0.000
deferment 0.000 0.000 0 0 1 0.000 0.000
chthonic 0.000 0.000 0 0 1 0.000 0.000
utnapishtim 0.375 0.125 1 1 1 0.375 0.125
bioko 0.000 0.000 0 0 1 0.000 0.000
circuitry 0.000 0.000 0 0 1 0.000 0.000
clotted 0.000 0.000 0 0 1 0.000 0.000
nonleaded 0.250 0.000 1 0 1 0.250 0.000
acetylate 0.000 0.000 0 0 2 0.000 0.000
hanging 0.000 0.000 0 0 3 0.000 0.000
woody 0.000 0.000 0 0 3 0.000 0.000
rhinitis 0.000 0.500 0 1 1 0.000 0.500
hastily 0.125 0.000 1 0 1 0.125 0.000
self-reliant 0.250 0.000 1 0 1 0.250 0.000
localized 0.000 0.000 0 0 2 0.000 0.000

In [31]:
f = pipeline.named_steps["features"].named_steps["union_feature"]

In [34]:
cvec = f.transformer_list[0][1].named_steps["count_feature"]

In [35]:
analyzer = cvec.build_analyzer()

In [36]:
analyzer("This is a really good book")


Out[36]:
[u'really', u'good', u'book', u'really good', u'good book']

In [38]:
cvec


Out[38]:
CountVectorizer(analyzer=u'word', binary=True, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=3,
        ngram_range=(1, 2), preprocessor=None, stop_words='english',
        strip_accents=None, token_pattern=u'(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [90]:
swn_dict = dict(zip(df_swn.index.values, df_swn[["avg_pos_score", "avg_neg_score"]].values))

In [62]:
swn_dict["woody"]


Out[62]:
array([ 0.,  0.])

In [ ]: