notebook.community

Edit and run



In [39]:

    
from keras.models import Sequential
from keras.layers.convolutional import Convolution1D, MaxPooling1D
from keras.layers.embeddings import Embedding
from keras.layers.core import Dense, Dropout, Flatten
from keras.layers.recurrent import LSTM
from keras.preprocessing.sequence import pad_sequences

import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix, classification_report, precision_recall_fscore_support
from sklearn.preprocessing import LabelBinarizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.base import TransformerMixin
from sklearn.pipeline import FeatureUnion
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder, LabelBinarizer


from ttp import ttp
import re



In [2]:

    
train_file = "data/processed/gold/train/100_topics_100_tweets.sentence-three-point.subtask-A.train.gold.txt"
test_file = "data/processed/gold/dev/100_topics_100_tweets.sentence-three-point.subtask-A.dev.gold.txt"
df_train = pd.read_csv(train_file, header=None, sep="\t")
df_test = pd.read_csv(test_file, header= None, sep="\t")
NULL_TEXT = "Not Available"
df_text_train = df_train[df_train[2] != NULL_TEXT][[1,2]]
df_text_test = df_test[df_test[2] != NULL_TEXT][[1,2]]



In [3]:

    
df_text_train.head()









    Out[3]:






  
    
      
      1
      2
    
  
  
    
      0
      negative
      dear @Microsoft the newOoffice for Mac is grea...
    
    
      1
      negative
      @Microsoft how about you make a system that do...
    
    
      4
      neutral
      If I make a game as a #windows10 Universal App...
    
    
      5
      positive
      Microsoft, I may not prefer your gaming branch...
    
    
      6
      negative
      @MikeWolf1980 @Microsoft I will be downgrading...



In [4]:

    
p = ttp.Parser(include_spans=True)



In [5]:

    
USER_TAG = "_USER"
HASH_TAG = "_HASHTAG"
URL_TAG = "_URL"
LISTS_TAG = "_LIST"


def preprocess(item):
    txt = item[2]
    cols = ["label", "text", "norm_text", "c_url", "c_user", "c_tag", "c_list", "is_reply","t_len"]
    result = p.parse(txt)
    counts = [len(result.urls), len(result.users), len(result.tags), len(result.lists), result.reply is not None, len(txt)]
    processed_txt = ""
    last = 0
    all_results = map(lambda x: (x[0], USER_TAG, x[1]), result.users) +\
        map(lambda x: (x[0], HASH_TAG, x[1]), result.tags) + map(lambda x: (x[0], URL_TAG, x[1]), result.urls) +\
        map(lambda x: (x[0], LISTS_TAG, x[1]), result.lists)
    all_results = sorted(all_results, key=lambda x: x[2][1])
    for k, t, v in all_results:
        processed_txt += txt[last:v[0]] + t
        last = v[1]
    #return all_results, counts, processed_txt
    return pd.Series([item[1], item[2], processed_txt] + counts, index=cols)



In [6]:

    
df_text_train.head().apply(preprocess, axis=1)









    Out[6]:






  
    
      
      label
      text
      norm_text
      c_url
      c_user
      c_tag
      c_list
      is_reply
      t_len
    
  
  
    
      0
      negative
      dear @Microsoft the newOoffice for Mac is grea...
      dear _USER
      0
      1
      0
      0
      False
      83
    
    
      1
      negative
      @Microsoft how about you make a system that do...
      _USER
      0
      1
      0
      0
      True
      136
    
    
      4
      neutral
      If I make a game as a #windows10 Universal App...
      If I make a game as a _HASHTAG Universal App. ...
      0
      2
      2
      0
      False
      137
    
    
      5
      positive
      Microsoft, I may not prefer your gaming branch...
      Microsoft, I may not prefer your gaming branch...
      0
      1
      1
      0
      False
      128
    
    
      6
      negative
      @MikeWolf1980 @Microsoft I will be downgrading...
      _USER _USER I will be downgrading and let #Win...
      0
      2
      2
      0
      True
      129



In [8]:

    
df_text_train_proc = df_text_train.apply(preprocess, axis=1)
df_text_test_proc = df_text_test.apply(preprocess, axis=1)


print df_text_train_proc.shape, df_text_test_proc.shape









    



 (5366, 9) (1798, 9)



In [92]:

    
cols = df_text_train_proc.columns.values

class ColumnFeatures(TransformerMixin):
    def __init__(self, colname, to_df=True):
        print "Initialized extractor for column %s" % colname
        self.colname = colname
        self.to_df = to_df
    def get_feature_names(self):
        return [self.colname]
    def transform(self, X, **transform_params):
        print "Extracting column [%s], to_df = %s" % (self.colname, self.to_df)
        if self.to_df:
            return pd.DataFrame(X[self.colname])
        return X[self.colname]
    def fit(self, X, y=None, **fit_params):
        return self



class DenseTransformer(TransformerMixin):
    def transform(self, X, **transform_params):
        print "New shape: ",  X.todense().shape
        return X.todense()
    def fit(self, X, y=None, **fit_params):
        return self
    

class IdentityTransformer(TransformerMixin):
    def transform(self, X, **transform_params):
        print "X processed by parent. Output shape: %s" % (X.shape, )
        return X
    def fit(self, X, y=None, **fit_params):
        return self
    
class MultiColumnExtractor(TransformerMixin):
    def __init__(self, colnames, to_series=False):
        print "Initialized extractor for column %s" % colnames
        self.colnames = colnames
        self.to_series = to_series
    def get_feature_names(self):
        return self.colnames
    def transform(self, X, **transform_params):
        print "Extracting columns [%s] from X with shape: %s" % (self.colnames,X.shape)
        if self.to_series:
            if len(self.colnames) != 1:
                raise Exception("When to_series is True, len(colnames) should be 1")
            return X[self.colnames[0]]
        return pd.DataFrame(X[self.colnames])
    def fit(self, X, y=None, **fit_params):
        return self

    
class DictScoreExtractor(TransformerMixin):
    def __init__(self, colname, senti_dict, pattern=r'[^a-zA-Z]+'):
        print "Initialized extractor for column %s with pattern %s" % (colname, pattern)
        self.colname = colname
        self.pattern = re.compile(pattern)
        self.feature_names = ["tot_pos", "tot_neg"]
        self.senti_dict = senti_dict
    def get_feature_names(self):
        return self.feature_names
    def _extract_features(self,x):
        tokens = re.split(self.pattern, x.strip())
        pos_score, neg_score = 0.0, 0.0
        for t in tokens:
            p, n = self.senti_dict.get(t, [0.,0.])
            pos_score += p
            neg_score += n
        return pd.Series([pos_score, neg_score])
    def transform(self, X, **transform_params):
        print "Extracting columns [%s] from X with shape: %s" % (self.colname,X.shape)
        X_sc = X[self.colname].apply(self._extract_features)
        return pd.DataFrame(X_sc)
    def fit(self, X, y=None, **fit_params):
        return self



In [93]:

    
print "Dict lookup", swn_dict.get("")

d = DictScoreExtractor("norm_text", swn_dict)


d.fit_transform(df_text_test_proc.head())









    



Dict lookup [ 0.  0.]
Initialized extractor for column norm_text with pattern [^a-zA-Z]+
Extracting columns [norm_text] from X with shape: (5, 9)






    Out[93]:






  
    
      
      0
      1
    
  
  
    
      0
      0.000000
      0.00000
    
    
      1
      0.733333
      0.81250
    
    
      2
      0.364583
      0.21875
    
    
      4
      0.000000
      0.00000
    
    
      5
      0.000000
      0.00000



In [98]:

    
pipeline = Pipeline([
        ("features", Pipeline([
                    ("union_feature", FeatureUnion([
                                #("text", Pipeline([
                                #            ("norm_txt", MultiColumnExtractor(["norm_text"], to_series=True)),
                                #            ("count_feature", CountVectorizer(min_df = 3, stop_words="english",\
                                #                                              ngram_range=(1,1), binary=True)),
                                #        ])),
                                ("dict_score", Pipeline([
                                            ("dict_scores", DictScoreExtractor("norm_text", swn_dict)),
                                            ("std_d", StandardScaler()),
                                        ])),
                                ("meta_feature", Pipeline([
                                            ("meta_f", MultiColumnExtractor(cols[3:])),
                                            ("std_f", StandardScaler()),
                                        ]))
                            ]))
                ]))
    ])









    



Initialized extractor for column norm_text with pattern [^a-zA-Z]+
Initialized extractor for column ['c_url' 'c_user' 'c_tag' 'c_list' 'is_reply' 't_len']



In [99]:

    
df_all = pd.concat((df_text_test_proc, df_text_train_proc))
X_all = pipeline.fit_transform(df_all)

X_train = X_all[:df_text_train_proc.shape[0]]
y_train = df_text_train_proc["label"].values
X_test = X_all[df_text_train_proc.shape[0]:]
y_test = df_text_test_proc["label"].values

print X_train.shape, y_train.shape, X_test.shape, y_test.shape









    



Extracting columns [norm_text] from X with shape: (7164, 9)
Extracting columns [['c_url' 'c_user' 'c_tag' 'c_list' 'is_reply' 't_len']] from X with shape: (7164, 9)
(5366, 8) (5366,) (1798, 8) (1798,)



In [102]:

    
svc = LinearSVC(multi_class="crammer_singer", C=0.5)
svc.fit(X_train, y_train)
y_pred = svc.predict(X_train)
print classification_report(y_pred, y_train)
print precision_recall_fscore_support(y_pred, y_train, average="micro")
print confusion_matrix(y_pred, y_train)

y_pred = svc.predict(X_test)
print classification_report(y_pred, y_test)
print precision_recall_fscore_support(y_pred, y_test, average="micro")
print confusion_matrix(y_pred, y_test)









    



             precision    recall  f1-score   support

   negative       0.00      0.14      0.01        14
    neutral       0.00      0.20      0.00         5
   positive       1.00      0.51      0.68      5347

avg / total       0.99      0.51      0.68      5366

(0.51304509877003357, 0.51304509877003357, 0.51304509877003357, None)
[[   2    8    4]
 [   1    1    3]
 [ 762 1835 2750]]
             precision    recall  f1-score   support

   negative       0.00      0.33      0.01         3
    neutral       0.00      0.00      0.00         1
   positive       1.00      0.42      0.60      1794

avg / total       1.00      0.42      0.59      1798

(0.42380422691879865, 0.42380422691879865, 0.42380422691879871, None)
[[  1   1   1]
 [  0   0   1]
 [352 681 761]]



In [13]:

    
df_sentiwn = pd.read_csv("data/SentiWordNet_3.0.0_20130122.txt", sep="\t", skiprows=26, usecols=[0,1,2,3,4])



In [14]:

    
df_sentiwn.head()









    Out[14]:






  
    
      
      # POS
      ID
      PosScore
      NegScore
      SynsetTerms
    
  
  
    
      0
      a
      1740
      0.125
      0.00
      able#1
    
    
      1
      a
      2098
      0.000
      0.75
      unable#1
    
    
      2
      a
      2312
      0.000
      0.00
      dorsal#2 abaxial#1
    
    
      3
      a
      2527
      0.000
      0.00
      ventral#2 adaxial#1
    
    
      4
      a
      2730
      0.000
      0.00
      acroscopic#1



In [15]:

    
df_sentiwn.shape









    Out[15]:





(117660, 5)



In [26]:

    
senti_wn_scores = dict()
non_unigram_c = 0
for row in df_sentiwn.iterrows():
    terms = row[1]["SynsetTerms"].split()
    pos_score, neg_score = row[1][["PosScore", "NegScore"]]
    #print terms, pos_score, neg_score
    for t in terms:
        t = t.split("#")[0]
        if len(t.split("_")) > 1:
            non_unigram_c += 1
            continue # Consider only unigram words
        if t not in senti_wn_scores:
            senti_wn_scores[t] = [0,0,0,0,0]
        senti_wn_scores[t][0] += pos_score # Total pos score
        senti_wn_scores[t][1] += neg_score # Total neg score
        senti_wn_scores[t][2] += (pos_score > 0) # Total pos counts
        senti_wn_scores[t][3] += (neg_score > 0) # Total neg counts
        senti_wn_scores[t][4] += 1 # Total count
print "Created dictionary with %s unigrams. Skipped %s non unigrams" % (len(senti_wn_scores), non_unigram_c)









    



Created dictionary with 83119 unigrams. Skipped 68082 non unigrams



In [27]:

    
df_swn = pd.DataFrame(senti_wn_scores.values(), index=senti_wn_scores.keys(), columns=["pos_score", "neg_score", "pos_c", "neg_c", "tot_c"])



In [89]:

    
df_swn[["avg_pos_score", "avg_neg_score"]] = df_swn[["pos_score", "neg_score"]].div(df_swn["tot_c"], axis=0).fillna(0)
df_swn.head(20)









    Out[89]:






  
    
      
      pos_score
      neg_score
      pos_c
      neg_c
      tot_c
      avg_pos_score
      avg_neg_score
    
  
  
    
      fawn
      0.250
      0.125
      2
      1
      5
      0.050
      0.025
    
    
      unattackable
      0.125
      0.000
      1
      0
      1
      0.125
      0.000
    
    
      homomorphism
      0.000
      0.000
      0
      0
      1
      0.000
      0.000
    
    
      underneath
      0.000
      0.000
      0
      0
      2
      0.000
      0.000
    
    
      melosa
      0.000
      0.000
      0
      0
      1
      0.000
      0.000
    
    
      nunnery
      0.000
      0.000
      0
      0
      1
      0.000
      0.000
    
    
      deferment
      0.000
      0.000
      0
      0
      1
      0.000
      0.000
    
    
      chthonic
      0.000
      0.000
      0
      0
      1
      0.000
      0.000
    
    
      utnapishtim
      0.375
      0.125
      1
      1
      1
      0.375
      0.125
    
    
      bioko
      0.000
      0.000
      0
      0
      1
      0.000
      0.000
    
    
      circuitry
      0.000
      0.000
      0
      0
      1
      0.000
      0.000
    
    
      clotted
      0.000
      0.000
      0
      0
      1
      0.000
      0.000
    
    
      nonleaded
      0.250
      0.000
      1
      0
      1
      0.250
      0.000
    
    
      acetylate
      0.000
      0.000
      0
      0
      2
      0.000
      0.000
    
    
      hanging
      0.000
      0.000
      0
      0
      3
      0.000
      0.000
    
    
      woody
      0.000
      0.000
      0
      0
      3
      0.000
      0.000
    
    
      rhinitis
      0.000
      0.500
      0
      1
      1
      0.000
      0.500
    
    
      hastily
      0.125
      0.000
      1
      0
      1
      0.125
      0.000
    
    
      self-reliant
      0.250
      0.000
      1
      0
      1
      0.250
      0.000
    
    
      localized
      0.000
      0.000
      0
      0
      2
      0.000
      0.000



In [31]:

    
f = pipeline.named_steps["features"].named_steps["union_feature"]



In [34]:

    
cvec = f.transformer_list[0][1].named_steps["count_feature"]



In [35]:

    
analyzer = cvec.build_analyzer()



In [36]:

    
analyzer("This is a really good book")









    Out[36]:





[u'really', u'good', u'book', u'really good', u'good book']



In [38]:

    
cvec









    Out[38]:





CountVectorizer(analyzer=u'word', binary=True, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=3,
        ngram_range=(1, 2), preprocessor=None, stop_words='english',
        strip_accents=None, token_pattern=u'(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)



In [90]:

    
swn_dict = dict(zip(df_swn.index.values, df_swn[["avg_pos_score", "avg_neg_score"]].values))



In [62]:

    
swn_dict["woody"]









    Out[62]:





array([ 0.,  0.])



In [ ]:

	1	2
0	negative	dear @Microsoft the newOoffice for Mac is grea...
1	negative	@Microsoft how about you make a system that do...
4	neutral	If I make a game as a #windows10 Universal App...
5	positive	Microsoft, I may not prefer your gaming branch...
6	negative	@MikeWolf1980 @Microsoft I will be downgrading...

	label	text	norm_text	c_user	c_tag	is_reply	t_len
0	negative	dear @Microsoft the newOoffice for Mac is grea...	dear _USER	1	0	False	83
1	negative	@Microsoft how about you make a system that do...	_USER	1	0	True	136
4	neutral	If I make a game as a #windows10 Universal App...	If I make a game as a _HASHTAG Universal App. ...	2	2	False	137
5	positive	Microsoft, I may not prefer your gaming branch...	Microsoft, I may not prefer your gaming branch...	1	1	False	128
6	negative	@MikeWolf1980 @Microsoft I will be downgrading...	_USER _USER I will be downgrading and let #Win...	2	2	True	129

	0	1
0	0.000000	0.00000
1	0.733333	0.81250
2	0.364583	0.21875
4	0.000000	0.00000
5	0.000000	0.00000

	# POS	ID	PosScore	NegScore	SynsetTerms
0	a	1740	0.125	0.00	able#1
1	a	2098	0.000	0.75	unable#1
2	a	2312	0.000	0.00	dorsal#2 abaxial#1
3	a	2527	0.000	0.00	ventral#2 adaxial#1
4	a	2730	0.000	0.00	acroscopic#1

	pos_score	neg_score	pos_c	neg_c	tot_c	avg_pos_score	avg_neg_score
fawn	0.250	0.125	2	1	5	0.050	0.025
unattackable	0.125	0.000	1	0	1	0.125	0.000
homomorphism	0.000	0.000	0	0	1	0.000	0.000
underneath	0.000	0.000	0	0	2	0.000	0.000
melosa	0.000	0.000	0	0	1	0.000	0.000
nunnery	0.000	0.000	0	0	1	0.000	0.000
deferment	0.000	0.000	0	0	1	0.000	0.000
chthonic	0.000	0.000	0	0	1	0.000	0.000
utnapishtim	0.375	0.125	1	1	1	0.375	0.125
bioko	0.000	0.000	0	0	1	0.000	0.000
circuitry	0.000	0.000	0	0	1	0.000	0.000
clotted	0.000	0.000	0	0	1	0.000	0.000
nonleaded	0.250	0.000	1	0	1	0.250	0.000
acetylate	0.000	0.000	0	0	2	0.000	0.000
hanging	0.000	0.000	0	0	3	0.000	0.000
woody	0.000	0.000	0	0	3	0.000	0.000
rhinitis	0.000	0.500	0	1	1	0.000	0.500
hastily	0.125	0.000	1	0	1	0.125	0.000
self-reliant	0.250	0.000	1	0	1	0.250	0.000
localized	0.000	0.000	0	0	2	0.000	0.000