In [39]:
from keras.models import Sequential
from keras.layers.convolutional import Convolution1D, MaxPooling1D
from keras.layers.embeddings import Embedding
from keras.layers.core import Dense, Dropout, Flatten
from keras.layers.recurrent import LSTM
from keras.preprocessing.sequence import pad_sequences
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix, classification_report, precision_recall_fscore_support
from sklearn.preprocessing import LabelBinarizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.base import TransformerMixin
from sklearn.pipeline import FeatureUnion
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder, LabelBinarizer
from ttp import ttp
import re
In [2]:
train_file = "data/processed/gold/train/100_topics_100_tweets.sentence-three-point.subtask-A.train.gold.txt"
test_file = "data/processed/gold/dev/100_topics_100_tweets.sentence-three-point.subtask-A.dev.gold.txt"
df_train = pd.read_csv(train_file, header=None, sep="\t")
df_test = pd.read_csv(test_file, header= None, sep="\t")
NULL_TEXT = "Not Available"
df_text_train = df_train[df_train[2] != NULL_TEXT][[1,2]]
df_text_test = df_test[df_test[2] != NULL_TEXT][[1,2]]
In [3]:
df_text_train.head()
Out[3]:
In [4]:
p = ttp.Parser(include_spans=True)
In [5]:
USER_TAG = "_USER"
HASH_TAG = "_HASHTAG"
URL_TAG = "_URL"
LISTS_TAG = "_LIST"
def preprocess(item):
txt = item[2]
cols = ["label", "text", "norm_text", "c_url", "c_user", "c_tag", "c_list", "is_reply","t_len"]
result = p.parse(txt)
counts = [len(result.urls), len(result.users), len(result.tags), len(result.lists), result.reply is not None, len(txt)]
processed_txt = ""
last = 0
all_results = map(lambda x: (x[0], USER_TAG, x[1]), result.users) +\
map(lambda x: (x[0], HASH_TAG, x[1]), result.tags) + map(lambda x: (x[0], URL_TAG, x[1]), result.urls) +\
map(lambda x: (x[0], LISTS_TAG, x[1]), result.lists)
all_results = sorted(all_results, key=lambda x: x[2][1])
for k, t, v in all_results:
processed_txt += txt[last:v[0]] + t
last = v[1]
#return all_results, counts, processed_txt
return pd.Series([item[1], item[2], processed_txt] + counts, index=cols)
In [6]:
df_text_train.head().apply(preprocess, axis=1)
Out[6]:
In [8]:
df_text_train_proc = df_text_train.apply(preprocess, axis=1)
df_text_test_proc = df_text_test.apply(preprocess, axis=1)
print df_text_train_proc.shape, df_text_test_proc.shape
In [92]:
cols = df_text_train_proc.columns.values
class ColumnFeatures(TransformerMixin):
def __init__(self, colname, to_df=True):
print "Initialized extractor for column %s" % colname
self.colname = colname
self.to_df = to_df
def get_feature_names(self):
return [self.colname]
def transform(self, X, **transform_params):
print "Extracting column [%s], to_df = %s" % (self.colname, self.to_df)
if self.to_df:
return pd.DataFrame(X[self.colname])
return X[self.colname]
def fit(self, X, y=None, **fit_params):
return self
class DenseTransformer(TransformerMixin):
def transform(self, X, **transform_params):
print "New shape: ", X.todense().shape
return X.todense()
def fit(self, X, y=None, **fit_params):
return self
class IdentityTransformer(TransformerMixin):
def transform(self, X, **transform_params):
print "X processed by parent. Output shape: %s" % (X.shape, )
return X
def fit(self, X, y=None, **fit_params):
return self
class MultiColumnExtractor(TransformerMixin):
def __init__(self, colnames, to_series=False):
print "Initialized extractor for column %s" % colnames
self.colnames = colnames
self.to_series = to_series
def get_feature_names(self):
return self.colnames
def transform(self, X, **transform_params):
print "Extracting columns [%s] from X with shape: %s" % (self.colnames,X.shape)
if self.to_series:
if len(self.colnames) != 1:
raise Exception("When to_series is True, len(colnames) should be 1")
return X[self.colnames[0]]
return pd.DataFrame(X[self.colnames])
def fit(self, X, y=None, **fit_params):
return self
class DictScoreExtractor(TransformerMixin):
def __init__(self, colname, senti_dict, pattern=r'[^a-zA-Z]+'):
print "Initialized extractor for column %s with pattern %s" % (colname, pattern)
self.colname = colname
self.pattern = re.compile(pattern)
self.feature_names = ["tot_pos", "tot_neg"]
self.senti_dict = senti_dict
def get_feature_names(self):
return self.feature_names
def _extract_features(self,x):
tokens = re.split(self.pattern, x.strip())
pos_score, neg_score = 0.0, 0.0
for t in tokens:
p, n = self.senti_dict.get(t, [0.,0.])
pos_score += p
neg_score += n
return pd.Series([pos_score, neg_score])
def transform(self, X, **transform_params):
print "Extracting columns [%s] from X with shape: %s" % (self.colname,X.shape)
X_sc = X[self.colname].apply(self._extract_features)
return pd.DataFrame(X_sc)
def fit(self, X, y=None, **fit_params):
return self
In [93]:
print "Dict lookup", swn_dict.get("")
d = DictScoreExtractor("norm_text", swn_dict)
d.fit_transform(df_text_test_proc.head())
Out[93]:
In [98]:
pipeline = Pipeline([
("features", Pipeline([
("union_feature", FeatureUnion([
#("text", Pipeline([
# ("norm_txt", MultiColumnExtractor(["norm_text"], to_series=True)),
# ("count_feature", CountVectorizer(min_df = 3, stop_words="english",\
# ngram_range=(1,1), binary=True)),
# ])),
("dict_score", Pipeline([
("dict_scores", DictScoreExtractor("norm_text", swn_dict)),
("std_d", StandardScaler()),
])),
("meta_feature", Pipeline([
("meta_f", MultiColumnExtractor(cols[3:])),
("std_f", StandardScaler()),
]))
]))
]))
])
In [99]:
df_all = pd.concat((df_text_test_proc, df_text_train_proc))
X_all = pipeline.fit_transform(df_all)
X_train = X_all[:df_text_train_proc.shape[0]]
y_train = df_text_train_proc["label"].values
X_test = X_all[df_text_train_proc.shape[0]:]
y_test = df_text_test_proc["label"].values
print X_train.shape, y_train.shape, X_test.shape, y_test.shape
In [102]:
svc = LinearSVC(multi_class="crammer_singer", C=0.5)
svc.fit(X_train, y_train)
y_pred = svc.predict(X_train)
print classification_report(y_pred, y_train)
print precision_recall_fscore_support(y_pred, y_train, average="micro")
print confusion_matrix(y_pred, y_train)
y_pred = svc.predict(X_test)
print classification_report(y_pred, y_test)
print precision_recall_fscore_support(y_pred, y_test, average="micro")
print confusion_matrix(y_pred, y_test)
In [13]:
df_sentiwn = pd.read_csv("data/SentiWordNet_3.0.0_20130122.txt", sep="\t", skiprows=26, usecols=[0,1,2,3,4])
In [14]:
df_sentiwn.head()
Out[14]:
In [15]:
df_sentiwn.shape
Out[15]:
In [26]:
senti_wn_scores = dict()
non_unigram_c = 0
for row in df_sentiwn.iterrows():
terms = row[1]["SynsetTerms"].split()
pos_score, neg_score = row[1][["PosScore", "NegScore"]]
#print terms, pos_score, neg_score
for t in terms:
t = t.split("#")[0]
if len(t.split("_")) > 1:
non_unigram_c += 1
continue # Consider only unigram words
if t not in senti_wn_scores:
senti_wn_scores[t] = [0,0,0,0,0]
senti_wn_scores[t][0] += pos_score # Total pos score
senti_wn_scores[t][1] += neg_score # Total neg score
senti_wn_scores[t][2] += (pos_score > 0) # Total pos counts
senti_wn_scores[t][3] += (neg_score > 0) # Total neg counts
senti_wn_scores[t][4] += 1 # Total count
print "Created dictionary with %s unigrams. Skipped %s non unigrams" % (len(senti_wn_scores), non_unigram_c)
In [27]:
df_swn = pd.DataFrame(senti_wn_scores.values(), index=senti_wn_scores.keys(), columns=["pos_score", "neg_score", "pos_c", "neg_c", "tot_c"])
In [89]:
df_swn[["avg_pos_score", "avg_neg_score"]] = df_swn[["pos_score", "neg_score"]].div(df_swn["tot_c"], axis=0).fillna(0)
df_swn.head(20)
Out[89]:
In [31]:
f = pipeline.named_steps["features"].named_steps["union_feature"]
In [34]:
cvec = f.transformer_list[0][1].named_steps["count_feature"]
In [35]:
analyzer = cvec.build_analyzer()
In [36]:
analyzer("This is a really good book")
Out[36]:
In [38]:
cvec
Out[38]:
In [90]:
swn_dict = dict(zip(df_swn.index.values, df_swn[["avg_pos_score", "avg_neg_score"]].values))
In [62]:
swn_dict["woody"]
Out[62]:
In [ ]: