In [18]:
import numpy as np
from utils import load_data
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn import linear_model
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import classification_report
In [2]:
df_train, df_test = load_data()
In [3]:
import pandas as pd
import re
import preprocessor as p
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from string import punctuation
from nltk.stem.snowball import EnglishStemmer
p.set_options(p.OPT.URL)
stemmer = EnglishStemmer()
def text_feature_clean_fun(s):
s = p.clean(s)
s = re.sub("[^a-zA-Z]+", ' ', s) # remove numbers
s = s.lower()
# s = ' '.join([stemmer.stem(word) for word in s.split() if word not in (stopwords.words('english'))])
return s
def feature_transform(raw_feature_str_list, vectorizer, tfidf_transformer):
X = [text_feature_clean_fun(row)for row in raw_feature_str_list]
X = vectorizer.transform(X)
X = tfidf_transformer.transform(X)
return X
In [4]:
df_train['text'] = df_train['text'].apply(text_feature_clean_fun)
In [5]:
df_train.head()
Out[5]:
In [35]:
vectorizer = CountVectorizer(ngram_range=(1, 2), min_df=1)
tfidf_transformer = TfidfTransformer(smooth_idf=False)
X_train = vectorizer.fit_transform(df_train['text'])
X_train = tfidf_transformer.fit_transform(X_train)
X_test = feature_transform(df_test['text'], vectorizer, tfidf_transformer)
print('Train X shape: {}'.format(X_train.shape))
print('Test X shape: {}'.format(X_test.shape))
In [38]:
model = linear_model.LogisticRegression(C=10, solver='newton-cg', n_jobs=-1, verbose=True)
In [39]:
model.fit(X_train, df_train['label'])
Out[39]:
In [40]:
model.score(X_train, df_train['label'])
Out[40]:
In [41]:
model.score(X_test, df_test['label'])
Out[41]:
In [42]:
p_micro, r_micro, f1_micro, _ = \
precision_recall_fscore_support(df_test['label'], model.predict(X_test), average='micro')
p_macro, r_macro, f1_macro, _ = \
precision_recall_fscore_support(df_test['label'], model.predict(X_test), average='macro')
print('\n======> Micro scores ==> P: {0:.4f}, R: {1:.4f}, F1: {2:.4f}'.format(p_micro, r_micro, f1_micro))
print('\n======> Macro scores ==> P: {0:.4f}, R: {1:.4f}, F1: {2:.4f}'.format(p_macro, r_macro, f1_macro))
In [43]:
print(classification_report(df_test['label'], model.predict(X_test)))
In [44]:
res = model.predict(X_test)
np.savetxt('res.txt', res, fmt='%d')
%run ./scorer_semeval18.py ./us_trial.labels res.txt
In [ ]: