In [1]:
from utils import load_data
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer
from sklearn import linear_model
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import classification_report
In [2]:
df_train, df_test = load_data()
In [3]:
import pandas as pd
import re
import preprocessor as p
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from string import punctuation
from nltk.stem.snowball import EnglishStemmer
p.set_options(p.OPT.URL)
translator = str.maketrans("", "", punctuation)
stemmer = EnglishStemmer()
def text_feature_clean_fun(s):
# s = re.sub("[^a-zA-Z]+", ' ', s) # remove numbers
# s = s.lower()
text = ' '.join([word for word in s.split() if word not in (stopwords.words('english'))])
# delete punctuation
text = word_tokenize(text.translate(translator))
# stemming
text = [stemmer.stem(w) for w in text]
# preprocessing as tweet
text = p.clean(' '.join(text))
return s
In [4]:
df_train['text'] = df_train['text'].apply(text_feature_clean_fun)
df_test['text'] = df_test['text'].apply(text_feature_clean_fun)
In [5]:
df_train.head()
Out[5]:
In [6]:
tfidf_transformer = TfidfVectorizer(ngram_range=(1,2), min_df=1)
X_train = tfidf_transformer.fit_transform(df_train['text'])
X_test = tfidf_transformer.transform(df_test['text'])
print('Train X shape: {}'.format(X_train.shape))
print('Test X shape: {}'.format(X_test.shape))
In [7]:
model = linear_model.LogisticRegression(C=10, solver='newton-cg', n_jobs=-1)
In [8]:
model.fit(X_train, df_train['label'])
Out[8]:
In [9]:
model.score(X_train, df_train['label'])
Out[9]:
In [10]:
model.score(X_test, df_test['label'])
Out[10]:
In [11]:
p_micro, r_micro, f1_micro, _ = \
precision_recall_fscore_support(df_test['label'], model.predict(X_test), average='micro')
p_macro, r_macro, f1_macro, _ = \
precision_recall_fscore_support(df_test['label'], model.predict(X_test), average='macro')
print('\n======> Micro scores ==> P: {0:.4f}, R: {1:.4f}, F1: {2:.4f}'.format(p_micro, r_micro, f1_micro))
print('\n======> Macro scores ==> P: {0:.4f}, R: {1:.4f}, F1: {2:.4f}'.format(p_macro, r_macro, f1_macro))
In [12]:
print(classification_report(df_test['label'], model.predict(X_test)))
In [13]:
import numpy as np
res = model.predict(X_test)
np.savetxt('res.txt', res, fmt='%d')
%run ./scorer_semeval18.py ./us_trial.labels res.txt
In [ ]: