In [1]:
from utils import load_data

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer
from sklearn import linear_model
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import classification_report

In [2]:
df_train, df_test = load_data()


count 374503
count \n 374627
374503

In [3]:
import pandas as pd
import re

import preprocessor as p
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from string import punctuation
from nltk.stem.snowball import EnglishStemmer

p.set_options(p.OPT.URL)
translator = str.maketrans("", "", punctuation)
stemmer = EnglishStemmer()

def text_feature_clean_fun(s):
    
    
    # s = re.sub("[^a-zA-Z]+", ' ', s)   # remove numbers
    # s = s.lower()
    
    text = ' '.join([word for word in s.split() if word not in (stopwords.words('english'))])
    
    # delete punctuation
    text = word_tokenize(text.translate(translator))
    
    # stemming
    text = [stemmer.stem(w) for w in text]
    # preprocessing as tweet
    text = p.clean(' '.join(text))
    
    return s

In [4]:
df_train['text'] = df_train['text'].apply(text_feature_clean_fun)
df_test['text'] = df_test['text'].apply(text_feature_clean_fun)

In [5]:
df_train.head()


Out[5]:
label text
0 2 LoL @ West Covina, California
1 0 Step out and explore. # ️ @ Ellis Island Cafe
2 1 My baby bear @ Bubby's
3 5 RuPaul's Drag Race bingo fun. Drag Queens be S...
4 2 Black History like a Mufffffaaaaaka #blacchyna...

In [6]:
tfidf_transformer = TfidfVectorizer(ngram_range=(1,2), min_df=1)

X_train = tfidf_transformer.fit_transform(df_train['text'])

X_test = tfidf_transformer.transform(df_test['text'])

print('Train X shape: {}'.format(X_train.shape))
print('Test X shape:  {}'.format(X_test.shape))


Train X shape: (374503, 1571174)
Test X shape:  (50000, 1571174)

In [7]:
model = linear_model.LogisticRegression(C=10, solver='newton-cg', n_jobs=-1)

In [8]:
model.fit(X_train, df_train['label'])


Out[8]:
LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=-1,
          penalty='l2', random_state=None, solver='newton-cg', tol=0.0001,
          verbose=0, warm_start=False)

In [9]:
model.score(X_train, df_train['label'])


Out[9]:
0.9690523173379119

In [10]:
model.score(X_test, df_test['label'])


Out[10]:
0.35474

In [11]:
p_micro, r_micro, f1_micro, _ = \
    precision_recall_fscore_support(df_test['label'], model.predict(X_test), average='micro')
p_macro, r_macro, f1_macro, _ = \
    precision_recall_fscore_support(df_test['label'], model.predict(X_test), average='macro')
print('\n======> Micro scores ==> P: {0:.4f},  R: {1:.4f}, F1: {2:.4f}'.format(p_micro, r_micro, f1_micro))
print('\n======> Macro scores ==> P: {0:.4f},  R: {1:.4f}, F1: {2:.4f}'.format(p_macro, r_macro, f1_macro))


======> Micro scores ==> P: 0.3547,  R: 0.3547, F1: 0.3547

======> Macro scores ==> P: 0.1106,  R: 0.1449, F1: 0.1223
/Users/fuyangliu/Workspace/deep_learning_tutorial/p3ml-venv/lib/python3.6/site-packages/sklearn/metrics/classification.py:1135: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)

In [12]:
print(classification_report(df_test['label'], model.predict(X_test)))


             precision    recall  f1-score   support

          0       0.45      0.80      0.57     10760
          1       0.49      0.62      0.55      5279
          2       0.51      0.80      0.62      5241
          3       0.57      0.46      0.51      2885
          4       0.03      0.02      0.02      2517
          5       0.01      0.01      0.01      2317
          6       0.00      0.00      0.00      2049
          7       0.02      0.01      0.01      1894
          8       0.00      0.00      0.00      1796
          9       0.00      0.00      0.00      1671
         10       0.04      0.07      0.05      1544
         11       0.04      0.04      0.04      1528
         12       0.02      0.01      0.01      1462
         13       0.00      0.00      0.00      1346
         14       0.00      0.00      0.00      1377
         15       0.03      0.05      0.03      1249
         16       0.00      0.00      0.00      1306
         17       0.00      0.00      0.00      1279
         18       0.02      0.01      0.02      1286
         19       0.00      0.00      0.00      1214

avg / total       0.24      0.35      0.28     50000

/Users/fuyangliu/Workspace/deep_learning_tutorial/p3ml-venv/lib/python3.6/site-packages/sklearn/metrics/classification.py:1135: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)

In [13]:
import numpy as np

res = model.predict(X_test)

np.savetxt('res.txt', res, fmt='%d')

%run ./scorer_semeval18.py ./us_trial.labels res.txt


Macro F-Score (official): 12.23
-----
Micro F-Score: 35.474
Precision: 35.474
Recall: 35.474

In [ ]: