In [18]:
import numpy as np

from utils import load_data

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn import linear_model
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import classification_report

In [2]:
df_train, df_test = load_data()


count 374503
count \n 374627
374503

In [3]:
import pandas as pd
import re

import preprocessor as p
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from string import punctuation
from nltk.stem.snowball import EnglishStemmer

p.set_options(p.OPT.URL)
stemmer = EnglishStemmer()

def text_feature_clean_fun(s):
    s = p.clean(s)
    
    s = re.sub("[^a-zA-Z]+", ' ', s)   # remove numbers
    s = s.lower()
    
    # s = ' '.join([stemmer.stem(word) for word in s.split() if word not in (stopwords.words('english'))])
    
    return s


def feature_transform(raw_feature_str_list, vectorizer, tfidf_transformer):
    X = [text_feature_clean_fun(row)for row in raw_feature_str_list]
    
    X = vectorizer.transform(X)
    X = tfidf_transformer.transform(X)
    
    return X

In [4]:
df_train['text'] = df_train['text'].apply(text_feature_clean_fun)

In [5]:
df_train.head()


Out[5]:
label text
0 2 lol west covina california
1 0 step out and explore ellis island cafe
2 1 my baby bear bubby s
3 5 rupaul s drag race bingo fun drag queens be se...
4 2 black history like a mufffffaaaaaka blacchyna ...

In [35]:
vectorizer = CountVectorizer(ngram_range=(1, 2), min_df=1)
tfidf_transformer = TfidfTransformer(smooth_idf=False)

X_train = vectorizer.fit_transform(df_train['text'])
X_train = tfidf_transformer.fit_transform(X_train)

X_test = feature_transform(df_test['text'], vectorizer, tfidf_transformer)

print('Train X shape: {}'.format(X_train.shape))
print('Test X shape:  {}'.format(X_test.shape))


Train X shape: (374503, 1533651)
Test X shape:  (50000, 1533651)

In [38]:
model = linear_model.LogisticRegression(C=10, solver='newton-cg', n_jobs=-1, verbose=True)

In [39]:
model.fit(X_train, df_train['label'])


[Parallel(n_jobs=-1)]: Done  13 out of  13 | elapsed:  4.1min finished
Out[39]:
LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=-1,
          penalty='l2', random_state=None, solver='newton-cg', tol=0.0001,
          verbose=True, warm_start=False)

In [40]:
model.score(X_train, df_train['label'])


Out[40]:
0.969767932433118

In [41]:
model.score(X_test, df_test['label'])


Out[41]:
0.35506

In [42]:
p_micro, r_micro, f1_micro, _ = \
    precision_recall_fscore_support(df_test['label'], model.predict(X_test), average='micro')
p_macro, r_macro, f1_macro, _ = \
    precision_recall_fscore_support(df_test['label'], model.predict(X_test), average='macro')
print('\n======> Micro scores ==> P: {0:.4f},  R: {1:.4f}, F1: {2:.4f}'.format(p_micro, r_micro, f1_micro))
print('\n======> Macro scores ==> P: {0:.4f},  R: {1:.4f}, F1: {2:.4f}'.format(p_macro, r_macro, f1_macro))


======> Micro scores ==> P: 0.3551,  R: 0.3551, F1: 0.3551

======> Macro scores ==> P: 0.1104,  R: 0.1450, F1: 0.1223
/Users/fuyangliu/Workspace/deep_learning_tutorial/p3ml-venv/lib/python3.6/site-packages/sklearn/metrics/classification.py:1135: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)

In [43]:
print(classification_report(df_test['label'], model.predict(X_test)))


             precision    recall  f1-score   support

          0       0.45      0.80      0.57     10760
          1       0.49      0.62      0.55      5279
          2       0.50      0.80      0.62      5241
          3       0.57      0.46      0.51      2885
          4       0.03      0.03      0.03      2517
          5       0.02      0.01      0.01      2317
          6       0.00      0.00      0.00      2049
          7       0.02      0.01      0.01      1894
          8       0.00      0.00      0.00      1796
          9       0.00      0.00      0.00      1671
         10       0.04      0.07      0.05      1544
         11       0.04      0.03      0.03      1528
         12       0.01      0.01      0.01      1462
         13       0.00      0.00      0.00      1346
         14       0.00      0.00      0.00      1377
         15       0.03      0.05      0.03      1249
         16       0.00      0.00      0.00      1306
         17       0.00      0.00      0.00      1279
         18       0.02      0.01      0.02      1286
         19       0.00      0.00      0.00      1214

avg / total       0.24      0.36      0.28     50000

/Users/fuyangliu/Workspace/deep_learning_tutorial/p3ml-venv/lib/python3.6/site-packages/sklearn/metrics/classification.py:1135: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)

In [44]:
res = model.predict(X_test)

np.savetxt('res.txt', res, fmt='%d')

%run ./scorer_semeval18.py ./us_trial.labels res.txt


Macro F-Score (official): 12.235
-----
Micro F-Score: 35.506
Precision: 35.506
Recall: 35.506

In [ ]: