In [17]:
# -*- coding: utf-8 -*-
import json
import os
from collections import Counter

import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score

DATA_DIR = os.path.join(os.getcwd(), 'data/processed')

In [18]:
def get_stop_words(docs, n=100, min_freq=1):
    fdist = Counter()
    for doc in docs:
        for word in doc:
            fdist[word] += 1
    common_words = {word for word, freq in fdist.most_common(n)}
    rare_words = {word for word, freq in fdist.items() if freq <= min_freq}
    stopwords = common_words.union(rare_words)
    print('{}/{}'.format(len(stopwords), len(fdist)))
    return stopwords


def load_data(file_path):
    with open(file_path) as f:
        items = json.load(f)
    return items


def reduce_annotation(items):
    labels = []
    for annotations in items['labels']:
        qualities = [annotation['quality'] for annotation in annotations]
        label = '0' if qualities.count('0') > qualities.count('1') else '1'
        labels.append(label)
    items['labels'] = labels
    print('Label Percentage:')
    print('  0: {}'.format(labels.count('0') / len(labels)))
    print('  1: {}'.format(labels.count('1') / len(labels)))
    return items


def build_pipeline(stopwords):
    parameters = {'n_estimators': [10, 30, 50, 70, 90, 110, 130, 150], 'max_features': ['auto', 'sqrt', 'log2', None]}
    text_clf = Pipeline([('vect', CountVectorizer(stop_words=stopwords)),
                         ('tfidf', TfidfTransformer()),
                         ('clf', GridSearchCV(RandomForestClassifier(), parameters, cv=2, scoring='accuracy', n_jobs=-1)),
                         ])
    return text_clf

In [20]:
items = load_data(os.path.join(DATA_DIR, 'posts.json'))
items = reduce_annotation(items)
stopwords = get_stop_words(items['data'], n=100, min_freq=5)
items['data'] = [' '.join(doc) for doc in items['data']]
X_train, X_test, y_train, y_test = train_test_split(items['data'], items['labels'], test_size=0.4)
text_clf = build_pipeline(stopwords=stopwords)
text_clf = text_clf.fit(X_train, y_train)
y_pred = text_clf.predict(X_test)

# Evaluation
print('Accuracy: {}'.format(accuracy_score(y_test, y_pred)))
print(classification_report(y_test, y_pred))


Label Percentage:
  0: 0.78
  1: 0.22
6217/7783
Accuracy: 0.8
             precision    recall  f1-score   support

          0       0.82      0.97      0.89        32
          1       0.50      0.12      0.20         8

avg / total       0.75      0.80      0.75        40


In [ ]: