In [7]:
# -*- coding: utf-8 -*-
import json
import os

import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

DATA_DIR = os.path.join(os.getcwd(), 'data/processed')

In [10]:
def load_data(file_path):
    with open(file_path) as f:
        items = json.load(f)
    return items


def reduce_annotation(items):
    labels = []
    for annotations in items['labels']:
        qualities = [annotation['quality'] for annotation in annotations]
        label = '0' if qualities.count('0') > qualities.count('1') else '1'
        labels.append(label)
    items['labels'] = labels
    print('Label Percentage:')
    print('  0: {}'.format(labels.count('0') / len(labels)))
    print('  1: {}'.format(labels.count('1') / len(labels)))
    return items


def predict(X, th=10):
    return ['1' if like >= th else '0' for like in X]

In [40]:
items = load_data(os.path.join(DATA_DIR, 'likes.json'))
items['data'] = [[like] for like in items['data']]
items = reduce_annotation(items)
X_train, X_test, y_train, y_test = train_test_split(items['data'], items['labels'], test_size=0.4)
lr = LogisticRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)

# Evaluation
print('Accuracy: {}'.format(accuracy_score(y_test, y_pred)))
print(classification_report(y_test, y_pred))


Label Percentage:
  0: 0.78
  1: 0.22
Accuracy: 0.8
             precision    recall  f1-score   support

          0       0.82      0.97      0.89        33
          1       0.00      0.00      0.00         7

avg / total       0.68      0.80      0.73        40


In [ ]: