In [1]:
# Add anna to the path
import os
import sys
module_path = os.path.abspath(os.path.join("../../../anna"))
if module_path not in sys.path:
    sys.path.append(module_path)

DATA_DIR = "../../../data"

In [2]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import dataset.rcv1.parser as data

%matplotlib inline

In [3]:
# Load data
train_docs, test_docs, unused_docs = data.fetch_and_parse(DATA_DIR)

In [4]:
labels = [l for d in (train_docs + test_docs) for l in d.labels]
labels_count = len(set(labels))
labels_per_doc = len(labels) / (len(train_docs) + len(test_docs))
print("# Train docs: " + str(len(train_docs)))
print("# Test docs: " + str(len(test_docs)))
print("# Labels: " + str(labels_count))
print("# Labels per doc: " + str(labels_per_doc))


# Train docs: 781265
# Test docs: 23149
# Labels: 103
# Labels per doc: 3.2407131153858586

In [5]:
pre, ax = plt.subplots(figsize=[12, 6])

ax.set_xlabel('# Labels')
ax.set_ylabel('# Instances')
n, bins, patches = ax.hist([len(d.labels) for d in test_docs])



In [ ]: