In [1]:
# Add anna to the path
import os
import sys
module_path = os.path.abspath(os.path.join("../../../anna"))
if module_path not in sys.path:
sys.path.append(module_path)
DATA_DIR = "../../../data"
In [2]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import dataset.rcv1.parser as data
%matplotlib inline
In [3]:
# Load data
train_docs, test_docs, unused_docs = data.fetch_and_parse(DATA_DIR)
In [4]:
labels = [l for d in (train_docs + test_docs) for l in d.labels]
labels_count = len(set(labels))
labels_per_doc = len(labels) / (len(train_docs) + len(test_docs))
print("# Train docs: " + str(len(train_docs)))
print("# Test docs: " + str(len(test_docs)))
print("# Labels: " + str(labels_count))
print("# Labels per doc: " + str(labels_per_doc))
In [5]:
pre, ax = plt.subplots(figsize=[12, 6])
ax.set_xlabel('# Labels')
ax.set_ylabel('# Instances')
n, bins, patches = ax.hist([len(d.labels) for d in test_docs])
In [ ]: