In [10]:
from tokenizer import SimpleGermanTokenizer
import matplotlib.pyplot as plt
import os
import random
%matplotlib inline

t = SimpleGermanTokenizer()
def load_documents(path):
    documents = []
    for (dirpath, dirnames, filenames) in os.walk(path):
        for filename in filenames: 
            with open(os.path.join(dirpath, filename), 'r') as file:
                documents.append(file.read())
        break
    random.shuffle(documents)
    return documents
        
def token_number_histogram(documents, t):
    tokenized_documents = [t.tokenize(doc) for doc in documents]
    lengths = [len(doc) for doc in tokenized_documents]
    plt.hist(lengths, bins=range(min(lengths), max(lengths), 10))
    plt.show()

In [11]:
pos = load_documents('data/input/positive')
token_number_histogram(pos, t)



In [8]:
len(t.tokenize(pos[233]))


Out[8]:
5010

In [12]:
unl = load_documents('data/input/unlabeled')
token_number_histogram(unl, t)



In [13]:
import numpy
print([len(doc) for doc in load_documents('data/input/positive')[:20]])
print([len(doc) for doc in load_documents('data/input/unlabeled')[:20]])


[11399, 18215, 4464, 24019, 30195, 41649, 54119, 11577, 49063, 13693, 47702, 34243, 48014, 23458, 87326, 100837, 70389, 34739, 84724, 40386]
[1883, 9085, 6700, 15610, 1827, 42564, 32913, 10179, 2558, 11513, 6674, 26530, 5568, 48341, 7221, 23418, 5421, 24952, 39296, 2111]

In [ ]: