In [10]:
from tokenizer import SimpleGermanTokenizer
import matplotlib.pyplot as plt
import os
import random
%matplotlib inline
t = SimpleGermanTokenizer()
def load_documents(path):
documents = []
for (dirpath, dirnames, filenames) in os.walk(path):
for filename in filenames:
with open(os.path.join(dirpath, filename), 'r') as file:
documents.append(file.read())
break
random.shuffle(documents)
return documents
def token_number_histogram(documents, t):
tokenized_documents = [t.tokenize(doc) for doc in documents]
lengths = [len(doc) for doc in tokenized_documents]
plt.hist(lengths, bins=range(min(lengths), max(lengths), 10))
plt.show()
In [11]:
pos = load_documents('data/input/positive')
token_number_histogram(pos, t)
In [8]:
len(t.tokenize(pos[233]))
Out[8]:
In [12]:
unl = load_documents('data/input/unlabeled')
token_number_histogram(unl, t)
In [13]:
import numpy
print([len(doc) for doc in load_documents('data/input/positive')[:20]])
print([len(doc) for doc in load_documents('data/input/unlabeled')[:20]])
In [ ]: