In [1]:
import spacy
from sklearn import datasets

texts = datasets.fetch_20newsgroups(subset='train').data
nlp = spacy.load('en')

In [2]:
import texture

%timeit texture.document_matrix(texts, nlp.tokenizer)


1 loop, best of 3: 9.02 s per loop

In [3]:
from sklearn.feature_extraction.text import CountVectorizer
vec = CountVectorizer()

%timeit vec.fit_transform(texts)


1 loop, best of 3: 3.61 s per loop

In [ ]:
%timeit next(nlp.pipe(texts, batch_size=10000, n_threads=5))

In [ ]: