In [1]:
import spacy
from sklearn import datasets
texts = datasets.fetch_20newsgroups(subset='train').data
nlp = spacy.load('en')
In [2]:
import texture
%timeit texture.document_matrix(texts, nlp.tokenizer)
In [3]:
from sklearn.feature_extraction.text import CountVectorizer
vec = CountVectorizer()
%timeit vec.fit_transform(texts)
In [ ]:
%timeit next(nlp.pipe(texts, batch_size=10000, n_threads=5))
In [ ]: