Сначала возьмем выборку отзывов на фильмы из NLTK:
In [2]:
from nltk.corpus import movie_reviews
negids = movie_reviews.fileids('neg')
posids = movie_reviews.fileids('pos')
print negids[:5]
Приготовим список текстов и классов как обучающую выборку:
In [3]:
negfeats = [" ".join(movie_reviews.words(fileids=[f])) for f in negids]
posfeats = [" ".join(movie_reviews.words(fileids=[f])) for f in posids]
texts = negfeats + posfeats
labels = [0] * len(negfeats) + [1] * len(posfeats)
In [4]:
print texts[0]
Импортируем нужные нам модули
In [5]:
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.cross_validation import cross_val_score
from sklearn.pipeline import Pipeline
In [6]:
def text_classifier(vectorizer, transformer, classifier):
return Pipeline(
[("vectorizer", vectorizer),
("transformer", transformer),
("classifier", classifier)]
)
In [7]:
for clf in [LogisticRegression, LinearSVC, SGDClassifier]:
print clf
print cross_val_score(text_classifier(CountVectorizer(), TfidfTransformer(), clf()), texts, labels).mean()
print "\n"
In [8]:
clf_pipeline = Pipeline(
[("vectorizer", TfidfVectorizer()),
("classifier", LinearSVC())]
)
clf_pipeline.fit(texts, labels)
print clf_pipeline
In [9]:
print clf_pipeline.predict(["Amazing film! I will advice it to all my friends. Genious",
"Awful film! The man who advised me to watch it is really crazy idiot."])
In [23]:
%%time
from sklearn.decomposition import NMF, TruncatedSVD
v = CountVectorizer()
mx = v.fit_transform(texts)
mf = TruncatedSVD(10)
u = mf.fit_transform(mx)
In [22]:
for transform in [TruncatedSVD, NMF]:
print transform
print cross_val_score(text_classifier(CountVectorizer(), transform(n_components=10), LinearSVC()), texts, labels).mean()
print "\n"
In [ ]:
Если задать n_components=1000:
In [12]:
%%time
print cross_val_score(text_classifier(TfidfVectorizer(), TruncatedSVD(n_components=1000), LinearSVC()),
texts,
labels
).mean()
In [14]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
In [15]:
%%time
print cross_val_score(
Pipeline([
("vectorizer", CountVectorizer()),
("transformer", TruncatedSVD(100)),
("classifier", RandomForestClassifier(100))
]),
texts,
labels
)
Больше компонент и больше деревьев:
In [19]:
%%time
print cross_val_score(text_classifier(CountVectorizer(), TruncatedSVD(n_components=1000), RandomForestClassifier(1000)),
texts,
labels
).mean()
Tf*Idf вместо частот слов:
In [18]:
%%time
print cross_val_score(text_classifier(TfidfVectorizer(), TruncatedSVD(n_components=1000), RandomForestClassifier(1000)),
texts,
labels
).mean()
In [16]:
from sklearn.pipeline import FeatureUnion
estimators = [('tfidf', TfidfTransformer()), ('svd', TruncatedSVD(1))]
combined = FeatureUnion(estimators)
In [17]:
%%time
print cross_val_score(
Pipeline([
("vectorizer", CountVectorizer()),
("transformer", combined),
("classifier", LinearSVC())
]),
texts,
labels
)
In [ ]: