In [1]:
import numpy as np
from sklearn.preprocessing import FunctionTransformer
In [2]:
transformer = FunctionTransformer(np.log1p)
X = np.array([[0, 1], [2, 3]])
transformer.transform(X)
Out[2]:
管道通常用在将向量化(vectorizer) => 转换器(transformer) => 分类器(classifier) 过程封装为一个连贯的过程.
例:以fetch_20newsgroups数据为例做贝叶斯分类器模型
In [4]:
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.datasets import fetch_20newsgroups
In [5]:
categories = ['alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med']
twenty_train = fetch_20newsgroups(subset='train',categories=categories, shuffle=True, random_state=42)
In [6]:
text_clf = Pipeline([('vect', CountVectorizer()),# 分词并向量化
('tfidf', TfidfTransformer()), # tfidf算法提取关键字
('clf', MultinomialNB())]) # 分类器
In [7]:
text_clf.fit(twenty_train.data, twenty_train.target)
Out[7]:
In [8]:
import numpy as np
twenty_test = fetch_20newsgroups(subset='test',categories=categories, shuffle=True, random_state=42)
docs_test = twenty_test.data
predicted = text_clf.predict(docs_test)
np.mean(predicted == twenty_test.target)
Out[8]: