Import


In [ ]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from commonml.text import CustomDictVectorizer
from commonml import es
import numpy as np

Definition


In [ ]:
analyzer_url = 'es://localhost:9200/.analyzer/kuromoji_neologd_analyzer'
es_analyzer = es.build_analyzer(analyzer_url)
vect = CustomDictVectorizer(vect_rules=[
            {'name': 'title',
             'vectorizer': CountVectorizer(tokenizer=es_analyzer,
                                           max_df=0.8,
                                           min_df=1,
                                           dtype=np.float32)},
            {'name': 'description',
             'vectorizer': CountVectorizer(tokenizer=es_analyzer,
                                           max_df=0.8,
                                           min_df=1,
                                           dtype=np.float32)}
           ])

Data


In [ ]:
data_dict = [
             {'title':'Test 1','description':'Aaa'},
             {'title':'Test 2','description':'Bbb'}
            ]

Fit


In [ ]:
vect.fit(data_dict)

Transform


In [ ]:
X = vect.transform(data_dict)
for data in X:
    print(data)