In [ ]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from commonml.text import CustomDictVectorizer
from commonml import es
import numpy as np
In [ ]:
analyzer_url = 'es://localhost:9200/.analyzer/kuromoji_neologd_analyzer'
es_analyzer = es.build_analyzer(analyzer_url)
vect = CustomDictVectorizer(vect_rules=[
{'name': 'title',
'vectorizer': CountVectorizer(tokenizer=es_analyzer,
max_df=0.8,
min_df=1,
dtype=np.float32)},
{'name': 'description',
'vectorizer': CountVectorizer(tokenizer=es_analyzer,
max_df=0.8,
min_df=1,
dtype=np.float32)}
])
In [ ]:
data_dict = [
{'title':'Test 1','description':'Aaa'},
{'title':'Test 2','description':'Bbb'}
]
In [ ]:
vect.fit(data_dict)
In [ ]:
X = vect.transform(data_dict)
for data in X:
print(data)