In [1]:
from janome.tokenizer import Tokenizer
from janome.analyzer import Analyzer
from janome.charfilter import *
from janome.tokenfilter import *
In [2]:
t = Tokenizer()
In [3]:
s = '<div>PythonとPYTHONとパイソンとパイソン</div>'
In [4]:
for token in t.tokenize(s):
print(token)
In [5]:
char_filters = [UnicodeNormalizeCharFilter(),
RegexReplaceCharFilter('<.*?>', '')]
In [6]:
token_filters = [POSKeepFilter(['名詞']),
LowerCaseFilter(),
ExtractAttributeFilter('surface')]
In [7]:
a = Analyzer(char_filters=char_filters, token_filters=token_filters)
In [8]:
for token in a.analyze(s):
print(token)
In [9]:
s = '自然言語処理による日本国憲法の形態素解析'
In [10]:
for token in t.tokenize(s):
print(token)
In [11]:
a = Analyzer(token_filters=[CompoundNounFilter()])
In [12]:
for token in a.analyze(s):
print(token)
In [13]:
s = '人民の人民による人民のための政治'
In [14]:
a = Analyzer(token_filters=[POSKeepFilter(['名詞']), TokenCountFilter()])
In [15]:
g_count = a.analyze(s)
print(type(g_count))
In [16]:
for i in g_count:
print(i)
In [17]:
l_count = list(a.analyze(s))
print(type(l_count))
In [18]:
print(l_count)
In [19]:
d_count = dict(a.analyze(s))
print(type(d_count))
In [20]:
print(d_count)
In [21]:
print(d_count['人民'])
In [22]:
# print(d_count['国民'])
# KeyError: '国民'
In [23]:
print(d_count.get('国民', 0))
In [24]:
s = '走れと言われたので走ると言った'
In [25]:
a = Analyzer(token_filters=[TokenCountFilter()])
In [26]:
print(list(a.analyze(s)))
In [27]:
a = Analyzer(token_filters=[TokenCountFilter(att='base_form')])
In [28]:
print(list(a.analyze(s)))
In [29]:
a = Analyzer(token_filters=[TokenCountFilter(att='part_of_speech')])
In [30]:
print(list(a.analyze(s)))
In [31]:
s = '吾輩は猫である'
In [32]:
a = Analyzer(token_filters=[POSKeepFilter('助動詞')])
In [33]:
for token in a.analyze(s):
print(token)
In [34]:
a = Analyzer(token_filters=[POSKeepFilter(['助動詞'])])
In [35]:
for token in a.analyze(s):
print(token)