In [1]:
from janome.tokenizer import Tokenizer
import collections
In [2]:
t = Tokenizer()
In [3]:
s = '人民の人民による人民のための政治'
In [4]:
for token in t.tokenize(s):
print(token)
In [5]:
c = collections.Counter(t.tokenize(s, wakati=True))
In [6]:
print(type(c))
In [7]:
print(c)
In [8]:
print(c['人民'])
In [9]:
print(c['国民'])
In [10]:
mc = c.most_common()
print(mc)
In [11]:
print(mc[0][0])
In [12]:
print(mc[0][1])
In [13]:
words, counts = zip(*c.most_common())
In [14]:
print(words)
In [15]:
print(counts)
In [16]:
s = '走れと言われたので走ると言った'
In [17]:
print(collections.Counter(t.tokenize(s, wakati=True)))
In [18]:
print(collections.Counter(token.base_form for token in t.tokenize(s)))
In [19]:
print(type(token.base_form for token in t.tokenize(s)))
In [20]:
print(collections.Counter(token.base_form for token in t.tokenize(s)
if token.part_of_speech.startswith('動詞,自立')))
In [21]:
print(collections.Counter(token.part_of_speech.split(',')[0] for token in t.tokenize(s)))