In [ ]:
import MeCab
mecab = MeCab.Tagger("")
mecab.parse('')
with open('neko2.txt.mecab', 'w') as neko_mecab:
neko = "".join([i for i in open('neko.txt', 'r')])
morpheme = mecab.parse(neko)
neko_mecab.write("".join([i for i in morpheme]))
In [ ]:
import pickle
def analyze_morph(filename):
morph = []
sentence = []
for s in open(filename, 'r'):
if not s == 'EOS\n':
surface, result = s.split('\t')
result = result.split(',')
if surface == '。':
morph.append({'surface':surface, 'base':result[6], 'pos':result[0], 'pos1':result[1]})
sentence.append(morph)
morph = []
else:
morph.append({'surface':surface, 'base':result[6], 'pos':result[0], 'pos1':result[1]})
return sentence
neko_sample = analyze_morph('neko.txt.mecab')
with open('morph_neko.pickle', 'wb') as f:
pickle.dump(neko_sample, f, protocol=pickle.HIGHEST_PROTOCOL)
In [ ]:
import pickle
def extract_verb_surface(morph):
verb_list = []
for i in morph:
for j in i:
if j['pos']=='動詞' :
verb_list.append(j['surface'])
return verb_list
with open('morph_neko.pickle', 'rb') as f:
neko_sample = pickle.load(f, encoding='utf-8', fix_imports=False)
verbs = extract_verb(neko_sample)
In [ ]:
import pickle
def extract_verb_base(morph):
base_form_lists = [j['base'] for i in morph for j in i if j['pos'] == '動詞']
return base_form_lists
with open('morph_neko.pickle', 'rb') as f:
neko_sample = pickle.load(f, encoding='utf-8', fix_imports=False)
base_forms = extract_verb_base(neko_sample)
In [ ]:
with open('morph_neko.pickle', 'rb') as f:
neko_sample = pickle.load(f, encoding='utf-8', fix_imports=False)
noun_shen = [j['base'] for i in neko_sample for j in i if j['pos1'] in ('サ変接続') and j['base'] not in ['*\n']]
In [ ]:
import pickle
with open('morph_neko.pickle', 'rb') as f:
neko_samples = pickle.load(f)
noun_phrase = [i[j-1]['surface'] + i[j]['base'] + i[j+1]['surface'] \
for i in neko_samples for j in range(len(i)) \
if i[j]['base'] in ('の') and i[j+1]['pos'] in ('名詞') and i[j-1]['pos'] in ('名詞')]
noun_phrase
In [ ]:
import pickle
with open('morph_neko.pickle', 'rb') as f:
neko_samples = pickle.load(f, encoding='utf-8', fix_imports=False)
len_dict = {}
for sentence_list in neko_samples:
noun_list = []
for dict_morpho in sentence_list:
if dict_morpho["pos"] in ("名詞"):
noun_list.append(dict_morpho["surface"])
else:
len_dict[len(noun_list)] = noun_list
noun_list = []
print(len_dict)
print("".join(len_dict[max(len_dict.keys())]))
In [54]:
import pickle
from operator import itemgetter
from collections import defaultdict
with open('morph_neko.pickle', 'rb') as f:
neko_samples = pickle.load(f, encoding='utf-8', fix_imports=False)
words = [word['base'] for sentence in neko_samples for word in sentence ]
word_count = defaultdict(int)
for i in words:
if i not in (['。', '、']) :
word_count[i] += 1
word_count = sorted(word_count.items(), key=itemgetter(1), reverse=True)
with open('count_list.pickle', 'wb') as f:
pickle.dump(word_count, f)
In [45]:
%matplotlib inline
import matplotlib.pyplot as plt
import prettyplotlib as ppl
import pandas as pd
word_list = [i[0] for i in word_count][0:10]
count_list = [i[1] for i in word_count][0:10]
plt.xlim(-0.5, 10)
ppl.bar(range(10), count_list, align='center', alpha=0.8)
plt.xticks(range(10), word_list)
plt.show()
In [42]:
%matplotlib inline
import prettyplotlib as ppl
import matplotlib.pyplot as plt
import pickle
from collections import defaultdict, Counter
with open('count_list.pickle', 'rb') as f:
count_list = pickle.load(f)
counts = [i[1] for i in count_list]
counts = (Counter(counts))
kind = [i[0]for i in counts.most_common()]
frequency = [i[1] for i in counts.most_common()]
plt.figure(figsize=(12, 9))
plt.hist(frequency, color='c', bins=239)
print()
In [59]:
%matplotlib inline
import prettyplotlib as ppl
import matplotlib.pyplot as plt
import pickle
from collections import defaultdict, Counter
count_list = [i[1] for i in word_count]
ranking = range(1, len(count_list)+1)
plt.xscale('log')
plt.yscale('log')
ppl.plot(ranking, count_list)
Out[59]:
In [ ]: