In [25]:
import pickle
In [49]:
def separate_sentences(filename):
checks = ['. ', '; ', '? ', '! ']
for sentences in open(filename, 'r'):
sentences = sentences.strip()
sep_flag = False
sep_index = 0
check_sign = ''
for i in checks:
if i in sentences:
check_sign = i
sep_index = sentences.find(i)
sep_flag = sentences[sep_index+2].isupper()
continue
if sep_flag:
splited_sentences = sentences.split(check_sign)
for s in splited_sentences:
if s[-1] not in (check_sign[0]):
print(s+check_sign[0]+'\n')
else:
print(s)
else:
print(sentences)
separate_sentences('nlp.txt')
In [74]:
l_sentences = []
def separate_sentences(filename):
checks = ['. ', '; ', '? ', '! ']
for sentences in open(filename, 'r'):
sentences = sentences.strip()
sep_flag = False
sep_index = 0
check_sign = ''
for i in checks:
if i in sentences:
check_sign = i
sep_index = sentences.find(i)
sep_flag = sentences[sep_index+2].isupper()
continue
if sep_flag:
splited_sentences = sentences.split(check_sign)
for s in splited_sentences:
if s[-1] not in (check_sign[0]):
l_sentences.append(s+check_sign[0]+'\n')
else:
l_sentences.append(s)
else:
l_sentences.append(sentences)
separate_sentences('nlp.txt')
with open('nlp.pickle', 'wb') as f:
pickle.dump("".join(l_sentences), f)
In [47]:
with open('nlp.pickle', 'rb') as f:
sentences = pickle.load(f)
words = sentences.split(' ')
with open('nlp_words.pickle', 'wb') as f:
pickle.dump(words, f)
with open('nlp_words.txt', 'w') as f:
for word in words:
f.write(word.rstrip()+'\n')
51の出力を入力として受け取り,Porterのステミングアルゴリズムを適用し,
単語と語幹をタブ区切り形式で出力せよ.
Pythonでは,Porterのステミングアルゴリズムの実装としてstemmingモジュールを利用するとよい.
### ステミング(stemming)
同じような話題を指している語を一つの素性としてみなしたいとき、
派生語なども含めて同一の素性とみなす作業のこと。
例. ”run”, “runs”, “ran” さらに”runner”など。
### ポーターのステマー(Porter’s stemmer)
英語のステミングの手法。多くの規則がある。
規則の例
“operational”と ”operate”は同じ ”oper” として扱えるが、
(hundred→hundr)となってしまったり、
(international→intern)と、意味が変わってしまうこともある。
In [40]:
from nltk.stem import PorterStemmer
porter = PorterStemmer()
with open('nlp_words.pickle', 'rb') as f:
words = pickle.load(f)
stem_words = []
for word in words:
stem = porter.stem(word)
stemmed = word[len(stem):]
stem_words.append(stem+'\t'+stemmed+'\n')
with open('nlp_stems.pickle', 'wb') as f:
pickle.dump(stem_words, f)
with open('nlp_stems.txt', 'w') as f:
for stem in stem_words:
f.write("".join(stem))
In [41]:
%%bash
cd stanford-corenlp-full-2015-04-20/
sh corenlp.sh -file ../nlp.txt
cp nlp.txt.xml ../
In [29]:
import xml.etree.ElementTree as etree
tree = etree.parse('nlp.txt.xml')
root = tree.getroot()
temp = []
for tokens in root.iter('token'):
temp.append(tokens.find('word').text+'\t'+\
tokens.find('lemma').text+'\t'+\
tokens.find('POS').text+'\n')
with open('nlp_tag.pickle', 'wb') as f:
pickle.dump("".join(temp), f)
with open('nlp_tag.txt', 'w') as f:
f.write("".join(temp))
In [38]:
with open('nlp_tag.pickle', 'rb') as f:
tokens = pickle.load(f)
for token in tokens.split('\n'):
splited_token = token.split('\t')
if splited_token[-1] == 'NNP':
print(splited_token[0])
In [80]:
import xml.etree.ElementTree as etree
tree = etree.parse('nlp.txt.xml')
root = tree.getroot()
temp = []
for mentions in root.iter('mention'):
if mentions.attrib != {}:
representative = mentions.find('text').text
else:
temp.append(mentions.find('text').text+'->'+representative)
In [1]:
import xml.etree.ElementTree as etree
import pygraphviz as pgv
tree = etree.parse('nlp.txt.xml')
root = tree.getroot()
collapse = []
collapse_list = []
for collapses in root.iter('dependencies'):
if collapses.attrib['type'] in ('collapsed-dependencies'):
for dep in collapses:
collapse.append((dep.find('governor').text, dep.find('dependent').text))
collapse_list.append(collapse)
collapse = []
graph_list = []
for i, sentence in enumerate(collapse_list):
g = pgv.AGraph(overlap='false')
for node in sentence:
if (node[0] in ('ROOT')):
g.add_node(node[1])
else:
g.add_edge(node, spline='true')
g.layout()
g.draw('./Untitled Folder/'+(str(i+1))+'.png')
del(g)
In [4]:
import pygraphviz as pgv
from collections import defaultdict
def choice_svo(xml_name):
tag_collapsed = "<dependencies type=\"collapsed-dependencies\">"
tag_collapsed_end = "</dependencies>"
collapsed_flag = False;
tag_governor = "</governor>"
tag_dependent = "</dependent>"
tag_dep = "<dep type=\""
dep_type_flag = False
dep_type = ''
dep_dict = defaultdict(lambda: defaultdict(list) )
sentence_id = 0; sentence_num = ''
for line in open(xml_name):
if tag_collapsed in line:
sentence_id += 1
sentence_num = str(sentence_id) + "\t"
collapsed_flag = True
dep_dict = defaultdict(lambda: defaultdict(list) )
if collapsed_flag:
if tag_collapsed_end in line:
make_svo(dep_dict)
collapsed_flag = False
if tag_dep in line:
dep_type = line.replace(tag_dep, '').strip().split("\"")[0]
if dep_type == "dobj" or dep_type == "nsubj":
dep_type_flag = True
else:
dep_type_flag = False
if dep_type_flag:
if tag_governor in line:
governor = line.replace(tag_governor, '').strip().split(">")[1]
elif tag_dependent in line:
dependent = line.replace(tag_dependent, '').strip().split(">")[1]
dep_dict[dep_type][governor].append(dependent)
def make_svo(dep_dict):
predicate_list = list()
for governor_word in dep_dict["nsubj"]:
if governor_word in dep_dict["dobj"]:
predicate_list.append(governor_word)
subject_list = list()
object_list = list()
for predicate in predicate_list:
print("\t".join((",".join(dep_dict["nsubj"][predicate]), \
predicate, ",".join(dep_dict["dobj"][predicate]) )))
if __name__ == "__main__":
xml_name = "nlp.txt.xml"
choice_svo(xml_name)
In [ ]: