In [2]:
import xml.etree.ElementTree as ET
import pandas as pd
In [3]:
# tree = ET.ElementTree(file='/home/jeffmxh/NLPCC2014微博情绪分析样例数据.xml')
# tree_2013 = ET.ElementTree(file = '/home/jeffmxh/微博情绪样例数据V5-13.xml')
new_tree = ET.ElementTree(file='/home/da/Downloads/SogouCA/news.allsites.850805.utf8.converted')
In [22]:
# root = tree.getroot()
# root.tag, root.attrib
Out[22]:
In [23]:
# print(root[0].attrib)
# print(root[0].text)
In [24]:
# print('root[0]:',root[0].attrib, '\n')
# for child_of_root in root[0]:
# print(child_of_root.text)
# print(child_of_root.attrib)
In [3]:
sentence_list = list()
emotion1 = list()
emotion2 = list()
# keysent = list()
for elem in tree.iter(tag='sentence'):
if elem.attrib['opinionated']=='Y':
sentence_list.append(elem.text)
emotion1.append(elem.attrib['emotion-1-type'])
emotion2.append(elem.attrib['emotion-2-type'])
else:
sentence_list.append(elem.text)
emotion1.append('none')
emotion2.append('none')
for elem in tree_2013.iter(tag='sentence'):
if elem.attrib['emotion_tag']=='Y':
sentence_list.append(elem.text)
emotion1.append(elem.attrib['emotion-1-type'])
emotion2.append(elem.attrib['emotion-2-type'])
else:
sentence_list.append(elem.text)
emotion1.append('none')
emotion2.append('none')
# keysent.append(elem.attrib['keyexpression1'])
#print(elem.tag, elem.text, elem.attrib)
In [4]:
len(sentence_list)
# sentence_frame = pd.DataFrame('sentence':sentence_list,
# 'keysent':keysent,
# 'emotion1':emotion1,
# 'emotion2':emotion2)
sentence_frame = pd.DataFrame()
sentence_frame['sentence'] = sentence_list
# sentence_frame['keysent'] = keysent
sentence_frame['emotion_1'] = emotion1
sentence_frame['emotion2'] = emotion2
sentence_frame
Out[4]:
In [25]:
for elem in tree.iterfind('sentence[@emotion-2-type="none"]'):
print(elem.tag, elem.text, elem.attrib)
list(tree.iterfind('sentence[@opinionated="N"]'))
Out[25]:
In [60]:
tree1 = ET.ElementTree(file='/home/jeffmxh/temp.xml')
In [61]:
for elem in tree1.iter(tag='branch'):
print(elem.tag, elem.attrib)
In [83]:
for elem in tree1.iterfind('branch[@hash="1cdf045c"]'):
print(elem.tag, elem.attrib)
In [5]:
writer = pd.ExcelWriter("sentiment_data.xlsx")
sentence_frame.to_excel(writer, sheet_name='sheet1', encoding='utf-8', index=False)
writer.save()