In [5]:
%%time
# flag to indicate whether we're inside a <text> tag as we iterate over the file line by line
in_text = False
# Set of tags to remove
text_to_remove = set(['<code>', '</code>', '<code/>', '<br>', '<br/>', '</br>'])
with open('EFWritingData.xml') as f_in, open('EFWritingData_new.xml', 'w') as f_out:
for line in f_in:
if '<text>' in line:
# We can't handle another <text> tag while we're already inside, so raise an error.
assert not in_text, 'Found nested text tag!'
in_text = True
if in_text:
for invalid_tag in text_to_remove:
if invalid_tag in line:
line = line.replace(invalid_tag, '')
if '</text>' in line:
in_text = False
f_out.write(line)
In [6]:
%%time
import xml.etree.ElementTree as ET
root = ET.parse('EFWritingData_new.xml')
In [15]:
def xml_to_record(node):
"""Given one <writing> node, return a flat record with all the relevant sub-element and attributes."""
article_id, level, unit = [int(node.attrib[x]) for x in ('id', 'level', 'unit')]
topic = node.find('topic')
topic_id, topic_text = int(topic.attrib['id']), topic.text
grade = int(node.find('grade').text)
text = node.find('text').text.strip()
return {
'article_id': article_id,
'level' : level,
'unit' : unit,
'topic_id' : topic_id,
'topic_text': topic_text,
'grade' : grade,
'text' : text
}
content_nodes = root.findall('.//writing')
input_recs = list(map(xml_to_record, content_nodes))
In [19]:
import pandas as pd
input_df = pd.DataFrame.from_records(input_recs)
input_df.head()
Out[19]:
In [23]:
%%time
input_df.to_pickle('input.pkl')
In [ ]: