Pre-processing

The XML file has nested <code> and <br> tags within the <text> tag, so ElementTree would not parse the file. The pre-processing below cleans removes the nested tags within the texts.


In [5]:
%%time

# flag to indicate whether we're inside a <text> tag as we iterate over the file line by line
in_text = False

# Set of tags to remove
text_to_remove = set(['<code>', '</code>', '<code/>', '<br>', '<br/>', '</br>'])

with open('EFWritingData.xml') as f_in, open('EFWritingData_new.xml', 'w') as f_out:
    for line in f_in:
        if '<text>' in line:
            # We can't handle another <text> tag while we're already inside, so raise an error.
            assert not in_text, 'Found nested text tag!'
            in_text = True
        
        if in_text:
            for invalid_tag in text_to_remove:
                if invalid_tag in line:
                    line = line.replace(invalid_tag, '')        
        
        if '</text>' in line:
            in_text = False
            
        f_out.write(line)


Wall time: 46.7 s

Extract Records from XML


In [6]:
%%time

import xml.etree.ElementTree as ET
root = ET.parse('EFWritingData_new.xml')


Wall time: 28.4 s

In [15]:
def xml_to_record(node):
    """Given one <writing> node, return a flat record with all the relevant sub-element and attributes."""
    
    article_id, level, unit = [int(node.attrib[x]) for x in ('id', 'level', 'unit')]
    topic = node.find('topic')
    topic_id, topic_text = int(topic.attrib['id']), topic.text
    grade = int(node.find('grade').text)
    text = node.find('text').text.strip()
    
    return {
        'article_id': article_id, 
        'level'     : level,
        'unit'      : unit,
        'topic_id'  : topic_id,
        'topic_text': topic_text,
        'grade'     : grade,
        'text'      : text
        
    }
    
content_nodes = root.findall('.//writing')
input_recs = list(map(xml_to_record, content_nodes))

Save the Records as a Pandas DataFrame


In [19]:
import pandas as pd

input_df = pd.DataFrame.from_records(input_recs)
input_df.head()


Out[19]:
article_id grade level text topic_id topic_text unit
0 1 90 6 After some time, the affection between them is... 41 Writing a movie plot 1
1 2 90 6 An e-ticket is a provement that you bought the... 42 Filling in an arrival card 2
2 3 86 6 From:xxx@1234.com To:Herman xxx@1234.com Date:... 43 Creating an office dress code 3
3 4 87 1 Hi Anna ,How are you . My name's Jayla . My te... 1 Introducing yourself by email 1
4 5 95 1 Dear Ms Thomas, There are thirty pens and fift... 2 Taking inventory in the office 2

In [23]:
%%time

input_df.to_pickle('input.pkl')


Wall time: 8.88 s

In [ ]: