Pre-processing

The XML file has nested <code> and <br> tags within the <text> tag, so ElementTree would not parse the file. The pre-processing below cleans removes the nested tags within the texts.



In [5]:

    
%%time

# flag to indicate whether we're inside a <text> tag as we iterate over the file line by line
in_text = False

# Set of tags to remove
text_to_remove = set(['<code>', '</code>', '<code/>', '<br>', '<br/>', '</br>'])

with open('EFWritingData.xml') as f_in, open('EFWritingData_new.xml', 'w') as f_out:
    for line in f_in:
        if '<text>' in line:
            # We can't handle another <text> tag while we're already inside, so raise an error.
            assert not in_text, 'Found nested text tag!'
            in_text = True
        
        if in_text:
            for invalid_tag in text_to_remove:
                if invalid_tag in line:
                    line = line.replace(invalid_tag, '')        
        
        if '</text>' in line:
            in_text = False
            
        f_out.write(line)









    



Wall time: 46.7 s

Extract Records from XML



In [6]:

    
%%time

import xml.etree.ElementTree as ET
root = ET.parse('EFWritingData_new.xml')









    



Wall time: 28.4 s



In [15]:

    
def xml_to_record(node):
    """Given one <writing> node, return a flat record with all the relevant sub-element and attributes."""
    
    article_id, level, unit = [int(node.attrib[x]) for x in ('id', 'level', 'unit')]
    topic = node.find('topic')
    topic_id, topic_text = int(topic.attrib['id']), topic.text
    grade = int(node.find('grade').text)
    text = node.find('text').text.strip()
    
    return {
        'article_id': article_id, 
        'level'     : level,
        'unit'      : unit,
        'topic_id'  : topic_id,
        'topic_text': topic_text,
        'grade'     : grade,
        'text'      : text
        
    }
    
content_nodes = root.findall('.//writing')
input_recs = list(map(xml_to_record, content_nodes))

Save the Records as a Pandas DataFrame



In [19]:

    
import pandas as pd

input_df = pd.DataFrame.from_records(input_recs)
input_df.head()









    Out[19]:







  
    
      
      article_id
      grade
      level
      text
      topic_id
      topic_text
      unit
    
  
  
    
      0
      1
      90
      6
      After some time, the affection between them is...
      41
      Writing a movie plot
      1
    
    
      1
      2
      90
      6
      An e-ticket is a provement that you bought the...
      42
      Filling in an arrival card
      2
    
    
      2
      3
      86
      6
      From:xxx@1234.com To:Herman xxx@1234.com Date:...
      43
      Creating an office dress code
      3
    
    
      3
      4
      87
      1
      Hi Anna ,How are you . My name's Jayla . My te...
      1
      Introducing yourself by email
      1
    
    
      4
      5
      95
      1
      Dear Ms Thomas, There are thirty pens and fift...
      2
      Taking inventory in the office
      2



In [23]:

    
%%time

input_df.to_pickle('input.pkl')









    



Wall time: 8.88 s



In [ ]:

	article_id	grade	level	text	topic_id	topic_text	unit
0	1	90	6	After some time, the affection between them is...	41	Writing a movie plot	1
1	2	90	6	An e-ticket is a provement that you bought the...	42	Filling in an arrival card	2
2	3	86	6	From:xxx@1234.com To:Herman xxx@1234.com Date:...	43	Creating an office dress code	3
3	4	87	1	Hi Anna ,How are you . My name's Jayla . My te...	1	Introducing yourself by email	1
4	5	95	1	Dear Ms Thomas, There are thirty pens and fift...	2	Taking inventory in the office	2