In [2]:
import os, re, random, glob, pickle, collections, gc
from tqdm import tqdm
from random import shuffle
from lxml import etree
from itertools import islice


Using TensorFlow backend.

In [12]:
MAX_HDLN_LEN = 30
NUM_ART_SENTS = 1
MAX_ART_LEN = 50 * NUM_ART_SENTS

In [13]:
pattern = re.compile(r'''
                    # Don't care about matching beginning of string
(                   # Start of Conditional (...|...)
                    ## First conditional
(\(\s)*             # It could start with 1 or 0 '(' followed by possible whitespace
\(+                 # One or More opening parentheses
\-?                 # Possible starting '-' (-[LR]RB-)
[A-Z\.,\'\`:]+      # One or More Capital letters and various punctuations
\-?                 # Possible ending '-' (-(Right|Left) Parenthesis-)
\$?                 # Zero or 1 '$'
\s                  # Must be followed by whitespace

|                   ## Second Conditional
\(\s\$\s            # ( $
|                   ## Third Conditional
\)                  # Also replace ')' with ''
|                   ## Fourth Conditional
\n
|                   ## Fifth Conditional
by\b\w+\b\w+$)      # by Author
''', re.VERBOSE)

In [14]:
def process_data(sentence):
    sentence = pattern.sub('', sentence)
    sentence = re.sub(r'(\`\`)|(\'\')', '"', sentence)
    sentence = re.sub(r'-RRB-', ')', sentence)
    sentence = re.sub(r'-LRB-', '(', sentence)
    sentence = re.sub(r'\.\.\.', ' ellipsis ', sentence)
    sentence = re.sub("([\d\"().,;:/_?!—])", r' \1 ', sentence).replace('-', ' ')
    sentence = re.sub(r'ellipsis', '...', sentence)
    sentence = re.sub(r'  *', ' ', sentence)
    sentence = sentence.lower().split()
    return sentence

In [15]:
def extract_headline(hdln):
    for headline in hdln.itertext():
        headline = process_data(headline)
        return headline

In [16]:
def extract_art_txt(txt, n_sents):
    sentences = []
    for sentence in islice(txt.itertext(), n_sents * 2):
        if sentence is not '\n':
            sentence = process_data(sentence)
            sentences += sentence
    return sentences

In [19]:
def write_tab_seperated(file, headlines, articles, articles_per_file):
    global included, total, overall_total
    # Parse XML Tree
    tree = etree.parse(file)
    root = tree.getroot()
    
    tot_arts = len(root)
    overall_total += tot_arts
    print(tot_arts," articles in this file. Extracting 100 article / headline pairs")
    
    # Using randomized indicies to ensure we don't add two of same article/headline from random sampling
    rand_indicies = list(range(0,len(root))); shuffle(rand_indicies)
    num_processed = 0
    for rand_idx in rand_indicies:
        child = root[rand_idx]
        hdln = child.find('HEADLINE')
        txt = child.find('TEXT')

        if hdln is not None and txt is not None: # Happens more often than you'd think.
            total += 1
            headline = extract_headline(hdln)
            article = extract_art_txt(txt, n_sents = NUM_ART_SENTS)
            if len(headline) > MAX_HDLN_LEN or len(article) > MAX_ART_LEN: continue

            headline = ' '.join(headline)
            article = ' '.join(article)
            del child; gc.collect() # Poor Macbook's memory couldn't keep up with the massive xml files
            num_processed += 1; included += 1
            if num_processed == articles_per_file: break
            headlines.append(headline)
            articles.append(article)
    del tree; gc.collect()

In [21]:
DATA_PATH = '/Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/*'
all_files = glob.glob(DATA_PATH); shuffle(all_files)

articles = []; headlines = []
included = 0
total = 0
overall_total = 0

with open('test.txt', 'w') as f:
    f.write('')

for file in tqdm(all_files):
    print('Parsing file: %s' % file)
    write_tab_seperated(file, headlines, articles, articles_per_file=120)
    percent_procd = included/total
    print('Total articles processed: {0} of {2} ({1:.1f}%)'.format(included, percent_procd * 100, total))


Out[21]:
0
  0%|          | 0/857 [00:00<?, ?it/s]
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/ltw_eng_199609.xml.gz

2841  articles in this file. Extracting 100 article / headline pairs
  0%|          | 1/857 [01:06<15:46:53, 66.37s/it]
Total articles processed: 120 of 183 (65.6%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/cna_eng_200201.xml.gz
805  articles in this file. Extracting 100 article / headline pairs
  0%|          | 2/857 [01:17<11:49:44, 49.81s/it]
Total articles processed: 240 of 330 (72.7%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/nyt_eng_200601.xml.gz
9042  articles in this file. Extracting 100 article / headline pairs
  0%|          | 3/857 [07:29<34:46:00, 146.56s/it]
Total articles processed: 360 of 516 (69.8%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/cna_eng_199904.xml.gz
868 
  0%|          | 4/857 [07:43<25:15:56, 106.63s/it]
 articles in this file. Extracting 100 article / headline pairs
Total articles processed: 480 of 664 (72.3%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/ltw_eng_199602.xml.gz
2780  articles in this file. Extracting 100 article / headline pairs
  1%|          | 5/857 [08:43<21:55:41, 92.65s/it] 
Total articles processed: 600 of 840 (71.4%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/nyt_eng_200908.xml.gz
5307  articles in this file. Extracting 100 article / headline pairs
  1%|          | 6/857 [11:25<26:49:59, 113.51s/it]
Total articles processed: 720 of 1007 (71.5%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/afp_eng_200806.xml.gz
20355  articles in this file. Extracting 100 article / headline pairs
  1%|          | 7/857 [16:15<39:16:27, 166.34s/it]
Total articles processed: 840 of 1307 (64.3%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/ltw_eng_200909.xml.gz
2866  articles in this file. Extracting 100 article / headline pairs
  1%|          | 8/857 [17:18<31:57:57, 135.54s/it]
Total articles processed: 960 of 1466 (65.5%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/ltw_eng_200309.xml.gz
3896  articles in this file. Extracting 100 article / headline pairs
  1%|          | 9/857 [18:59<29:27:37, 125.07s/it]
Total articles processed: 1080 of 1657 (65.2%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_199710.xml.gz
8334  articles in this file. Extracting 100 article / headline pairs
  1%|          | 10/857 [19:47<23:57:56, 101.86s/it]
Total articles processed: 1200 of 1794 (66.9%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/ltw_eng_199709.xml.gz
3062  articles in this file. Extracting 100 article / headline pairs
  1%|▏         | 11/857 [20:59<21:50:15, 92.93s/it] 
Total articles processed: 1320 of 1954 (67.6%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/ltw_eng_200407.xml.gz
4069  articles in this file. Extracting 100 article / headline pairs
  1%|▏         | 12/857 [22:42<22:33:02, 96.07s/it]
Total articles processed: 1440 of 2142 (67.2%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/afp_eng_200406.xml.gz
17077  articles in this file. Extracting 100 article / headline pairs
  2%|▏         | 13/857 [26:01<29:45:19, 126.92s/it]
Total articles processed: 1560 of 2270 (68.7%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_200311.xml.gz
9155  articles in this file. Extracting 100 article / headline pairs
  2%|▏         | 14/857 [27:00<24:55:59, 106.48s/it]
Total articles processed: 1680 of 2421 (69.4%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_200701.xml.gz
11462  articles in this file. Extracting 100 article / headline pairs
  2%|▏         | 15/857 [28:10<22:23:01, 95.70s/it] 
Total articles processed: 1800 of 2578 (69.8%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/apw_eng_200606.xml.gz
12455  articles in this file. Extracting 100 article / headline pairs
  2%|▏         | 16/857 [31:42<30:28:27, 130.45s/it]
Total articles processed: 1920 of 2708 (70.9%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/nyt_eng_200704.xml.gz
7661  articles in this file. Extracting 100 article / headline pairs
  2%|▏         | 17/857 [36:22<40:53:01, 175.22s/it]
Total articles processed: 2040 of 2850 (71.6%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/apw_eng_199607.xml.gz
12559  articles in this file. Extracting 100 article / headline pairs
  2%|▏         | 18/857 [39:11<40:26:04, 173.50s/it]
Total articles processed: 2160 of 2997 (72.1%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/afp_eng_199606.xml.gz
17197  articles in this file. Extracting 100 article / headline pairs
  2%|▏         | 19/857 [42:00<40:02:51, 172.04s/it]
Total articles processed: 2280 of 3117 (73.1%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/ltw_eng_199502.xml.gz
2890  articles in this file. Extracting 100 article / headline pairs
  2%|▏         | 20/857 [43:03<32:23:16, 139.30s/it]
Total articles processed: 2400 of 3306 (72.6%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_199806.xml.gz
8882  articles in this file. Extracting 100 article / headline pairs
  2%|▏         | 21/857 [43:55<26:19:31, 113.36s/it]
Total articles processed: 2520 of 3443 (73.2%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/cna_eng_199712.xml.gz
831  articles in this file. Extracting 100 article / headline pairs
  3%|▎         | 22/857 [44:09<19:19:52, 83.34s/it] 
Total articles processed: 2640 of 3592 (73.5%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/cna_eng_200106.xml.gz
768  articles in this file. Extracting 100 article / headline pairs
  3%|▎         | 23/857 [44:20<14:16:09, 61.59s/it]
Total articles processed: 2760 of 3733 (73.9%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/apw_eng_201008.xml.gz
16138  articles in this file. Extracting 100 article / headline pairs
  3%|▎         | 24/857 [48:54<29:01:53, 125.47s/it]
Total articles processed: 2880 of 3861 (74.6%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/cna_eng_200405.xml.gz
927  articles in this file. Extracting 100 article / headline pairs
  3%|▎         | 25/857 [49:05<21:04:30, 91.19s/it] 
Total articles processed: 3000 of 4050 (74.1%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/afp_eng_199512.xml.gz
13098  articles in this file. Extracting 100 article / headline pairs
  3%|▎         | 26/857 [51:10<23:21:12, 101.17s/it]
Total articles processed: 3120 of 4172 (74.8%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/nyt_eng_200810.xml.gz
6884  articles in this file. Extracting 100 article / headline pairs
  3%|▎         | 27/857 [55:20<33:39:04, 145.96s/it]
Total articles processed: 3240 of 4350 (74.5%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/ltw_eng_199510.xml.gz
3181  articles in this file. Extracting 100 article / headline pairs
  3%|▎         | 28/857 [56:35<28:42:04, 124.64s/it]
Total articles processed: 3360 of 4536 (74.1%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/cna_eng_201002.xml.gz
970  articles in this file. Extracting 100 article / headline pairs
  3%|▎         | 29/857 [56:48<20:56:52, 91.08s/it] 
Total articles processed: 3480 of 4678 (74.4%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/cna_eng_200804.xml.gz
1395  articles in this file. Extracting 100 article / headline pairs
  4%|▎         | 30/857 [57:03<15:42:15, 68.36s/it]
Total articles processed: 3600 of 4835 (74.5%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/afp_eng_199610.xml.gz
21021  articles in this file. Extracting 100 article / headline pairs
  4%|▎         | 31/857 [1:00:43<26:04:42, 113.66s/it]
Total articles processed: 3720 of 4956 (75.1%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_199606.xml.gz
8032  articles in this file. Extracting 100 article / headline pairs
  4%|▎         | 32/857 [1:01:27<21:16:52, 92.86s/it] 
Total articles processed: 3840 of 5090 (75.4%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/afp_eng_199511.xml.gz
14639  articles in this file. Extracting 100 article / headline pairs
  4%|▍         | 33/857 [1:03:48<24:32:48, 107.24s/it]
Total articles processed: 3960 of 5212 (76.0%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/nyt_eng_200610.xml.gz
8936  articles in this file. Extracting 100 article / headline pairs
  4%|▍         | 34/857 [1:09:32<40:48:16, 178.49s/it]
Total articles processed: 4080 of 5368 (76.0%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/ltw_eng_200602.xml.gz
3662  articles in this file. Extracting 100 article / headline pairs
  4%|▍         | 35/857 [1:10:56<34:15:44, 150.05s/it]
Total articles processed: 4200 of 5540 (75.8%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/apw_eng_200008.xml.gz
5034  articles in this file. Extracting 100 article / headline pairs
  4%|▍         | 36/857 [1:12:17<29:27:32, 129.17s/it]
Total articles processed: 4320 of 5669 (76.2%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/ltw_eng_200902.xml.gz
3076  articles in this file. Extracting 100 article / headline pairs
  4%|▍         | 37/857 [1:13:25<25:14:48, 110.84s/it]
Total articles processed: 4440 of 5856 (75.8%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_200503.xml.gz
10134  articles in this file. Extracting 100 article / headline pairs
  4%|▍         | 38/857 [1:14:37<22:34:42, 99.25s/it] 
Total articles processed: 4560 of 6009 (75.9%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/cna_eng_200502.xml.gz
807  articles in this file. Extracting 100 article / headline pairs
  5%|▍         | 39/857 [1:14:48<16:32:27, 72.80s/it]
Total articles processed: 4680 of 6188 (75.6%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_201011.xml.gz
12530  articles in this file. Extracting 100 article / headline pairs
  5%|▍         | 40/857 [1:16:30<18:30:04, 81.52s/it]
Total articles processed: 4800 of 6330 (75.8%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/cna_eng_200406.xml.gz
867  articles in this file. Extracting 100 article / headline pairs
  5%|▍         | 41/857 [1:16:41<13:41:50, 60.43s/it]
Total articles processed: 4920 of 6510 (75.6%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/apw_eng_199810.xml.gz
15803  articles in this file. Extracting 100 article / headline pairs
  5%|▍         | 42/857 [1:20:36<25:33:29, 112.90s/it]
Total articles processed: 5040 of 6643 (75.9%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_200504.xml.gz
8929  articles in this file. Extracting 100 article / headline pairs
  5%|▌         | 43/857 [1:21:39<22:04:59, 97.66s/it] 
Total articles processed: 5160 of 6792 (76.0%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/apw_eng_200112.xml.gz
17995  articles in this file. Extracting 100 article / headline pairs
  5%|▌         | 44/857 [1:27:11<37:56:31, 168.01s/it]
Total articles processed: 5280 of 6938 (76.1%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_199811.xml.gz
8649  articles in this file. Extracting 100 article / headline pairs
  5%|▌         | 45/857 [1:28:09<30:27:11, 135.01s/it]
Total articles processed: 5400 of 7081 (76.3%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_199711.xml.gz
8509  articles in this file. Extracting 100 article / headline pairs
  5%|▌         | 46/857 [1:29:05<25:06:37, 111.46s/it]
Total articles processed: 5520 of 7230 (76.3%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/afp_eng_200901.xml.gz
22315  articles in this file. Extracting 100 article / headline pairs
  5%|▌         | 47/857 [1:34:40<40:09:12, 178.46s/it]
Total articles processed: 5640 of 7358 (76.7%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/cna_eng_200705.xml.gz
1341  articles in this file. Extracting 100 article / headline pairs
  6%|▌         | 48/857 [1:34:58<29:16:45, 130.29s/it]
Total articles processed: 5760 of 7531 (76.5%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/afp_eng_200510.xml.gz
17292  articles in this file. Extracting 100 article / headline pairs
  6%|▌         | 49/857 [1:39:35<39:08:50, 174.42s/it]
Total articles processed: 5880 of 7671 (76.7%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/ltw_eng_199501.xml.gz
3110  articles in this file. Extracting 100 article / headline pairs
  6%|▌         | 50/857 [1:40:59<33:00:54, 147.28s/it]
Total articles processed: 6000 of 7835 (76.6%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_201010.xml.gz
12125  articles in this file. Extracting 100 article / headline pairs
  6%|▌         | 51/857 [1:42:52<30:40:08, 136.98s/it]
Total articles processed: 6120 of 7973 (76.8%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/afp_eng_200504.xml.gz
9095  articles in this file. Extracting 100 article / headline pairs
  6%|▌         | 52/857 [1:44:56<29:43:22, 132.92s/it]
Total articles processed: 6240 of 8107 (77.0%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/apw_eng_200903.xml.gz
17823  articles in this file. Extracting 100 article / headline pairs
  6%|▌         | 53/857 [1:50:42<44:01:03, 197.09s/it]
Total articles processed: 6360 of 8230 (77.3%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/apw_eng_199605.xml.gz
13574  articles in this file. Extracting 100 article / headline pairs
  6%|▋         | 54/857 [1:53:59<43:55:58, 196.96s/it]
Total articles processed: 6480 of 8379 (77.3%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/afp_eng_200811.xml.gz
20929  articles in this file. Extracting 100 article / headline pairs
  6%|▋         | 55/857 [1:58:58<50:42:58, 227.65s/it]
Total articles processed: 6600 of 8508 (77.6%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/ltw_eng_200706.xml.gz
3691  articles in this file. Extracting 100 article / headline pairs
  7%|▋         | 56/857 [2:00:38<42:04:27, 189.10s/it]
Total articles processed: 6720 of 8671 (77.5%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_200512.xml.gz
10001  articles in this file. Extracting 100 article / headline pairs
  7%|▋         | 57/857 [2:01:58<34:46:54, 156.52s/it]
Total articles processed: 6840 of 8813 (77.6%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/nyt_eng_199504.xml.gz
10099  articles in this file. Extracting 100 article / headline pairs
  7%|▋         | 58/857 [2:08:03<48:35:50, 218.96s/it]
Total articles processed: 6960 of 8981 (77.5%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/afp_eng_200404.xml.gz
16453  articles in this file. Extracting 100 article / headline pairs
  7%|▋         | 59/857 [2:12:00<49:46:21, 224.54s/it]
Total articles processed: 7080 of 9114 (77.7%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_201008.xml.gz
11374  articles in this file. Extracting 100 article / headline pairs
  7%|▋         | 60/857 [2:13:45<41:46:14, 188.68s/it]
Total articles processed: 7200 of 9257 (77.8%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_200011.xml.gz
8841  articles in this file. Extracting 100 article / headline pairs
  7%|▋         | 61/857 [2:14:49<33:25:28, 151.17s/it]
Total articles processed: 7320 of 9406 (77.8%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/nyt_eng_201006.xml.gz
5911  articles in this file. Extracting 100 article / headline pairs
  7%|▋         | 62/857 [2:18:23<37:32:05, 169.97s/it]
Total articles processed: 7440 of 9574 (77.7%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_200604.xml.gz
5894  articles in this file. Extracting 100 article / headline pairs
  7%|▋         | 63/857 [2:19:03<28:54:23, 131.06s/it]
Total articles processed: 7560 of 9729 (77.7%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/afp_eng_200704.xml.gz
17664  articles in this file. Extracting 100 article / headline pairs
  7%|▋         | 64/857 [2:23:15<36:51:34, 167.33s/it]
Total articles processed: 7680 of 9861 (77.9%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/ltw_eng_200501.xml.gz
3713  articles in this file. Extracting 100 article / headline pairs
  8%|▊         | 65/857 [2:25:03<32:54:30, 149.58s/it]
Total articles processed: 7800 of 10027 (77.8%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/ltw_eng_199708.xml.gz
2645  articles in this file. Extracting 100 article / headline pairs
  8%|▊         | 66/857 [2:26:09<27:21:09, 124.49s/it]
Total articles processed: 7920 of 10211 (77.6%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_199610.xml.gz
8060  articles in this file. Extracting 100 article / headline pairs
  8%|▊         | 67/857 [2:27:02<22:36:50, 103.05s/it]
Total articles processed: 8040 of 10345 (77.7%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/afp_eng_199406.xml.gz
11552  articles in this file. Extracting 100 article / headline pairs
  8%|▊         | 68/857 [2:28:59<23:30:12, 107.24s/it]
Total articles processed: 8160 of 10469 (77.9%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/afp_eng_199408.xml.gz
10939  articles in this file. Extracting 100 article / headline pairs
  8%|▊         | 69/857 [2:30:44<23:18:23, 106.48s/it]
Total articles processed: 8280 of 10597 (78.1%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/afp_eng_200812.xml.gz
18618  articles in this file. Extracting 100 article / headline pairs
  8%|▊         | 70/857 [2:35:06<33:28:13, 153.10s/it]
Total articles processed: 8400 of 10721 (78.4%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/ltw_eng_200804.xml.gz
3370  articles in this file. Extracting 100 article / headline pairs
  8%|▊         | 71/857 [2:36:32<29:02:40, 133.03s/it]
Total articles processed: 8520 of 10899 (78.2%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_200101.xml.gz
7880  articles in this file. Extracting 100 article / headline pairs
  8%|▊         | 72/857 [2:37:17<23:14:49, 106.61s/it]
Total articles processed: 8640 of 11051 (78.2%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/nyt_eng_200207.xml.gz
4908  articles in this file. Extracting 100 article / headline pairs
  9%|▊         | 73/857 [2:40:15<27:52:27, 127.99s/it]
Total articles processed: 8760 of 11213 (78.1%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/nyt_eng_200904.xml.gz
5697  articles in this file. Extracting 100 article / headline pairs
  9%|▊         | 74/857 [2:43:17<31:24:15, 144.39s/it]
Total articles processed: 8880 of 11409 (77.8%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/cna_eng_201012.xml.gz
1272  articles in this file. Extracting 100 article / headline pairs
  9%|▉         | 75/857 [2:43:33<22:59:09, 105.82s/it]
Total articles processed: 9000 of 11552 (77.9%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/ltw_eng_200411.xml.gz
3516  articles in this file. Extracting 100 article / headline pairs
  9%|▉         | 76/857 [2:45:02<21:49:03, 100.57s/it]
Total articles processed: 9120 of 11727 (77.8%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_200902.xml.gz
10049  articles in this file. Extracting 100 article / headline pairs
  9%|▉         | 77/857 [2:46:17<20:08:36, 92.97s/it] 
Total articles processed: 9240 of 11870 (77.8%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/ltw_eng_200907.xml.gz
3145  articles in this file. Extracting 100 article / headline pairs
  9%|▉         | 78/857 [2:47:29<18:46:09, 86.74s/it]
Total articles processed: 9360 of 12048 (77.7%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/nyt_eng_200707.xml.gz
7404  articles in this file. Extracting 100 article / headline pairs
  9%|▉         | 79/857 [2:52:15<31:38:44, 146.43s/it]
Total articles processed: 9480 of 12205 (77.7%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_199702.xml.gz
6928  articles in this file. Extracting 100 article / headline pairs
  9%|▉         | 80/857 [2:52:55<24:44:48, 114.66s/it]
Total articles processed: 9600 of 12345 (77.8%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_200810.xml.gz
8489  articles in this file. Extracting 100 article / headline pairs
  9%|▉         | 81/857 [2:53:50<20:49:42, 96.63s/it] 
Total articles processed: 9720 of 12495 (77.8%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/cna_eng_200104.xml.gz
845  articles in this file. Extracting 100 article / headline pairs
 10%|▉         | 82/857 [2:54:02<15:20:31, 71.27s/it]
Total articles processed: 9840 of 12644 (77.8%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_200007.xml.gz
9100  articles in this file. Extracting 100 article / headline pairs
 10%|▉         | 83/857 [2:54:53<14:02:42, 65.33s/it]
Total articles processed: 9960 of 12787 (77.9%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/cna_eng_200901.xml.gz
1135  articles in this file. Extracting 100 article / headline pairs
 10%|▉         | 84/857 [2:55:08<10:45:31, 50.11s/it]
Total articles processed: 10080 of 12942 (77.9%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/nyt_eng_200405.xml.gz
372  articles in this file. Extracting 100 article / headline pairs
 10%|▉         | 85/857 [2:55:20<8:16:48, 38.61s/it] 
Total articles processed: 10200 of 13129 (77.7%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/apw_eng_200003.xml.gz
6333  articles in this file. Extracting 100 article / headline pairs
 10%|█         | 86/857 [2:57:10<12:52:11, 60.09s/it]
Total articles processed: 10320 of 13258 (77.8%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/wpb_eng_201001.xml.gz
2020  articles in this file. Extracting 100 article / headline pairs
 10%|█         | 87/857 [2:57:50<11:35:17, 54.18s/it]
Total articles processed: 10440 of 13420 (77.8%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/afp_eng_200904.xml.gz
22630  articles in this file. Extracting 100 article / headline pairs
 10%|█         | 88/857 [3:02:39<26:35:21, 124.47s/it]
Total articles processed: 10560 of 13549 (77.9%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/ltw_eng_199611.xml.gz
2855  articles in this file. Extracting 100 article / headline pairs
 10%|█         | 89/857 [3:03:40<22:30:19, 105.49s/it]
Total articles processed: 10680 of 13721 (77.8%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/cna_eng_200912.xml.gz
1340  articles in this file. Extracting 100 article / headline pairs
 11%|█         | 90/857 [3:03:57<16:48:10, 78.87s/it] 
Total articles processed: 10800 of 13877 (77.8%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/apw_eng_199703.xml.gz
14521  articles in this file. Extracting 100 article / headline pairs
 11%|█         | 91/857 [3:07:32<25:28:02, 119.69s/it]
Total articles processed: 10920 of 14013 (77.9%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/nyt_eng_200711.xml.gz
7142  articles in this file. Extracting 100 article / headline pairs
 11%|█         | 92/857 [3:11:50<34:16:14, 161.27s/it]
Total articles processed: 11040 of 14174 (77.9%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/wpb_eng_201006.xml.gz
2228  articles in this file. Extracting 100 article / headline pairs
 11%|█         | 93/857 [3:12:35<26:51:20, 126.55s/it]
Total articles processed: 11160 of 14344 (77.8%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_199602.xml.gz
6696  articles in this file. Extracting 100 article / headline pairs
 11%|█         | 94/857 [3:13:12<21:04:28, 99.43s/it] 
Total articles processed: 11280 of 14489 (77.9%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_200608.xml.gz
10356  articles in this file. Extracting 100 article / headline pairs
 11%|█         | 95/857 [3:14:21<19:06:20, 90.26s/it]
Total articles processed: 11400 of 14643 (77.9%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/ltw_eng_200911.xml.gz
2768  articles in this file. Extracting 100 article / headline pairs
 11%|█         | 96/857 [3:15:20<17:08:55, 81.12s/it]
Total articles processed: 11520 of 14784 (77.9%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/afp_eng_200306.xml.gz
22132  articles in this file. Extracting 100 article / headline pairs
 11%|█▏        | 97/857 [3:20:56<33:14:34, 157.47s/it]
Total articles processed: 11640 of 14912 (78.1%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/apw_eng_200506.xml.gz
12347  articles in this file. Extracting 100 article / headline pairs
 11%|█▏        | 98/857 [3:24:11<35:32:51, 168.61s/it]
Total articles processed: 11760 of 15044 (78.2%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/ltw_eng_199601.xml.gz
2953  articles in this file. Extracting 100 article / headline pairs
 12%|█▏        | 99/857 [3:25:13<28:46:29, 136.66s/it]
Total articles processed: 11880 of 15208 (78.1%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/apw_eng_200507.xml.gz
8226  articles in this file. Extracting 100 article / headline pairs
 12%|█▏        | 100/857 [3:27:08<27:21:51, 130.13s/it]
Total articles processed: 12000 of 15339 (78.2%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/cna_eng_199908.xml.gz
873  articles in this file. Extracting 100 article / headline pairs
 12%|█▏        | 101/857 [3:27:19<19:52:38, 94.65s/it] 
Total articles processed: 12120 of 15488 (78.3%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/afp_eng_199506.xml.gz
13381  articles in this file. Extracting 100 article / headline pairs
 12%|█▏        | 102/857 [3:29:34<22:21:15, 106.59s/it]
Total articles processed: 12240 of 15610 (78.4%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/apw_eng_200307.xml.gz
6963  articles in this file. Extracting 100 article / headline pairs
 12%|█▏        | 103/857 [3:31:22<22:24:22, 106.98s/it]
Total articles processed: 12360 of 15757 (78.4%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/apw_eng_200712.xml.gz
16311  articles in this file. Extracting 100 article / headline pairs
 12%|█▏        | 104/857 [3:36:56<36:38:09, 175.15s/it]
Total articles processed: 12480 of 15887 (78.6%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/afp_eng_199405.xml.gz
6629  articles in this file. Extracting 100 article / headline pairs
 12%|█▏        | 105/857 [3:37:45<28:39:20, 137.18s/it]
Total articles processed: 12600 of 16009 (78.7%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_199604.xml.gz
7788  articles in this file. Extracting 100 article / headline pairs
 12%|█▏        | 106/857 [3:38:26<22:38:54, 108.57s/it]
Total articles processed: 12720 of 16155 (78.7%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/ltw_eng_200503.xml.gz
3717  articles in this file. Extracting 100 article / headline pairs
 12%|█▏        | 107/857 [3:39:56<21:27:14, 102.98s/it]
Total articles processed: 12840 of 16318 (78.7%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/apw_eng_200608.xml.gz
10983  articles in this file. Extracting 100 article / headline pairs
 13%|█▎        | 108/857 [3:42:49<25:45:07, 123.77s/it]
Total articles processed: 12960 of 16444 (78.8%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/afp_eng_200305.xml.gz
23340  articles in this file. Extracting 100 article / headline pairs
 13%|█▎        | 109/857 [3:48:39<39:49:04, 191.64s/it]
Total articles processed: 13080 of 16571 (78.9%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/afp_eng_199704.xml.gz
18619  articles in this file. Extracting 100 article / headline pairs
 13%|█▎        | 110/857 [3:52:12<41:05:30, 198.03s/it]
Total articles processed: 13200 of 16694 (79.1%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/afp_eng_200809.xml.gz
12962  articles in this file. Extracting 100 article / headline pairs
 13%|█▎        | 111/857 [3:54:43<38:07:10, 183.96s/it]
Total articles processed: 13320 of 16824 (79.2%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/apw_eng_200306.xml.gz
15874  articles in this file. Extracting 100 article / headline pairs
 13%|█▎        | 112/857 [4:00:09<46:55:19, 226.74s/it]
Total articles processed: 13440 of 16959 (79.2%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/nyt_eng_201009.xml.gz
6008  articles in this file. Extracting 100 article / headline pairs
 13%|█▎        | 113/857 [4:03:25<44:57:20, 217.53s/it]
Total articles processed: 13560 of 17131 (79.2%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/afp_eng_200402.xml.gz
20208  articles in this file. Extracting 100 article / headline pairs
 13%|█▎        | 114/857 [4:08:03<48:37:12, 235.58s/it]
Total articles processed: 13680 of 17262 (79.2%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/ltw_eng_199605.xml.gz
2896  articles in this file. Extracting 100 article / headline pairs
 13%|█▎        | 115/857 [4:09:05<37:50:06, 183.57s/it]
Total articles processed: 13800 of 17423 (79.2%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/afp_eng_199502.xml.gz
11595  articles in this file. Extracting 100 article / headline pairs
 14%|█▎        | 116/857 [4:10:53<33:06:46, 160.87s/it]
Total articles processed: 13920 of 17546 (79.3%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/afp_eng_200408.xml.gz
21106  articles in this file. Extracting 100 article / headline pairs
 14%|█▎        | 117/857 [4:15:02<38:29:11, 187.23s/it]
Total articles processed: 14040 of 17681 (79.4%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/apw_eng_200406.xml.gz
4705  articles in this file. Extracting 100 article / headline pairs
 14%|█▍        | 118/857 [4:15:59<30:27:31, 148.38s/it]
Total articles processed: 14160 of 17807 (79.5%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/apw_eng_200912.xml.gz
13112  articles in this file. Extracting 100 article / headline pairs
 14%|█▍        | 119/857 [4:19:39<34:47:36, 169.72s/it]
Total articles processed: 14280 of 17930 (79.6%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/nyt_eng_200807.xml.gz
6563  articles in this file. Extracting 100 article / headline pairs
 14%|█▍        | 120/857 [4:23:25<38:11:19, 186.54s/it]
Total articles processed: 14400 of 18086 (79.6%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/afp_eng_200608.xml.gz
11266  articles in this file. Extracting 100 article / headline pairs
 14%|█▍        | 121/857 [4:25:44<35:13:29, 172.29s/it]
Total articles processed: 14520 of 18220 (79.7%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_199909.xml.gz
9175  articles in this file. Extracting 100 article / headline pairs
 14%|█▍        | 122/857 [4:26:37<27:51:46, 136.47s/it]
Total articles processed: 14640 of 18375 (79.7%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/cna_eng_200410.xml.gz
1140  articles in this file. Extracting 100 article / headline pairs
 14%|█▍        | 123/857 [4:26:50<20:15:45, 99.38s/it] 
Total articles processed: 14760 of 18549 (79.6%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/nyt_eng_200409.xml.gz
9493  articles in this file. Extracting 100 article / headline pairs
 14%|█▍        | 124/857 [4:32:59<36:43:37, 180.38s/it]
Total articles processed: 14880 of 18718 (79.5%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/nyt_eng_200308.xml.gz
1678  articles in this file. Extracting 100 article / headline pairs
 15%|█▍        | 125/857 [4:33:30<27:34:21, 135.60s/it]
Total articles processed: 15000 of 18894 (79.4%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/afp_eng_200712.xml.gz
16333  articles in this file. Extracting 100 article / headline pairs
 15%|█▍        | 126/857 [4:37:03<32:15:07, 158.83s/it]
Total articles processed: 15120 of 19051 (79.4%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/apw_eng_200007.xml.gz
5268  articles in this file. Extracting 100 article / headline pairs
 15%|█▍        | 127/857 [4:38:26<27:34:26, 135.98s/it]
Total articles processed: 15240 of 19179 (79.5%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/apw_eng_200501.xml.gz
9482  articles in this file. Extracting 100 article / headline pairs
 15%|█▍        | 128/857 [4:40:40<27:25:36, 135.44s/it]
Total articles processed: 15360 of 19315 (79.5%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_200911.xml.gz
11002  articles in this file. Extracting 100 article / headline pairs
 15%|█▌        | 129/857 [4:42:04<24:17:41, 120.14s/it]
Total articles processed: 15480 of 19463 (79.5%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/nyt_eng_200805.xml.gz
6813  articles in this file. Extracting 100 article / headline pairs
 15%|█▌        | 130/857 [4:46:04<31:29:42, 155.96s/it]
Total articles processed: 15600 of 19622 (79.5%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/cna_eng_199710.xml.gz
921  articles in this file. Extracting 100 article / headline pairs
 15%|█▌        | 131/857 [4:46:16<22:46:21, 112.92s/it]
Total articles processed: 15720 of 19766 (79.5%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/apw_eng_201009.xml.gz
17406  articles in this file. Extracting 100 article / headline pairs
 15%|█▌        | 132/857 [4:51:22<34:22:13, 170.67s/it]
Total articles processed: 15840 of 19891 (79.6%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/ltw_eng_200311.xml.gz
3763  articles in this file. Extracting 100 article / headline pairs
 16%|█▌        | 133/857 [4:52:57<29:47:28, 148.13s/it]
Total articles processed: 15960 of 20054 (79.6%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/afp_eng_199505.xml.gz
12538  articles in this file. Extracting 100 article / headline pairs
 16%|█▌        | 134/857 [4:55:01<28:14:41, 140.64s/it]
Total articles processed: 16080 of 20175 (79.7%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_200401.xml.gz
8126  articles in this file. Extracting 100 article / headline pairs
 16%|█▌        | 135/857 [4:55:49<22:40:51, 113.09s/it]
Total articles processed: 16200 of 20338 (79.7%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/ltw_eng_199705.xml.gz
3008  articles in this file. Extracting 100 article / headline pairs
 16%|█▌        | 136/857 [4:56:52<19:35:39, 97.84s/it] 
Total articles processed: 16320 of 20503 (79.6%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/ltw_eng_200812.xml.gz
2912  articles in this file. Extracting 100 article / headline pairs
 16%|█▌        | 137/857 [4:57:57<17:36:11, 88.02s/it]
Total articles processed: 16440 of 20707 (79.4%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/cna_eng_200412.xml.gz
997  articles in this file. Extracting 100 article / headline pairs
 16%|█▌        | 138/857 [4:58:09<13:02:41, 65.32s/it]
Total articles processed: 16560 of 20874 (79.3%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_200005.xml.gz
9392  articles in this file. Extracting 100 article / headline pairs
 16%|█▌        | 139/857 [4:59:00<12:08:34, 60.88s/it]
Total articles processed: 16680 of 21020 (79.4%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/cna_eng_199810.xml.gz
932  articles in this file. Extracting 100 article / headline pairs
 16%|█▋        | 140/857 [4:59:12<9:13:47, 46.34s/it] 
Total articles processed: 16800 of 21151 (79.4%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_199908.xml.gz
8978  articles in this file. Extracting 100 article / headline pairs
 16%|█▋        | 141/857 [5:00:02<9:25:30, 47.39s/it]
Total articles processed: 16920 of 21307 (79.4%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/afp_eng_201002.xml.gz
20156  articles in this file. Extracting 100 article / headline pairs
 17%|█▋        | 142/857 [5:04:34<22:49:04, 114.89s/it]
Total articles processed: 17040 of 21440 (79.5%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/ltw_eng_200401.xml.gz
3725  articles in this file. Extracting 100 article / headline pairs
 17%|█▋        | 143/857 [5:06:06<21:26:03, 108.07s/it]
Total articles processed: 17160 of 21609 (79.4%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/afp_eng_201008.xml.gz
11910  articles in this file. Extracting 100 article / headline pairs
 17%|█▋        | 144/857 [5:08:29<23:26:40, 118.37s/it]
Total articles processed: 17280 of 21747 (79.5%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/apw_eng_199507.xml.gz
17489  articles in this file. Extracting 100 article / headline pairs
 17%|█▋        | 145/857 [5:13:28<34:08:18, 172.61s/it]
Total articles processed: 17400 of 21882 (79.5%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_199612.xml.gz
7452  articles in this file. Extracting 100 article / headline pairs
 17%|█▋        | 146/857 [5:14:10<26:19:51, 133.32s/it]
Total articles processed: 17520 of 22026 (79.5%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/nyt_eng_200703.xml.gz
8143  articles in this file. Extracting 100 article / headline pairs
 17%|█▋        | 147/857 [5:19:08<36:04:30, 182.92s/it]
Total articles processed: 17640 of 22177 (79.5%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/ltw_eng_199407.xml.gz
3110  articles in this file. Extracting 100 article / headline pairs
 17%|█▋        | 148/857 [5:20:15<29:09:13, 148.03s/it]
Total articles processed: 17760 of 22345 (79.5%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/ltw_eng_200805.xml.gz
3413  articles in this file. Extracting 100 article / headline pairs
 17%|█▋        | 149/857 [5:21:37<25:12:33, 128.18s/it]
Total articles processed: 17880 of 22526 (79.4%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/nyt_eng_200806.xml.gz
6790  articles in this file. Extracting 100 article / headline pairs
 18%|█▊        | 150/857 [5:25:35<31:40:48, 161.31s/it]
Total articles processed: 18000 of 22693 (79.3%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/cna_eng_201001.xml.gz
1273  articles in this file. Extracting 100 article / headline pairs
 18%|█▊        | 151/857 [5:25:51<23:05:08, 117.72s/it]
Total articles processed: 18120 of 22844 (79.3%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/afp_eng_199603.xml.gz
17451  articles in this file. Extracting 100 article / headline pairs
 18%|█▊        | 152/857 [5:28:45<26:18:39, 134.35s/it]
Total articles processed: 18240 of 22966 (79.4%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/ltw_eng_199603.xml.gz
2853  articles in this file. Extracting 100 article / headline pairs
 18%|█▊        | 153/857 [5:29:45<21:57:28, 112.28s/it]
Total articles processed: 18360 of 23157 (79.3%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/cna_eng_200609.xml.gz
1238  articles in this file. Extracting 100 article / headline pairs
 18%|█▊        | 154/857 [5:29:59<16:08:04, 82.62s/it] 
Total articles processed: 18480 of 23327 (79.2%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/nyt_eng_200503.xml.gz
8736  articles in this file. Extracting 100 article / headline pairs
 18%|█▊        | 155/857 [5:35:29<30:35:55, 156.92s/it]
Total articles processed: 18600 of 23496 (79.2%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/cna_eng_199805.xml.gz
830  articles in this file. Extracting 100 article / headline pairs
 18%|█▊        | 156/857 [5:35:41<22:04:55, 113.40s/it]
Total articles processed: 18720 of 23629 (79.2%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/ltw_eng_200809.xml.gz
2964  articles in this file. Extracting 100 article / headline pairs
 18%|█▊        | 157/857 [5:36:50<19:27:25, 100.07s/it]
Total articles processed: 18840 of 23817 (79.1%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_200707.xml.gz
11360  articles in this file. Extracting 100 article / headline pairs
 18%|█▊        | 158/857 [5:37:59<17:38:22, 90.85s/it] 
Total articles processed: 18960 of 23963 (79.1%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/ltw_eng_200606.xml.gz
3723  articles in this file. Extracting 100 article / headline pairs
 19%|█▊        | 159/857 [5:39:24<17:16:06, 89.06s/it]
Total articles processed: 19080 of 24136 (79.1%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/nyt_eng_200304.xml.gz
2209  articles in this file. Extracting 100 article / headline pairs
 19%|█▊        | 160/857 [5:40:03<14:20:28, 74.07s/it]
Total articles processed: 19200 of 24317 (79.0%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/ltw_eng_200408.xml.gz
3711  articles in this file. Extracting 100 article / headline pairs
 19%|█▉        | 161/857 [5:41:32<15:10:35, 78.50s/it]
Total articles processed: 19320 of 24484 (78.9%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/nyt_eng_199412.xml.gz
7676  articles in this file. Extracting 100 article / headline pairs
 19%|█▉        | 162/857 [5:45:15<23:30:45, 121.79s/it]
Total articles processed: 19440 of 24640 (78.9%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/ltw_eng_199406.xml.gz
2792  articles in this file. Extracting 100 article / headline pairs
 19%|█▉        | 163/857 [5:46:16<19:58:21, 103.60s/it]
Total articles processed: 19560 of 24806 (78.9%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/ltw_eng_199505.xml.gz
3249  articles in this file. Extracting 100 article / headline pairs
 19%|█▉        | 164/857 [5:47:29<18:09:11, 94.30s/it] 
Total articles processed: 19680 of 24983 (78.8%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/afp_eng_199703.xml.gz
19219  articles in this file. Extracting 100 article / headline pairs
 19%|█▉        | 165/857 [5:51:05<25:11:50, 131.08s/it]
Total articles processed: 19800 of 25106 (78.9%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/ltw_eng_200510.xml.gz
3800  articles in this file. Extracting 100 article / headline pairs
 19%|█▉        | 166/857 [5:52:32<22:36:23, 117.78s/it]
Total articles processed: 19920 of 25278 (78.8%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/ltw_eng_200507.xml.gz
3664  articles in this file. Extracting 100 article / headline pairs
 19%|█▉        | 167/857 [5:54:02<20:57:42, 109.37s/it]
Total articles processed: 20040 of 25441 (78.8%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/cna_eng_200807.xml.gz
1491  articles in this file. Extracting 100 article / headline pairs
 20%|█▉        | 168/857 [5:54:17<15:32:37, 81.21s/it] 
Total articles processed: 20160 of 25603 (78.7%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/cna_eng_200902.xml.gz
1144  articles in this file. Extracting 100 article / headline pairs
 20%|█▉        | 169/857 [5:54:31<11:40:02, 61.05s/it]
Total articles processed: 20280 of 25765 (78.7%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/nyt_eng_200602.xml.gz
9254  articles in this file. Extracting 100 article / headline pairs
 20%|█▉        | 170/857 [6:00:27<28:30:26, 149.38s/it]
Total articles processed: 20400 of 25927 (78.7%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/cna_eng_200408.xml.gz
92  articles in this file. Extracting 100 article / headline pairs
 20%|█▉        | 171/857 [6:00:31<20:10:35, 105.88s/it]
Total articles processed: 20467 of 26019 (78.7%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/nyt_eng_200612.xml.gz
8267  articles in this file. Extracting 100 article / headline pairs
 20%|██        | 172/857 [6:05:39<31:41:03, 166.52s/it]
Total articles processed: 20587 of 26180 (78.6%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/cna_eng_200012.xml.gz
719  articles in this file. Extracting 100 article / headline pairs
 20%|██        | 173/857 [6:05:51<22:47:56, 119.99s/it]
Total articles processed: 20707 of 26325 (78.7%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_200805.xml.gz
8385  articles in this file. Extracting 100 article / headline pairs
 20%|██        | 174/857 [6:06:40<18:45:42, 98.89s/it] 
Total articles processed: 20827 of 26469 (78.7%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_200104.xml.gz
8859  articles in this file. Extracting 100 article / headline pairs
 20%|██        | 175/857 [6:07:30<15:54:14, 83.95s/it]
Total articles processed: 20947 of 26625 (78.7%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/ltw_eng_199506.xml.gz
3140  articles in this file. Extracting 100 article / headline pairs
 21%|██        | 176/857 [6:08:39<15:04:29, 79.69s/it]
Total articles processed: 21067 of 26783 (78.7%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/afp_eng_200202.xml.gz
18257  articles in this file. Extracting 100 article / headline pairs
 21%|██        | 177/857 [6:12:39<24:08:25, 127.80s/it]
Total articles processed: 21187 of 26903 (78.8%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_199803.xml.gz
8848  articles in this file. Extracting 100 article / headline pairs
 21%|██        | 178/857 [6:13:29<19:41:36, 104.41s/it]
Total articles processed: 21307 of 27038 (78.8%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/afp_eng_200710.xml.gz
14962  articles in this file. Extracting 100 article / headline pairs
 21%|██        | 179/857 [6:16:43<24:41:55, 131.14s/it]
Total articles processed: 21427 of 27168 (78.9%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_199902.xml.gz
7350  articles in this file. Extracting 100 article / headline pairs
 21%|██        | 180/857 [6:17:25<19:38:00, 104.40s/it]
Total articles processed: 21547 of 27317 (78.9%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/apw_eng_199910.xml.gz
8701  articles in this file. Extracting 100 article / headline pairs
 21%|██        | 181/857 [6:19:59<22:26:07, 119.48s/it]
Total articles processed: 21667 of 27438 (79.0%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_200405.xml.gz
10477  articles in this file. Extracting 100 article / headline pairs
 21%|██        | 182/857 [6:21:10<19:38:42, 104.77s/it]
Total articles processed: 21787 of 27581 (79.0%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/nyt_eng_200306.xml.gz
1925  articles in this file. Extracting 100 article / headline pairs
 21%|██▏       | 183/857 [6:21:44<15:40:26, 83.72s/it] 
Total articles processed: 21907 of 27759 (78.9%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_200010.xml.gz
8864  articles in this file. Extracting 100 article / headline pairs
 21%|██▏       | 184/857 [6:22:34<13:45:01, 73.55s/it]
Total articles processed: 22027 of 27915 (78.9%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/apw_eng_199704.xml.gz
14110  articles in this file. Extracting 100 article / headline pairs
 22%|██▏       | 185/857 [6:25:51<20:36:32, 110.41s/it]
Total articles processed: 22147 of 28063 (78.9%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/afp_eng_201009.xml.gz
9589  articles in this file. Extracting 100 article / headline pairs
 22%|██▏       | 186/857 [6:27:33<20:07:37, 107.98s/it]
Total articles processed: 22267 of 28200 (79.0%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_200105.xml.gz
8924  articles in this file. Extracting 100 article / headline pairs
 22%|██▏       | 187/857 [6:28:22<16:47:24, 90.22s/it] 
Total articles processed: 22387 of 28355 (79.0%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/afp_eng_200804.xml.gz
19372  articles in this file. Extracting 100 article / headline pairs
 22%|██▏       | 188/857 [6:32:55<26:59:20, 145.23s/it]
Total articles processed: 22507 of 28716 (78.4%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/cna_eng_199801.xml.gz
685  articles in this file. Extracting 100 article / headline pairs
 22%|██▏       | 189/857 [6:33:07<19:30:47, 105.16s/it]
Total articles processed: 22627 of 28863 (78.4%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/cna_eng_200608.xml.gz
1437  articles in this file. Extracting 100 article / headline pairs
 22%|██▏       | 190/857 [6:33:21<14:25:21, 77.84s/it] 
Total articles processed: 22747 of 29031 (78.4%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/apw_eng_200504.xml.gz
8905  articles in this file. Extracting 100 article / headline pairs
 22%|██▏       | 191/857 [6:35:27<17:03:10, 92.18s/it]
Total articles processed: 22867 of 29162 (78.4%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/ltw_eng_199703.xml.gz
2956  articles in this file. Extracting 100 article / headline pairs
 22%|██▏       | 192/857 [6:36:29<15:23:37, 83.33s/it]
Total articles processed: 22987 of 29347 (78.3%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/apw_eng_200505.xml.gz
12029  articles in this file. Extracting 100 article / headline pairs
 23%|██▎       | 193/857 [6:39:31<20:48:26, 112.81s/it]
Total articles processed: 23107 of 29475 (78.4%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/cna_eng_199901.xml.gz
796  articles in this file. Extracting 100 article / headline pairs
 23%|██▎       | 194/857 [6:39:42<15:10:41, 82.42s/it] 
Total articles processed: 23227 of 29618 (78.4%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/cna_eng_200112.xml.gz
668  articles in this file. Extracting 100 article / headline pairs
 23%|██▎       | 195/857 [6:39:53<11:10:48, 60.80s/it]
Total articles processed: 23347 of 29756 (78.5%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/cna_eng_200606.xml.gz
1297  articles in this file. Extracting 100 article / headline pairs
 23%|██▎       | 196/857 [6:40:06<8:32:17, 46.50s/it] 
Total articles processed: 23467 of 29925 (78.4%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/afp_eng_199510.xml.gz
14309  articles in this file. Extracting 100 article / headline pairs
 23%|██▎       | 197/857 [6:42:21<13:22:27, 72.95s/it]
Total articles processed: 23587 of 30047 (78.5%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_200211.xml.gz
9030  articles in this file. Extracting 100 article / headline pairs
 23%|██▎       | 198/857 [6:43:20<12:35:02, 68.74s/it]
Total articles processed: 23707 of 30217 (78.5%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/apw_eng_200208.xml.gz
18806  articles in this file. Extracting 100 article / headline pairs
 23%|██▎       | 199/857 [6:49:18<28:28:24, 155.78s/it]
Total articles processed: 23827 of 30365 (78.5%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/afp_eng_201010.xml.gz
11046  articles in this file. Extracting 100 article / headline pairs
 23%|██▎       | 200/857 [6:51:23<26:42:20, 146.33s/it]
Total articles processed: 23947 of 30502 (78.5%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/cna_eng_200805.xml.gz
1530  articles in this file. Extracting 100 article / headline pairs
 23%|██▎       | 201/857 [6:51:40<19:35:22, 107.50s/it]
Total articles processed: 24067 of 30659 (78.5%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/apw_eng_200807.xml.gz
19173  articles in this file. Extracting 100 article / headline pairs
 24%|██▎       | 202/857 [6:57:58<34:20:09, 188.72s/it]
Total articles processed: 24187 of 30784 (78.6%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/cna_eng_200309.xml.gz
313  articles in this file. Extracting 100 article / headline pairs
 24%|██▎       | 203/857 [6:58:06<24:26:45, 134.57s/it]
Total articles processed: 24307 of 30958 (78.5%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/ltw_eng_200903.xml.gz
3066  articles in this file. Extracting 100 article / headline pairs
 24%|██▍       | 204/857 [6:59:12<20:42:00, 114.12s/it]
Total articles processed: 24427 of 31133 (78.5%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/ltw_eng_200712.xml.gz
3195  articles in this file. Extracting 100 article / headline pairs
 24%|██▍       | 205/857 [7:00:29<18:36:32, 102.75s/it]
Total articles processed: 24547 of 31300 (78.4%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/apw_eng_199903.xml.gz
17977  articles in this file. Extracting 100 article / headline pairs
 24%|██▍       | 206/857 [7:05:27<29:11:44, 161.45s/it]
Total articles processed: 24667 of 31450 (78.4%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/apw_eng_199502.xml.gz
13554  articles in this file. Extracting 100 article / headline pairs
 24%|██▍       | 207/857 [7:08:35<30:35:32, 169.43s/it]
Total articles processed: 24787 of 31589 (78.5%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/cna_eng_200701.xml.gz
1197  articles in this file. Extracting 100 article / headline pairs
 24%|██▍       | 208/857 [7:08:48<22:04:43, 122.47s/it]
Total articles processed: 24907 of 31751 (78.4%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/cna_eng_201011.xml.gz
1324  articles in this file. Extracting 100 article / headline pairs
 24%|██▍       | 209/857 [7:09:04<16:16:06, 90.38s/it] 
Total articles processed: 25027 of 31895 (78.5%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_200709.xml.gz
10997  articles in this file. Extracting 100 article / headline pairs
 25%|██▍       | 210/857 [7:10:10<14:58:39, 83.34s/it]
Total articles processed: 25147 of 32046 (78.5%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/apw_eng_199702.xml.gz
13134  articles in this file. Extracting 100 article / headline pairs
 25%|██▍       | 211/857 [7:13:10<20:07:16, 112.13s/it]
Total articles processed: 25267 of 32188 (78.5%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/cna_eng_200312.xml.gz
897  articles in this file. Extracting 100 article / headline pairs
 25%|██▍       | 212/857 [7:13:21<14:39:37, 81.83s/it] 
Total articles processed: 25387 of 32353 (78.5%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/cna_eng_200001.xml.gz
788  articles in this file. Extracting 100 article / headline pairs
 25%|██▍       | 213/857 [7:13:32<10:50:54, 60.64s/it]
Total articles processed: 25507 of 32492 (78.5%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_201002.xml.gz
10093  articles in this file. Extracting 100 article / headline pairs
 25%|██▍       | 214/857 [7:14:43<11:21:20, 63.58s/it]
Total articles processed: 25627 of 32628 (78.5%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/cna_eng_200802.xml.gz
1142  articles in this file. Extracting 100 article / headline pairs
 25%|██▌       | 215/857 [7:14:57<8:41:02, 48.69s/it] 
Total articles processed: 25747 of 32777 (78.6%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/afp_eng_200302.xml.gz
22065  articles in this file. Extracting 100 article / headline pairs
 25%|██▌       | 216/857 [7:20:25<23:37:16, 132.66s/it]
Total articles processed: 25867 of 32906 (78.6%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/cna_eng_200101.xml.gz
705  articles in this file. Extracting 100 article / headline pairs
 25%|██▌       | 217/857 [7:20:36<17:06:33, 96.24s/it] 
Total articles processed: 25987 of 33049 (78.6%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/ltw_eng_200705.xml.gz
3585  articles in this file. Extracting 100 article / headline pairs
 25%|██▌       | 218/857 [7:21:58<16:18:21, 91.86s/it]
Total articles processed: 26107 of 33224 (78.6%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/ltw_eng_199612.xml.gz
2820  articles in this file. Extracting 100 article / headline pairs
 26%|██▌       | 219/857 [7:22:57<14:31:47, 81.99s/it]
Total articles processed: 26227 of 33394 (78.5%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_199611.xml.gz
7734  articles in this file. Extracting 100 article / headline pairs
 26%|██▌       | 220/857 [7:23:40<12:25:34, 70.23s/it]
Total articles processed: 26347 of 33526 (78.6%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/afp_eng_200908.xml.gz
19063  articles in this file. Extracting 100 article / headline pairs
 26%|██▌       | 221/857 [7:27:49<21:54:14, 123.98s/it]
Total articles processed: 26467 of 33670 (78.6%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/cna_eng_199909.xml.gz
831  articles in this file. Extracting 100 article / headline pairs
 26%|██▌       | 222/857 [7:28:01<15:56:26, 90.37s/it] 
Total articles processed: 26587 of 33819 (78.6%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/cna_eng_200602.xml.gz
1044  articles in this file. Extracting 100 article / headline pairs
 26%|██▌       | 223/857 [7:28:13<11:47:23, 66.95s/it]
Total articles processed: 26707 of 33989 (78.6%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/nyt_eng_200005.xml.gz
9397  articles in this file. Extracting 100 article / headline pairs
 26%|██▌       | 224/857 [7:34:14<27:15:48, 155.05s/it]
Total articles processed: 26827 of 34144 (78.6%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_200601.xml.gz
9314  articles in this file. Extracting 100 article / headline pairs
 26%|██▋       | 225/857 [7:35:13<22:10:11, 126.28s/it]
Total articles processed: 26947 of 34304 (78.6%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/cna_eng_199809.xml.gz
841  articles in this file. Extracting 100 article / headline pairs
 26%|██▋       | 226/857 [7:35:25<16:05:51, 91.84s/it] 
Total articles processed: 27067 of 34437 (78.6%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_199601.xml.gz
7188  articles in this file. Extracting 100 article / headline pairs
 26%|██▋       | 227/857 [7:36:03<13:14:48, 75.70s/it]
Total articles processed: 27187 of 34588 (78.6%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/nyt_eng_200808.xml.gz
6804  articles in this file. Extracting 100 article / headline pairs
 27%|██▋       | 228/857 [7:40:05<21:56:47, 125.61s/it]
Total articles processed: 27307 of 34746 (78.6%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/ltw_eng_199802.xml.gz
2905  articles in this file. Extracting 100 article / headline pairs
 27%|██▋       | 229/857 [7:41:09<18:40:52, 107.09s/it]
Total articles processed: 27427 of 34920 (78.5%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_200811.xml.gz
9292  articles in this file. Extracting 100 article / headline pairs
 27%|██▋       | 230/857 [7:42:13<16:25:28, 94.30s/it] 
Total articles processed: 27547 of 35056 (78.6%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/nyt_eng_200912.xml.gz
4986  articles in this file. Extracting 100 article / headline pairs
 27%|██▋       | 231/857 [7:44:41<19:10:55, 110.31s/it]
Total articles processed: 27667 of 35207 (78.6%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/cna_eng_200307.xml.gz
13  articles in this file. Extracting 100 article / headline pairs
 27%|██▋       | 232/857 [7:44:42<13:28:31, 77.62s/it] 
Total articles processed: 27680 of 35220 (78.6%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/nyt_eng_201004.xml.gz
5541  articles in this file. Extracting 100 article / headline pairs
 27%|██▋       | 233/857 [7:47:32<18:16:31, 105.44s/it]
Total articles processed: 27800 of 35394 (78.5%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_199501.xml.gz
6202  articles in this file. Extracting 100 article / headline pairs
 27%|██▋       | 234/857 [7:48:08<14:35:36, 84.33s/it] 
Total articles processed: 27920 of 35529 (78.6%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/nyt_eng_200009.xml.gz
8986  articles in this file. Extracting 100 article / headline pairs
 27%|██▋       | 235/857 [7:54:02<28:34:01, 165.34s/it]
Total articles processed: 28040 of 35701 (78.5%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/afp_eng_199705.xml.gz
7518  articles in this file. Extracting 100 article / headline pairs
 28%|██▊       | 236/857 [7:55:08<23:22:23, 135.50s/it]
Total articles processed: 28160 of 35821 (78.6%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/cna_eng_200605.xml.gz
1325  articles in this file. Extracting 100 article / headline pairs
 28%|██▊       | 237/857 [7:55:22<17:03:16, 99.03s/it] 
Total articles processed: 28280 of 35983 (78.6%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_200203.xml.gz
9676  articles in this file. Extracting 100 article / headline pairs
 28%|██▊       | 238/857 [7:56:22<15:01:33, 87.39s/it]
Total articles processed: 28400 of 36140 (78.6%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/cna_eng_200811.xml.gz
1273  articles in this file. Extracting 100 article / headline pairs
 28%|██▊       | 239/857 [7:56:38<11:20:39, 66.08s/it]
Total articles processed: 28520 of 36284 (78.6%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_200404.xml.gz
10491  articles in this file. Extracting 100 article / headline pairs
 28%|██▊       | 240/857 [7:57:50<11:35:51, 67.67s/it]
Total articles processed: 28640 of 36429 (78.6%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/apw_eng_200111.xml.gz
19617  articles in this file. Extracting 100 article / headline pairs
 28%|██▊       | 241/857 [8:03:44<26:17:27, 153.65s/it]
Total articles processed: 28760 of 36564 (78.7%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/ltw_eng_199706.xml.gz
2703  articles in this file. Extracting 100 article / headline pairs
 28%|██▊       | 242/857 [8:04:42<21:19:43, 124.85s/it]
Total articles processed: 28880 of 36753 (78.6%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/nyt_eng_200307.xml.gz
2045  articles in this file. Extracting 100 article / headline pairs
 28%|██▊       | 243/857 [8:05:18<16:46:08, 98.32s/it] 
Total articles processed: 29000 of 36923 (78.5%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/afp_eng_200502.xml.gz
17542  articles in this file. Extracting 100 article / headline pairs
 28%|██▊       | 244/857 [8:09:11<23:36:21, 138.63s/it]
Total articles processed: 29120 of 37069 (78.6%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/apw_eng_199902.xml.gz
14819  articles in this file. Extracting 100 article / headline pairs
 29%|██▊       | 245/857 [8:12:54<27:54:01, 164.12s/it]
Total articles processed: 29240 of 37206 (78.6%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/afp_eng_200912.xml.gz
18821  articles in this file. Extracting 100 article / headline pairs
 29%|██▊       | 246/857 [8:17:03<32:10:12, 189.55s/it]
Total articles processed: 29360 of 37332 (78.6%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/cna_eng_200411.xml.gz
1114  articles in this file. Extracting 100 article / headline pairs
 29%|██▉       | 247/857 [8:17:16<23:08:33, 136.58s/it]
Total articles processed: 29480 of 37491 (78.6%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/ltw_eng_199608.xml.gz
2542  articles in this file. Extracting 100 article / headline pairs
 29%|██▉       | 248/857 [8:18:09<18:52:31, 111.58s/it]
Total articles processed: 29600 of 37671 (78.6%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/ltw_eng_199507.xml.gz
2893  articles in this file. Extracting 100 article / headline pairs
 29%|██▉       | 249/857 [8:19:13<16:24:52, 97.19s/it] 
Total articles processed: 29720 of 37862 (78.5%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/nyt_eng_200812.xml.gz
6000  articles in this file. Extracting 100 article / headline pairs
 29%|██▉       | 250/857 [8:22:42<22:02:13, 130.70s/it]
Total articles processed: 29840 of 38025 (78.5%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_200602.xml.gz
9448  articles in this file. Extracting 100 article / headline pairs
 29%|██▉       | 251/857 [8:23:40<18:20:17, 108.94s/it]
Total articles processed: 29960 of 38168 (78.5%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/apw_eng_200612.xml.gz
8043  articles in this file. Extracting 100 article / headline pairs
 29%|██▉       | 252/857 [8:25:53<19:29:39, 116.00s/it]
Total articles processed: 30080 of 38303 (78.5%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/apw_eng_200810.xml.gz
20033  articles in this file. Extracting 100 article / headline pairs
 30%|██▉       | 253/857 [8:32:04<32:18:33, 192.57s/it]
Total articles processed: 30200 of 38427 (78.6%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/cna_eng_199709.xml.gz
508 
 30%|██▉       | 254/857 [8:32:17<23:13:13, 138.63s/it]
 articles in this file. Extracting 100 article / headline pairs
Total articles processed: 30320 of 38563 (78.6%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/nyt_eng_199503.xml.gz
10616  articles in this file. Extracting 100 article / headline pairs
 30%|██▉       | 255/857 [8:38:06<33:46:19, 201.96s/it]
Total articles processed: 30440 of 38711 (78.6%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/nyt_eng_200909.xml.gz
5616  articles in this file. Extracting 100 article / headline pairs
 30%|██▉       | 256/857 [8:40:52<31:54:31, 191.13s/it]
Total articles processed: 30560 of 38864 (78.6%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_199906.xml.gz
8653 
 30%|██▉       | 257/857 [8:41:41<24:44:37, 148.46s/it]
 articles in this file. Extracting 100 article / headline pairs
Total articles processed: 30680 of 39013 (78.6%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/ltw_eng_199604.xml.gz
2849  articles in this file. Extracting 100 article / headline pairs
 30%|███       | 258/857 [8:42:43<20:24:16, 122.63s/it]
Total articles processed: 30800 of 39187 (78.6%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_199709.xml.gz
8025  articles in this file. Extracting 100 article / headline pairs
 30%|███       | 259/857 [8:43:30<16:34:47, 99.81s/it] 
Total articles processed: 30920 of 39335 (78.6%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_201001.xml.gz
10933  articles in this file. Extracting 100 article / headline pairs
 30%|███       | 260/857 [8:44:50<15:32:55, 93.76s/it]
Total articles processed: 31040 of 39485 (78.6%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/apw_eng_200005.xml.gz
5812  articles in this file. Extracting 100 article / headline pairs
 30%|███       | 261/857 [8:46:33<16:00:35, 96.70s/it]
Total articles processed: 31160 of 39619 (78.6%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/afp_eng_200808.xml.gz
22826  articles in this file. Extracting 100 article / headline pairs
 31%|███       | 262/857 [8:51:49<26:51:43, 162.53s/it]
Total articles processed: 31280 of 39847 (78.5%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/apw_eng_199506.xml.gz
19005  articles in this file. Extracting 100 article / headline pairs
 31%|███       | 263/857 [8:57:21<35:12:21, 213.37s/it]
Total articles processed: 31400 of 39986 (78.5%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_200207.xml.gz
8914  articles in this file. Extracting 100 article / headline pairs
 31%|███       | 264/857 [8:58:15<27:15:55, 165.52s/it]
Total articles processed: 31520 of 40138 (78.5%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/apw_eng_200607.xml.gz
12209  articles in this file. Extracting 100 article / headline pairs
 31%|███       | 265/857 [9:01:34<28:52:37, 175.60s/it]
Total articles processed: 31640 of 40270 (78.6%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/nyt_eng_200907.xml.gz
5590  articles in this file. Extracting 100 article / headline pairs
 31%|███       | 266/857 [9:04:24<28:32:48, 173.89s/it]
Total articles processed: 31760 of 40441 (78.5%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/nyt_eng_200302.xml.gz
1901  articles in this file. Extracting 100 article / headline pairs
 31%|███       | 267/857 [9:05:00<21:41:14, 132.33s/it]
Total articles processed: 31880 of 40609 (78.5%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_200507.xml.gz
10341  articles in this file. Extracting 100 article / headline pairs
 31%|███▏      | 268/857 [9:06:08<18:31:09, 113.19s/it]
Total articles processed: 32000 of 40751 (78.5%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_199607.xml.gz
8514  articles in this file. Extracting 100 article / headline pairs
 31%|███▏      | 269/857 [9:06:54<15:11:14, 92.98s/it] 
Total articles processed: 32120 of 40890 (78.6%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/afp_eng_201011.xml.gz
10252  articles in this file. Extracting 100 article / headline pairs
 32%|███▏      | 270/857 [9:08:47<16:08:19, 98.98s/it]
Total articles processed: 32240 of 41026 (78.6%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/cna_eng_200907.xml.gz
1366  articles in this file. Extracting 100 article / headline pairs
 32%|███▏      | 271/857 [9:09:03<12:04:49, 74.21s/it]
Total articles processed: 32360 of 41169 (78.6%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/afp_eng_199611.xml.gz
20362  articles in this file. Extracting 100 article / headline pairs
 32%|███▏      | 272/857 [9:12:37<18:51:01, 116.00s/it]
Total articles processed: 32480 of 41289 (78.7%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/afp_eng_200209.xml.gz
1792  articles in this file. Extracting 100 article / headline pairs
 32%|███▏      | 273/857 [9:12:57<14:08:30, 87.18s/it] 
Total articles processed: 32600 of 41430 (78.7%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/cna_eng_200311.xml.gz
855  articles in this file. Extracting 100 article / headline pairs
 32%|███▏      | 274/857 [9:13:07<10:23:03, 64.12s/it]
Total articles processed: 32720 of 41613 (78.6%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/apw_eng_201003.xml.gz
12132  articles in this file. Extracting 100 article / headline pairs
 32%|███▏      | 275/857 [9:16:13<16:16:12, 100.64s/it]
Total articles processed: 32840 of 41739 (78.7%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_200510.xml.gz
10032  articles in this file. Extracting 100 article / headline pairs
 32%|███▏      | 276/857 [9:17:23<14:46:18, 91.53s/it] 
Total articles processed: 32960 of 41883 (78.7%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/apw_eng_200609.xml.gz
10197  articles in this file. Extracting 100 article / headline pairs
 32%|███▏      | 277/857 [9:20:09<18:20:53, 113.88s/it]
Total articles processed: 33080 of 42008 (78.7%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/afp_eng_200512.xml.gz
8303  articles in this file. Extracting 100 article / headline pairs
 32%|███▏      | 278/857 [9:21:45<17:26:55, 108.49s/it]
Total articles processed: 33200 of 42159 (78.7%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/afp_eng_201006.xml.gz
21894  articles in this file. Extracting 100 article / headline pairs
 33%|███▎      | 279/857 [9:27:02<27:27:03, 170.97s/it]
Total articles processed: 33320 of 42285 (78.8%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/apw_eng_201012.xml.gz
15548  articles in this file. Extracting 100 article / headline pairs
 33%|███▎      | 280/857 [9:31:21<31:37:51, 197.35s/it]
Total articles processed: 33440 of 42411 (78.8%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/apw_eng_200511.xml.gz
12454  articles in this file. Extracting 100 article / headline pairs
 33%|███▎      | 281/857 [9:34:33<31:20:18, 195.87s/it]
Total articles processed: 33560 of 42535 (78.9%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_199605.xml.gz
8129  articles in this file. Extracting 100 article / headline pairs
 33%|███▎      | 282/857 [9:35:18<24:01:26, 150.41s/it]
Total articles processed: 33680 of 42683 (78.9%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_200506.xml.gz
9397  articles in this file. Extracting 100 article / headline pairs
 33%|███▎      | 283/857 [9:36:21<19:48:24, 124.22s/it]
Total articles processed: 33800 of 42833 (78.9%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/ltw_eng_200609.xml.gz
3747  articles in this file. Extracting 100 article / headline pairs
 33%|███▎      | 284/857 [9:37:48<18:00:32, 113.15s/it]
Total articles processed: 33920 of 43009 (78.9%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_200806.xml.gz
5873  articles in this file. Extracting 100 article / headline pairs
 33%|███▎      | 285/857 [9:38:26<14:22:53, 90.51s/it] 
Total articles processed: 34040 of 43148 (78.9%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/nyt_eng_199408.xml.gz
8404  articles in this file. Extracting 100 article / headline pairs
 33%|███▎      | 286/857 [9:42:38<22:02:37, 138.98s/it]
Total articles processed: 34160 of 43293 (78.9%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/ltw_eng_200512.xml.gz
3641  articles in this file. Extracting 100 article / headline pairs
 33%|███▎      | 287/857 [9:44:04<19:29:47, 123.14s/it]
Total articles processed: 34280 of 43469 (78.9%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/wpb_eng_201005.xml.gz
2464  articles in this file. Extracting 100 article / headline pairs
 34%|███▎      | 288/857 [9:44:54<16:01:22, 101.38s/it]
Total articles processed: 34400 of 43618 (78.9%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/ltw_eng_200402.xml.gz
3836  articles in this file. Extracting 100 article / headline pairs
 34%|███▎      | 289/857 [9:46:28<15:38:25, 99.13s/it] 
Total articles processed: 34520 of 43785 (78.8%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/cna_eng_200108.xml.gz
904  articles in this file. Extracting 100 article / headline pairs
 34%|███▍      | 290/857 [9:46:41<11:30:27, 73.06s/it]
Total articles processed: 34640 of 43926 (78.9%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_200502.xml.gz
8197  articles in this file. Extracting 100 article / headline pairs
 34%|███▍      | 291/857 [9:47:28<10:16:28, 65.35s/it]
Total articles processed: 34760 of 44072 (78.9%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_200704.xml.gz
9931  articles in this file. Extracting 100 article / headline pairs
 34%|███▍      | 292/857 [9:48:27<9:58:24, 63.55s/it] 
Total articles processed: 34880 of 44217 (78.9%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/nyt_eng_199411.xml.gz
7565  articles in this file. Extracting 100 article / headline pairs
 34%|███▍      | 293/857 [9:52:05<17:10:57, 109.68s/it]
Total articles processed: 35000 of 44368 (78.9%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/apw_eng_201005.xml.gz
11463  articles in this file. Extracting 100 article / headline pairs
 34%|███▍      | 294/857 [9:55:12<20:47:21, 132.93s/it]
Total articles processed: 35120 of 44493 (78.9%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/afp_eng_200701.xml.gz
15919  articles in this file. Extracting 100 article / headline pairs
 34%|███▍      | 295/857 [9:58:27<23:38:45, 151.47s/it]
Total articles processed: 35240 of 44627 (79.0%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/cna_eng_200903.xml.gz
1205  articles in this file. Extracting 100 article / headline pairs
 35%|███▍      | 296/857 [9:58:42<17:14:49, 110.68s/it]
Total articles processed: 35360 of 44784 (79.0%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/afp_eng_200210.xml.gz
23286  articles in this file. Extracting 100 article / headline pairs
 35%|███▍      | 297/857 [10:04:46<29:03:06, 186.76s/it]
Total articles processed: 35480 of 44920 (79.0%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_200905.xml.gz
10682  articles in this file. Extracting 100 article / headline pairs
 35%|███▍      | 298/857 [10:06:02<23:50:14, 153.51s/it]
Total articles processed: 35600 of 45065 (79.0%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/ltw_eng_200509.xml.gz
3987  articles in this file. Extracting 100 article / headline pairs
 35%|███▍      | 299/857 [10:07:39<21:10:17, 136.59s/it]
Total articles processed: 35720 of 45249 (78.9%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_200501.xml.gz
9633  articles in this file. Extracting 100 article / headline pairs
 35%|███▌      | 300/857 [10:08:43<17:43:45, 114.59s/it]
Total articles processed: 35840 of 45401 (78.9%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/afp_eng_199410.xml.gz
12021  articles in this file. Extracting 100 article / headline pairs
 35%|███▌      | 301/857 [10:10:33<17:29:32, 113.26s/it]
Total articles processed: 35960 of 45523 (79.0%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/afp_eng_200705.xml.gz
14965  articles in this file. Extracting 100 article / headline pairs
 35%|███▌      | 302/857 [10:13:29<20:23:21, 132.26s/it]
Total articles processed: 36080 of 45655 (79.0%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/cna_eng_200509.xml.gz
1023  articles in this file. Extracting 100 article / headline pairs
 35%|███▌      | 303/857 [10:13:42<14:49:29, 96.33s/it] 
Total articles processed: 36200 of 45826 (79.0%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/afp_eng_200907.xml.gz
19756  articles in this file. Extracting 100 article / headline pairs
 35%|███▌      | 304/857 [10:17:58<22:08:30, 144.14s/it]
Total articles processed: 36320 of 45954 (79.0%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/nyt_eng_200801.xml.gz
7461  articles in this file. Extracting 100 article / headline pairs
 36%|███▌      | 305/857 [10:22:37<28:19:40, 184.75s/it]
Total articles processed: 36440 of 46122 (79.0%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/nyt_eng_200309.xml.gz
2135 
 36%|███▌      | 306/857 [10:23:15<21:31:57, 140.69s/it]
 articles in this file. Extracting 100 article / headline pairs
Total articles processed: 36560 of 46329 (78.9%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_200910.xml.gz
11203  articles in this file. Extracting 100 article / headline pairs
 36%|███▌      | 307/857 [10:24:39<18:54:40, 123.78s/it]
Total articles processed: 36680 of 46479 (78.9%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/apw_eng_199509.xml.gz
16989  articles in this file. Extracting 100 article / headline pairs
 36%|███▌      | 308/857 [10:29:25<26:18:04, 172.47s/it]
Total articles processed: 36800 of 46621 (78.9%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/apw_eng_199606.xml.gz
11282  articles in this file. Extracting 100 article / headline pairs
 36%|███▌      | 309/857 [10:31:59<25:24:15, 166.89s/it]
Total articles processed: 36920 of 46769 (78.9%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/ltw_eng_200702.xml.gz
3222  articles in this file. Extracting 100 article / headline pairs
 36%|███▌      | 310/857 [10:33:10<20:59:32, 138.16s/it]
Total articles processed: 37040 of 46955 (78.9%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/afp_eng_200311.xml.gz
19457  articles in this file. Extracting 100 article / headline pairs
 36%|███▋      | 311/857 [10:37:19<26:00:02, 171.43s/it]
Total articles processed: 37160 of 47086 (78.9%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/cna_eng_200712.xml.gz
1260  articles in this file. Extracting 100 article / headline pairs
 36%|███▋      | 312/857 [10:37:34<18:48:28, 124.24s/it]
Total articles processed: 37280 of 47247 (78.9%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/ltw_eng_199704.xml.gz
3002  articles in this file. Extracting 100 article / headline pairs
 37%|███▋      | 313/857 [10:38:36<15:59:39, 105.84s/it]
Total articles processed: 37400 of 47434 (78.8%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/apw_eng_199612.xml.gz
12908  articles in this file. Extracting 100 article / headline pairs
 37%|███▋      | 314/857 [10:41:36<19:18:26, 128.00s/it]
Total articles processed: 37520 of 47573 (78.9%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/nyt_eng_200609.xml.gz
9159  articles in this file. Extracting 100 article / headline pairs
 37%|███▋      | 315/857 [10:47:33<29:37:13, 196.74s/it]
Total articles processed: 37640 of 47733 (78.9%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/afp_eng_200309.xml.gz
10700  articles in this file. Extracting 100 article / headline pairs
 37%|███▋      | 316/857 [10:49:33<26:06:15, 173.71s/it]
Total articles processed: 37760 of 47871 (78.9%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/apw_eng_199501.xml.gz
14863  articles in this file. Extracting 100 article / headline pairs
 37%|███▋      | 317/857 [10:53:21<28:30:12, 190.02s/it]
Total articles processed: 37880 of 48005 (78.9%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/apw_eng_200002.xml.gz
7376  articles in this file. Extracting 100 article / headline pairs
 37%|███▋      | 318/857 [10:55:37<26:00:31, 173.71s/it]
Total articles processed: 38000 of 48131 (79.0%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/afp_eng_200612.xml.gz
16203  articles in this file. Extracting 100 article / headline pairs
 37%|███▋      | 319/857 [10:59:17<28:01:36, 187.54s/it]
Total articles processed: 38120 of 48265 (79.0%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/ltw_eng_200308.xml.gz
2837  articles in this file. Extracting 100 article / headline pairs
 37%|███▋      | 320/857 [11:00:20<22:23:33, 150.12s/it]
Total articles processed: 38240 of 48424 (79.0%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/apw_eng_201010.xml.gz
17867  articles in this file. Extracting 100 article / headline pairs
 37%|███▋      | 321/857 [11:05:38<29:51:17, 200.52s/it]
Total articles processed: 38360 of 48552 (79.0%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/apw_eng_199601.xml.gz
12647  articles in this file. Extracting 100 article / headline pairs
 38%|███▊      | 322/857 [11:08:37<28:51:11, 194.15s/it]
Total articles processed: 38480 of 48689 (79.0%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/afp_eng_200902.xml.gz
21270  articles in this file. Extracting 100 article / headline pairs
 38%|███▊      | 323/857 [11:13:11<32:22:14, 218.23s/it]
Total articles processed: 38600 of 48821 (79.1%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/wpb_eng_201003.xml.gz
2564  articles in this file. Extracting 100 article / headline pairs
 38%|███▊      | 324/857 [11:14:04<24:57:22, 168.56s/it]
Total articles processed: 38720 of 48979 (79.1%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/apw_eng_200605.xml.gz
13795  articles in this file. Extracting 100 article / headline pairs
 38%|███▊      | 325/857 [11:17:53<27:36:21, 186.81s/it]
Total articles processed: 38840 of 49112 (79.1%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/ltw_eng_200405.xml.gz
3845 
 38%|███▊      | 326/857 [11:19:31<23:35:39, 159.96s/it]
 articles in this file. Extracting 100 article / headline pairs
Total articles processed: 38960 of 49284 (79.1%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/cna_eng_200401.xml.gz
694  articles in this file. Extracting 100 article / headline pairs
 38%|███▊      | 327/857 [11:19:40<16:54:41, 114.87s/it]
Total articles processed: 39080 of 49474 (79.0%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/cna_eng_201010.xml.gz
1278  articles in this file. Extracting 100 article / headline pairs
 38%|███▊      | 328/857 [11:19:55<12:28:38, 84.91s/it] 
Total articles processed: 39200 of 49612 (79.0%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/ltw_eng_199606.xml.gz
2745  articles in this file. Extracting 100 article / headline pairs
 38%|███▊      | 329/857 [11:20:54<11:17:10, 76.95s/it]
Total articles processed: 39320 of 49790 (79.0%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/afp_eng_199409.xml.gz
11122  articles in this file. Extracting 100 article / headline pairs
 39%|███▊      | 330/857 [11:22:34<12:16:39, 83.87s/it]
Total articles processed: 39440 of 49911 (79.0%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/nyt_eng_200511.xml.gz
8914  articles in this file. Extracting 100 article / headline pairs
 39%|███▊      | 331/857 [11:28:10<23:19:20, 159.62s/it]
Total articles processed: 39560 of 50079 (79.0%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_200609.xml.gz
10572  articles in this file. Extracting 100 article / headline pairs
 39%|███▊      | 332/857 [11:29:20<19:20:19, 132.61s/it]
Total articles processed: 39680 of 50228 (79.0%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_200408.xml.gz
11084  articles in this file. Extracting 100 article / headline pairs
 39%|███▉      | 333/857 [11:30:38<16:55:39, 116.30s/it]
Total articles processed: 39800 of 50381 (79.0%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_199805.xml.gz
8965  articles in this file. Extracting 100 article / headline pairs
 39%|███▉      | 334/857 [11:31:29<14:03:34, 96.78s/it] 
Total articles processed: 39920 of 50521 (79.0%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_200312.xml.gz
9878  articles in this file. Extracting 100 article / headline pairs
 39%|███▉      | 335/857 [11:32:38<12:49:04, 88.40s/it]
Total articles processed: 40040 of 50673 (79.0%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/apw_eng_200512.xml.gz
12349  articles in this file. Extracting 100 article / headline pairs
 39%|███▉      | 336/857 [11:35:53<17:25:17, 120.38s/it]
Total articles processed: 40160 of 50805 (79.0%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/ltw_eng_200910.xml.gz
3076  articles in this file. Extracting 100 article / headline pairs
 39%|███▉      | 337/857 [11:36:59<15:02:36, 104.15s/it]
Total articles processed: 40280 of 50961 (79.0%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/nyt_eng_200710.xml.gz
8181  articles in this file. Extracting 100 article / headline pairs
 39%|███▉      | 338/857 [11:42:04<23:42:14, 164.42s/it]
Total articles processed: 40400 of 51125 (79.0%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/apw_eng_200108.xml.gz
15744  articles in this file. Extracting 100 article / headline pairs
 40%|███▉      | 339/857 [11:46:33<28:08:31, 195.58s/it]
Total articles processed: 40520 of 51264 (79.0%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/cna_eng_199910.xml.gz
779  articles in this file. Extracting 100 article / headline pairs
 40%|███▉      | 340/857 [11:46:44<20:09:57, 140.42s/it]
Total articles processed: 40640 of 51404 (79.1%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/apw_eng_201002.xml.gz
10915  articles in this file. Extracting 100 article / headline pairs
 40%|███▉      | 341/857 [11:49:29<21:10:54, 147.78s/it]
Total articles processed: 40760 of 51532 (79.1%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/cna_eng_200004.xml.gz
725  articles in this file. Extracting 100 article / headline pairs
 40%|███▉      | 342/857 [11:49:41<15:18:22, 107.00s/it]
Total articles processed: 40880 of 51675 (79.1%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_200906.xml.gz
9381  articles in this file. Extracting 100 article / headline pairs
 40%|████      | 343/857 [11:50:47<13:29:56, 94.55s/it] 
Total articles processed: 41000 of 51810 (79.1%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/nyt_eng_201010.xml.gz
6050  articles in this file. Extracting 100 article / headline pairs
 40%|████      | 344/857 [11:54:11<18:11:02, 127.61s/it]
Total articles processed: 41120 of 51987 (79.1%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/apw_eng_199808.xml.gz
14302  articles in this file. Extracting 100 article / headline pairs
 40%|████      | 345/857 [11:57:39<21:33:14, 151.55s/it]
Total articles processed: 41240 of 52127 (79.1%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_199810.xml.gz
8539  articles in this file. Extracting 100 article / headline pairs
 40%|████      | 346/857 [11:58:29<17:11:07, 121.07s/it]
Total articles processed: 41360 of 52278 (79.1%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_199912.xml.gz
8788  articles in this file. Extracting 100 article / headline pairs
 40%|████      | 347/857 [11:59:19<14:07:41, 99.73s/it] 
Total articles processed: 41480 of 52435 (79.1%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/afp_eng_200506.xml.gz
20528  articles in this file. Extracting 100 article / headline pairs
 41%|████      | 348/857 [12:04:33<23:13:05, 164.21s/it]
Total articles processed: 41600 of 52567 (79.1%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/nyt_eng_200501.xml.gz
9220  articles in this file. Extracting 100 article / headline pairs
 41%|████      | 349/857 [12:10:28<31:14:55, 221.45s/it]
Total articles processed: 41720 of 52748 (79.1%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_200809.xml.gz
8278  articles in this file. Extracting 100 article / headline pairs
 41%|████      | 350/857 [12:11:25<24:13:24, 172.00s/it]
Total articles processed: 41840 of 52892 (79.1%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/wpb_eng_201002.xml.gz
1809  articles in this file. Extracting 100 article / headline pairs
 41%|████      | 351/857 [12:12:03<18:30:48, 131.72s/it]
Total articles processed: 41960 of 53051 (79.1%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/ltw_eng_200703.xml.gz
3673  articles in this file. Extracting 100 article / headline pairs
 41%|████      | 352/857 [12:13:30<16:35:38, 118.29s/it]
Total articles processed: 42080 of 53227 (79.1%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/cna_eng_200507.xml.gz
1131  articles in this file. Extracting 100 article / headline pairs
 41%|████      | 353/857 [12:13:43<12:08:48, 86.76s/it] 
Total articles processed: 42200 of 53387 (79.0%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/nyt_eng_200502.xml.gz
7552  articles in this file. Extracting 100 article / headline pairs
 41%|████▏     | 354/857 [12:18:25<20:17:35, 145.24s/it]
Total articles processed: 42320 of 53552 (79.0%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/ltw_eng_200906.xml.gz
2992  articles in this file. Extracting 100 article / headline pairs
 41%|████▏     | 355/857 [12:19:32<17:00:40, 121.99s/it]
Total articles processed: 42440 of 53742 (79.0%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/cna_eng_199905.xml.gz
826  articles in this file. Extracting 100 article / headline pairs
 42%|████▏     | 356/857 [12:19:44<12:22:03, 88.87s/it] 
Total articles processed: 42560 of 53895 (79.0%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/afp_eng_200212.xml.gz
19861  articles in this file. Extracting 100 article / headline pairs
 42%|████▏     | 357/857 [12:24:27<20:24:47, 146.97s/it]
Total articles processed: 42680 of 54027 (79.0%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/afp_eng_199508.xml.gz
12525  articles in this file. Extracting 100 article / headline pairs
 42%|████▏     | 358/857 [12:26:26<19:14:38, 138.84s/it]
Total articles processed: 42800 of 54149 (79.0%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_200301.xml.gz
9246  articles in this file. Extracting 100 article / headline pairs
 42%|████▏     | 359/857 [12:27:27<15:58:20, 115.46s/it]
Total articles processed: 42920 of 54317 (79.0%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/cna_eng_200808.xml.gz
1257  articles in this file. Extracting 100 article / headline pairs
 42%|████▏     | 360/857 [12:27:42<11:46:30, 85.29s/it] 
Total articles processed: 43040 of 54473 (79.0%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/afp_eng_200801.xml.gz
20302  articles in this file. Extracting 100 article / headline pairs
 42%|████▏     | 361/857 [12:32:38<20:26:29, 148.37s/it]
Total articles processed: 43160 of 54604 (79.0%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/afp_eng_200905.xml.gz
21794  articles in this file. Extracting 100 article / headline pairs
 42%|████▏     | 362/857 [12:37:25<26:08:15, 190.09s/it]
Total articles processed: 43280 of 54732 (79.1%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/afp_eng_200610.xml.gz
15727  articles in this file. Extracting 100 article / headline pairs
 42%|████▏     | 363/857 [12:41:31<28:23:35, 206.91s/it]
Total articles processed: 43400 of 54866 (79.1%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_200109.xml.gz
8867  articles in this file. Extracting 100 article / headline pairs
 42%|████▏     | 364/857 [12:42:26<22:04:29, 161.20s/it]
Total articles processed: 43520 of 55013 (79.1%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/nyt_eng_200804.xml.gz
6847  articles in this file. Extracting 100 article / headline pairs
 43%|████▎     | 365/857 [12:46:40<25:49:14, 188.93s/it]
Total articles processed: 43640 of 55181 (79.1%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/afp_eng_200609.xml.gz
15073  articles in this file. Extracting 100 article / headline pairs
 43%|████▎     | 366/857 [12:50:32<27:32:23, 201.92s/it]
Total articles processed: 43760 of 55317 (79.1%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/apw_eng_200408.xml.gz
9919  articles in this file. Extracting 100 article / headline pairs
 43%|████▎     | 367/857 [12:53:21<26:08:24, 192.05s/it]
Total articles processed: 43880 of 55451 (79.1%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/ltw_eng_200908.xml.gz
2987  articles in this file. Extracting 100 article / headline pairs
 43%|████▎     | 368/857 [12:54:27<20:57:47, 154.33s/it]
Total articles processed: 44000 of 55615 (79.1%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_200410.xml.gz
9367  articles in this file. Extracting 100 article / headline pairs
 43%|████▎     | 369/857 [12:55:29<17:10:23, 126.69s/it]
Total articles processed: 44120 of 55759 (79.1%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/ltw_eng_200802.xml.gz
3435  articles in this file. Extracting 100 article / headline pairs
 43%|████▎     | 370/857 [12:56:50<15:15:07, 112.75s/it]
Total articles processed: 44240 of 55941 (79.1%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/afp_eng_200603.xml.gz
20040  articles in this file. Extracting 100 article / headline pairs
 43%|████▎     | 371/857 [13:01:48<22:44:54, 168.51s/it]
Total articles processed: 44360 of 56087 (79.1%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/apw_eng_199610.xml.gz
12788  articles in this file. Extracting 100 article / headline pairs
 43%|████▎     | 372/857 [13:04:39<22:47:02, 169.12s/it]
Total articles processed: 44480 of 56233 (79.1%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/cna_eng_200002.xml.gz
588  articles in this file. Extracting 100 article / headline pairs
 44%|████▎     | 373/857 [13:04:50<16:21:35, 121.69s/it]
Total articles processed: 44600 of 56383 (79.1%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/cna_eng_201008.xml.gz
1266  articles in this file. Extracting 100 article / headline pairs
 44%|████▎     | 374/857 [13:05:05<12:03:05, 89.83s/it] 
Total articles processed: 44720 of 56520 (79.1%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/cna_eng_199806.xml.gz
864  articles in this file. Extracting 100 article / headline pairs
 44%|████▍     | 375/857 [13:05:17<8:53:45, 66.44s/it] 
Total articles processed: 44840 of 56648 (79.2%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/afp_eng_200405.xml.gz
17625  articles in this file. Extracting 100 article / headline pairs
 44%|████▍     | 376/857 [13:08:36<14:10:35, 106.10s/it]
Total articles processed: 44960 of 56783 (79.2%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/apw_eng_199511.xml.gz
12596  articles in this file. Extracting 100 article / headline pairs
 44%|████▍     | 377/857 [13:11:35<17:05:16, 128.16s/it]
Total articles processed: 45080 of 56930 (79.2%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_200912.xml.gz
10495  articles in this file. Extracting 100 article / headline pairs
 44%|████▍     | 378/857 [13:12:54<15:04:30, 113.30s/it]
Total articles processed: 45200 of 57076 (79.2%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_199505.xml.gz
7402  articles in this file. Extracting 100 article / headline pairs
 44%|████▍     | 379/857 [13:13:35<12:09:37, 91.59s/it] 
Total articles processed: 45320 of 57223 (79.2%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_200306.xml.gz
9530  articles in this file. Extracting 100 article / headline pairs
 44%|████▍     | 380/857 [13:14:35<10:54:06, 82.28s/it]
Total articles processed: 45440 of 57371 (79.2%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/afp_eng_200906.xml.gz
21664  articles in this file. Extracting 100 article / headline pairs
 44%|████▍     | 381/857 [13:19:29<19:15:40, 145.67s/it]
Total articles processed: 45560 of 57502 (79.2%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/cna_eng_200702.xml.gz
964  articles in this file. Extracting 100 article / headline pairs
 45%|████▍     | 382/857 [13:19:41<13:56:06, 105.61s/it]
Total articles processed: 45680 of 57667 (79.2%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/ltw_eng_200605.xml.gz
3877  articles in this file. Extracting 100 article / headline pairs
 45%|████▍     | 383/857 [13:21:21<13:40:19, 103.84s/it]
Total articles processed: 45800 of 57828 (79.2%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/nyt_eng_200701.xml.gz
8583  articles in this file. Extracting 100 article / headline pairs
 45%|████▍     | 384/857 [13:27:09<23:16:32, 177.15s/it]
Total articles processed: 45920 of 57988 (79.2%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/nyt_eng_200404.xml.gz
1674  articles in this file. Extracting 100 article / headline pairs
 45%|████▍     | 385/857 [13:27:48<17:46:44, 135.60s/it]
Total articles processed: 46040 of 58163 (79.2%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_200907.xml.gz
11261  articles in this file. Extracting 100 article / headline pairs
 45%|████▌     | 386/857 [13:29:11<15:41:27, 119.93s/it]
Total articles processed: 46160 of 58310 (79.2%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/ltw_eng_200412.xml.gz
3173  articles in this file. Extracting 100 article / headline pairs
 45%|████▌     | 387/857 [13:30:31<14:04:20, 107.79s/it]
Total articles processed: 46280 of 58485 (79.1%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_199511.xml.gz
7450  articles in this file. Extracting 100 article / headline pairs
 45%|████▌     | 388/857 [13:31:11<11:24:31, 87.57s/it] 
Total articles processed: 46400 of 58623 (79.1%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/ltw_eng_199410.xml.gz
3110  articles in this file. Extracting 100 article / headline pairs
 45%|████▌     | 389/857 [13:32:28<10:57:26, 84.29s/it]
Total articles processed: 46520 of 58786 (79.1%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_199905.xml.gz
8839  articles in this file. Extracting 100 article / headline pairs
 46%|████▌     | 390/857 [13:33:37<10:21:09, 79.81s/it]
Total articles processed: 46640 of 58927 (79.1%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/apw_eng_199911.xml.gz
59  articles in this file. Extracting 100 article / headline pairs
 46%|████▌     | 391/857 [13:33:41<7:24:30, 57.23s/it] 
Total articles processed: 46697 of 58986 (79.2%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_200402.xml.gz
9541  articles in this file. Extracting 100 article / headline pairs
 46%|████▌     | 392/857 [13:35:02<8:17:21, 64.18s/it]
Total articles processed: 46817 of 59140 (79.2%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/afp_eng_200304.xml.gz
24565  articles in this file. Extracting 100 article / headline pairs
 46%|████▌     | 393/857 [13:41:58<21:53:43, 169.88s/it]
Total articles processed: 46937 of 59270 (79.2%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_199512.xml.gz
6672  articles in this file. Extracting 100 article / headline pairs
 46%|████▌     | 394/857 [13:42:40<16:54:42, 131.50s/it]
Total articles processed: 47057 of 59421 (79.2%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/nyt_eng_200311.xml.gz
1881  articles in this file. Extracting 100 article / headline pairs
 46%|████▌     | 395/857 [13:43:16<13:11:36, 102.81s/it]
Total articles processed: 47177 of 59605 (79.1%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/apw_eng_199909.xml.gz
7301  articles in this file. Extracting 100 article / headline pairs
 46%|████▌     | 396/857 [13:45:31<14:24:10, 112.47s/it]
Total articles processed: 47297 of 59728 (79.2%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_200412.xml.gz
9721  articles in this file. Extracting 100 article / headline pairs
 46%|████▋     | 397/857 [13:46:43<12:47:35, 100.12s/it]
Total articles processed: 47417 of 59883 (79.2%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/cna_eng_200403.xml.gz
1071  articles in this file. Extracting 100 article / headline pairs
 46%|████▋     | 398/857 [13:46:55<9:25:24, 73.91s/it]  
Total articles processed: 47537 of 60055 (79.2%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/apw_eng_200508.xml.gz
10941  articles in this file. Extracting 100 article / headline pairs
 47%|████▋     | 399/857 [13:49:53<13:21:49, 105.04s/it]
Total articles processed: 47657 of 60185 (79.2%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_199807.xml.gz
9051  articles in this file. Extracting 100 article / headline pairs
 47%|████▋     | 400/857 [13:50:48<11:26:55, 90.19s/it] 
Total articles processed: 47777 of 60334 (79.2%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/nyt_eng_200709.xml.gz
6833  articles in this file. Extracting 100 article / headline pairs
 47%|████▋     | 401/857 [13:55:14<18:04:55, 142.75s/it]
Total articles processed: 47897 of 60484 (79.2%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/cna_eng_200611.xml.gz
1322  articles in this file. Extracting 100 article / headline pairs
 47%|████▋     | 402/857 [13:55:29<13:12:56, 104.56s/it]
Total articles processed: 48017 of 60653 (79.2%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/ltw_eng_200808.xml.gz
3058  articles in this file. Extracting 100 article / headline pairs
 47%|████▋     | 403/857 [13:56:52<12:21:39, 98.02s/it] 
Total articles processed: 48137 of 60865 (79.1%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_199809.xml.gz
8284  articles in this file. Extracting 100 article / headline pairs
 47%|████▋     | 404/857 [13:57:46<10:40:03, 84.78s/it]
Total articles processed: 48257 of 61007 (79.1%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/ltw_eng_200707.xml.gz
3354  articles in this file. Extracting 100 article / headline pairs
 47%|████▋     | 405/857 [13:59:26<11:13:20, 89.38s/it]
Total articles processed: 48377 of 61171 (79.1%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/apw_eng_199908.xml.gz
10745  articles in this file. Extracting 100 article / headline pairs
 47%|████▋     | 406/857 [14:02:26<14:36:20, 116.59s/it]
Total articles processed: 48497 of 61313 (79.1%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/apw_eng_200603.xml.gz
13660  articles in this file. Extracting 100 article / headline pairs
 47%|████▋     | 407/857 [14:06:34<19:29:43, 155.96s/it]
Total articles processed: 48617 of 61442 (79.1%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/wpb_eng_201010.xml.gz
2083  articles in this file. Extracting 100 article / headline pairs
 48%|████▊     | 408/857 [14:07:25<15:30:44, 124.37s/it]
Total articles processed: 48737 of 61589 (79.1%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_199705.xml.gz
8240  articles in this file. Extracting 100 article / headline pairs
 48%|████▊     | 409/857 [14:08:16<12:45:49, 102.57s/it]
Total articles processed: 48857 of 61731 (79.1%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/apw_eng_199508.xml.gz
17493  articles in this file. Extracting 100 article / headline pairs
 48%|████▊     | 410/857 [14:13:39<20:55:11, 168.48s/it]
Total articles processed: 48977 of 61876 (79.2%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/ltw_eng_199509.xml.gz
2995  articles in this file. Extracting 100 article / headline pairs
 48%|████▊     | 411/857 [14:14:50<17:14:54, 139.23s/it]
Total articles processed: 49097 of 62025 (79.2%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_200702.xml.gz
9674  articles in this file. Extracting 100 article / headline pairs
 48%|████▊     | 412/857 [14:15:46<14:09:16, 114.51s/it]
Total articles processed: 49217 of 62186 (79.1%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/afp_eng_200707.xml.gz
17164  articles in this file. Extracting 100 article / headline pairs
 48%|████▊     | 413/857 [14:19:45<18:42:36, 151.70s/it]
Total articles processed: 49337 of 62316 (79.2%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/ltw_eng_199710.xml.gz
3207  articles in this file. Extracting 100 article / headline pairs
 48%|████▊     | 414/857 [14:21:00<15:49:38, 128.62s/it]
Total articles processed: 49457 of 62504 (79.1%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/cna_eng_201007.xml.gz
1223  articles in this file. Extracting 100 article / headline pairs
 48%|████▊     | 415/857 [14:21:16<11:38:25, 94.81s/it] 
Total articles processed: 49577 of 62649 (79.1%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/nyt_eng_200607.xml.gz
7409  articles in this file. Extracting 100 article / headline pairs
 49%|████▊     | 416/857 [14:26:07<18:49:35, 153.69s/it]
Total articles processed: 49697 of 62816 (79.1%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/afp_eng_200810.xml.gz
21968  articles in this file. Extracting 100 article / headline pairs
 49%|████▊     | 417/857 [14:31:19<24:35:41, 201.23s/it]
Total articles processed: 49817 of 62946 (79.1%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/ltw_eng_200811.xml.gz
2952  articles in this file. Extracting 100 article / headline pairs
 49%|████▉     | 418/857 [14:32:30<19:45:49, 162.07s/it]
Total articles processed: 49937 of 63135 (79.1%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/apw_eng_200902.xml.gz
16341  articles in this file. Extracting 100 article / headline pairs
 49%|████▉     | 419/857 [14:37:36<24:59:59, 205.48s/it]
Total articles processed: 50057 of 63265 (79.1%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/ltw_eng_200710.xml.gz
3474  articles in this file. Extracting 100 article / headline pairs
 49%|████▉     | 420/857 [14:39:01<20:33:06, 169.31s/it]
Total articles processed: 50177 of 63459 (79.1%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/ltw_eng_200404.xml.gz
3230  articles in this file. Extracting 100 article / headline pairs
 49%|████▉     | 421/857 [14:40:20<17:13:05, 142.17s/it]
Total articles processed: 50297 of 63637 (79.0%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_200804.xml.gz
5696  articles in this file. Extracting 100 article / headline pairs
 49%|████▉     | 422/857 [14:40:57<13:22:25, 110.68s/it]
Total articles processed: 50417 of 63780 (79.0%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_200308.xml.gz
8241  articles in this file. Extracting 100 article / headline pairs
 49%|████▉     | 423/857 [14:41:50<11:15:34, 93.40s/it] 
Total articles processed: 50537 of 63931 (79.0%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/nyt_eng_200504.xml.gz
7824  articles in this file. Extracting 100 article / headline pairs
 49%|████▉     | 424/857 [14:46:59<18:59:14, 157.86s/it]
Total articles processed: 50657 of 64091 (79.0%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/afp_eng_199503.xml.gz
13515  articles in this file. Extracting 100 article / headline pairs
 50%|████▉     | 425/857 [14:49:16<18:12:11, 151.69s/it]
Total articles processed: 50777 of 64214 (79.1%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_200706.xml.gz
10556  articles in this file. Extracting 100 article / headline pairs
 50%|████▉     | 426/857 [14:50:26<15:14:35, 127.32s/it]
Total articles processed: 50897 of 64360 (79.1%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_200901.xml.gz
9074  articles in this file. Extracting 100 article / headline pairs
 50%|████▉     | 427/857 [14:51:30<12:55:19, 108.18s/it]
Total articles processed: 51017 of 64510 (79.1%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/apw_eng_200001.xml.gz
8067  articles in this file. Extracting 100 article / headline pairs
 50%|████▉     | 428/857 [14:54:13<14:50:20, 124.52s/it]
Total articles processed: 51137 of 64640 (79.1%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/cna_eng_200809.xml.gz
983  articles in this file. Extracting 100 article / headline pairs
 50%|█████     | 429/857 [14:54:27<10:52:51, 91.52s/it] 
Total articles processed: 51257 of 64796 (79.1%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/apw_eng_201011.xml.gz
18357  articles in this file. Extracting 100 article / headline pairs
 50%|█████     | 430/857 [15:00:17<20:03:18, 169.08s/it]
Total articles processed: 51377 of 64918 (79.1%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/ltw_eng_200502.xml.gz
1977  articles in this file. Extracting 100 article / headline pairs
 50%|█████     | 431/857 [15:01:04<15:39:20, 132.30s/it]
Total articles processed: 51497 of 65084 (79.1%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/cna_eng_200603.xml.gz
1228  articles in this file. Extracting 100 article / headline pairs
 50%|█████     | 432/857 [15:01:18<11:26:10, 96.87s/it] 
Total articles processed: 51617 of 65258 (79.1%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/afp_eng_199407.xml.gz
10901  articles in this file. Extracting 100 article / headline pairs
 51%|█████     | 433/857 [15:02:58<11:32:41, 98.02s/it]
Total articles processed: 51737 of 65379 (79.1%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_199910.xml.gz
8854  articles in this file. Extracting 100 article / headline pairs
 51%|█████     | 434/857 [15:03:51<9:55:26, 84.46s/it] 
Total articles processed: 51857 of 65526 (79.1%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/ltw_eng_199504.xml.gz
2889  articles in this file. Extracting 100 article / headline pairs
 51%|█████     | 435/857 [15:04:59<9:17:59, 79.33s/it]
Total articles processed: 51977 of 65702 (79.1%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/nyt_eng_199502.xml.gz
8437  articles in this file. Extracting 100 article / headline pairs
 51%|█████     | 436/857 [15:09:34<16:08:41, 138.06s/it]
Total articles processed: 52097 of 65859 (79.1%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_199603.xml.gz
8033  articles in this file. Extracting 100 article / headline pairs
 51%|█████     | 437/857 [15:10:20<12:53:03, 110.44s/it]
Total articles processed: 52217 of 65997 (79.1%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_199812.xml.gz
9727  articles in this file. Extracting 100 article / headline pairs
 51%|█████     | 438/857 [15:11:18<11:02:22, 94.85s/it] 
Total articles processed: 52337 of 66139 (79.1%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/wpb_eng_201008.xml.gz
2122  articles in this file. Extracting 100 article / headline pairs
 51%|█████     | 439/857 [15:12:05<9:19:54, 80.37s/it] 
Total articles processed: 52457 of 66304 (79.1%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/ltw_eng_200810.xml.gz
3313  articles in this file. Extracting 100 article / headline pairs
 51%|█████▏    | 440/857 [15:13:25<9:17:48, 80.26s/it]
Total articles processed: 52577 of 66489 (79.1%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/cna_eng_201009.xml.gz
1177  articles in this file. Extracting 100 article / headline pairs
 51%|█████▏    | 441/857 [15:13:40<7:01:30, 60.80s/it]
Total articles processed: 52697 of 66639 (79.1%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_200208.xml.gz
9123  articles in this file. Extracting 100 article / headline pairs
 52%|█████▏    | 442/857 [15:14:41<7:00:20, 60.77s/it]
Total articles processed: 52817 of 66809 (79.1%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_200509.xml.gz
9276  articles in this file. Extracting 100 article / headline pairs
 52%|█████▏    | 443/857 [15:15:48<7:13:23, 62.81s/it]
Total articles processed: 52937 of 66945 (79.1%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_201004.xml.gz
12642  articles in this file. Extracting 100 article / headline pairs
 52%|█████▏    | 444/857 [15:17:30<8:32:42, 74.49s/it]
Total articles processed: 53057 of 67086 (79.1%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/nyt_eng_200403.xml.gz
1834  articles in this file. Extracting 100 article / headline pairs
 52%|█████▏    | 445/857 [15:18:05<7:09:03, 62.48s/it]
Total articles processed: 53177 of 67261 (79.1%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/ltw_eng_199508.xml.gz
2963  articles in this file. Extracting 100 article / headline pairs
 52%|█████▏    | 446/857 [15:19:14<7:21:26, 64.44s/it]
Total articles processed: 53297 of 67446 (79.0%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/apw_eng_199705.xml.gz
14546  articles in this file. Extracting 100 article / headline pairs
 52%|█████▏    | 447/857 [15:22:54<12:39:08, 111.09s/it]
Total articles processed: 53417 of 67595 (79.0%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/apw_eng_199709.xml.gz
14515  articles in this file. Extracting 100 article / headline pairs
 52%|█████▏    | 448/857 [15:26:47<16:48:20, 147.92s/it]
Total articles processed: 53537 of 67746 (79.0%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_200212.xml.gz
7412  articles in this file. Extracting 100 article / headline pairs
 52%|█████▏    | 449/857 [15:27:35<13:21:30, 117.87s/it]
Total articles processed: 53657 of 67899 (79.0%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/nyt_eng_200712.xml.gz
6842  articles in this file. Extracting 100 article / headline pairs
 53%|█████▎    | 450/857 [15:32:01<18:19:48, 162.13s/it]
Total articles processed: 53777 of 68072 (79.0%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_200703.xml.gz
11599  articles in this file. Extracting 100 article / headline pairs
 53%|█████▎    | 451/857 [15:33:21<15:31:22, 137.64s/it]
Total articles processed: 53897 of 68210 (79.0%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_199804.xml.gz
8148  articles in this file. Extracting 100 article / headline pairs
 53%|█████▎    | 452/857 [15:34:08<12:25:03, 110.38s/it]
Total articles processed: 54017 of 68352 (79.0%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/apw_eng_200509.xml.gz
13170  articles in this file. Extracting 100 article / headline pairs
 53%|█████▎    | 453/857 [15:37:53<16:15:03, 144.81s/it]
Total articles processed: 54137 of 68485 (79.0%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/apw_eng_200602.xml.gz
10323  articles in this file. Extracting 100 article / headline pairs
 53%|█████▎    | 454/857 [15:40:40<16:56:45, 151.38s/it]
Total articles processed: 54257 of 68613 (79.1%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/cna_eng_200008.xml.gz
923  articles in this file. Extracting 100 article / headline pairs
 53%|█████▎    | 455/857 [15:40:53<12:17:27, 110.07s/it]
Total articles processed: 54377 of 68757 (79.1%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/cna_eng_200710.xml.gz
1279  articles in this file. Extracting 100 article / headline pairs
 53%|█████▎    | 456/857 [15:41:08<9:04:05, 81.41s/it]  
Total articles processed: 54497 of 68932 (79.1%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/cna_eng_200501.xml.gz
1069  articles in this file. Extracting 100 article / headline pairs
 53%|█████▎    | 457/857 [15:41:21<6:45:50, 60.88s/it]
Total articles processed: 54617 of 69096 (79.0%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/nyt_eng_200604.xml.gz
8989  articles in this file. Extracting 100 article / headline pairs
 53%|█████▎    | 458/857 [15:47:21<16:41:31, 150.61s/it]
Total articles processed: 54737 of 69261 (79.0%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_201012.xml.gz
10351  articles in this file. Extracting 100 article / headline pairs
 54%|█████▎    | 459/857 [15:48:49<14:33:56, 131.75s/it]
Total articles processed: 54857 of 69399 (79.0%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_200708.xml.gz
11141  articles in this file. Extracting 100 article / headline pairs
 54%|█████▎    | 460/857 [15:50:00<12:31:16, 113.54s/it]
Total articles processed: 54977 of 69557 (79.0%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/apw_eng_200911.xml.gz
12868  articles in this file. Extracting 100 article / headline pairs
 54%|█████▍    | 461/857 [15:53:39<15:58:32, 145.23s/it]
Total articles processed: 55097 of 69682 (79.1%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/apw_eng_199602.xml.gz
11883  articles in this file. Extracting 100 article / headline pairs
 54%|█████▍    | 462/857 [15:56:32<16:50:28, 153.49s/it]
Total articles processed: 55217 of 69821 (79.1%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/cna_eng_200706.xml.gz
1218  articles in this file. Extracting 100 article / headline pairs
 54%|█████▍    | 463/857 [15:56:46<12:13:41, 111.73s/it]
Total articles processed: 55337 of 69991 (79.1%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/afp_eng_200203.xml.gz
16913  articles in this file. Extracting 100 article / headline pairs
 54%|█████▍    | 464/857 [16:00:01<14:56:02, 136.80s/it]
Total articles processed: 55457 of 70112 (79.1%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/nyt_eng_200811.xml.gz
6090  articles in this file. Extracting 100 article / headline pairs
 54%|█████▍    | 465/857 [16:03:48<17:50:57, 163.92s/it]
Total articles processed: 55577 of 70281 (79.1%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/cna_eng_200506.xml.gz
1143  articles in this file. Extracting 100 article / headline pairs
 54%|█████▍    | 466/857 [16:04:03<12:55:56, 119.07s/it]
Total articles processed: 55697 of 70457 (79.1%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_199707.xml.gz
7658  articles in this file. Extracting 100 article / headline pairs
 54%|█████▍    | 467/857 [16:04:48<10:29:51, 96.90s/it] 
Total articles processed: 55817 of 70593 (79.1%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_199509.xml.gz
6876  articles in this file. Extracting 100 article / headline pairs
 55%|█████▍    | 468/857 [16:05:29<8:40:18, 80.25s/it] 
Total articles processed: 55937 of 70736 (79.1%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/afp_eng_200605.xml.gz
19492  articles in this file. Extracting 100 article / headline pairs
 55%|█████▍    | 469/857 [16:10:45<16:15:41, 150.88s/it]
Total articles processed: 56057 of 70874 (79.1%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/afp_eng_199411.xml.gz
11526  articles in this file. Extracting 100 article / headline pairs
 55%|█████▍    | 470/857 [16:12:44<15:10:48, 141.21s/it]
Total articles processed: 56177 of 70998 (79.1%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/afp_eng_200201.xml.gz
22203  articles in this file. Extracting 100 article / headline pairs
 55%|█████▍    | 471/857 [16:18:23<21:29:52, 200.50s/it]
Total articles processed: 56297 of 71119 (79.2%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_200808.xml.gz
13582  articles in this file. Extracting 100 article / headline pairs
 55%|█████▌    | 472/857 [16:20:32<19:09:33, 179.15s/it]
Total articles processed: 56417 of 71252 (79.2%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/afp_eng_200207.xml.gz
20756  articles in this file. Extracting 100 article / headline pairs
 55%|█████▌    | 473/857 [16:25:51<23:34:11, 220.97s/it]
Total articles processed: 56537 of 71386 (79.2%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_199904.xml.gz
9039  articles in this file. Extracting 100 article / headline pairs
 55%|█████▌    | 474/857 [16:26:59<18:38:55, 175.29s/it]
Total articles processed: 56657 of 71534 (79.2%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/apw_eng_200409.xml.gz
10656  articles in this file. Extracting 100 article / headline pairs
 55%|█████▌    | 475/857 [16:30:04<18:54:09, 178.14s/it]
Total articles processed: 56777 of 71663 (79.2%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/afp_eng_200312.xml.gz
20029  articles in this file. Extracting 100 article / headline pairs
 56%|█████▌    | 476/857 [16:35:10<22:54:04, 216.39s/it]
Total articles processed: 56897 of 71795 (79.2%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/cna_eng_200103.xml.gz
892  articles in this file. Extracting 100 article / headline pairs
 56%|█████▌    | 477/857 [16:35:29<16:36:45, 157.38s/it]
Total articles processed: 57017 of 71930 (79.3%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/cna_eng_199911.xml.gz
787  articles in this file. Extracting 100 article / headline pairs
 56%|█████▌    | 478/857 [16:35:45<12:04:55, 114.76s/it]
Total articles processed: 57137 of 72072 (79.3%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/cna_eng_200503.xml.gz
1192  articles in this file. Extracting 100 article / headline pairs
 56%|█████▌    | 479/857 [16:36:00<8:55:00, 84.92s/it]  
Total articles processed: 57257 of 72236 (79.3%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/ltw_eng_200410.xml.gz
4185  articles in this file. Extracting 100 article / headline pairs
 56%|█████▌    | 480/857 [16:38:03<10:06:08, 96.47s/it]
Total articles processed: 57377 of 72412 (79.2%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/ltw_eng_199804.xml.gz
2953  articles in this file. Extracting 100 article / headline pairs
 56%|█████▌    | 481/857 [16:39:12<9:13:04, 88.26s/it] 
Total articles processed: 57497 of 72611 (79.2%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_199704.xml.gz
7840  articles in this file. Extracting 100 article / headline pairs
 56%|█████▌    | 482/857 [16:39:58<7:51:19, 75.41s/it]
Total articles processed: 57617 of 72752 (79.2%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/apw_eng_199710.xml.gz
14588  articles in this file. Extracting 100 article / headline pairs
 56%|█████▋    | 483/857 [16:43:37<12:18:15, 118.44s/it]
Total articles processed: 57737 of 72897 (79.2%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/apw_eng_199809.xml.gz
15259  articles in this file. Extracting 100 article / headline pairs
 56%|█████▋    | 484/857 [16:47:32<15:53:58, 153.45s/it]
Total articles processed: 57857 of 73032 (79.2%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/apw_eng_199505.xml.gz
17933  articles in this file. Extracting 100 article / headline pairs
 57%|█████▋    | 485/857 [16:53:01<21:17:52, 206.11s/it]
Total articles processed: 57977 of 73178 (79.2%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_200111.xml.gz
1740  articles in this file. Extracting 100 article / headline pairs
 57%|█████▋    | 486/857 [16:53:19<15:25:45, 149.72s/it]
Total articles processed: 58097 of 73337 (79.2%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/ltw_eng_200505.xml.gz
3739  articles in this file. Extracting 100 article / headline pairs
 57%|█████▋    | 487/857 [16:55:06<14:04:18, 136.91s/it]
Total articles processed: 58217 of 73504 (79.2%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/nyt_eng_200903.xml.gz
6409  articles in this file. Extracting 100 article / headline pairs
 57%|█████▋    | 488/857 [16:59:03<17:07:10, 167.02s/it]
Total articles processed: 58337 of 73682 (79.2%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/nyt_eng_200708.xml.gz
7233  articles in this file. Extracting 100 article / headline pairs
 57%|█████▋    | 489/857 [17:03:43<20:31:42, 200.82s/it]
Total articles processed: 58457 of 73856 (79.1%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/apw_eng_200906.xml.gz
17889  articles in this file. Extracting 100 article / headline pairs
 57%|█████▋    | 490/857 [17:10:33<26:52:52, 263.69s/it]
Total articles processed: 58577 of 73981 (79.2%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_200209.xml.gz
9178  articles in this file. Extracting 100 article / headline pairs
 57%|█████▋    | 491/857 [17:11:46<20:59:31, 206.48s/it]
Total articles processed: 58697 of 74144 (79.2%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_199712.xml.gz
8282 
 57%|█████▋    | 492/857 [17:13:06<17:04:12, 168.36s/it]
 articles in this file. Extracting 100 article / headline pairs
Total articles processed: 58817 of 74285 (79.2%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/cna_eng_200009.xml.gz
831  articles in this file. Extracting 100 article / headline pairs
 58%|█████▊    | 493/857 [17:13:28<12:35:24, 124.52s/it]
Total articles processed: 58937 of 74429 (79.2%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/ltw_eng_199702.xml.gz
2813 
 58%|█████▊    | 494/857 [17:14:50<11:16:54, 111.89s/it]
 articles in this file. Extracting 100 article / headline pairs
Total articles processed: 59057 of 74603 (79.2%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_200612.xml.gz
11597  articles in this file. Extracting 100 article / headline pairs
 58%|█████▊    | 495/857 [17:16:27<10:47:36, 107.34s/it]
Total articles processed: 59177 of 74762 (79.2%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/nyt_eng_200605.xml.gz
9244  articles in this file. Extracting 100 article / headline pairs
 58%|█████▊    | 496/857 [17:23:08<19:36:28, 195.53s/it]
Total articles processed: 59297 of 74916 (79.2%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/apw_eng_200811.xml.gz
15057  articles in this file. Extracting 100 article / headline pairs
 58%|█████▊    | 497/857 [17:28:54<24:02:19, 240.39s/it]
Total articles processed: 59417 of 75043 (79.2%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/nyt_eng_200004.xml.gz
8286 
 58%|█████▊    | 498/857 [17:34:21<26:35:13, 266.61s/it]
 articles in this file. Extracting 100 article / headline pairs
Total articles processed: 59537 of 75210 (79.2%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/cna_eng_200505.xml.gz
1179  articles in this file. Extracting 100 article / headline pairs
 58%|█████▊    | 499/857 [17:34:35<18:58:43, 190.85s/it]
Total articles processed: 59657 of 75374 (79.1%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/afp_eng_200611.xml.gz
20164  articles in this file. Extracting 100 article / headline pairs
 58%|█████▊    | 500/857 [17:40:19<23:27:55, 236.63s/it]
Total articles processed: 59777 of 75517 (79.2%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/ltw_eng_200603.xml.gz
4125  articles in this file. Extracting 100 article / headline pairs
 58%|█████▊    | 501/857 [17:42:16<19:51:34, 200.83s/it]
Total articles processed: 59897 of 75697 (79.1%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_199508.xml.gz
7275  articles in this file. Extracting 100 article / headline pairs
 59%|█████▊    | 502/857 [17:43:06<15:20:40, 155.61s/it]
Total articles processed: 60017 of 75841 (79.1%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/ltw_eng_200905.xml.gz
3026  articles in this file. Extracting 100 article / headline pairs
 59%|█████▊    | 503/857 [17:44:22<12:56:06, 131.54s/it]
Total articles processed: 60137 of 76008 (79.1%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/afp_eng_200409.xml.gz
20710  articles in this file. Extracting 100 article / headline pairs
 59%|█████▉    | 504/857 [17:49:16<17:40:44, 180.29s/it]
Total articles processed: 60257 of 76138 (79.1%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_200106.xml.gz
8620  articles in this file. Extracting 100 article / headline pairs
 59%|█████▉    | 505/857 [17:50:08<13:52:53, 141.97s/it]
Total articles processed: 60377 of 76283 (79.1%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/ltw_eng_200709.xml.gz
3315  articles in this file. Extracting 100 article / headline pairs
 59%|█████▉    | 506/857 [17:51:41<12:24:32, 127.27s/it]
Total articles processed: 60497 of 76472 (79.1%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/cna_eng_200007.xml.gz
840  articles in this file. Extracting 100 article / headline pairs
 59%|█████▉    | 507/857 [17:51:54<9:01:38, 92.85s/it]  
Total articles processed: 60617 of 76607 (79.1%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/cna_eng_200111.xml.gz
877  articles in this file. Extracting 100 article / headline pairs
 59%|█████▉    | 508/857 [17:52:11<6:47:44, 70.10s/it]
Total articles processed: 60737 of 76747 (79.1%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_200202.xml.gz
7897  articles in this file. Extracting 100 article / headline pairs
 59%|█████▉    | 509/857 [17:53:10<6:27:48, 66.86s/it]
Total articles processed: 60857 of 76901 (79.1%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_201009.xml.gz
11673  articles in this file. Extracting 100 article / headline pairs
 60%|█████▉    | 510/857 [17:55:02<7:44:37, 80.34s/it]
Total articles processed: 60977 of 77040 (79.1%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/cna_eng_200806.xml.gz
1437  articles in this file. Extracting 100 article / headline pairs
 60%|█████▉    | 511/857 [17:55:20<5:55:55, 61.72s/it]
Total articles processed: 61097 of 77189 (79.2%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/ltw_eng_199405.xml.gz
771  articles in this file. Extracting 100 article / headline pairs
 60%|█████▉    | 512/857 [17:55:41<4:44:49, 49.54s/it]
Total articles processed: 61217 of 77357 (79.1%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/afp_eng_200412.xml.gz
14654  articles in this file. Extracting 100 article / headline pairs
 60%|█████▉    | 513/857 [17:58:49<8:41:21, 90.93s/it]
Total articles processed: 61337 of 77494 (79.2%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/nyt_eng_200312.xml.gz
1896  articles in this file. Extracting 100 article / headline pairs
 60%|█████▉    | 514/857 [17:59:25<7:06:37, 74.63s/it]
Total articles processed: 61457 of 77678 (79.1%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_199808.xml.gz
8467  articles in this file. Extracting 100 article / headline pairs
 60%|██████    | 515/857 [18:00:20<6:31:21, 68.66s/it]
Total articles processed: 61577 of 77819 (79.1%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_200611.xml.gz
11264  articles in this file. Extracting 100 article / headline pairs
 60%|██████    | 516/857 [18:01:49<7:04:39, 74.72s/it]
Total articles processed: 61697 of 77974 (79.1%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/ltw_eng_200807.xml.gz
3035  articles in this file. Extracting 100 article / headline pairs
 60%|██████    | 517/857 [18:03:18<7:27:42, 79.01s/it]
Total articles processed: 61817 of 78159 (79.1%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/nyt_eng_200802.xml.gz
6857  articles in this file. Extracting 100 article / headline pairs
 60%|██████    | 518/857 [18:08:03<13:16:31, 140.98s/it]
Total articles processed: 61937 of 78328 (79.1%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/apw_eng_199609.xml.gz
11972  articles in this file. Extracting 100 article / headline pairs
 61%|██████    | 519/857 [18:11:04<14:21:30, 152.93s/it]
Total articles processed: 62057 of 78473 (79.1%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/nyt_eng_200407.xml.gz
9277  articles in this file. Extracting 100 article / headline pairs
 61%|██████    | 520/857 [18:17:38<21:05:04, 225.24s/it]
Total articles processed: 62177 of 78653 (79.1%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_200603.xml.gz
9712  articles in this file. Extracting 100 article / headline pairs
 61%|██████    | 521/857 [18:18:53<16:48:57, 180.17s/it]
Total articles processed: 62297 of 78799 (79.1%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/nyt_eng_201011.xml.gz
5940  articles in this file. Extracting 100 article / headline pairs
 61%|██████    | 522/857 [18:22:31<17:48:33, 191.38s/it]
Total articles processed: 62417 of 78972 (79.0%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/ltw_eng_200612.xml.gz
3204  articles in this file. Extracting 100 article / headline pairs
 61%|██████    | 523/857 [18:23:57<14:49:08, 159.72s/it]
Total articles processed: 62537 of 79159 (79.0%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/nyt_eng_200402.xml.gz
1630  articles in this file. Extracting 100 article / headline pairs
 61%|██████    | 524/857 [18:24:30<11:16:17, 121.85s/it]
Total articles processed: 62657 of 79339 (79.0%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/apw_eng_199411.xml.gz
9990  articles in this file. Extracting 100 article / headline pairs
 61%|██████▏   | 525/857 [18:27:09<12:15:08, 132.86s/it]
Total articles processed: 62777 of 79473 (79.0%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/afp_eng_199501.xml.gz
11370  articles in this file. Extracting 100 article / headline pairs
 61%|██████▏   | 526/857 [18:29:04<11:43:56, 127.60s/it]
Total articles processed: 62897 of 79597 (79.0%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_200407.xml.gz
9637  articles in this file. Extracting 100 article / headline pairs
 61%|██████▏   | 527/857 [18:30:22<10:20:01, 112.73s/it]
Total articles processed: 63017 of 79746 (79.0%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/afp_eng_200507.xml.gz
20308  articles in this file. Extracting 100 article / headline pairs
 62%|██████▏   | 528/857 [18:36:17<16:56:18, 185.35s/it]
Total articles processed: 63137 of 79886 (79.0%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/cna_eng_200810.xml.gz
1154 
 62%|██████▏   | 529/857 [18:36:35<12:18:26, 135.08s/it]
 articles in this file. Extracting 100 article / headline pairs
Total articles processed: 63257 of 80044 (79.0%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/cna_eng_200708.xml.gz
1168  articles in this file. Extracting 100 article / headline pairs
 62%|██████▏   | 530/857 [18:36:50<8:59:51, 99.06s/it]  
Total articles processed: 63377 of 80204 (79.0%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_200206.xml.gz
10361  articles in this file. Extracting 100 article / headline pairs
 62%|██████▏   | 531/857 [18:38:06<8:21:56, 92.38s/it]
Total articles processed: 63497 of 80348 (79.0%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_200908.xml.gz
9473  articles in this file. Extracting 100 article / headline pairs
 62%|██████▏   | 532/857 [18:39:16<7:42:57, 85.47s/it]
Total articles processed: 63617 of 80505 (79.0%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/nyt_eng_199410.xml.gz
8565  articles in this file. Extracting 100 article / headline pairs
 62%|██████▏   | 533/857 [18:43:50<12:47:47, 142.18s/it]
Total articles processed: 63737 of 80656 (79.0%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/afp_eng_199507.xml.gz
12909  articles in this file. Extracting 100 article / headline pairs
 62%|██████▏   | 534/857 [18:46:05<12:33:48, 140.03s/it]
Total articles processed: 63857 of 80781 (79.0%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/apw_eng_200905.xml.gz
19071  articles in this file. Extracting 100 article / headline pairs
 62%|██████▏   | 535/857 [18:52:21<18:51:23, 210.82s/it]
Total articles processed: 63977 of 80903 (79.1%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/cna_eng_199912.xml.gz
739  articles in this file. Extracting 100 article / headline pairs
 63%|██████▎   | 536/857 [18:52:36<13:33:28, 152.05s/it]
Total articles processed: 64097 of 81052 (79.1%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/nyt_eng_200505.xml.gz
8118  articles in this file. Extracting 100 article / headline pairs
 63%|██████▎   | 537/857 [18:57:54<17:56:15, 201.80s/it]
Total articles processed: 64217 of 81218 (79.1%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/afp_eng_200501.xml.gz
20378  articles in this file. Extracting 100 article / headline pairs
 63%|██████▎   | 538/857 [19:02:53<20:27:35, 230.89s/it]
Total articles processed: 64337 of 81362 (79.1%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/ltw_eng_199409.xml.gz
3021  articles in this file. Extracting 100 article / headline pairs
 63%|██████▎   | 539/857 [19:04:04<16:10:06, 183.04s/it]
Total articles processed: 64457 of 81546 (79.0%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/ltw_eng_199711.xml.gz
2770  articles in this file. Extracting 100 article / headline pairs
 63%|██████▎   | 540/857 [19:05:08<12:58:40, 147.38s/it]
Total articles processed: 64577 of 81718 (79.0%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_200406.xml.gz
10029  articles in this file. Extracting 100 article / headline pairs
 63%|██████▎   | 541/857 [19:06:24<11:02:06, 125.72s/it]
Total articles processed: 64697 of 81865 (79.0%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/afp_eng_199702.xml.gz
17601  articles in this file. Extracting 100 article / headline pairs
 63%|██████▎   | 542/857 [19:09:51<13:08:28, 150.19s/it]
Total articles processed: 64817 of 81986 (79.1%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/cna_eng_200105.xml.gz
829  articles in this file. Extracting 100 article / headline pairs
 63%|██████▎   | 543/857 [19:10:05<9:31:45, 109.25s/it] 
Total articles processed: 64937 of 82133 (79.1%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/cna_eng_200110.xml.gz
859  articles in this file. Extracting 100 article / headline pairs
 63%|██████▎   | 544/857 [19:10:17<6:58:51, 80.29s/it] 
Total articles processed: 65057 of 82271 (79.1%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/nyt_eng_201005.xml.gz
5952  articles in this file. Extracting 100 article / headline pairs
 64%|██████▎   | 545/857 [19:13:42<10:10:55, 117.49s/it]
Total articles processed: 65177 of 82442 (79.1%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/nyt_eng_200506.xml.gz
8965  articles in this file. Extracting 100 article / headline pairs
 64%|██████▎   | 546/857 [19:19:41<16:24:50, 190.00s/it]
Total articles processed: 65297 of 82613 (79.0%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/apw_eng_200910.xml.gz
13338  articles in this file. Extracting 100 article / headline pairs
 64%|██████▍   | 547/857 [19:23:28<17:19:29, 201.19s/it]
Total articles processed: 65417 of 82738 (79.1%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/cna_eng_200711.xml.gz
1198  articles in this file. Extracting 100 article / headline pairs
 64%|██████▍   | 548/857 [19:23:43<12:27:38, 145.17s/it]
Total articles processed: 65537 of 82890 (79.1%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/afp_eng_201005.xml.gz
22058  articles in this file. Extracting 100 article / headline pairs
 64%|██████▍   | 549/857 [19:29:08<17:02:55, 199.27s/it]
Total articles processed: 65657 of 83025 (79.1%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/cna_eng_199804.xml.gz
660  articles in this file. Extracting 100 article / headline pairs
 64%|██████▍   | 550/857 [19:29:20<12:12:18, 143.12s/it]
Total articles processed: 65777 of 83165 (79.1%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_201006.xml.gz
12986  articles in this file. Extracting 100 article / headline pairs
 64%|██████▍   | 551/857 [19:31:04<11:10:13, 131.42s/it]
Total articles processed: 65897 of 83301 (79.1%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_200803.xml.gz
6856  articles in this file. Extracting 100 article / headline pairs
 64%|██████▍   | 552/857 [19:31:49<8:55:29, 105.34s/it] 
Total articles processed: 66017 of 83444 (79.1%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/ltw_eng_199610.xml.gz
3197  articles in this file. Extracting 100 article / headline pairs
 65%|██████▍   | 553/857 [19:33:03<8:05:47, 95.88s/it] 
Total articles processed: 66137 of 83598 (79.1%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_200307.xml.gz
10330  articles in this file. Extracting 100 article / headline pairs
 65%|██████▍   | 554/857 [19:34:14<7:27:23, 88.59s/it]
Total articles processed: 66257 of 83750 (79.1%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/apw_eng_199811.xml.gz
15355  articles in this file. Extracting 100 article / headline pairs
 65%|██████▍   | 555/857 [19:38:07<11:03:21, 131.79s/it]
Total articles processed: 66377 of 83889 (79.1%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/nyt_eng_200906.xml.gz
5357  articles in this file. Extracting 100 article / headline pairs
 65%|██████▍   | 556/857 [19:41:02<12:06:25, 144.80s/it]
Total articles processed: 66497 of 84047 (79.1%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/apw_eng_200503.xml.gz
12320  articles in this file. Extracting 100 article / headline pairs
 65%|██████▍   | 557/857 [19:44:27<13:34:30, 162.90s/it]
Total articles processed: 66617 of 84181 (79.1%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_199706.xml.gz
7603  articles in this file. Extracting 100 article / headline pairs
 65%|██████▌   | 558/857 [19:45:13<10:36:18, 127.69s/it]
Total articles processed: 66737 of 84323 (79.1%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/afp_eng_200703.xml.gz
19406  articles in this file. Extracting 100 article / headline pairs
 65%|██████▌   | 559/857 [19:49:53<14:21:45, 173.51s/it]
Total articles processed: 66857 of 84457 (79.2%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_200204.xml.gz
9282  articles in this file. Extracting 100 article / headline pairs
 65%|██████▌   | 560/857 [19:50:54<11:31:02, 139.60s/it]
Total articles processed: 66977 of 84602 (79.2%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/afp_eng_199605.xml.gz
14390  articles in this file. Extracting 100 article / headline pairs
 65%|██████▌   | 561/857 [19:53:17<11:34:19, 140.74s/it]
Total articles processed: 67097 of 84723 (79.2%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/nyt_eng_200803.xml.gz
7144  articles in this file. Extracting 100 article / headline pairs
 66%|██████▌   | 562/857 [19:57:52<14:50:24, 181.10s/it]
Total articles processed: 67217 of 84892 (79.2%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_199701.xml.gz
7794  articles in this file. Extracting 100 article / headline pairs
 66%|██████▌   | 563/857 [19:58:38<11:29:10, 140.65s/it]
Total articles processed: 67337 of 85040 (79.2%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/ltw_eng_199512.xml.gz
2791  articles in this file. Extracting 100 article / headline pairs
 66%|██████▌   | 564/857 [19:59:39<9:29:57, 116.71s/it] 
Total articles processed: 67457 of 85225 (79.2%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_199911.xml.gz
8679  articles in this file. Extracting 100 article / headline pairs
 66%|██████▌   | 565/857 [20:00:30<7:51:38, 96.91s/it] 
Total articles processed: 67577 of 85367 (79.2%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/wpb_eng_201007.xml.gz
2190  articles in this file. Extracting 100 article / headline pairs
 66%|██████▌   | 566/857 [20:01:18<6:38:36, 82.19s/it]
Total articles processed: 67697 of 85529 (79.2%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/nyt_eng_199501.xml.gz
8979  articles in this file. Extracting 100 article / headline pairs
 66%|██████▌   | 567/857 [20:06:18<11:53:35, 147.64s/it]
Total articles processed: 67817 of 85683 (79.1%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/afp_eng_200508.xml.gz
8594  articles in this file. Extracting 100 article / headline pairs
 66%|██████▋   | 568/857 [20:08:07<10:55:25, 136.08s/it]
Total articles processed: 67937 of 85820 (79.2%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/ltw_eng_199701.xml.gz
2999  articles in this file. Extracting 100 article / headline pairs
 66%|██████▋   | 569/857 [20:09:16<9:16:42, 115.98s/it] 
Total articles processed: 68057 of 85988 (79.1%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/apw_eng_199603.xml.gz
13283  articles in this file. Extracting 100 article / headline pairs
 67%|██████▋   | 570/857 [20:12:46<11:29:41, 144.19s/it]
Total articles processed: 68177 of 86134 (79.2%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/nyt_eng_200303.xml.gz
2093  articles in this file. Extracting 100 article / headline pairs
 67%|██████▋   | 571/857 [20:13:27<8:58:57, 113.07s/it] 
Total articles processed: 68297 of 86305 (79.1%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/ltw_eng_200611.xml.gz
3715  articles in this file. Extracting 100 article / headline pairs
 67%|██████▋   | 572/857 [20:15:02<8:31:14, 107.63s/it]
Total articles processed: 68417 of 86462 (79.1%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/cna_eng_200910.xml.gz
1305  articles in this file. Extracting 100 article / headline pairs
 67%|██████▋   | 573/857 [20:15:20<6:21:47, 80.66s/it] 
Total articles processed: 68537 of 86609 (79.1%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/apw_eng_200812.xml.gz
12541  articles in this file. Extracting 100 article / headline pairs
 67%|██████▋   | 574/857 [20:19:08<9:49:18, 124.94s/it]
Total articles processed: 68657 of 86737 (79.2%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/cna_eng_200512.xml.gz
1140  articles in this file. Extracting 100 article / headline pairs
 67%|██████▋   | 575/857 [20:19:22<7:11:21, 91.78s/it] 
Total articles processed: 68777 of 86921 (79.1%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/nyt_eng_200809.xml.gz
6427  articles in this file. Extracting 100 article / headline pairs
 67%|██████▋   | 576/857 [20:23:23<10:38:30, 136.34s/it]
Total articles processed: 68897 of 87085 (79.1%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/apw_eng_199412.xml.gz
14497  articles in this file. Extracting 100 article / headline pairs
 67%|██████▋   | 577/857 [20:27:15<12:51:23, 165.30s/it]
Total articles processed: 69017 of 87217 (79.1%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_200909.xml.gz
10598  articles in this file. Extracting 100 article / headline pairs
 67%|██████▋   | 578/857 [20:28:40<10:56:35, 141.20s/it]
Total articles processed: 69137 of 87370 (79.1%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/apw_eng_199901.xml.gz
15834  articles in this file. Extracting 100 article / headline pairs
 68%|██████▊   | 579/857 [20:33:03<13:42:49, 177.59s/it]
Total articles processed: 69257 of 87505 (79.1%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/cna_eng_199807.xml.gz
999  articles in this file. Extracting 100 article / headline pairs
 68%|██████▊   | 580/857 [20:33:17<9:52:55, 128.43s/it] 
Total articles processed: 69377 of 87633 (79.2%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/afp_eng_200403.xml.gz
22006  articles in this file. Extracting 100 article / headline pairs
 68%|██████▊   | 581/857 [20:38:46<14:28:05, 188.72s/it]
Total articles processed: 69497 of 87765 (79.2%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/cna_eng_199802.xml.gz
770  articles in this file. Extracting 100 article / headline pairs
 68%|██████▊   | 582/857 [20:39:01<10:26:18, 136.65s/it]
Total articles processed: 69617 of 87915 (79.2%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/cna_eng_200904.xml.gz
1195  articles in this file. Extracting 100 article / headline pairs
 68%|██████▊   | 583/857 [20:39:17<7:38:41, 100.44s/it] 
Total articles processed: 69737 of 88069 (79.2%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/nyt_eng_200705.xml.gz
7670  articles in this file. Extracting 100 article / headline pairs
 68%|██████▊   | 584/857 [20:44:19<12:11:33, 160.78s/it]
Total articles processed: 69857 of 88235 (79.2%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/cna_eng_200010.xml.gz
788  articles in this file. Extracting 100 article / headline pairs
 68%|██████▊   | 585/857 [20:44:32<8:48:07, 116.50s/it] 
Total articles processed: 69977 of 88372 (79.2%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_199703.xml.gz
8525  articles in this file. Extracting 100 article / headline pairs
 68%|██████▊   | 586/857 [20:45:25<7:20:54, 97.62s/it] 
Total articles processed: 70097 of 88523 (79.2%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_200712.xml.gz
10830  articles in this file. Extracting 100 article / headline pairs
 68%|██████▊   | 587/857 [20:46:47<6:58:12, 92.94s/it]
Total articles processed: 70217 of 88661 (79.2%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/cna_eng_200310.xml.gz
1048  articles in this file. Extracting 100 article / headline pairs
 69%|██████▊   | 588/857 [20:47:00<5:08:08, 68.73s/it]
Total articles processed: 70337 of 88836 (79.2%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/cna_eng_200005.xml.gz
848  articles in this file. Extracting 100 article / headline pairs
 69%|██████▊   | 589/857 [20:47:12<3:52:04, 51.96s/it]
Total articles processed: 70457 of 88978 (79.2%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_200310.xml.gz
9831  articles in this file. Extracting 100 article / headline pairs
 69%|██████▉   | 590/857 [20:48:26<4:20:05, 58.45s/it]
Total articles processed: 70577 of 89124 (79.2%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/ltw_eng_200504.xml.gz
3005  articles in this file. Extracting 100 article / headline pairs
 69%|██████▉   | 591/857 [20:49:41<4:41:16, 63.45s/it]
Total articles processed: 70697 of 89293 (79.2%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/apw_eng_199510.xml.gz
12770  articles in this file. Extracting 100 article / headline pairs
 69%|██████▉   | 592/857 [20:52:59<7:37:40, 103.63s/it]
Total articles processed: 70817 of 89436 (79.2%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/ltw_eng_199801.xml.gz
3128  articles in this file. Extracting 100 article / headline pairs
 69%|██████▉   | 593/857 [20:54:13<6:58:02, 95.01s/it] 
Total articles processed: 70937 of 89619 (79.2%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/cna_eng_200511.xml.gz
1036  articles in this file. Extracting 100 article / headline pairs
 69%|██████▉   | 594/857 [20:54:27<5:09:12, 70.54s/it]
Total articles processed: 71057 of 89773 (79.2%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/afp_eng_200204.xml.gz
15628  articles in this file. Extracting 100 article / headline pairs
 69%|██████▉   | 595/857 [20:57:24<7:28:10, 102.64s/it]
Total articles processed: 71177 of 89898 (79.2%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_200711.xml.gz
11573  articles in this file. Extracting 100 article / headline pairs
 70%|██████▉   | 596/857 [20:58:46<6:59:09, 96.36s/it] 
Total articles processed: 71297 of 90051 (79.2%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/ltw_eng_199607.xml.gz
3043  articles in this file. Extracting 100 article / headline pairs
 70%|██████▉   | 597/857 [20:59:55<6:21:21, 88.01s/it]
Total articles processed: 71417 of 90229 (79.2%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/nyt_eng_200901.xml.gz
6442  articles in this file. Extracting 100 article / headline pairs
 70%|██████▉   | 598/857 [21:03:50<9:30:27, 132.15s/it]
Total articles processed: 71537 of 90414 (79.1%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_200103.xml.gz
9643  articles in this file. Extracting 100 article / headline pairs
 70%|██████▉   | 599/857 [21:05:01<8:09:37, 113.87s/it]
Total articles processed: 71657 of 90562 (79.1%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/afp_eng_200410.xml.gz
20234  articles in this file. Extracting 100 article / headline pairs
 70%|███████   | 600/857 [21:09:41<11:40:32, 163.55s/it]
Total articles processed: 71777 of 90699 (79.1%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/cna_eng_201004.xml.gz
1249  articles in this file. Extracting 100 article / headline pairs
 70%|███████   | 601/857 [21:09:57<8:30:00, 119.53s/it] 
Total articles processed: 71897 of 90844 (79.1%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_199802.xml.gz
7857  articles in this file. Extracting 100 article / headline pairs
 70%|███████   | 602/857 [21:10:46<6:57:45, 98.30s/it] 
Total articles processed: 72017 of 90983 (79.2%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_200606.xml.gz
8370  articles in this file. Extracting 100 article / headline pairs
 70%|███████   | 603/857 [21:11:46<6:06:58, 86.69s/it]
Total articles processed: 72137 of 91118 (79.2%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_200812.xml.gz
8708  articles in this file. Extracting 100 article / headline pairs
 70%|███████   | 604/857 [21:12:53<5:40:39, 80.79s/it]
Total articles processed: 72257 of 91253 (79.2%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_200003.xml.gz
9717  articles in this file. Extracting 100 article / headline pairs
 71%|███████   | 605/857 [21:13:54<5:14:26, 74.87s/it]
Total articles processed: 72377 of 91407 (79.2%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_199504.xml.gz
6524  articles in this file. Extracting 100 article / headline pairs
 71%|███████   | 606/857 [21:14:34<4:29:40, 64.47s/it]
Total articles processed: 72497 of 91548 (79.2%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/ltw_eng_199503.xml.gz
3193  articles in this file. Extracting 100 article / headline pairs
 71%|███████   | 607/857 [21:16:00<4:55:10, 70.84s/it]
Total articles processed: 72617 of 91740 (79.2%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/apw_eng_200908.xml.gz
13080  articles in this file. Extracting 100 article / headline pairs
 71%|███████   | 608/857 [21:20:07<8:33:12, 123.66s/it]
Total articles processed: 72737 of 91865 (79.2%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_200006.xml.gz
9434  articles in this file. Extracting 100 article / headline pairs
 71%|███████   | 609/857 [21:21:06<7:11:58, 104.51s/it]
Total articles processed: 72857 of 92017 (79.2%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/cna_eng_200607.xml.gz
1257  articles in this file. Extracting 100 article / headline pairs
 71%|███████   | 610/857 [21:21:21<5:18:41, 77.41s/it] 
Total articles processed: 72977 of 92174 (79.2%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_200110.xml.gz
9437  articles in this file. Extracting 100 article / headline pairs
 71%|███████▏  | 611/857 [21:22:18<4:52:45, 71.40s/it]
Total articles processed: 73097 of 92331 (79.2%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/apw_eng_201004.xml.gz
11161  articles in this file. Extracting 100 article / headline pairs
 71%|███████▏  | 612/857 [21:25:38<7:29:12, 110.01s/it]
Total articles processed: 73217 of 92460 (79.2%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/nyt_eng_200305.xml.gz
1980  articles in this file. Extracting 100 article / headline pairs
 72%|███████▏  | 613/857 [21:26:20<6:04:42, 89.68s/it] 
Total articles processed: 73337 of 92634 (79.2%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/ltw_eng_200406.xml.gz
3607  articles in this file. Extracting 100 article / headline pairs
 72%|███████▏  | 614/857 [21:27:59<6:14:14, 92.41s/it]
Total articles processed: 73457 of 92798 (79.2%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/cna_eng_200704.xml.gz
988  articles in this file. Extracting 100 article / headline pairs
 72%|███████▏  | 615/857 [21:28:15<4:40:20, 69.50s/it]
Total articles processed: 73577 of 92971 (79.1%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/cna_eng_200707.xml.gz
1263  articles in this file. Extracting 100 article / headline pairs
 72%|███████▏  | 616/857 [21:28:33<3:36:23, 53.87s/it]
Total articles processed: 73697 of 93128 (79.1%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/afp_eng_200601.xml.gz
16321  articles in this file. Extracting 100 article / headline pairs
 72%|███████▏  | 617/857 [21:32:59<7:50:13, 117.56s/it]
Total articles processed: 73817 of 93264 (79.1%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/afp_eng_199608.xml.gz
18045  articles in this file. Extracting 100 article / headline pairs
 72%|███████▏  | 618/857 [21:36:15<9:22:09, 141.13s/it]
Total articles processed: 73937 of 93385 (79.2%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/cna_eng_200906.xml.gz
1211  articles in this file. Extracting 100 article / headline pairs
 72%|███████▏  | 619/857 [21:36:31<6:51:13, 103.67s/it]
Total articles processed: 74057 of 93537 (79.2%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/ltw_eng_200601.xml.gz
3553  articles in this file. Extracting 100 article / headline pairs
 72%|███████▏  | 620/857 [21:38:04<6:36:10, 100.30s/it]
Total articles processed: 74177 of 93701 (79.2%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_200607.xml.gz
9030  articles in this file. Extracting 100 article / headline pairs
 72%|███████▏  | 621/857 [21:39:11<5:55:19, 90.34s/it] 
Total articles processed: 74297 of 93847 (79.2%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/afp_eng_200211.xml.gz
22486  articles in this file. Extracting 100 article / headline pairs
 73%|███████▎  | 622/857 [21:45:41<11:46:43, 180.44s/it]
Total articles processed: 74417 of 93979 (79.2%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/apw_eng_200412.xml.gz
3427  articles in this file. Extracting 100 article / headline pairs
 73%|███████▎  | 623/857 [21:46:31<9:11:19, 141.37s/it] 
Total articles processed: 74537 of 94120 (79.2%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/afp_eng_200911.xml.gz
19927  articles in this file. Extracting 100 article / headline pairs
 73%|███████▎  | 624/857 [21:51:32<12:14:22, 189.11s/it]
Total articles processed: 74657 of 94254 (79.2%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_200605.xml.gz
10325  articles in this file. Extracting 100 article / headline pairs
 73%|███████▎  | 625/857 [21:53:00<10:14:21, 158.89s/it]
Total articles processed: 74777 of 94402 (79.2%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/cna_eng_201003.xml.gz
1336  articles in this file. Extracting 100 article / headline pairs
 73%|███████▎  | 626/857 [21:53:19<7:29:28, 116.74s/it] 
Total articles processed: 74897 of 94544 (79.2%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/nyt_eng_200411.xml.gz
9249  articles in this file. Extracting 100 article / headline pairs
 73%|███████▎  | 627/857 [21:59:39<12:29:58, 195.64s/it]
Total articles processed: 75017 of 94712 (79.2%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/afp_eng_200805.xml.gz
20450  articles in this file. Extracting 100 article / headline pairs
 73%|███████▎  | 628/857 [22:04:58<14:48:16, 232.74s/it]
Total articles processed: 75137 of 94889 (79.2%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/afp_eng_200903.xml.gz
22827  articles in this file. Extracting 100 article / headline pairs
 73%|███████▎  | 629/857 [22:10:25<16:31:53, 261.03s/it]
Total articles processed: 75257 of 95018 (79.2%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/cna_eng_200404.xml.gz
982  articles in this file. Extracting 100 article / headline pairs
 74%|███████▎  | 630/857 [22:10:37<11:45:38, 186.51s/it]
Total articles processed: 75377 of 95201 (79.2%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_201005.xml.gz
12027  articles in this file. Extracting 100 article / headline pairs
 74%|███████▎  | 631/857 [22:12:15<10:01:26, 159.68s/it]
Total articles processed: 75497 of 95344 (79.2%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/afp_eng_200709.xml.gz
19434  articles in this file. Extracting 100 article / headline pairs
 74%|███████▎  | 632/857 [22:17:12<12:33:13, 200.86s/it]
Total articles processed: 75617 of 95475 (79.2%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/cna_eng_200003.xml.gz
852  articles in this file. Extracting 100 article / headline pairs
 74%|███████▍  | 633/857 [22:17:25<8:59:54, 144.62s/it] 
Total articles processed: 75737 of 95620 (79.2%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/cna_eng_200908.xml.gz
1253  articles in this file. Extracting 100 article / headline pairs
 74%|███████▍  | 634/857 [22:17:41<6:34:11, 106.06s/it]
Total articles processed: 75857 of 95775 (79.2%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/apw_eng_199512.xml.gz
11775  articles in this file. Extracting 100 article / headline pairs
 74%|███████▍  | 635/857 [22:20:38<7:51:08, 127.34s/it]
Total articles processed: 75977 of 95915 (79.2%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/ltw_eng_200607.xml.gz
3514 
 74%|███████▍  | 636/857 [22:22:06<7:05:32, 115.53s/it]
 articles in this file. Extracting 100 article / headline pairs
Total articles processed: 76097 of 96098 (79.2%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/apw_eng_199707.xml.gz
14715  articles in this file. Extracting 100 article / headline pairs
 74%|███████▍  | 637/857 [22:26:01<9:15:15, 151.43s/it]
Total articles processed: 76217 of 96237 (79.2%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_200008.xml.gz
9267  articles in this file. Extracting 100 article / headline pairs
 74%|███████▍  | 638/857 [22:26:59<7:30:42, 123.48s/it]
Total articles processed: 76337 of 96391 (79.2%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_199507.xml.gz
6927  articles in this file. Extracting 100 article / headline pairs
 75%|███████▍  | 639/857 [22:27:39<5:57:12, 98.31s/it] 
Total articles processed: 76457 of 96537 (79.2%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_200610.xml.gz
10708  articles in this file. Extracting 100 article / headline pairs
 75%|███████▍  | 640/857 [22:28:55<5:31:48, 91.74s/it]
Total articles processed: 76577 of 96687 (79.2%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/apw_eng_199504.xml.gz
15524  articles in this file. Extracting 100 article / headline pairs
 75%|███████▍  | 641/857 [22:32:57<8:12:33, 136.82s/it]
Total articles processed: 76697 of 96827 (79.2%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/apw_eng_199608.xml.gz
10032  articles in this file. Extracting 100 article / headline pairs
 75%|███████▍  | 642/857 [22:35:14<8:09:42, 136.66s/it]
Total articles processed: 76817 of 96988 (79.2%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/ltw_eng_199408.xml.gz
3150  articles in this file. Extracting 100 article / headline pairs
 75%|███████▌  | 643/857 [22:36:28<7:00:49, 117.99s/it]
Total articles processed: 76937 of 97150 (79.2%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/nyt_eng_199407.xml.gz
8045  articles in this file. Extracting 100 article / headline pairs
 75%|███████▌  | 644/857 [22:40:43<9:24:49, 159.10s/it]
Total articles processed: 77057 of 97295 (79.2%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/cna_eng_200203.xml.gz
619  articles in this file. Extracting 100 article / headline pairs
 75%|███████▌  | 645/857 [22:40:54<6:45:13, 114.69s/it]
Total articles processed: 77177 of 97434 (79.2%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/cna_eng_200911.xml.gz
1187  articles in this file. Extracting 100 article / headline pairs
 75%|███████▌  | 646/857 [22:41:10<4:59:21, 85.12s/it] 
Total articles processed: 77297 of 97591 (79.2%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/nyt_eng_200310.xml.gz
2144  articles in this file. Extracting 100 article / headline pairs
 75%|███████▌  | 647/857 [22:41:50<4:09:41, 71.34s/it]
Total articles processed: 77417 of 97769 (79.2%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/cna_eng_199902.xml.gz
650  articles in this file. Extracting 100 article / headline pairs
 76%|███████▌  | 648/857 [22:42:01<3:06:05, 53.42s/it]
Total articles processed: 77537 of 97909 (79.2%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/ltw_eng_200904.xml.gz
3080  articles in this file. Extracting 100 article / headline pairs
 76%|███████▌  | 649/857 [22:43:12<3:22:59, 58.55s/it]
Total articles processed: 77657 of 98087 (79.2%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/apw_eng_200308.xml.gz
18552  articles in this file. Extracting 100 article / headline pairs
 76%|███████▌  | 650/857 [22:50:14<9:38:38, 167.72s/it]
Total articles processed: 77777 of 98227 (79.2%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/afp_eng_200307.xml.gz
22559  articles in this file. Extracting 100 article / headline pairs
 76%|███████▌  | 651/857 [22:56:17<12:56:43, 226.23s/it]
Total articles processed: 77897 of 98360 (79.2%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/afp_eng_200509.xml.gz
16763  articles in this file. Extracting 100 article / headline pairs
 76%|███████▌  | 652/857 [23:00:37<13:27:43, 236.41s/it]
Total articles processed: 78017 of 98497 (79.2%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_200411.xml.gz
9367  articles in this file. Extracting 100 article / headline pairs
 76%|███████▌  | 653/857 [23:01:41<10:28:03, 184.72s/it]
Total articles processed: 78137 of 98660 (79.2%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_200107.xml.gz
8581  articles in this file. Extracting 100 article / headline pairs
 76%|███████▋  | 654/857 [23:02:33<8:10:34, 145.00s/it] 
Total articles processed: 78257 of 98808 (79.2%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/afp_eng_200802.xml.gz
8829  articles in this file. Extracting 100 article / headline pairs
 76%|███████▋  | 655/857 [23:04:22<7:31:22, 134.07s/it]
Total articles processed: 78377 of 98937 (79.2%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/afp_eng_199412.xml.gz
10930  articles in this file. Extracting 100 article / headline pairs
 77%|███████▋  | 656/857 [23:06:09<7:01:22, 125.79s/it]
Total articles processed: 78497 of 99062 (79.2%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/cna_eng_200612.xml.gz
596  articles in this file. Extracting 100 article / headline pairs
 77%|███████▋  | 657/857 [23:06:19<5:03:42, 91.11s/it] 
Total articles processed: 78617 of 99212 (79.2%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_200807.xml.gz
6887  articles in this file. Extracting 100 article / headline pairs
 77%|███████▋  | 658/857 [23:07:09<4:21:29, 78.84s/it]
Total articles processed: 78737 of 99351 (79.3%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/ltw_eng_199411.xml.gz
3039  articles in this file. Extracting 100 article / headline pairs
 77%|███████▋  | 659/857 [23:08:24<4:16:50, 77.83s/it]
Total articles processed: 78857 of 99521 (79.2%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/afp_eng_200511.xml.gz
9231  articles in this file. Extracting 100 article / headline pairs
 77%|███████▋  | 660/857 [23:10:32<5:04:04, 92.61s/it]
Total articles processed: 78977 of 99656 (79.2%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/nyt_eng_200702.xml.gz
7502  articles in this file. Extracting 100 article / headline pairs
 77%|███████▋  | 661/857 [23:15:15<8:10:01, 150.01s/it]
Total articles processed: 79097 of 99812 (79.2%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/afp_eng_199601.xml.gz
15604  articles in this file. Extracting 100 article / headline pairs
 77%|███████▋  | 662/857 [23:17:55<8:17:15, 153.00s/it]
Total articles processed: 79217 of 99932 (79.3%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/ltw_eng_200508.xml.gz
3801  articles in this file. Extracting 100 article / headline pairs
 77%|███████▋  | 663/857 [23:19:29<7:16:45, 135.08s/it]
Total articles processed: 79337 of 100097 (79.3%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_201003.xml.gz
12849  articles in this file. Extracting 100 article / headline pairs
 77%|███████▋  | 664/857 [23:21:15<6:46:16, 126.31s/it]
Total articles processed: 79457 of 100242 (79.3%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/nyt_eng_200910.xml.gz
5464  articles in this file. Extracting 100 article / headline pairs
 78%|███████▊  | 665/857 [23:24:05<7:26:33, 139.55s/it]
Total articles processed: 79577 of 100416 (79.2%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_199510.xml.gz
7286  articles in this file. Extracting 100 article / headline pairs
 78%|███████▊  | 666/857 [23:24:48<5:51:51, 110.53s/it]
Total articles processed: 79697 of 100559 (79.3%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/afp_eng_200706.xml.gz
17365  articles in this file. Extracting 100 article / headline pairs
 78%|███████▊  | 667/857 [23:28:43<7:48:25, 147.92s/it]
Total articles processed: 79817 of 100693 (79.3%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/apw_eng_200401.xml.gz
11422  articles in this file. Extracting 100 article / headline pairs
 78%|███████▊  | 668/857 [23:32:52<9:21:14, 178.17s/it]
Total articles processed: 79937 of 100830 (79.3%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/cna_eng_200812.xml.gz
1343 
 78%|███████▊  | 669/857 [23:33:11<6:49:07, 130.57s/it]
 articles in this file. Extracting 100 article / headline pairs
Total articles processed: 80057 of 100993 (79.3%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/cna_eng_200601.xml.gz
1146  articles in this file. Extracting 100 article / headline pairs
 78%|███████▊  | 670/857 [23:33:28<5:00:39, 96.47s/it] 
Total articles processed: 80177 of 101164 (79.3%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/ltw_eng_200310.xml.gz
4047  articles in this file. Extracting 100 article / headline pairs
 78%|███████▊  | 671/857 [23:35:31<5:24:03, 104.53s/it]
Total articles processed: 80297 of 101339 (79.2%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_200802.xml.gz
9033  articles in this file. Extracting 100 article / headline pairs
 78%|███████▊  | 672/857 [23:36:51<4:59:29, 97.13s/it] 
Total articles processed: 80417 of 101495 (79.2%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/afp_eng_200407.xml.gz
16165  articles in this file. Extracting 100 article / headline pairs
 79%|███████▊  | 673/857 [23:40:21<6:41:10, 130.82s/it]
Total articles processed: 80537 of 101621 (79.3%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/cna_eng_201006.xml.gz
1314  articles in this file. Extracting 100 article / headline pairs
 79%|███████▊  | 674/857 [23:40:38<4:54:56, 96.70s/it] 
Total articles processed: 80657 of 101759 (79.3%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_199502.xml.gz
6010  articles in this file. Extracting 100 article / headline pairs
 79%|███████▉  | 675/857 [23:41:13<3:57:35, 78.33s/it]
Total articles processed: 80777 of 101905 (79.3%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/nyt_eng_200412.xml.gz
7943  articles in this file. Extracting 100 article / headline pairs
 79%|███████▉  | 676/857 [23:46:32<7:34:10, 150.55s/it]
Total articles processed: 80897 of 102078 (79.3%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_200302.xml.gz
9318  articles in this file. Extracting 100 article / headline pairs
 79%|███████▉  | 677/857 [23:47:36<6:13:03, 124.35s/it]
Total articles processed: 81017 of 102234 (79.2%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/wpb_eng_201012.xml.gz
2081  articles in this file. Extracting 100 article / headline pairs
 79%|███████▉  | 678/857 [23:48:21<5:00:22, 100.68s/it]
Total articles processed: 81137 of 102387 (79.2%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/cna_eng_199812.xml.gz
755  articles in this file. Extracting 100 article / headline pairs
 79%|███████▉  | 679/857 [23:48:33<3:39:48, 74.09s/it] 
Total articles processed: 81257 of 102524 (79.3%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/apw_eng_199711.xml.gz
16586  articles in this file. Extracting 100 article / headline pairs
 79%|███████▉  | 680/857 [23:53:31<6:56:48, 141.29s/it]
Total articles processed: 81377 of 102669 (79.3%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/cna_eng_200006.xml.gz
850  articles in this file. Extracting 100 article / headline pairs
 79%|███████▉  | 681/857 [23:53:44<5:01:44, 102.86s/it]
Total articles processed: 81497 of 102805 (79.3%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/ltw_eng_199806.xml.gz
1297  articles in this file. Extracting 100 article / headline pairs
 80%|███████▉  | 682/857 [23:54:15<3:56:35, 81.12s/it] 
Total articles processed: 81617 of 102981 (79.3%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/afp_eng_200909.xml.gz
20285  articles in this file. Extracting 100 article / headline pairs
 80%|███████▉  | 683/857 [23:59:23<7:12:30, 149.14s/it]
Total articles processed: 81737 of 103106 (79.3%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_200903.xml.gz
10667  articles in this file. Extracting 100 article / headline pairs
 80%|███████▉  | 684/857 [24:01:00<6:25:39, 133.75s/it]
Total articles processed: 81857 of 103243 (79.3%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_200002.xml.gz
8051  articles in this file. Extracting 100 article / headline pairs
 80%|███████▉  | 685/857 [24:01:49<5:10:33, 108.33s/it]
Total articles processed: 81977 of 103399 (79.3%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_200009.xml.gz
9571  articles in this file. Extracting 100 article / headline pairs
 80%|████████  | 686/857 [24:02:50<4:27:32, 93.87s/it] 
Total articles processed: 82097 of 103546 (79.3%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/afp_eng_200112.xml.gz
13913  articles in this file. Extracting 100 article / headline pairs
 80%|████████  | 687/857 [24:05:55<5:44:00, 121.41s/it]
Total articles processed: 82217 of 103668 (79.3%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/nyt_eng_200611.xml.gz
8877  articles in this file. Extracting 100 article / headline pairs
 80%|████████  | 688/857 [24:11:58<9:05:51, 193.80s/it]
Total articles processed: 82337 of 103840 (79.3%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/afp_eng_200301.xml.gz
19404  articles in this file. Extracting 100 article / headline pairs
 80%|████████  | 689/857 [24:16:49<10:24:36, 223.07s/it]
Total articles processed: 82457 of 103964 (79.3%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_200303.xml.gz
13065  articles in this file. Extracting 100 article / headline pairs
 81%|████████  | 690/857 [24:18:23<8:32:50, 184.25s/it] 
Total articles processed: 82577 of 104103 (79.3%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/apw_eng_200202.xml.gz
19929  articles in this file. Extracting 100 article / headline pairs
 81%|████████  | 691/857 [24:24:50<11:18:16, 245.16s/it]
Total articles processed: 82697 of 104247 (79.3%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_199903.xml.gz
9442  articles in this file. Extracting 100 article / headline pairs
 81%|████████  | 692/857 [24:25:53<8:43:49, 190.48s/it] 
Total articles processed: 82817 of 104396 (79.3%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/nyt_eng_200512.xml.gz
8907  articles in this file. Extracting 100 article / headline pairs
 81%|████████  | 693/857 [24:31:54<10:59:56, 241.44s/it]
Total articles processed: 82937 of 104564 (79.3%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/cna_eng_200905.xml.gz
1119  articles in this file. Extracting 100 article / headline pairs
 81%|████████  | 694/857 [24:32:13<7:55:14, 174.94s/it] 
Total articles processed: 83057 of 104715 (79.3%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/cna_eng_200801.xml.gz
1429  articles in this file. Extracting 100 article / headline pairs
 81%|████████  | 695/857 [24:32:29<5:43:28, 127.22s/it]
Total articles processed: 83177 of 104876 (79.3%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/ltw_eng_199805.xml.gz
3018  articles in this file. Extracting 100 article / headline pairs
 81%|████████  | 696/857 [24:33:41<4:56:50, 110.62s/it]
Total articles processed: 83297 of 105056 (79.3%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/apw_eng_199503.xml.gz
16271  articles in this file. Extracting 100 article / headline pairs
 81%|████████▏ | 697/857 [24:37:55<6:49:44, 153.66s/it]
Total articles processed: 83417 of 105191 (79.3%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_199708.xml.gz
7825  articles in this file. Extracting 100 article / headline pairs
 81%|████████▏ | 698/857 [24:38:41<5:21:34, 121.35s/it]
Total articles processed: 83537 of 105343 (79.3%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/afp_eng_200607.xml.gz
18293  articles in this file. Extracting 100 article / headline pairs
 82%|████████▏ | 699/857 [24:43:25<7:28:05, 170.16s/it]
Total articles processed: 83657 of 105478 (79.3%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/apw_eng_199706.xml.gz
14010  articles in this file. Extracting 100 article / headline pairs
 82%|████████▏ | 700/857 [24:47:02<8:01:46, 184.11s/it]
Total articles processed: 83777 of 105618 (79.3%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/cna_eng_200409.xml.gz
1128  articles in this file. Extracting 100 article / headline pairs
 82%|████████▏ | 701/857 [24:47:16<5:45:58, 133.07s/it]
Total articles processed: 83897 of 105797 (79.3%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/ltw_eng_199412.xml.gz
3015  articles in this file. Extracting 100 article / headline pairs
 82%|████████▏ | 702/857 [24:48:26<4:54:51, 114.14s/it]
Total articles processed: 84017 of 105985 (79.3%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/ltw_eng_200608.xml.gz
3703  articles in this file. Extracting 100 article / headline pairs
 82%|████████▏ | 703/857 [24:49:59<4:36:50, 107.86s/it]
Total articles processed: 84137 of 106152 (79.3%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_200201.xml.gz
8936  articles in this file. Extracting 100 article / headline pairs
 82%|████████▏ | 704/857 [24:50:56<3:55:46, 92.46s/it] 
Total articles processed: 84257 of 106306 (79.3%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/apw_eng_199906.xml.gz
17093  articles in this file. Extracting 100 article / headline pairs
 82%|████████▏ | 705/857 [24:56:11<6:43:51, 159.42s/it]
Total articles processed: 84377 of 106441 (79.3%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/ltw_eng_200704.xml.gz
3425  articles in this file. Extracting 100 article / headline pairs
 82%|████████▏ | 706/857 [24:57:34<5:43:20, 136.43s/it]
Total articles processed: 84497 of 106618 (79.3%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/afp_eng_200205.xml.gz
20207  articles in this file. Extracting 100 article / headline pairs
 82%|████████▏ | 707/857 [25:02:19<7:32:36, 181.05s/it]
Total articles processed: 84617 of 106739 (79.3%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/cna_eng_200402.xml.gz
897  articles in this file. Extracting 100 article / headline pairs
 83%|████████▎ | 708/857 [25:02:31<5:23:50, 130.41s/it]
Total articles processed: 84737 of 106906 (79.3%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/nyt_eng_201003.xml.gz
5261  articles in this file. Extracting 100 article / headline pairs
 83%|████████▎ | 709/857 [25:05:25<5:53:22, 143.26s/it]
Total articles processed: 84857 of 107076 (79.2%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/apw_eng_200502.xml.gz
8234  articles in this file. Extracting 100 article / headline pairs
 83%|████████▎ | 710/857 [25:07:27<5:35:20, 136.87s/it]
Total articles processed: 84977 of 107214 (79.3%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/apw_eng_199807.xml.gz
14534  articles in this file. Extracting 100 article / headline pairs
 83%|████████▎ | 711/857 [25:11:13<6:38:39, 163.83s/it]
Total articles processed: 85097 of 107354 (79.3%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/ltw_eng_199707.xml.gz
3183  articles in this file. Extracting 100 article / headline pairs
 83%|████████▎ | 712/857 [25:12:27<5:30:29, 136.76s/it]
Total articles processed: 85217 of 107545 (79.2%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_200710.xml.gz
9826  articles in this file. Extracting 100 article / headline pairs
 83%|████████▎ | 713/857 [25:13:35<4:38:38, 116.10s/it]
Total articles processed: 85337 of 107697 (79.2%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/nyt_eng_201002.xml.gz
4816  articles in this file. Extracting 100 article / headline pairs
 83%|████████▎ | 714/857 [25:16:08<5:03:11, 127.22s/it]
Total articles processed: 85457 of 107859 (79.2%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/apw_eng_200006.xml.gz
5204  articles in this file. Extracting 100 article / headline pairs
 83%|████████▎ | 715/857 [25:17:37<4:34:12, 115.86s/it]
Total articles processed: 85577 of 107992 (79.2%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/ltw_eng_200801.xml.gz
3413  articles in this file. Extracting 100 article / headline pairs
 84%|████████▎ | 716/857 [25:19:06<4:12:52, 107.60s/it]
Total articles processed: 85697 of 108164 (79.2%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_200210.xml.gz
10581  articles in this file. Extracting 100 article / headline pairs
 84%|████████▎ | 717/857 [25:20:20<3:48:08, 97.77s/it] 
Total articles processed: 85817 of 108308 (79.2%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/nyt_eng_200911.xml.gz
5068  articles in this file. Extracting 100 article / headline pairs
 84%|████████▍ | 718/857 [25:22:59<4:28:27, 115.88s/it]
Total articles processed: 85937 of 108464 (79.2%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_199801.xml.gz
8053  articles in this file. Extracting 100 article / headline pairs
 84%|████████▍ | 719/857 [25:23:45<3:38:38, 95.06s/it] 
Total articles processed: 86057 of 108616 (79.2%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/afp_eng_200803.xml.gz
20883  articles in this file. Extracting 100 article / headline pairs
 84%|████████▍ | 720/857 [25:29:01<6:08:34, 161.42s/it]
Total articles processed: 86177 of 108752 (79.2%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_200508.xml.gz
9154  articles in this file. Extracting 100 article / headline pairs
 84%|████████▍ | 721/857 [25:30:05<4:59:10, 131.99s/it]
Total articles processed: 86297 of 108904 (79.2%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/afp_eng_200401.xml.gz
7452  articles in this file. Extracting 100 article / headline pairs
 84%|████████▍ | 722/857 [25:31:17<4:16:53, 114.17s/it]
Total articles processed: 86417 of 109047 (79.2%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/ltw_eng_200610.xml.gz
3900  articles in this file. Extracting 100 article / headline pairs
 84%|████████▍ | 723/857 [25:32:58<4:05:42, 110.02s/it]
Total articles processed: 86537 of 109217 (79.2%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/afp_eng_200708.xml.gz
17708  articles in this file. Extracting 100 article / headline pairs
 84%|████████▍ | 724/857 [25:37:17<5:43:25, 154.93s/it]
Total articles processed: 86657 of 109352 (79.2%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/apw_eng_200407.xml.gz
11603  articles in this file. Extracting 100 article / headline pairs
 85%|████████▍ | 725/857 [25:40:33<6:08:00, 167.27s/it]
Total articles processed: 86777 of 109485 (79.3%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/nyt_eng_199912.xml.gz
9393  articles in this file. Extracting 100 article / headline pairs
 85%|████████▍ | 726/857 [25:46:42<8:16:48, 227.55s/it]
Total articles processed: 86897 of 109643 (79.3%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_200102.xml.gz
8235  articles in this file. Extracting 100 article / headline pairs
 85%|████████▍ | 727/857 [25:47:32<6:17:50, 174.39s/it]
Total articles processed: 87017 of 109792 (79.3%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/afp_eng_201001.xml.gz
20217  articles in this file. Extracting 100 article / headline pairs
 85%|████████▍ | 728/857 [25:52:29<7:33:57, 211.15s/it]
Total articles processed: 87137 of 109925 (79.3%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/cna_eng_200909.xml.gz
1154 
 85%|████████▌ | 729/857 [25:52:46<5:26:20, 152.98s/it]
 articles in this file. Extracting 100 article / headline pairs
Total articles processed: 87257 of 110065 (79.3%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/ltw_eng_200708.xml.gz
3539  articles in this file. Extracting 100 article / headline pairs
 85%|████████▌ | 730/857 [25:54:15<4:43:25, 133.90s/it]
Total articles processed: 87377 of 110243 (79.3%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_200801.xml.gz
9927  articles in this file. Extracting 100 article / headline pairs
 85%|████████▌ | 731/857 [25:55:23<3:59:29, 114.04s/it]
Total articles processed: 87497 of 110380 (79.3%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/nyt_eng_200608.xml.gz
8284  articles in this file. Extracting 100 article / headline pairs
 85%|████████▌ | 732/857 [26:00:57<6:15:03, 180.03s/it]
Total articles processed: 87617 of 110540 (79.3%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/cna_eng_199907.xml.gz
936  articles in this file. Extracting 100 article / headline pairs
 86%|████████▌ | 733/857 [26:01:13<4:30:02, 130.66s/it]
Total articles processed: 87737 of 110678 (79.3%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_199506.xml.gz
7170  articles in this file. Extracting 100 article / headline pairs
 86%|████████▌ | 734/857 [26:01:55<3:33:40, 104.23s/it]
Total articles processed: 87857 of 110815 (79.3%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_200705.xml.gz
11260  articles in this file. Extracting 100 article / headline pairs
 86%|████████▌ | 735/857 [26:03:10<3:14:14, 95.53s/it] 
Total articles processed: 87977 of 110970 (79.3%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_199608.xml.gz
8004  articles in this file. Extracting 100 article / headline pairs
 86%|████████▌ | 736/857 [26:03:56<2:42:34, 80.62s/it]
Total articles processed: 88097 of 111108 (79.3%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/apw_eng_200510.xml.gz
12674  articles in this file. Extracting 100 article / headline pairs
 86%|████████▌ | 737/857 [26:07:25<3:58:21, 119.18s/it]
Total articles processed: 88217 of 111236 (79.3%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/cna_eng_200610.xml.gz
1242  articles in this file. Extracting 100 article / headline pairs
 86%|████████▌ | 738/857 [26:07:40<2:54:06, 87.78s/it] 
Total articles processed: 88337 of 111402 (79.3%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/wpb_eng_201004.xml.gz
2361  articles in this file. Extracting 100 article / headline pairs
 86%|████████▌ | 739/857 [26:08:32<2:31:30, 77.04s/it]
Total articles processed: 88457 of 111559 (79.3%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/apw_eng_200004.xml.gz
5375  articles in this file. Extracting 100 article / headline pairs
 86%|████████▋ | 740/857 [26:10:07<2:40:43, 82.42s/it]
Total articles processed: 88577 of 111687 (79.3%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/afp_eng_201007.xml.gz
18798  articles in this file. Extracting 100 article / headline pairs
 86%|████████▋ | 741/857 [26:14:41<4:30:14, 139.78s/it]
Total articles processed: 88697 of 111822 (79.3%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/nyt_eng_201012.xml.gz
5683  articles in this file. Extracting 100 article / headline pairs
 87%|████████▋ | 742/857 [26:18:00<5:02:01, 157.58s/it]
Total articles processed: 88817 of 111982 (79.3%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/cna_eng_199711.xml.gz
906  articles in this file. Extracting 100 article / headline pairs
 87%|████████▋ | 743/857 [26:18:14<3:37:46, 114.61s/it]
Total articles processed: 88937 of 112126 (79.3%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/afp_eng_201004.xml.gz
20300  articles in this file. Extracting 100 article / headline pairs
 87%|████████▋ | 744/857 [26:22:53<5:08:41, 163.91s/it]
Total articles processed: 89057 of 112259 (79.3%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/afp_eng_200411.xml.gz
20129  articles in this file. Extracting 100 article / headline pairs
 87%|████████▋ | 745/857 [26:27:15<6:00:58, 193.38s/it]
Total articles processed: 89177 of 112388 (79.3%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_199901.xml.gz
8203  articles in this file. Extracting 100 article / headline pairs
 87%|████████▋ | 746/857 [26:28:04<4:37:23, 149.94s/it]
Total articles processed: 89297 of 112539 (79.3%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/apw_eng_200901.xml.gz
16137  articles in this file. Extracting 100 article / headline pairs
 87%|████████▋ | 747/857 [26:33:20<6:06:33, 199.94s/it]
Total articles processed: 89417 of 112664 (79.4%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/afp_eng_200606.xml.gz
19682  articles in this file. Extracting 100 article / headline pairs
 87%|████████▋ | 748/857 [26:38:46<7:11:51, 237.72s/it]
Total articles processed: 89537 of 112792 (79.4%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/afp_eng_200910.xml.gz
20312  articles in this file. Extracting 100 article / headline pairs
 87%|████████▋ | 749/857 [26:43:36<7:35:49, 253.24s/it]
Total articles processed: 89657 of 112927 (79.4%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/apw_eng_200402.xml.gz
15194  articles in this file. Extracting 100 article / headline pairs
 88%|████████▊ | 750/857 [26:49:17<8:18:48, 279.71s/it]
Total articles processed: 89777 of 113071 (79.4%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_200511.xml.gz
9275  articles in this file. Extracting 100 article / headline pairs
 88%|████████▊ | 751/857 [26:50:30<6:24:27, 217.61s/it]
Total articles processed: 89897 of 113220 (79.4%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/apw_eng_201006.xml.gz
12147  articles in this file. Extracting 100 article / headline pairs
 88%|████████▊ | 752/857 [26:53:56<6:14:57, 214.26s/it]
Total articles processed: 90017 of 113344 (79.4%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_200403.xml.gz
10694  articles in this file. Extracting 100 article / headline pairs
 88%|████████▊ | 753/857 [26:55:20<5:03:28, 175.08s/it]
Total articles processed: 90137 of 113488 (79.4%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/cna_eng_199803.xml.gz
814  articles in this file. Extracting 100 article / headline pairs
 88%|████████▊ | 754/857 [26:55:33<3:36:58, 126.40s/it]
Total articles processed: 90257 of 113631 (79.4%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_201007.xml.gz
11544  articles in this file. Extracting 100 article / headline pairs
 88%|████████▊ | 755/857 [26:57:07<3:18:24, 116.71s/it]
Total articles processed: 90377 of 113775 (79.4%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/cna_eng_200709.xml.gz
1082  articles in this file. Extracting 100 article / headline pairs
 88%|████████▊ | 756/857 [26:57:21<2:24:32, 85.86s/it] 
Total articles processed: 90497 of 113930 (79.4%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/afp_eng_201012.xml.gz
18253  articles in this file. Extracting 100 article / headline pairs
 88%|████████▊ | 757/857 [27:01:38<3:48:44, 137.24s/it]
Total articles processed: 90617 of 114063 (79.4%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/afp_eng_200807.xml.gz
18338  articles in this file. Extracting 100 article / headline pairs
 88%|████████▊ | 758/857 [27:06:17<4:56:56, 179.96s/it]
Total articles processed: 90737 of 114260 (79.4%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/cna_eng_200504.xml.gz
1135  articles in this file. Extracting 100 article / headline pairs
 89%|████████▊ | 759/857 [27:06:32<3:33:02, 130.43s/it]
Total articles processed: 90857 of 114430 (79.4%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/afp_eng_200308.xml.gz
2792  articles in this file. Extracting 100 article / headline pairs
 89%|████████▊ | 760/857 [27:07:02<2:42:00, 100.21s/it]
Total articles processed: 90977 of 114570 (79.4%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/apw_eng_199701.xml.gz
13939  articles in this file. Extracting 100 article / headline pairs
 89%|████████▉ | 761/857 [27:10:34<3:34:04, 133.80s/it]
Total articles processed: 91097 of 114703 (79.4%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/ltw_eng_200312.xml.gz
3700  articles in this file. Extracting 100 article / headline pairs
 89%|████████▉ | 762/857 [27:12:14<3:15:36, 123.55s/it]
Total articles processed: 91217 of 114889 (79.4%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/cna_eng_200109.xml.gz
687  articles in this file. Extracting 100 article / headline pairs
 89%|████████▉ | 763/857 [27:12:26<2:21:12, 90.14s/it] 
Total articles processed: 91337 of 115039 (79.4%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/cna_eng_200308.xml.gz
884  articles in this file. Extracting 100 article / headline pairs
 89%|████████▉ | 764/857 [27:12:39<1:43:37, 66.86s/it]
Total articles processed: 91457 of 115184 (79.4%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/apw_eng_199708.xml.gz
13859  articles in this file. Extracting 100 article / headline pairs
 89%|████████▉ | 765/857 [27:16:14<2:51:02, 111.55s/it]
Total articles processed: 91577 of 115330 (79.4%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_200205.xml.gz
2783  articles in this file. Extracting 100 article / headline pairs
 89%|████████▉ | 766/857 [27:16:36<2:08:17, 84.59s/it] 
Total articles processed: 91697 of 115478 (79.4%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/ltw_eng_200403.xml.gz
4118  articles in this file. Extracting 100 article / headline pairs
 89%|████████▉ | 767/857 [27:18:25<2:18:02, 92.03s/it]
Total articles processed: 91817 of 115645 (79.4%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/cna_eng_200107.xml.gz
898  articles in this file. Extracting 100 article / headline pairs
 90%|████████▉ | 768/857 [27:18:39<1:41:24, 68.36s/it]
Total articles processed: 91937 of 115791 (79.4%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/cna_eng_200202.xml.gz
705  articles in this file. Extracting 100 article / headline pairs
 90%|████████▉ | 769/857 [27:18:50<1:15:02, 51.16s/it]
Total articles processed: 92057 of 115930 (79.4%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/apw_eng_200909.xml.gz
12890  articles in this file. Extracting 100 article / headline pairs
 90%|████████▉ | 770/857 [27:22:34<2:29:33, 103.14s/it]
Total articles processed: 92177 of 116053 (79.4%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/afp_eng_199609.xml.gz
20587  articles in this file. Extracting 100 article / headline pairs
 90%|████████▉ | 771/857 [27:26:07<3:15:00, 136.05s/it]
Total articles processed: 92297 of 116174 (79.4%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_200505.xml.gz
8930  articles in this file. Extracting 100 article / headline pairs
 90%|█████████ | 772/857 [27:27:11<2:42:02, 114.38s/it]
Total articles processed: 92417 of 116327 (79.4%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/ltw_eng_200806.xml.gz
3006  articles in this file. Extracting 100 article / headline pairs
 90%|█████████ | 773/857 [27:28:23<2:22:24, 101.73s/it]
Total articles processed: 92537 of 116501 (79.4%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/apw_eng_200411.xml.gz
9131  articles in this file. Extracting 100 article / headline pairs
 90%|█████████ | 774/857 [27:30:40<2:35:35, 112.47s/it]
Total articles processed: 92657 of 116628 (79.4%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/apw_eng_201007.xml.gz
10131  articles in this file. Extracting 100 article / headline pairs
 90%|█████████ | 775/857 [27:33:20<2:52:50, 126.47s/it]
Total articles processed: 92777 of 116754 (79.5%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/apw_eng_200009.xml.gz
5349  articles in this file. Extracting 100 article / headline pairs
 91%|█████████ | 776/857 [27:34:49<2:35:48, 115.42s/it]
Total articles processed: 92897 of 116886 (79.5%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/wpb_eng_201011.xml.gz
2107  articles in this file. Extracting 100 article / headline pairs
 91%|█████████ | 777/857 [27:35:36<2:06:36, 94.96s/it] 
Total articles processed: 93017 of 117048 (79.5%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/afp_eng_200503.xml.gz
19144  articles in this file. Extracting 100 article / headline pairs
 91%|█████████ | 778/857 [27:40:15<3:17:39, 150.12s/it]
Total articles processed: 93137 of 117186 (79.5%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_200108.xml.gz
9882  articles in this file. Extracting 100 article / headline pairs
 91%|█████████ | 779/857 [27:41:18<2:41:00, 123.85s/it]
Total articles processed: 93257 of 117345 (79.5%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/ltw_eng_199803.xml.gz
3201  articles in this file. Extracting 100 article / headline pairs
 91%|█████████ | 780/857 [27:42:36<2:21:15, 110.07s/it]
Total articles processed: 93377 of 117538 (79.4%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/afp_eng_200602.xml.gz
18996  articles in this file. Extracting 100 article / headline pairs
 91%|█████████ | 781/857 [27:47:28<3:28:38, 164.72s/it]
Total articles processed: 93497 of 117676 (79.5%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_200305.xml.gz
9486  articles in this file. Extracting 100 article / headline pairs
 91%|█████████ | 782/857 [27:48:33<2:48:22, 134.69s/it]
Total articles processed: 93617 of 117835 (79.4%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/ltw_eng_200803.xml.gz
3359  articles in this file. Extracting 100 article / headline pairs
 91%|█████████▏| 783/857 [27:49:54<2:26:31, 118.80s/it]
Total articles processed: 93737 of 118002 (79.4%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_200309.xml.gz
9026  articles in this file. Extracting 100 article / headline pairs
 91%|█████████▏| 784/857 [27:50:57<2:04:11, 102.07s/it]
Total articles processed: 93857 of 118150 (79.4%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/cna_eng_199906.xml.gz
910  articles in this file. Extracting 100 article / headline pairs
 92%|█████████▏| 785/857 [27:51:10<1:30:28, 75.39s/it] 
Total articles processed: 93977 of 118296 (79.4%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/apw_eng_200907.xml.gz
12233  articles in this file. Extracting 100 article / headline pairs
 92%|█████████▏| 786/857 [27:54:43<2:17:53, 116.53s/it]
Total articles processed: 94097 of 118420 (79.5%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/nyt_eng_199409.xml.gz
8249  articles in this file. Extracting 100 article / headline pairs
 92%|█████████▏| 787/857 [27:59:05<3:07:02, 160.32s/it]
Total articles processed: 94217 of 118580 (79.5%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/afp_eng_200310.xml.gz
23272  articles in this file. Extracting 100 article / headline pairs
 92%|█████████▏| 788/857 [28:04:56<4:10:04, 217.46s/it]
Total articles processed: 94337 of 118716 (79.5%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/apw_eng_201001.xml.gz
15780  articles in this file. Extracting 100 article / headline pairs
 92%|█████████▏| 789/857 [28:09:53<4:33:29, 241.31s/it]
Total articles processed: 94457 of 118843 (79.5%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/nyt_eng_200706.xml.gz
7491  articles in this file. Extracting 100 article / headline pairs
 92%|█████████▏| 790/857 [28:14:46<4:46:40, 256.72s/it]
Total articles processed: 94577 of 119002 (79.5%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/nyt_eng_199804.xml.gz
10408  articles in this file. Extracting 100 article / headline pairs
 92%|█████████▏| 791/857 [28:21:24<5:28:58, 299.07s/it]
Total articles processed: 94697 of 119168 (79.5%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/nyt_eng_200509.xml.gz
6898  articles in this file. Extracting 100 article / headline pairs
 92%|█████████▏| 792/857 [28:25:53<5:14:07, 289.96s/it]
Total articles processed: 94817 of 119338 (79.5%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_200004.xml.gz
9196  articles in this file. Extracting 100 article / headline pairs
 93%|█████████▎| 793/857 [28:26:50<3:54:54, 220.23s/it]
Total articles processed: 94937 of 119488 (79.5%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/cna_eng_200508.xml.gz
1165  articles in this file. Extracting 100 article / headline pairs
 93%|█████████▎| 794/857 [28:27:04<2:46:20, 158.43s/it]
Total articles processed: 95057 of 119654 (79.4%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/nyt_eng_200902.xml.gz
5650  articles in this file. Extracting 100 article / headline pairs
 93%|█████████▎| 795/857 [28:30:20<2:55:16, 169.61s/it]
Total articles processed: 95177 of 119833 (79.4%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/ltw_eng_200409.xml.gz
3897  articles in this file. Extracting 100 article / headline pairs
 93%|█████████▎| 796/857 [28:32:00<2:31:04, 148.59s/it]
Total articles processed: 95297 of 120015 (79.4%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/ltw_eng_200604.xml.gz
3311  articles in this file. Extracting 100 article / headline pairs
 93%|█████████▎| 797/857 [28:33:20<2:08:12, 128.20s/it]
Total articles processed: 95417 of 120200 (79.4%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/apw_eng_200212.xml.gz
15760  articles in this file. Extracting 100 article / headline pairs
 93%|█████████▎| 798/857 [28:38:24<2:57:51, 180.88s/it]
Total articles processed: 95537 of 120344 (79.4%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/ltw_eng_199511.xml.gz
2959  articles in this file. Extracting 100 article / headline pairs
 93%|█████████▎| 799/857 [28:39:31<2:21:46, 146.66s/it]
Total articles processed: 95657 of 120516 (79.4%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/nyt_eng_201001.xml.gz
5188  articles in this file. Extracting 100 article / headline pairs
 93%|█████████▎| 800/857 [28:42:18<2:25:13, 152.86s/it]
Total articles processed: 95777 of 120682 (79.4%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_199907.xml.gz
8698  articles in this file. Extracting 100 article / headline pairs
 93%|█████████▎| 801/857 [28:43:10<1:54:28, 122.65s/it]
Total articles processed: 95897 of 120829 (79.4%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/apw_eng_200405.xml.gz
4401  articles in this file. Extracting 100 article / headline pairs
 94%|█████████▎| 802/857 [28:44:18<1:37:14, 106.07s/it]
Total articles processed: 96017 of 120965 (79.4%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/nyt_eng_200606.xml.gz
8616  articles in this file. Extracting 100 article / headline pairs
 94%|█████████▎| 803/857 [28:50:01<2:39:25, 177.13s/it]
Total articles processed: 96137 of 121137 (79.4%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/ltw_eng_200901.xml.gz
3121  articles in this file. Extracting 100 article / headline pairs
 94%|█████████▍| 804/857 [28:51:19<2:10:22, 147.59s/it]
Total articles processed: 96257 of 121347 (79.3%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/nyt_eng_200301.xml.gz
5940  articles in this file. Extracting 100 article / headline pairs
 94%|█████████▍| 805/857 [28:55:13<2:30:12, 173.33s/it]
Total articles processed: 96377 of 121516 (79.3%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/apw_eng_199904.xml.gz
16803  articles in this file. Extracting 100 article / headline pairs
 94%|█████████▍| 806/857 [28:59:57<2:55:46, 206.79s/it]
Total articles processed: 96497 of 121656 (79.3%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/cna_eng_201005.xml.gz
1349  articles in this file. Extracting 100 article / headline pairs
 94%|█████████▍| 807/857 [29:00:16<2:05:09, 150.19s/it]
Total articles processed: 96617 of 121802 (79.3%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/nyt_eng_201007.xml.gz
5720  articles in this file. Extracting 100 article / headline pairs
 94%|█████████▍| 808/857 [29:03:31<2:13:42, 163.73s/it]
Total articles processed: 96737 of 121959 (79.3%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/afp_eng_200505.xml.gz
19415  articles in this file. Extracting 100 article / headline pairs
 94%|█████████▍| 809/857 [29:08:32<2:43:56, 204.93s/it]
Total articles processed: 96857 of 122094 (79.3%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/afp_eng_200208.xml.gz
6644  articles in this file. Extracting 100 article / headline pairs
 95%|█████████▍| 810/857 [29:09:35<2:07:14, 162.44s/it]
Total articles processed: 96977 of 122233 (79.3%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/afp_eng_199602.xml.gz
15269  articles in this file. Extracting 100 article / headline pairs
 95%|█████████▍| 811/857 [29:12:09<2:02:36, 159.93s/it]
Total articles processed: 97097 of 122355 (79.4%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/nyt_eng_201008.xml.gz
5965  articles in this file. Extracting 100 article / headline pairs
 95%|█████████▍| 812/857 [29:15:36<2:10:22, 173.82s/it]
Total articles processed: 97217 of 122516 (79.4%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/ltw_eng_200711.xml.gz
3436  articles in this file. Extracting 100 article / headline pairs
 95%|█████████▍| 813/857 [29:17:00<1:47:41, 146.85s/it]
Total articles processed: 97337 of 122711 (79.3%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/afp_eng_199509.xml.gz
13351  articles in this file. Extracting 100 article / headline pairs
 95%|█████████▍| 814/857 [29:19:10<1:41:48, 142.05s/it]
Total articles processed: 97457 of 122835 (79.3%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/afp_eng_199604.xml.gz
16091  articles in this file. Extracting 100 article / headline pairs
 95%|█████████▌| 815/857 [29:21:56<1:44:20, 149.06s/it]
Total articles processed: 97577 of 122958 (79.4%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_200012.xml.gz
8007  articles in this file. Extracting 100 article / headline pairs
 95%|█████████▌| 816/857 [29:22:47<1:21:49, 119.73s/it]
Total articles processed: 97697 of 123113 (79.4%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/cna_eng_199903.xml.gz
840  articles in this file. Extracting 100 article / headline pairs
 95%|█████████▌| 817/857 [29:23:00<58:33, 87.83s/it]   
Total articles processed: 97817 of 123257 (79.4%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/apw_eng_199907.xml.gz
16154  articles in this file. Extracting 100 article / headline pairs
 95%|█████████▌| 818/857 [29:27:49<1:36:09, 147.94s/it]
Total articles processed: 97937 of 123397 (79.4%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/cna_eng_200011.xml.gz
715  articles in this file. Extracting 100 article / headline pairs
 96%|█████████▌| 819/857 [29:28:01<1:07:59, 107.36s/it]
Total articles processed: 98057 of 123539 (79.4%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/ltw_eng_200701.xml.gz
3690  articles in this file. Extracting 100 article / headline pairs
 96%|█████████▌| 820/857 [29:29:31<1:02:57, 102.08s/it]
Total articles processed: 98177 of 123705 (79.4%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/nyt_eng_200905.xml.gz
5913  articles in this file. Extracting 100 article / headline pairs
 96%|█████████▌| 821/857 [29:32:48<1:18:20, 130.57s/it]
Total articles processed: 98297 of 123869 (79.4%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/cna_eng_199811.xml.gz
867  articles in this file. Extracting 100 article / headline pairs
 96%|█████████▌| 822/857 [29:33:02<55:39, 95.42s/it]   
Total articles processed: 98417 of 124006 (79.4%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/cna_eng_199808.xml.gz
853  articles in this file. Extracting 100 article / headline pairs
 96%|█████████▌| 823/857 [29:33:14<40:01, 70.64s/it]
Total articles processed: 98537 of 124133 (79.4%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_200304.xml.gz
10231  articles in this file. Extracting 100 article / headline pairs
 96%|█████████▌| 824/857 [29:34:24<38:42, 70.37s/it]
Total articles processed: 98657 of 124278 (79.4%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/nyt_eng_200401.xml.gz
1850  articles in this file. Extracting 100 article / headline pairs
 96%|█████████▋| 825/857 [29:34:58<31:43, 59.48s/it]
Total articles processed: 98777 of 124452 (79.4%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/ltw_eng_200506.xml.gz
4616  articles in this file. Extracting 100 article / headline pairs
 96%|█████████▋| 826/857 [29:37:07<41:32, 80.42s/it]
Total articles processed: 98897 of 124618 (79.4%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_199609.xml.gz
7828  articles in this file. Extracting 100 article / headline pairs
 96%|█████████▋| 827/857 [29:37:53<35:00, 70.03s/it]
Total articles processed: 99017 of 124763 (79.4%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/cna_eng_200510.xml.gz
1093  articles in this file. Extracting 100 article / headline pairs
 97%|█████████▋| 828/857 [29:38:08<25:45, 53.31s/it]
Total articles processed: 99137 of 124945 (79.3%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/wpb_eng_201009.xml.gz
2114  articles in this file. Extracting 100 article / headline pairs
 97%|█████████▋| 829/857 [29:38:56<24:10, 51.82s/it]
Total articles processed: 99257 of 125112 (79.3%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_200409.xml.gz
9099  articles in this file. Extracting 100 article / headline pairs
 97%|█████████▋| 830/857 [29:39:59<24:54, 55.34s/it]
Total articles processed: 99377 of 125259 (79.3%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/apw_eng_199611.xml.gz
12692  articles in this file. Extracting 100 article / headline pairs
 97%|█████████▋| 831/857 [29:42:56<39:46, 91.78s/it]
Total articles processed: 99497 of 125408 (79.3%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/cna_eng_200703.xml.gz
1300  articles in this file. Extracting 100 article / headline pairs
 97%|█████████▋| 832/857 [29:43:11<28:38, 68.75s/it]
Total articles processed: 99617 of 125579 (79.3%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/cna_eng_200102.xml.gz
759  articles in this file. Extracting 100 article / headline pairs
 97%|█████████▋| 833/857 [29:43:23<20:43, 51.80s/it]
Total articles processed: 99737 of 125722 (79.3%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/afp_eng_200604.xml.gz
12320  articles in this file. Extracting 100 article / headline pairs
 97%|█████████▋| 834/857 [29:46:08<32:46, 85.50s/it]
Total articles processed: 99857 of 125866 (79.3%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/nyt_eng_200603.xml.gz
9359  articles in this file. Extracting 100 article / headline pairs
 97%|█████████▋| 835/857 [29:52:24<1:03:20, 172.77s/it]
Total articles processed: 99977 of 126034 (79.3%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/cna_eng_200803.xml.gz
1380  articles in this file. Extracting 100 article / headline pairs
 98%|█████████▊| 836/857 [29:52:44<44:27, 127.03s/it]  
Total articles processed: 100097 of 126200 (79.3%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/apw_eng_200410.xml.gz
12005  articles in this file. Extracting 100 article / headline pairs
 98%|█████████▊| 837/857 [29:56:12<50:23, 151.16s/it]
Total articles processed: 100217 of 126330 (79.3%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/ltw_eng_199712.xml.gz
3051  articles in this file. Extracting 100 article / headline pairs
 98%|█████████▊| 838/857 [29:57:22<40:12, 126.96s/it]
Total articles processed: 100337 of 126507 (79.3%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/xin_eng_200001.xml.gz
8516  articles in this file. Extracting 100 article / headline pairs
 98%|█████████▊| 839/857 [29:58:10<30:55, 103.07s/it]
Total articles processed: 100457 of 126655 (79.3%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/ltw_eng_200511.xml.gz
3839  articles in this file. Extracting 100 article / headline pairs
 98%|█████████▊| 840/857 [29:59:45<28:34, 100.83s/it]
Total articles processed: 100577 of 126838 (79.3%)
Parsing file: /Volumes/Alex Hard Drive/anno_eng_gigaword_5/data/xml/apw_eng_199604.xml.gz
13000  articles in this file. Extracting 100 article / headline pairs
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-21-f7c3ee7803d6> in <module>()
     15     print('Parsing file: %s' % file)
     16     #proc = mp.Process(target=
---> 17     write_tab_seperated(file, headlines, articles, articles_per_file=120)
     18     #proc.start()
     19     #proc.join()

<ipython-input-19-e74de0f4513b> in write_tab_seperated(file, headlines, articles, articles_per_file)
     19             headline = extract_headline(hdln)
     20             article = extract_art_txt(txt, n_sents = NUM_ART_SENTS)
---> 21             if len(headline) > MAX_HDLN_LEN or len(article) > MAX_ART_LEN: continue
     22 
     23             headline = ' '.join(headline)

TypeError: object of type 'NoneType' has no len()

In [4]:
with open('articles.pkl', 'wb') as f:
    pickle.dump(articles, f)

with open('headlines.pkl', 'wb') as f:
    pickle.dump(headlines, f)


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-4-427334b793f9> in <module>()
      1 with open('articles.pkl', 'wb') as f:
----> 2     pickle.dump(articles, f)
      3 
      4 with open('headlines.pkl', 'wb') as f:
      5     pickle.dump(headlines, f)

NameError: name 'articles' is not defined

In [23]:
len(articles), len(headlines)


Out[23]:
99812