In [ ]:
# Parse xmls
# preprocess documents
# create bins
# output - processed document bins

EOS To MongDB


In [1]:
import pymongo
from pymongo import MongoClient
import glob
import datetime
import pprint
import codecs
import xmltodict
import gensim
import string 
from bson.objectid import ObjectId
from multiprocessing import Pool
import gzip

import pickle
from matplotlib import pyplot as plt
%matplotlib notebook
import dateparser
from datetime import datetime

import spacy
nlp = spacy.load('en')

In [3]:
def get_db(): 
    client = MongoClient('localhost:27017') 
    db = client.db    
    return db

# get_db()

def get_collection_cursor():
    db = get_db()
    return db.get_collection('eos_violence').find({}).batch_size(1000)

def get_collection_stats():
    collection = get_db().collection_names(include_system_collections=False)
    for collect in collection:
        print (collect)
        
print(get_collection_stats())
# get_db().drop_collection('eos_violence.collection.collection')
print(get_collection_cursor().count())


eos_violence
collection
None
680456

In [4]:
%%time


eos_list = glob.glob('data/violence_iraq_v1/*.xml')
eos_list.sort()
print (len(eos_list))


680475
CPU times: user 1.66 s, sys: 204 ms, total: 1.87 s
Wall time: 1.87 s

In [5]:
%%time


def doc_generator(path):
    """
    generator function to read in documents from the file
    """
    
    for fname in path:
        with codecs.open(fname, encoding='utf_8') as doc_file:
            try:
                yield xmltodict.parse(doc_file.read())['Document']
            except Exception as e: 
                pass #print(e)


CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 3.81 µs

In [ ]:
i=0
# for item in enumerate(doc_generator(eos_list)):
#     i += 1
#     print(item)
#     break

print(sum(1 for i in doc_generator(eos_list)))

In [ ]:
%%time

def insert_docs():
    db = get_db()
    print(type(db))
    print(pymongo.version)

    result = db.get_collection('eos_violence').insert_many(doc_generator(eos_list))
    print(result)
    
insert_docs()

In [6]:
doc = get_db().get_collection('eos_violence').find_one({"Id": 'en_20120101_1stheadli17961_990428600000'})
# print(doc['tokens'])
print(doc)


{'_id': ObjectId('59755483bb5459719b5588ec'), 'PublicationDateTime': '01 Jan 2012 05:19:00', 'SourceName': '1st Headlines- Pakistan', 'Id': 'en_20120101_1stheadli17961_990428600000', 'CaptureDateTime': None, 'Encoding': 'UTF-8', 'Language': None, 'Content': '>One of the last major cities to mark the end of 2011 was New  \nYork, where Lady Gaga helped launched the giant glitterball in  \nTimes Square. \n  The large, illuminated crystal ball was lowered for the last  \nminute of the passing year - a tradition started in 1907. \n  Fireworks then filled the sky at midnight and confetti dropped  \non revelers in the square. \n  Around one million people attended the celebrations, which end a  \nyear of <-troubled-> economic times for the US and the tenth  \nanniversary of the September 11 <-attacks->. \n   \nLady Gaga and New York Mayor Michael Bloomberg \n  "There\'s no doubt that 2012 will bring even more change," US  \nPresident Barack Obama said in his end-of-year address. \n  "And as we head into the New Year, I\'m hopeful that we have what  \nit takes to face that change and come out even stronger." \n  Across South America the fireworks continued - in Rio de  \nJaneiro, an estimated two million white-clad partygoers watched  \na spectacular "green" fireworks extravaganza on Copacabana  \nbeach. \n  In Berlin another spectacular display lit up the night sky with  \npartying at the Brandenburg Gate, while in Paris some 360,000  \npeople flocked to the Champs-Elysees. \n  In Kiev there was a football-themed countdown before fireworks  \nlit up Independence Square. Ukraine will co-host Euro 2012 with  \nPoland. \n  In the Egyptian capital, there were no pyrotechnics - instead  \nthe assembled masses in Revolution Square held a candlelit vigil  \nto remember all those who had lost their lives during the Arab  \nSpring of 2011. \n   \nAn extravagant firework display lit up the Sydney sky \n  Earlier, Sydney and Hong Kong set the New Year\'s Eve standard  \nwith glittering extravaganzas. \n  Sydney\'s famous harbour exploded in a blaze of colour on the  \nstroke of midnight Down Under. \n  "Every year we make sure our celebrations are bigger and better  \nthan the one before," Sydney Lord Mayor Clover Moore said. \n  In Hong Kong, a barrage of fireworks were fired from several of  \nits iconic buildings, delighting partygoers crammed on to the  \nwaterfront and in pleasure boats. \n  The mood was more somber in Tokyo, but Dubai led the way in the  \nMiddle East. \n  The city\'s Burj Khalifa skyscraper, the world\'s tallest man-made  \nstructure, hosted a pyrotechnics display even more extravagant  \nthan 12 months ago. \n   \n:: Britain welcomes the New Year with a bang. \n  :: World leaders evoke 2011\'s events in their New Year\'s  \nmessages. \n  try{if(OB_SK_enabled){var  \nOB_permalink=document.location.href.indexOf("news.sky.com")==-1?OB_SK_default_permalink:document.location.href;var \nOB_widgetId="AR_1";var OB_Template="customtemplate";var  \nOB_langJS="http://widgets.outbrain.com/lang_en.js";if(typeof(OB_Script)!="undefined"){OutbrainStart()}else{var \nOB_Script=true;var str=" Weather \n  The weather servers are currently busy, please try again in a  \ncouple of minutes. \n  Goodyear We would like your feedback, please fill in our survey  \nLatest From Sky Sports FA reveal Suarez ban reasons \n  The Football Association has now released full written reasons  \nfor Liverpool star Luis Suarez\'s ban for racially abusing  \nPatrice Evra \n   \nUnited quiet on Rooney reports \n  Manchester United have made no comment on claims that Wayne  \nRooney missed the shock defeat by Blackburn due to disciplinary  \nreasons. \n   \nCruise for Clijsters \n  Kim Clijsters admitted to feeling a little rusty after sweeping  \naside Romania\'s Simona Halep in the first <-round-> of the Brisbane  \nInternational. \n  Latest From Sky Living The Saturdays\' Rochelle Wiseman And JLS\'  \nMarvin Humes Are Engaged Russell Brand Files For Divorce From  \nKaty Perry Our Favourite Video Interviews Of The Year With  \nJennifer Aniston, Anna Wintour and Anne Hathaway Your Photos  \nSNOW DECEMBER 2011 GENERAL CHRISTMAS 2011 BEAUTIFUL BRITAIN YOUR  \nREPORT BREAKING NEWS WILD WEATHER DREAM DESTINATIONS YOUR DOGS  \nYOUR CATS Upload your photos Your Videos Child bins christmas  \ncard!! <-syria-> regime <-crimes-> Congestion - Hammersmith <-Clashes-> with  \npolice Steam Train Departure <-syria-> regime <-crimes-> manonabeach  \nmeets Jim Woolley. joanna yeates Carnival 2011 A day in the life  \nof a £1 coin Upload your videos Sky News Extra RSS SMS  \nAlerts About Us Sky News International How To Get Sky News Sky  \nNews Panel Sky Player Podcast Sky News Radio \n \n (C) Copyright 2012, BSKYB. All Rights Reserved. \n \n URL: http://news.sky.com/skynews/Home/World-News/Celebrations-Sweep-Globe-For-Start-Of-2012-New-Year-Welcomed-By-Revellers-Around-The-World/Article/201201116140409?f=rss', 'Title': 'Celebrations Sweep Globe For Start Of 2012', 'SourceCode': None}

In [11]:
def punct_space(token):
    """
    helper function to eliminate tokens
    that are pure punctuation or whitespace
    """
    return token.is_punct or token.is_space or (len(token.orth_) < 2)

In [13]:
with open('data/stopwords', encoding='utf-8') as fp:
    stopwords = fp.read().split('\n')
    
exclude = ["’s", "'s", "-PRON-", "'''", "``", "–", "‘a", "--", "..."]
stopwords.extend(exclude)
stopwords.extend(list(string.punctuation))
# print(stopwords)

In [14]:
# load the finished model from disk
bigram_model = gensim.models.phrases.Phraser.load('data/eos/ngram/bigram_model_%s' % 'all')

print('bigram loaded')


bigram loaded

In [15]:
%%time


def process_corpus(cursor, db):
    print ("Start documents process" + datetime.now().strftime("%H:%M:%S"))
    i = 0
    for doc in cursor:
        # lemmatize the text, removing punctuation and whitespace
        try:
            unigram_review = [token.lemma_ for token in nlp(doc['Text']) if not punct_space(token)] 
        except:
            try:
                unigram_review = [token.lemma_ for token in nlp(doc['Content']) if not punct_space(token)] 
            except:
                continue
        
        try:
            # apply the first-order phrase models
            ngram_doc = bigram_model[unigram_review]
            # remove any remaining stopwords and exclude list
            ngram_doc = [term for term in ngram_doc if term not in stopwords]
            # Update document
            db.get_collection('eos_violence').update({'_id': ObjectId(doc['_id'])},
                                 {'$set': {'tokens' : ngram_doc}}, upsert=False)
            i += 1
            if (i % 20000) == 0:
                print ("Process {} documents so far ".format(i) + datetime.now().strftime("%H:%M:%S"))
        
        except Exception as e: 
            print(e)
            pass


CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 14.5 µs

In [16]:
def get_collection_cursor():
    db = get_db()
    return db.get_collection('eos_violence').find({}).batch_size(1000)

print(get_collection_cursor().count())


680456

In [17]:
%%time


process_corpus(get_collection_cursor(), get_db())


Start documents process22:18:00
/usr/local/lib/python3.5/dist-packages/gensim/models/phrases.py:274: UserWarning: For a faster implementation, use the gensim.models.phrases.Phraser class
  warnings.warn("For a faster implementation, use the gensim.models.phrases.Phraser class")
/usr/local/lib/python3.5/dist-packages/ipykernel_launcher.py:23: DeprecationWarning: update is deprecated. Use replace_one, update_one or update_many instead.
Process 20000 documents so far 22:30:48
Process 40000 documents so far 22:43:35
Process 60000 documents so far 22:58:34
Process 80000 documents so far 23:12:29
Process 100000 documents so far 23:25:29
Process 120000 documents so far 23:37:44
Process 140000 documents so far 23:49:06
Process 160000 documents so far 00:00:27
Process 180000 documents so far 00:11:45
Process 200000 documents so far 00:24:16
Process 220000 documents so far 00:35:48
Process 240000 documents so far 00:48:47
Process 260000 documents so far 01:01:21
Process 280000 documents so far 01:13:46
Process 300000 documents so far 01:24:17
Process 320000 documents so far 01:35:23
Process 340000 documents so far 01:45:58
Process 360000 documents so far 01:57:34
Process 380000 documents so far 02:10:17
Process 400000 documents so far 02:22:42
Process 420000 documents so far 02:34:56
Process 440000 documents so far 02:48:41
Process 460000 documents so far 03:03:24
Process 480000 documents so far 03:17:15
Process 500000 documents so far 03:30:20
Process 520000 documents so far 03:46:12
Process 540000 documents so far 03:58:51
Process 560000 documents so far 04:12:09
Process 580000 documents so far 04:26:55
Process 600000 documents so far 04:43:27
Process 620000 documents so far 04:58:18
Process 640000 documents so far 05:11:44
Process 660000 documents so far 05:24:45
Process 680000 documents so far 05:39:20
CPU times: user 7h 18min 42s, sys: 33.8 s, total: 7h 19min 16s
Wall time: 7h 21min 39s

In [18]:
%%time

from operator import itemgetter

def corpus_info():

    cursor = get_collection_cursor()
    
    corpus = {}
    doc_count = 0
        
    for doc in cursor:        
        doc_date = dateparser.parse(doc['PublicationDateTime']).strftime('%Y_%m')        
        if (doc_date in corpus):
            corpus[doc_date].append(doc['Id'])   
        else:
            corpus[doc_date] = [doc['Id']]
            
        doc_count += 1       
        if (doc_count % 50000) == 0:
            print("Counted {} documents so far ".format(doc_count) + datetime.now().strftime("%H:%M:%S"))

    with open('data/eos/data_windows.pkl', 'wb') as f:
        pickle.dump(corpus, f)
    
    x = sorted(corpus) # time window
    y = [] # number of docs
    for item in x:
        y.append(len(corpus[item]))
        
    print(x)
    print(y)
    print('total number of documents: %d' % sum(y))

    plt.bar(range(len(y)), y, align='center')
    plt.xticks(range(len(x)), x, size='small')
    plt.title('Time Windows')
    plt.xlabel('Time')
    plt.xticks(rotation=30)
    plt.show()
    plt.savefig('data/eos/graphs/windows_corpus_info_2016_2017.png', bbox_inches='tight', dpi=800)

corpus_info()


Counted 50000 documents so far 05:40:58
Counted 100000 documents so far 05:42:20
Counted 150000 documents so far 05:43:37
Counted 200000 documents so far 05:44:55
Counted 250000 documents so far 05:46:13
Counted 300000 documents so far 05:47:30
Counted 350000 documents so far 05:48:49
Counted 400000 documents so far 05:50:07
Counted 450000 documents so far 05:51:25
Counted 500000 documents so far 05:52:45
Counted 550000 documents so far 05:54:05
Counted 600000 documents so far 05:55:23
Counted 650000 documents so far 05:56:43
['2012_01', '2012_02', '2012_03', '2012_04', '2012_05', '2012_06', '2012_07', '2012_08', '2012_09', '2012_10', '2012_11', '2012_12', '2013_01', '2013_02', '2013_03', '2013_04', '2013_05', '2013_06', '2013_07', '2013_08', '2013_09', '2013_10', '2013_11', '2013_12', '2014_01', '2014_02', '2014_03', '2014_04', '2014_05', '2014_06', '2014_07', '2014_08', '2014_09', '2014_10', '2014_11', '2014_12', '2015_01', '2015_02', '2015_03', '2015_04', '2015_05', '2015_06', '2015_07', '2015_08', '2015_09', '2015_10', '2015_11', '2015_12', '2016_01', '2016_02', '2016_03', '2016_04', '2016_05', '2016_06', '2016_07', '2016_08', '2016_09', '2016_10', '2016_11', '2016_12', '2017_01', '2017_02', '2017_03', '2017_04', '2017_05', '2017_06', '2017_07']
[20416, 19505, 22537, 37400, 26776, 25819, 18341, 11017, 1732, 1180, 1536, 6506, 12291, 12740, 17523, 14196, 28566, 46969, 1490, 18852, 1043, 840, 6423, 14470, 19148, 9066, 11383, 16972, 13677, 13765, 2696, 10222, 17430, 21171, 12632, 5770, 6, 27, 46, 15, 61, 32, 33, 38, 55, 8462, 8217, 5285, 4928, 6386, 10071, 5585, 12457, 11344, 9810, 13358, 13840, 10838, 3431, 6834, 6441, 2447, 3664, 6166, 3665, 4670, 144]
total number of documents: 680456
CPU times: user 17min 22s, sys: 2.22 s, total: 17min 24s
Wall time: 17min 54s

In [19]:
def get_collection_cursor(ids):
    db = get_db()
    return db.get_collection('eos_violence').find({"Id": {'$in': ids }}).batch_size(1000)

In [20]:
def graph_corpus(corpus):
    x = sorted(corpus) # time window
    y = [] # number of docs
    for item in x:
        y.append(len(corpus[item]))
        
    print(x)
    print(y)
    print('total number of documents: %d' % sum(y))

    plt.bar(range(len(y)), y, align='center')
    plt.xticks(range(len(x)), x, size='small')
    plt.title('Time Windows')
    plt.xlabel('Time')
    plt.xticks(rotation=30)
    plt.show()
    plt.savefig('data/eos/graphs/windows_corpus_info_2016_2017.png', bbox_inches='tight', dpi=800)

In [22]:
%%time


import random

with open('data/eos/data_windows.pkl', 'rb') as f:
    corpus = pickle.load(f)
    
x = sorted(corpus) # time window
y = [] # number of docs
for item in x:
    y.append(len(corpus[item]))
    
print(min(y))
average = int(sum(y) / float(len(y)))
print(average)    

# for window in corpus:
#     random.shuffle(corpus[window])
#     if (len(corpus[window]) > 75000 ):
#         limit = int(len(corpus[window]) / 2)
#         doc_ids = corpus[window][:limit]
#         corpus[window] = doc_ids

bin_location = 'dynamic_nmf/data/windowbin/slices/tokenized_window_%s.gz'

graph_corpus(corpus)

f_all = gzip.open(bin_location  % 'all', 'w') 

for window in corpus:
    print("Processing %s documents so far " % format(window) + datetime.now().strftime("%H:%M:%S"))
    cursor = get_collection_cursor(corpus[window])
    
    doc_list = []
    for doc in cursor:
        try:
            doc_list.append(doc['Id'] + '\t' + u' '.join(doc['tokens']))
        except:
            pass
    print(len(doc_list))
    
    f = gzip.open(bin_location  % window, 'w')         
    f.write( '\n'.join( doc_list ).encode("utf-8") )
    f_all.write( '\n'.join( doc_list ).encode("utf-8") )
    f.close()
    print("Finished %s documents so far " % format(window) + datetime.now().strftime("%H:%M:%S"))
    
f_all.close()


6
10156
['2012_01', '2012_02', '2012_03', '2012_04', '2012_05', '2012_06', '2012_07', '2012_08', '2012_09', '2012_10', '2012_11', '2012_12', '2013_01', '2013_02', '2013_03', '2013_04', '2013_05', '2013_06', '2013_07', '2013_08', '2013_09', '2013_10', '2013_11', '2013_12', '2014_01', '2014_02', '2014_03', '2014_04', '2014_05', '2014_06', '2014_07', '2014_08', '2014_09', '2014_10', '2014_11', '2014_12', '2015_01', '2015_02', '2015_03', '2015_04', '2015_05', '2015_06', '2015_07', '2015_08', '2015_09', '2015_10', '2015_11', '2015_12', '2016_01', '2016_02', '2016_03', '2016_04', '2016_05', '2016_06', '2016_07', '2016_08', '2016_09', '2016_10', '2016_11', '2016_12', '2017_01', '2017_02', '2017_03', '2017_04', '2017_05', '2017_06', '2017_07']
[20416, 19505, 22537, 37400, 26776, 25819, 18341, 11017, 1732, 1180, 1536, 6506, 12291, 12740, 17523, 14196, 28566, 46969, 1490, 18852, 1043, 840, 6423, 14470, 19148, 9066, 11383, 16972, 13677, 13765, 2696, 10222, 17430, 21171, 12632, 5770, 6, 27, 46, 15, 61, 32, 33, 38, 55, 8462, 8217, 5285, 4928, 6386, 10071, 5585, 12457, 11344, 9810, 13358, 13840, 10838, 3431, 6834, 6441, 2447, 3664, 6166, 3665, 4670, 144]
total number of documents: 680456
Processing 2012_07 documents so far 17:44:59
18341
Finished 2012_07 documents so far 17:45:07
Processing 2016_10 documents so far 17:45:07
10838
Finished 2016_10 documents so far 17:45:13
Processing 2012_03 documents so far 17:45:13
22537
Finished 2012_03 documents so far 17:45:23
Processing 2015_01 documents so far 17:45:23
6
Finished 2015_01 documents so far 17:45:26
Processing 2012_02 documents so far 17:45:26
19505
Finished 2012_02 documents so far 17:45:35
Processing 2015_02 documents so far 17:45:35
27
Finished 2015_02 documents so far 17:45:37
Processing 2016_09 documents so far 17:45:37
13840
Finished 2016_09 documents so far 17:45:44
Processing 2015_04 documents so far 17:45:44
15
Finished 2015_04 documents so far 17:45:46
Processing 2013_08 documents so far 17:45:46
18852
Finished 2013_08 documents so far 17:45:55
Processing 2013_10 documents so far 17:45:55
840
Finished 2013_10 documents so far 17:45:57
Processing 2017_01 documents so far 17:45:57
6441
Finished 2017_01 documents so far 17:46:02
Processing 2014_09 documents so far 17:46:02
17430
Finished 2014_09 documents so far 17:46:10
Processing 2014_03 documents so far 17:46:10
11383
Finished 2014_03 documents so far 17:46:16
Processing 2017_07 documents so far 17:46:16
144
Finished 2017_07 documents so far 17:46:18
Processing 2012_09 documents so far 17:46:18
1732
Finished 2012_09 documents so far 17:46:21
Processing 2012_10 documents so far 17:46:21
1180
Finished 2012_10 documents so far 17:46:24
Processing 2015_11 documents so far 17:46:24
8208
Finished 2015_11 documents so far 17:46:29
Processing 2012_05 documents so far 17:46:29
26776
Finished 2012_05 documents so far 17:46:41
Processing 2017_02 documents so far 17:46:41
2447
Finished 2017_02 documents so far 17:46:44
Processing 2014_07 documents so far 17:46:44
2696
Finished 2014_07 documents so far 17:46:48
Processing 2015_12 documents so far 17:46:48
5279
Finished 2015_12 documents so far 17:46:52
Processing 2012_11 documents so far 17:46:52
1536
Finished 2012_11 documents so far 17:46:55
Processing 2013_07 documents so far 17:46:55
1490
Finished 2013_07 documents so far 17:46:58
Processing 2017_04 documents so far 17:46:58
6166
Finished 2017_04 documents so far 17:47:03
Processing 2016_03 documents so far 17:47:03
10071
Finished 2016_03 documents so far 17:47:09
Processing 2016_07 documents so far 17:47:09
9810
Finished 2016_07 documents so far 17:47:15
Processing 2016_04 documents so far 17:47:15
5585
Finished 2016_04 documents so far 17:47:20
Processing 2014_01 documents so far 17:47:20
19148
Finished 2014_01 documents so far 17:47:29
Processing 2016_08 documents so far 17:47:29
13358
Finished 2016_08 documents so far 17:47:37
Processing 2015_09 documents so far 17:47:37
55
Finished 2015_09 documents so far 17:47:39
Processing 2016_01 documents so far 17:47:39
4928
Finished 2016_01 documents so far 17:47:43
Processing 2013_01 documents so far 17:47:43
12291
Finished 2013_01 documents so far 17:47:50
Processing 2013_06 documents so far 17:47:50
46969
Finished 2013_06 documents so far 17:48:09
Processing 2015_03 documents so far 17:48:09
46
Finished 2015_03 documents so far 17:48:11
Processing 2015_06 documents so far 17:48:11
32
Finished 2015_06 documents so far 17:48:13
Processing 2015_07 documents so far 17:48:13
33
Finished 2015_07 documents so far 17:48:16
Processing 2015_08 documents so far 17:48:16
38
Finished 2015_08 documents so far 17:48:18
Processing 2015_05 documents so far 17:48:18
61
Finished 2015_05 documents so far 17:48:21
Processing 2012_01 documents so far 17:48:21
20416
Finished 2012_01 documents so far 17:48:29
Processing 2013_11 documents so far 17:48:29
6423
Finished 2013_11 documents so far 17:48:34
Processing 2017_05 documents so far 17:48:34
3665
Finished 2017_05 documents so far 17:48:38
Processing 2016_11 documents so far 17:48:38
3431
Finished 2016_11 documents so far 17:48:42
Processing 2012_06 documents so far 17:48:42
25819
Finished 2012_06 documents so far 17:48:52
Processing 2014_06 documents so far 17:48:52
13765
Finished 2014_06 documents so far 17:48:59
Processing 2013_04 documents so far 17:48:59
14196
Finished 2013_04 documents so far 17:49:06
Processing 2016_05 documents so far 17:49:06
12457
Finished 2016_05 documents so far 17:49:13
Processing 2016_12 documents so far 17:49:13
6834
Finished 2016_12 documents so far 17:49:18
Processing 2012_12 documents so far 17:49:18
6506
Finished 2012_12 documents so far 17:49:23
Processing 2017_06 documents so far 17:49:23
4670
Finished 2017_06 documents so far 17:49:27
Processing 2014_10 documents so far 17:49:27
21171
Finished 2014_10 documents so far 17:49:37
Processing 2014_12 documents so far 17:49:37
5770
Finished 2014_12 documents so far 17:49:42
Processing 2013_05 documents so far 17:49:42
28566
Finished 2013_05 documents so far 17:49:53
Processing 2014_08 documents so far 17:49:53
10222
Finished 2014_08 documents so far 17:49:59
Processing 2015_10 documents so far 17:49:59
8462
Finished 2015_10 documents so far 17:50:04
Processing 2017_03 documents so far 17:50:04
3664
Finished 2017_03 documents so far 17:50:08
Processing 2012_08 documents so far 17:50:08
11017
Finished 2012_08 documents so far 17:50:14
Processing 2014_02 documents so far 17:50:14
9066
Finished 2014_02 documents so far 17:50:19
Processing 2013_12 documents so far 17:50:19
14470
Finished 2013_12 documents so far 17:50:27
Processing 2016_02 documents so far 17:50:27
6386
Finished 2016_02 documents so far 17:50:32
Processing 2013_03 documents so far 17:50:32
17523
Finished 2013_03 documents so far 17:50:40
Processing 2014_05 documents so far 17:50:40
13677
Finished 2014_05 documents so far 17:50:48
Processing 2016_06 documents so far 17:50:48
11344
Finished 2016_06 documents so far 17:50:55
Processing 2014_11 documents so far 17:50:55
12632
Finished 2014_11 documents so far 17:51:01
Processing 2012_04 documents so far 17:51:01
37400
Finished 2012_04 documents so far 17:51:16
Processing 2013_02 documents so far 17:51:16
12740
Finished 2013_02 documents so far 17:51:23
Processing 2014_04 documents so far 17:51:23
16972
Finished 2014_04 documents so far 17:51:31
Processing 2013_09 documents so far 17:51:31
1043
Finished 2013_09 documents so far 17:51:34
CPU times: user 3min 37s, sys: 11.8 s, total: 3min 49s
Wall time: 6min 38s

In [ ]:
length = [76147, 105437, 166768, 97922, 202805, 184538, 135195, 150944, 148744, 87716, 32146, 36870]

print(sum(length) / float(len(length)))

In [ ]: