This note book gives the trend of a single word in single mailing list.


In [2]:
%matplotlib inline

In [4]:
from bigbang.archive import Archive
import bigbang.parse as parse
import bigbang.graph as graph
import bigbang.mailman as mailman
import bigbang.process as process
import networkx as nx
import matplotlib.pyplot as plt
import pandas as pd
from pprint import pprint as pp
import pytz
import numpy as np
import math
import nltk
from itertools import repeat
from nltk.stem.lancaster import LancasterStemmer
st = LancasterStemmer()
from nltk.corpus import stopwords
import re

In [5]:
urls = ["http://mail.scipy.org/pipermail/ipython-dev/"]#,
        #"http://mail.scipy.org/pipermail/ipython-user/"],
        #"http://mail.scipy.org/pipermail/scipy-dev/",
        #"http://mail.scipy.org/pipermail/scipy-user/",
        #"http://mail.scipy.org/pipermail/numpy-discussion/"]


archives= [Archive(url,archive_dir="../archives") for url in urls]


No data found at http://mail.scipy.org/pipermail/ipython-dev/. Attempting to collect data from URL.
This could take a while.
'Getting archive page for ipython-dev'
['2016-August.txt.gz',
 '2016-July.txt.gz',
 '2016-June.txt.gz',
 '2016-May.txt.gz',
 '2016-April.txt.gz',
 '2016-March.txt.gz',
 '2016-February.txt.gz',
 '2016-January.txt.gz',
 '2015-December.txt.gz',
 '2015-November.txt.gz',
 '2015-October.txt.gz',
 '2015-September.txt.gz',
 '2015-August.txt.gz',
 '2015-July.txt.gz',
 '2015-June.txt.gz',
 '2015-May.txt.gz',
 '2015-April.txt.gz',
 '2015-March.txt.gz',
 '2015-February.txt.gz',
 '2015-January.txt.gz',
 '2014-December.txt.gz',
 '2014-November.txt.gz',
 '2014-October.txt.gz',
 '2014-September.txt.gz',
 '2014-August.txt.gz',
 '2014-July.txt.gz',
 '2014-June.txt.gz',
 '2014-May.txt.gz',
 '2014-April.txt.gz',
 '2014-March.txt.gz',
 '2014-February.txt.gz',
 '2014-January.txt.gz',
 '2013-December.txt.gz',
 '2013-November.txt.gz',
 '2013-October.txt.gz',
 '2013-September.txt.gz',
 '2013-August.txt.gz',
 '2013-July.txt.gz',
 '2013-June.txt.gz',
 '2013-May.txt.gz',
 '2013-April.txt.gz',
 '2013-March.txt.gz',
 '2013-February.txt.gz',
 '2013-January.txt.gz',
 '2012-December.txt.gz',
 '2012-November.txt.gz',
 '2012-October.txt.gz',
 '2012-September.txt.gz',
 '2012-August.txt.gz',
 '2012-July.txt.gz',
 '2012-June.txt.gz',
 '2012-May.txt.gz',
 '2012-April.txt.gz',
 '2012-March.txt.gz',
 '2012-February.txt.gz',
 '2012-January.txt.gz',
 '2011-December.txt.gz',
 '2011-November.txt.gz',
 '2011-October.txt.gz',
 '2011-September.txt.gz',
 '2011-August.txt.gz',
 '2011-July.txt.gz',
 '2011-June.txt.gz',
 '2011-May.txt.gz',
 '2011-April.txt.gz',
 '2011-March.txt.gz',
 '2011-February.txt.gz',
 '2011-January.txt.gz',
 '2010-December.txt.gz',
 '2010-November.txt.gz',
 '2010-October.txt.gz',
 '2010-September.txt.gz',
 '2010-August.txt.gz',
 '2010-July.txt.gz',
 '2010-June.txt.gz',
 '2010-May.txt.gz',
 '2010-April.txt.gz',
 '2010-March.txt.gz',
 '2010-February.txt.gz',
 '2010-January.txt.gz',
 '2009-December.txt.gz',
 '2009-November.txt.gz',
 '2009-October.txt.gz',
 '2009-September.txt.gz',
 '2009-August.txt.gz',
 '2009-July.txt.gz',
 '2009-June.txt.gz',
 '2009-May.txt.gz',
 '2009-April.txt.gz',
 '2009-March.txt.gz',
 '2009-February.txt.gz',
 '2009-January.txt.gz',
 '2008-December.txt.gz',
 '2008-November.txt.gz',
 '2008-October.txt.gz',
 '2008-September.txt.gz',
 '2008-August.txt.gz',
 '2008-July.txt.gz',
 '2008-June.txt.gz',
 '2008-May.txt.gz',
 '2008-April.txt.gz',
 '2008-March.txt.gz',
 '2008-February.txt.gz',
 '2008-January.txt.gz',
 '2007-December.txt.gz',
 '2007-November.txt.gz',
 '2007-October.txt.gz',
 '2007-September.txt.gz',
 '2007-August.txt.gz',
 '2007-July.txt.gz',
 '2007-June.txt.gz',
 '2007-May.txt.gz',
 '2007-April.txt.gz',
 '2007-March.txt.gz',
 '2007-February.txt.gz',
 '2007-January.txt.gz',
 '2006-December.txt.gz',
 '2006-November.txt.gz',
 '2006-October.txt.gz',
 '2006-September.txt.gz',
 '2006-August.txt.gz',
 '2006-July.txt.gz',
 '2006-June.txt.gz',
 '2006-May.txt.gz',
 '2006-April.txt.gz',
 '2006-March.txt.gz',
 '2006-February.txt.gz',
 '2006-January.txt.gz',
 '2005-December.txt.gz',
 '2005-November.txt.gz',
 '2005-October.txt.gz',
 '2005-September.txt.gz',
 '2005-August.txt.gz',
 '2005-July.txt.gz',
 '2005-June.txt.gz',
 '2005-May.txt.gz',
 '2005-April.txt.gz',
 '2005-March.txt.gz',
 '2005-February.txt.gz',
 '2005-January.txt.gz',
 '2004-December.txt.gz',
 '2004-November.txt.gz',
 '2004-October.txt.gz',
 '2004-September.txt.gz',
 '2004-August.txt.gz',
 '2004-July.txt.gz',
 '2004-June.txt.gz',
 '2004-May.txt.gz',
 '2004-April.txt.gz',
 '2004-March.txt.gz',
 '2004-February.txt.gz',
 '2004-January.txt.gz',
 '2003-December.txt.gz',
 '2003-November.txt.gz',
 '2003-October.txt.gz',
 '2003-September.txt.gz',
 '2003-August.txt.gz',
 '2003-July.txt.gz',
 '2003-June.txt.gz',
 '2003-May.txt.gz',
 '2003-April.txt.gz']
'retrieving http://mail.scipy.org/pipermail/ipython-dev/2016-August.txt.gz'
200 - writing file to archives/ipython-dev/2016-August.txt.gz
'retrieving http://mail.scipy.org/pipermail/ipython-dev/2016-July.txt.gz'
200 - writing file to archives/ipython-dev/2016-July.txt.gz
'retrieving http://mail.scipy.org/pipermail/ipython-dev/2016-June.txt.gz'
200 - writing file to archives/ipython-dev/2016-June.txt.gz
'retrieving http://mail.scipy.org/pipermail/ipython-dev/2016-May.txt.gz'
200 - writing file to archives/ipython-dev/2016-May.txt.gz
'retrieving http://mail.scipy.org/pipermail/ipython-dev/2016-April.txt.gz'
200 - writing file to archives/ipython-dev/2016-April.txt.gz
'retrieving http://mail.scipy.org/pipermail/ipython-dev/2016-March.txt.gz'
200 - writing file to archives/ipython-dev/2016-March.txt.gz
'retrieving http://mail.scipy.org/pipermail/ipython-dev/2016-February.txt.gz'
200 - writing file to archives/ipython-dev/2016-February.txt.gz
'retrieving http://mail.scipy.org/pipermail/ipython-dev/2016-January.txt.gz'
200 - writing file to archives/ipython-dev/2016-January.txt.gz
'retrieving http://mail.scipy.org/pipermail/ipython-dev/2015-December.txt.gz'
200 - writing file to archives/ipython-dev/2015-December.txt.gz
'retrieving http://mail.scipy.org/pipermail/ipython-dev/2015-November.txt.gz'
200 - writing file to archives/ipython-dev/2015-November.txt.gz
'retrieving http://mail.scipy.org/pipermail/ipython-dev/2015-October.txt.gz'
200 - writing file to archives/ipython-dev/2015-October.txt.gz
'retrieving http://mail.scipy.org/pipermail/ipython-dev/2015-September.txt.gz'
200 - writing file to archives/ipython-dev/2015-September.txt.gz
'retrieving http://mail.scipy.org/pipermail/ipython-dev/2015-August.txt.gz'
200 - writing file to archives/ipython-dev/2015-August.txt.gz
'retrieving http://mail.scipy.org/pipermail/ipython-dev/2015-July.txt.gz'
200 - writing file to archives/ipython-dev/2015-July.txt.gz
'retrieving http://mail.scipy.org/pipermail/ipython-dev/2015-June.txt.gz'
200 - writing file to archives/ipython-dev/2015-June.txt.gz
'retrieving http://mail.scipy.org/pipermail/ipython-dev/2015-May.txt.gz'
200 - writing file to archives/ipython-dev/2015-May.txt.gz
'retrieving http://mail.scipy.org/pipermail/ipython-dev/2015-April.txt.gz'
200 - writing file to archives/ipython-dev/2015-April.txt.gz
'retrieving http://mail.scipy.org/pipermail/ipython-dev/2015-March.txt.gz'
200 - writing file to archives/ipython-dev/2015-March.txt.gz
'retrieving http://mail.scipy.org/pipermail/ipython-dev/2015-February.txt.gz'
200 - writing file to archives/ipython-dev/2015-February.txt.gz
'retrieving http://mail.scipy.org/pipermail/ipython-dev/2015-January.txt.gz'
200 - writing file to archives/ipython-dev/2015-January.txt.gz
'retrieving http://mail.scipy.org/pipermail/ipython-dev/2014-December.txt.gz'
200 - writing file to archives/ipython-dev/2014-December.txt.gz
'retrieving http://mail.scipy.org/pipermail/ipython-dev/2014-November.txt.gz'
200 - writing file to archives/ipython-dev/2014-November.txt.gz
'retrieving http://mail.scipy.org/pipermail/ipython-dev/2014-October.txt.gz'
200 - writing file to archives/ipython-dev/2014-October.txt.gz
'retrieving http://mail.scipy.org/pipermail/ipython-dev/2014-September.txt.gz'
200 - writing file to archives/ipython-dev/2014-September.txt.gz
'retrieving http://mail.scipy.org/pipermail/ipython-dev/2014-August.txt.gz'
200 - writing file to archives/ipython-dev/2014-August.txt.gz
'retrieving http://mail.scipy.org/pipermail/ipython-dev/2014-July.txt.gz'
200 - writing file to archives/ipython-dev/2014-July.txt.gz
'retrieving http://mail.scipy.org/pipermail/ipython-dev/2014-June.txt.gz'
200 - writing file to archives/ipython-dev/2014-June.txt.gz
'retrieving http://mail.scipy.org/pipermail/ipython-dev/2014-May.txt.gz'
200 - writing file to archives/ipython-dev/2014-May.txt.gz
'retrieving http://mail.scipy.org/pipermail/ipython-dev/2014-April.txt.gz'
200 - writing file to archives/ipython-dev/2014-April.txt.gz
'retrieving http://mail.scipy.org/pipermail/ipython-dev/2014-March.txt.gz'
200 - writing file to archives/ipython-dev/2014-March.txt.gz
'retrieving http://mail.scipy.org/pipermail/ipython-dev/2014-February.txt.gz'
200 - writing file to archives/ipython-dev/2014-February.txt.gz
'retrieving http://mail.scipy.org/pipermail/ipython-dev/2014-January.txt.gz'
200 - writing file to archives/ipython-dev/2014-January.txt.gz
'retrieving http://mail.scipy.org/pipermail/ipython-dev/2013-December.txt.gz'
200 - writing file to archives/ipython-dev/2013-December.txt.gz
'retrieving http://mail.scipy.org/pipermail/ipython-dev/2013-November.txt.gz'
200 - writing file to archives/ipython-dev/2013-November.txt.gz
'retrieving http://mail.scipy.org/pipermail/ipython-dev/2013-October.txt.gz'
200 - writing file to archives/ipython-dev/2013-October.txt.gz
'retrieving http://mail.scipy.org/pipermail/ipython-dev/2013-September.txt.gz'
200 - writing file to archives/ipython-dev/2013-September.txt.gz
'retrieving http://mail.scipy.org/pipermail/ipython-dev/2013-August.txt.gz'
200 - writing file to archives/ipython-dev/2013-August.txt.gz
'retrieving http://mail.scipy.org/pipermail/ipython-dev/2013-July.txt.gz'
200 - writing file to archives/ipython-dev/2013-July.txt.gz
'retrieving http://mail.scipy.org/pipermail/ipython-dev/2013-June.txt.gz'
200 - writing file to archives/ipython-dev/2013-June.txt.gz
'retrieving http://mail.scipy.org/pipermail/ipython-dev/2013-May.txt.gz'
200 - writing file to archives/ipython-dev/2013-May.txt.gz
'retrieving http://mail.scipy.org/pipermail/ipython-dev/2013-April.txt.gz'
200 - writing file to archives/ipython-dev/2013-April.txt.gz
'retrieving http://mail.scipy.org/pipermail/ipython-dev/2013-March.txt.gz'
200 - writing file to archives/ipython-dev/2013-March.txt.gz
'retrieving http://mail.scipy.org/pipermail/ipython-dev/2013-February.txt.gz'
200 - writing file to archives/ipython-dev/2013-February.txt.gz
'retrieving http://mail.scipy.org/pipermail/ipython-dev/2013-January.txt.gz'
200 - writing file to archives/ipython-dev/2013-January.txt.gz
'retrieving http://mail.scipy.org/pipermail/ipython-dev/2012-December.txt.gz'
200 - writing file to archives/ipython-dev/2012-December.txt.gz
'retrieving http://mail.scipy.org/pipermail/ipython-dev/2012-November.txt.gz'
200 - writing file to archives/ipython-dev/2012-November.txt.gz
'retrieving http://mail.scipy.org/pipermail/ipython-dev/2012-October.txt.gz'
200 - writing file to archives/ipython-dev/2012-October.txt.gz
'retrieving http://mail.scipy.org/pipermail/ipython-dev/2012-September.txt.gz'
200 - writing file to archives/ipython-dev/2012-September.txt.gz
'retrieving http://mail.scipy.org/pipermail/ipython-dev/2012-August.txt.gz'
200 - writing file to archives/ipython-dev/2012-August.txt.gz
'retrieving http://mail.scipy.org/pipermail/ipython-dev/2012-July.txt.gz'
200 - writing file to archives/ipython-dev/2012-July.txt.gz
'retrieving http://mail.scipy.org/pipermail/ipython-dev/2012-June.txt.gz'
200 - writing file to archives/ipython-dev/2012-June.txt.gz
'retrieving http://mail.scipy.org/pipermail/ipython-dev/2012-May.txt.gz'
200 - writing file to archives/ipython-dev/2012-May.txt.gz
'retrieving http://mail.scipy.org/pipermail/ipython-dev/2012-April.txt.gz'
200 - writing file to archives/ipython-dev/2012-April.txt.gz
'retrieving http://mail.scipy.org/pipermail/ipython-dev/2012-March.txt.gz'
200 - writing file to archives/ipython-dev/2012-March.txt.gz
'retrieving http://mail.scipy.org/pipermail/ipython-dev/2012-February.txt.gz'
200 - writing file to archives/ipython-dev/2012-February.txt.gz
'retrieving http://mail.scipy.org/pipermail/ipython-dev/2012-January.txt.gz'
200 - writing file to archives/ipython-dev/2012-January.txt.gz
'retrieving http://mail.scipy.org/pipermail/ipython-dev/2011-December.txt.gz'
200 - writing file to archives/ipython-dev/2011-December.txt.gz
'retrieving http://mail.scipy.org/pipermail/ipython-dev/2011-November.txt.gz'
200 - writing file to archives/ipython-dev/2011-November.txt.gz
'retrieving http://mail.scipy.org/pipermail/ipython-dev/2011-October.txt.gz'
200 - writing file to archives/ipython-dev/2011-October.txt.gz
'retrieving http://mail.scipy.org/pipermail/ipython-dev/2011-September.txt.gz'
200 - writing file to archives/ipython-dev/2011-September.txt.gz
'retrieving http://mail.scipy.org/pipermail/ipython-dev/2011-August.txt.gz'
200 - writing file to archives/ipython-dev/2011-August.txt.gz
'retrieving http://mail.scipy.org/pipermail/ipython-dev/2011-July.txt.gz'
200 - writing file to archives/ipython-dev/2011-July.txt.gz
'retrieving http://mail.scipy.org/pipermail/ipython-dev/2011-June.txt.gz'
200 - writing file to archives/ipython-dev/2011-June.txt.gz
'retrieving http://mail.scipy.org/pipermail/ipython-dev/2011-May.txt.gz'
200 - writing file to archives/ipython-dev/2011-May.txt.gz
'retrieving http://mail.scipy.org/pipermail/ipython-dev/2011-April.txt.gz'
200 - writing file to archives/ipython-dev/2011-April.txt.gz
'retrieving http://mail.scipy.org/pipermail/ipython-dev/2011-March.txt.gz'
200 - writing file to archives/ipython-dev/2011-March.txt.gz
'retrieving http://mail.scipy.org/pipermail/ipython-dev/2011-February.txt.gz'
200 - writing file to archives/ipython-dev/2011-February.txt.gz
'retrieving http://mail.scipy.org/pipermail/ipython-dev/2011-January.txt.gz'
200 - writing file to archives/ipython-dev/2011-January.txt.gz
'retrieving http://mail.scipy.org/pipermail/ipython-dev/2010-December.txt.gz'
200 - writing file to archives/ipython-dev/2010-December.txt.gz
'retrieving http://mail.scipy.org/pipermail/ipython-dev/2010-November.txt.gz'
200 - writing file to archives/ipython-dev/2010-November.txt.gz
'retrieving http://mail.scipy.org/pipermail/ipython-dev/2010-October.txt.gz'
200 - writing file to archives/ipython-dev/2010-October.txt.gz
'retrieving http://mail.scipy.org/pipermail/ipython-dev/2010-September.txt.gz'
200 - writing file to archives/ipython-dev/2010-September.txt.gz
'retrieving http://mail.scipy.org/pipermail/ipython-dev/2010-August.txt.gz'
200 - writing file to archives/ipython-dev/2010-August.txt.gz
'retrieving http://mail.scipy.org/pipermail/ipython-dev/2010-July.txt.gz'
200 - writing file to archives/ipython-dev/2010-July.txt.gz
'retrieving http://mail.scipy.org/pipermail/ipython-dev/2010-June.txt.gz'
200 - writing file to archives/ipython-dev/2010-June.txt.gz
'retrieving http://mail.scipy.org/pipermail/ipython-dev/2010-May.txt.gz'
200 - writing file to archives/ipython-dev/2010-May.txt.gz
'retrieving http://mail.scipy.org/pipermail/ipython-dev/2010-April.txt.gz'
200 - writing file to archives/ipython-dev/2010-April.txt.gz
'retrieving http://mail.scipy.org/pipermail/ipython-dev/2010-March.txt.gz'
200 - writing file to archives/ipython-dev/2010-March.txt.gz
'retrieving http://mail.scipy.org/pipermail/ipython-dev/2010-February.txt.gz'
200 - writing file to archives/ipython-dev/2010-February.txt.gz
'retrieving http://mail.scipy.org/pipermail/ipython-dev/2010-January.txt.gz'
200 - writing file to archives/ipython-dev/2010-January.txt.gz
'retrieving http://mail.scipy.org/pipermail/ipython-dev/2009-December.txt.gz'
200 - writing file to archives/ipython-dev/2009-December.txt.gz
'retrieving http://mail.scipy.org/pipermail/ipython-dev/2009-November.txt.gz'
200 - writing file to archives/ipython-dev/2009-November.txt.gz
'retrieving http://mail.scipy.org/pipermail/ipython-dev/2009-October.txt.gz'
200 - writing file to archives/ipython-dev/2009-October.txt.gz
'retrieving http://mail.scipy.org/pipermail/ipython-dev/2009-September.txt.gz'
200 - writing file to archives/ipython-dev/2009-September.txt.gz
'retrieving http://mail.scipy.org/pipermail/ipython-dev/2009-August.txt.gz'
200 - writing file to archives/ipython-dev/2009-August.txt.gz
'retrieving http://mail.scipy.org/pipermail/ipython-dev/2009-July.txt.gz'
200 - writing file to archives/ipython-dev/2009-July.txt.gz
'retrieving http://mail.scipy.org/pipermail/ipython-dev/2009-June.txt.gz'
200 - writing file to archives/ipython-dev/2009-June.txt.gz
'retrieving http://mail.scipy.org/pipermail/ipython-dev/2009-May.txt.gz'
200 - writing file to archives/ipython-dev/2009-May.txt.gz
'retrieving http://mail.scipy.org/pipermail/ipython-dev/2009-April.txt.gz'
200 - writing file to archives/ipython-dev/2009-April.txt.gz
'retrieving http://mail.scipy.org/pipermail/ipython-dev/2009-March.txt.gz'
200 - writing file to archives/ipython-dev/2009-March.txt.gz
'retrieving http://mail.scipy.org/pipermail/ipython-dev/2009-February.txt.gz'
200 - writing file to archives/ipython-dev/2009-February.txt.gz
'retrieving http://mail.scipy.org/pipermail/ipython-dev/2009-January.txt.gz'
200 - writing file to archives/ipython-dev/2009-January.txt.gz
'retrieving http://mail.scipy.org/pipermail/ipython-dev/2008-December.txt.gz'
200 - writing file to archives/ipython-dev/2008-December.txt.gz
'retrieving http://mail.scipy.org/pipermail/ipython-dev/2008-November.txt.gz'
200 - writing file to archives/ipython-dev/2008-November.txt.gz
'retrieving http://mail.scipy.org/pipermail/ipython-dev/2008-October.txt.gz'
200 - writing file to archives/ipython-dev/2008-October.txt.gz
'retrieving http://mail.scipy.org/pipermail/ipython-dev/2008-September.txt.gz'
200 - writing file to archives/ipython-dev/2008-September.txt.gz
'retrieving http://mail.scipy.org/pipermail/ipython-dev/2008-August.txt.gz'
200 - writing file to archives/ipython-dev/2008-August.txt.gz
'retrieving http://mail.scipy.org/pipermail/ipython-dev/2008-July.txt.gz'
200 - writing file to archives/ipython-dev/2008-July.txt.gz
'retrieving http://mail.scipy.org/pipermail/ipython-dev/2008-June.txt.gz'
200 - writing file to archives/ipython-dev/2008-June.txt.gz
'retrieving http://mail.scipy.org/pipermail/ipython-dev/2008-May.txt.gz'
200 - writing file to archives/ipython-dev/2008-May.txt.gz
'retrieving http://mail.scipy.org/pipermail/ipython-dev/2008-April.txt.gz'
200 - writing file to archives/ipython-dev/2008-April.txt.gz
'retrieving http://mail.scipy.org/pipermail/ipython-dev/2008-March.txt.gz'
200 - writing file to archives/ipython-dev/2008-March.txt.gz
'retrieving http://mail.scipy.org/pipermail/ipython-dev/2008-February.txt.gz'
200 - writing file to archives/ipython-dev/2008-February.txt.gz
'retrieving http://mail.scipy.org/pipermail/ipython-dev/2008-January.txt.gz'
200 - writing file to archives/ipython-dev/2008-January.txt.gz
'retrieving http://mail.scipy.org/pipermail/ipython-dev/2007-December.txt.gz'
200 - writing file to archives/ipython-dev/2007-December.txt.gz
'retrieving http://mail.scipy.org/pipermail/ipython-dev/2007-November.txt.gz'
200 - writing file to archives/ipython-dev/2007-November.txt.gz
'retrieving http://mail.scipy.org/pipermail/ipython-dev/2007-October.txt.gz'
200 - writing file to archives/ipython-dev/2007-October.txt.gz
'retrieving http://mail.scipy.org/pipermail/ipython-dev/2007-September.txt.gz'
200 - writing file to archives/ipython-dev/2007-September.txt.gz
'retrieving http://mail.scipy.org/pipermail/ipython-dev/2007-August.txt.gz'
200 - writing file to archives/ipython-dev/2007-August.txt.gz
'retrieving http://mail.scipy.org/pipermail/ipython-dev/2007-July.txt.gz'
200 - writing file to archives/ipython-dev/2007-July.txt.gz
'retrieving http://mail.scipy.org/pipermail/ipython-dev/2007-June.txt.gz'
200 - writing file to archives/ipython-dev/2007-June.txt.gz
'retrieving http://mail.scipy.org/pipermail/ipython-dev/2007-May.txt.gz'
200 - writing file to archives/ipython-dev/2007-May.txt.gz
'retrieving http://mail.scipy.org/pipermail/ipython-dev/2007-April.txt.gz'
200 - writing file to archives/ipython-dev/2007-April.txt.gz
'retrieving http://mail.scipy.org/pipermail/ipython-dev/2007-March.txt.gz'
200 - writing file to archives/ipython-dev/2007-March.txt.gz
'retrieving http://mail.scipy.org/pipermail/ipython-dev/2007-February.txt.gz'
200 - writing file to archives/ipython-dev/2007-February.txt.gz
'retrieving http://mail.scipy.org/pipermail/ipython-dev/2007-January.txt.gz'
200 - writing file to archives/ipython-dev/2007-January.txt.gz
'retrieving http://mail.scipy.org/pipermail/ipython-dev/2006-December.txt.gz'
200 - writing file to archives/ipython-dev/2006-December.txt.gz
'retrieving http://mail.scipy.org/pipermail/ipython-dev/2006-November.txt.gz'
200 - writing file to archives/ipython-dev/2006-November.txt.gz
'retrieving http://mail.scipy.org/pipermail/ipython-dev/2006-October.txt.gz'
200 - writing file to archives/ipython-dev/2006-October.txt.gz
'retrieving http://mail.scipy.org/pipermail/ipython-dev/2006-September.txt.gz'
200 - writing file to archives/ipython-dev/2006-September.txt.gz
'retrieving http://mail.scipy.org/pipermail/ipython-dev/2006-August.txt.gz'
200 - writing file to archives/ipython-dev/2006-August.txt.gz
'retrieving http://mail.scipy.org/pipermail/ipython-dev/2006-July.txt.gz'
200 - writing file to archives/ipython-dev/2006-July.txt.gz
'retrieving http://mail.scipy.org/pipermail/ipython-dev/2006-June.txt.gz'
200 - writing file to archives/ipython-dev/2006-June.txt.gz
'retrieving http://mail.scipy.org/pipermail/ipython-dev/2006-May.txt.gz'
200 - writing file to archives/ipython-dev/2006-May.txt.gz
'retrieving http://mail.scipy.org/pipermail/ipython-dev/2006-April.txt.gz'
200 - writing file to archives/ipython-dev/2006-April.txt.gz
'retrieving http://mail.scipy.org/pipermail/ipython-dev/2006-March.txt.gz'
200 - writing file to archives/ipython-dev/2006-March.txt.gz
'retrieving http://mail.scipy.org/pipermail/ipython-dev/2006-February.txt.gz'
200 - writing file to archives/ipython-dev/2006-February.txt.gz
'retrieving http://mail.scipy.org/pipermail/ipython-dev/2006-January.txt.gz'
200 - writing file to archives/ipython-dev/2006-January.txt.gz
'retrieving http://mail.scipy.org/pipermail/ipython-dev/2005-December.txt.gz'
200 - writing file to archives/ipython-dev/2005-December.txt.gz
'retrieving http://mail.scipy.org/pipermail/ipython-dev/2005-November.txt.gz'
200 - writing file to archives/ipython-dev/2005-November.txt.gz
'retrieving http://mail.scipy.org/pipermail/ipython-dev/2005-October.txt.gz'
200 - writing file to archives/ipython-dev/2005-October.txt.gz
'retrieving http://mail.scipy.org/pipermail/ipython-dev/2005-September.txt.gz'
200 - writing file to archives/ipython-dev/2005-September.txt.gz
'retrieving http://mail.scipy.org/pipermail/ipython-dev/2005-August.txt.gz'
200 - writing file to archives/ipython-dev/2005-August.txt.gz
'retrieving http://mail.scipy.org/pipermail/ipython-dev/2005-July.txt.gz'
200 - writing file to archives/ipython-dev/2005-July.txt.gz
'retrieving http://mail.scipy.org/pipermail/ipython-dev/2005-June.txt.gz'
200 - writing file to archives/ipython-dev/2005-June.txt.gz
'retrieving http://mail.scipy.org/pipermail/ipython-dev/2005-May.txt.gz'
200 - writing file to archives/ipython-dev/2005-May.txt.gz
'retrieving http://mail.scipy.org/pipermail/ipython-dev/2005-April.txt.gz'
200 - writing file to archives/ipython-dev/2005-April.txt.gz
'retrieving http://mail.scipy.org/pipermail/ipython-dev/2005-March.txt.gz'
200 - writing file to archives/ipython-dev/2005-March.txt.gz
'retrieving http://mail.scipy.org/pipermail/ipython-dev/2005-February.txt.gz'
200 - writing file to archives/ipython-dev/2005-February.txt.gz
'retrieving http://mail.scipy.org/pipermail/ipython-dev/2005-January.txt.gz'
200 - writing file to archives/ipython-dev/2005-January.txt.gz
'retrieving http://mail.scipy.org/pipermail/ipython-dev/2004-December.txt.gz'
200 - writing file to archives/ipython-dev/2004-December.txt.gz
'retrieving http://mail.scipy.org/pipermail/ipython-dev/2004-November.txt.gz'
200 - writing file to archives/ipython-dev/2004-November.txt.gz
'retrieving http://mail.scipy.org/pipermail/ipython-dev/2004-October.txt.gz'
200 - writing file to archives/ipython-dev/2004-October.txt.gz
'retrieving http://mail.scipy.org/pipermail/ipython-dev/2004-September.txt.gz'
200 - writing file to archives/ipython-dev/2004-September.txt.gz
'retrieving http://mail.scipy.org/pipermail/ipython-dev/2004-August.txt.gz'
200 - writing file to archives/ipython-dev/2004-August.txt.gz
'retrieving http://mail.scipy.org/pipermail/ipython-dev/2004-July.txt.gz'
200 - writing file to archives/ipython-dev/2004-July.txt.gz
'retrieving http://mail.scipy.org/pipermail/ipython-dev/2004-June.txt.gz'
200 - writing file to archives/ipython-dev/2004-June.txt.gz
'retrieving http://mail.scipy.org/pipermail/ipython-dev/2004-May.txt.gz'
200 - writing file to archives/ipython-dev/2004-May.txt.gz
'retrieving http://mail.scipy.org/pipermail/ipython-dev/2004-April.txt.gz'
200 - writing file to archives/ipython-dev/2004-April.txt.gz
'retrieving http://mail.scipy.org/pipermail/ipython-dev/2004-March.txt.gz'
200 - writing file to archives/ipython-dev/2004-March.txt.gz
'retrieving http://mail.scipy.org/pipermail/ipython-dev/2004-February.txt.gz'
200 - writing file to archives/ipython-dev/2004-February.txt.gz
'retrieving http://mail.scipy.org/pipermail/ipython-dev/2004-January.txt.gz'
200 - writing file to archives/ipython-dev/2004-January.txt.gz
'retrieving http://mail.scipy.org/pipermail/ipython-dev/2003-December.txt.gz'
200 - writing file to archives/ipython-dev/2003-December.txt.gz
'retrieving http://mail.scipy.org/pipermail/ipython-dev/2003-November.txt.gz'
200 - writing file to archives/ipython-dev/2003-November.txt.gz
'retrieving http://mail.scipy.org/pipermail/ipython-dev/2003-October.txt.gz'
200 - writing file to archives/ipython-dev/2003-October.txt.gz
'retrieving http://mail.scipy.org/pipermail/ipython-dev/2003-September.txt.gz'
200 - writing file to archives/ipython-dev/2003-September.txt.gz
'retrieving http://mail.scipy.org/pipermail/ipython-dev/2003-August.txt.gz'
200 - writing file to archives/ipython-dev/2003-August.txt.gz
'retrieving http://mail.scipy.org/pipermail/ipython-dev/2003-July.txt.gz'
200 - writing file to archives/ipython-dev/2003-July.txt.gz
'retrieving http://mail.scipy.org/pipermail/ipython-dev/2003-June.txt.gz'
200 - writing file to archives/ipython-dev/2003-June.txt.gz
'retrieving http://mail.scipy.org/pipermail/ipython-dev/2003-May.txt.gz'
200 - writing file to archives/ipython-dev/2003-May.txt.gz
'retrieving http://mail.scipy.org/pipermail/ipython-dev/2003-April.txt.gz'
200 - writing file to archives/ipython-dev/2003-April.txt.gz
unzipping 161 archive files
Opening 161 archive files
/Users/nick/code/mailing-list-analysis/bigbang/bigbang/archive.py:74: FutureWarning: sort(columns=....) is deprecated, use sort_values(by=.....)
  self.data.sort(columns='Date', inplace=True)

In [6]:
checkword = "python" #can change words, should be lower case

You'll need to download some resources for NLTK (the natural language toolkit) in order to do the kind of processing we want on all the mailing list text. In particular, for this notebook you'll need punkt, the Punkt Tokenizer Models.

To download, from an interactive Python shell, run:

import nltk
nltk.download()

And in the graphical UI that appears, choose "punkt" from the All Packages tab and Download.


In [8]:
df = pd.DataFrame(columns=["MessageId","Date","From","In-Reply-To","Count"])
for row in archives[0].data.iterrows():
    try: 
        w = row[1]["Body"].replace("'", "")
        k = re.sub(r'[^\w]', ' ', w)
        k = k.lower()
        t = nltk.tokenize.word_tokenize(k)
        subdict = {}
        count = 0
        for g in t:
            try:
                word = st.stem(g)
            except:
                print g
                pass
            if word == checkword:
                count += 1
        if count == 0:
            continue
        else:
            subdict["MessageId"] = row[0]
            subdict["Date"] = row[1]["Date"]
            subdict["From"] = row[1]["From"]
            subdict["In-Reply-To"] = row[1]["In-Reply-To"]
            subdict["Count"] = count
            df = df.append(subdict,ignore_index=True)
    except:
        if row[1]["Body"] is None: 
            print '!!! Detected an email with an empty Body field...'
        else: print 'error'

In [9]:
df[:5]  #dataframe of informations of the particular word.


Out[9]:
MessageId Date From In-Reply-To Count
0 <3E9E4094.7030802@colorado.edu> 2003-04-17 05:50:12 fperez@colorado.edu (Fernando Perez) <003d01c28a9a$3dcb8560$e301340a@cyberhigh.fcoe... 2.0
1 <3E9E4094.7030802@colorado.edu> 2003-04-17 05:50:12 fperez at colorado.edu (Fernando Perez) <003d01c28a9a$3dcb8560$e301340a@cyberhigh.fcoe... 2.0
2 <000c01c304ee$3cb79e60$e901340a@cyberhigh.fcoe... 2003-04-17 14:32:56 cdodt@fcoe.k12.ca.us (Cory Dodt) <3E9E4094.7030802@colorado.edu> 3.0
3 <000c01c304ee$3cb79e60$e901340a@cyberhigh.fcoe... 2003-04-17 14:32:56 cdodt at fcoe.k12.ca.us (Cory Dodt) <3E9E4094.7030802@colorado.edu> 3.0
4 <3E9EC1CA.3060800@colorado.edu> 2003-04-17 15:01:30 fperez@colorado.edu (Fernando Perez) <000c01c304ee$3cb79e60$e901340a@cyberhigh.fcoe... 6.0

Group the dataframe by the month and year, and aggregate the counts for the checkword during each month to get a quick histogram of how frequently that word has been used over time.


In [22]:
df.groupby([df.Date.dt.year, df.Date.dt.month]).agg({'Count':np.sum}).plot(y='Count')


Out[22]:
<matplotlib.axes._subplots.AxesSubplot at 0x10ea834d0>