notebook.community

Edit and run



In [2]:

    
%matplotlib inline
from bigbang.archive import Archive
from bigbang.thread import Thread
from bigbang.thread import Node
import matplotlib.pyplot as plt
import datetime

First, collect data from a public email archive.



In [3]:

    
url = "https://lists.wikimedia.org/pipermail/analytics/"
arx = Archive(url,archive_dir="../archives")

We can count the number of threads in the archive easily. The first time you run Archive.get_thread it may take some time to compute, but the result is cached in the Archive object.



In [4]:

    
#threads = arx.get_threads()
len(arx.get_threads())









    Out[4]:





628

We can plot a histogram of the number of messages in each thread. In most cases this will be a power law distribution.



In [5]:

    
y = [t.get_num_messages() for t in arx.get_threads()]

plt.hist(y, bins=30)
plt.xlabel('number of messages in a thread')
plt.show()

We can also plot the number of people participating in each thread. Here, the participants are differentiated by the From: header on the emails they've sent.



In [6]:

    
n = [t.get_num_people() for t in arx.get_threads()]

plt.hist(n, bins = 20)
plt.xlabel('number of email-address in a thread')
plt.show()

The duration of a thread is the amount of elapsed time between its first and last message.



In [7]:

    
y = [t.get_duration().days for t in arx.get_threads()]

plt.hist(y, bins = (10))
plt.xlabel('duration of a thread(days)')
plt.show()



In [8]:

    
y = [t.get_duration().seconds for t in arx.get_threads()]

plt.hist(y, bins = (10))
plt.xlabel('duration of a thread(seconds)')
plt.show()

You can examine the properties of a single thread.



In [9]:

    
print(arx.get_threads()[0].get_duration())



In [21]:

    
threads = arx.get_threads



In [22]:

    
content = arx.get_threads()[0].get_root().data['Body']
content









    Out[22]:





'Welcome to the the inaugural Analytics Mailing list email.\n\nHere all your analytics wishes comes true, \n\n\nso proposals, ideas, crazy ideas, crazy crazy ideas are welcome here!\nas long as we can count something it is welcome. \n\n\nD\n\n'



In [23]:

    
len(content.split())









    Out[23]:





38

Suppose we want to know whether or not longer threads (that contain more distinct messages) have more words.



In [24]:

    
short_threads = []
long_threads = []
for t in arx.get_threads():
    if(t.get_num_messages() < 6): short_threads.append(t)
    else: long_threads.append(t)



In [25]:

    
print(len(short_threads))
print(len(long_threads))



In [26]:

    
len(long_threads[0].get_content())









    Out[26]:





13



In [27]:

    
dist_short = []
dist_long = []
for t in short_threads:
    avg_short = sum([len(i.split()) for i in t.get_content()]) / len(t.get_content())
    dist_short.append(avg_short)
for t in long_threads:
    avg_long = sum([len(i.split()) for i in t.get_content()]) / len(t.get_content())
    dist_long.append(avg_long)



In [28]:

    
plt.hist(dist_short, bins = (15))
plt.show()



In [29]:

    
plt.hist(dist_long, bins = (15))
plt.show()



In [30]:

    
print(sum(dist_short)/ len(dist_short))
print(sum(dist_long)/ len(dist_long))



In [31]:

    
%reload_ext autoreload
s_leaves = []
s_notleaves = []
for t in threads:
    for node in t.get_leaves():
        s_leaves.append(len(node.data['Body'].split()))
    for node in t.get_not_leaves():
        s_notleaves.append(len(node.data['Body'].split()))









    



---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-31-69067a14eb67> in <module>()
      2 s_leaves = []
      3 s_notleaves = []
----> 4 for t in threads:
      5     for node in t.get_leaves():
      6         s_leaves.append(len(node.data['Body'].split()))

TypeError: 'instancemethod' object is not iterable



In [20]:

    
plt.hist(s_leaves, bins = (15))
plt.show()









    



---------------------------------------------------------------------------
IndexError                                Traceback (most recent call last)
<ipython-input-20-804337a1ae9d> in <module>()
----> 1 plt.hist(s_leaves, bins = (15))
      2 plt.show()

/home/sb/anaconda/envs/bigbang/lib/python2.7/site-packages/matplotlib/pyplot.pyc in hist(x, bins, range, normed, weights, cumulative, bottom, histtype, align, orientation, rwidth, log, color, label, stacked, hold, **kwargs)
   2825                       histtype=histtype, align=align, orientation=orientation,
   2826                       rwidth=rwidth, log=log, color=color, label=label,
-> 2827                       stacked=stacked, **kwargs)
   2828         draw_if_interactive()
   2829     finally:

/home/sb/anaconda/envs/bigbang/lib/python2.7/site-packages/matplotlib/axes.pyc in hist(self, x, bins, range, normed, weights, cumulative, bottom, histtype, align, orientation, rwidth, log, color, label, stacked, **kwargs)
   8247         # Massage 'x' for processing.
   8248         # NOTE: Be sure any changes here is also done below to 'weights'
-> 8249         if isinstance(x, np.ndarray) or not iterable(x[0]):
   8250             # TODO: support masked arrays;
   8251             x = np.asarray(x)

IndexError: list index out of range



In [23]:

    
plt.hist(s_notleaves, bins = (15))
plt.show()



In [24]:

    
print(sum(s_leaves)/len(s_leaves))
print(sum(s_notleaves)/len(s_notleaves))



In [25]:

    
import re
mess = threads[85].get_leaves()[0].data['Body']



In [26]:

    
print(mess)









    



Fernando Perez wrote:

>
> Great!  Many thanks for this.  Please give it a bit more pounding, and 
> I'd encourage other users of dreload to also try it out.  The code is 
> definitely far simpler than the original dreload, but since I don't 
> understand that code too well, I'd like to tiptoe a bit on this 
> issue.  If it survives a bit of pounding and discussion, I'll 
> definitely be glad to put it in.

One of the things where it differ's from the old deep_reload is that 
when importing a submodule, say ipbug.vm, it will not reload 
ipbug/__init__.py. I've attached another version, which tries to do just 
that and I'm using that version currently without problems.
However I think there must be an even  less complicated version.  
Clearing sys.modules and 'reimporting' the module like in the following 
code seems to work ok.
====
import sys, bbutils.textproc.spellcheck
m=sys.modules.copy()
sys.modules.clear()
sys.modules['sys'] = sys
import bbutils.textproc.spellcheck
====

Currently I don't have the time to investigate this further, but in a 
week or two, I'll  have another look at this.

- Ralf

-------------- next part --------------
import sys
import __builtin__

builtin_import = None   # will be set to __builtin__.__import__ by reload function
old_modules = {}        # will be set to sys.modules by reload function
reloaded = {}           # names of reloaded modules, uses same keys as sys.modules

def determineParentName(globals):
    """determine name of the module which has called the import statement"""
    if not globals or  not globals.has_key("__name__"):
        return None
    pname = globals['__name__']
    if globals.has_key("__path__"):
        return pname
    if '.' in pname:
        i = pname.rfind('.')
        pname = pname[:i]
        return pname
    return None


def reloadModuleAndParents(name, globals=None):
    """for name='some.module.bar', reload some, some.module, some.module.bar.
    Determines module, which has called the import statement, and prefers
    module relative paths, i.e. 'import os' in module m, from m/__init__.py
    imports m/os.py if it's there.
    """
    global reloaded

    def reloadByName(n):
        if old_modules.has_key(n):
            if reloaded.has_key(n):
                return reloaded[n]
            
            reloaded[mname]=1
            sys.modules[mname] = old_modules[mname]

            if mname != old_modules[mname].__name__:
                # module changed it's name. otherwise dreload(xml.sax) fails.
                print "Module changed name:", mname, old_modules[mname].__name__
                return reloadModuleAndParents(old_modules[mname].__name__)
            else:
                print "Reloading", mname
                return __builtin__.reload(old_modules[mname])

        return None

    mods = name.split(".")
    parent = determineParentName(globals)

    retval = None
    for i in range(len(mods)):
        mname = ".".join(mods[:i+1])
        if parent:
            relative = "%s.%s" % (parent, mname)
            if old_modules.has_key(relative) and old_modules[relative]:
                retval=reloadModuleAndParents(relative)
                continue
        retval=reloadByName(mname)
    return retval
                
def my_import_hook(name, globals=None, locals=None, fromlist=None):
    """replacement for __builtin__.__import__

    reloads module 'name' if it hasn't been already reloaded
    and then calls original __builtin__.__import__.
    """

##     if fromlist:
##         print 'Importing', fromlist, 'from module', name
##     else:
##         print 'Importing module', name
    reloadModuleAndParents(name, globals)
    return builtin_import(name, globals, locals, fromlist)



# Replacement for reload()
def reload(module, exclude=['sys', '__builtin__', '__main__']):
    """Recursively reload all modules used in the given module.  Optionally
    takes a list of modules to exclude from reloading.  The default exclude
    list contains sys, __main__, and __builtin__, to prevent, e.g., resetting 
    display, exception, and io hooks.
    """
    global builtin_import
    global old_modules
    global reloaded

    reloaded = {}
    old_modules = sys.modules.copy()
    sys.modules.clear()
    for ex in exclude+list(sys.builtin_module_names):
        if old_modules.has_key(ex) and not sys.modules.has_key(ex):
            print "EXCLUDING", ex
            sys.modules[ex] = old_modules[ex]
            reloaded[ex]=1

    builtin_import = __builtin__.__import__
    __builtin__.__import__ = my_import_hook

    try:
        return reloadModuleAndParents(module.__name__)
    finally:
        # restore old values
        __builtin__.__import__ = builtin_import
        for m in old_modules:
            if not sys.modules.has_key(m):
                sys.modules[m] = old_modules[m]



In [27]:

    
mess.split('\n')
message = list()
for l in mess.split('\n'):
    n = len(l)
    if(len(l)!=0 and l[0] != '>' and l[n-6:n] != 'wrote:'):
        message.append(l)
new = str()
for l in message:
    new = new + l + '\n'



In [28]:

    
print(new)









    



One of the things where it differ's from the old deep_reload is that 
when importing a submodule, say ipbug.vm, it will not reload 
ipbug/__init__.py. I've attached another version, which tries to do just 
that and I'm using that version currently without problems.
However I think there must be an even  less complicated version.  
Clearing sys.modules and 'reimporting' the module like in the following 
code seems to work ok.
====
import sys, bbutils.textproc.spellcheck
m=sys.modules.copy()
sys.modules.clear()
sys.modules['sys'] = sys
import bbutils.textproc.spellcheck
====
Currently I don't have the time to investigate this further, but in a 
week or two, I'll  have another look at this.
- Ralf
-------------- next part --------------
import sys
import __builtin__
builtin_import = None   # will be set to __builtin__.__import__ by reload function
old_modules = {}        # will be set to sys.modules by reload function
reloaded = {}           # names of reloaded modules, uses same keys as sys.modules
def determineParentName(globals):
    """determine name of the module which has called the import statement"""
    if not globals or  not globals.has_key("__name__"):
        return None
    pname = globals['__name__']
    if globals.has_key("__path__"):
        return pname
    if '.' in pname:
        i = pname.rfind('.')
        pname = pname[:i]
        return pname
    return None
def reloadModuleAndParents(name, globals=None):
    """for name='some.module.bar', reload some, some.module, some.module.bar.
    Determines module, which has called the import statement, and prefers
    module relative paths, i.e. 'import os' in module m, from m/__init__.py
    imports m/os.py if it's there.
    """
    global reloaded
    def reloadByName(n):
        if old_modules.has_key(n):
            if reloaded.has_key(n):
                return reloaded[n]
            
            reloaded[mname]=1
            sys.modules[mname] = old_modules[mname]
            if mname != old_modules[mname].__name__:
                # module changed it's name. otherwise dreload(xml.sax) fails.
                print "Module changed name:", mname, old_modules[mname].__name__
                return reloadModuleAndParents(old_modules[mname].__name__)
            else:
                print "Reloading", mname
                return __builtin__.reload(old_modules[mname])
        return None
    mods = name.split(".")
    parent = determineParentName(globals)
    retval = None
    for i in range(len(mods)):
        mname = ".".join(mods[:i+1])
        if parent:
            relative = "%s.%s" % (parent, mname)
            if old_modules.has_key(relative) and old_modules[relative]:
                retval=reloadModuleAndParents(relative)
                continue
        retval=reloadByName(mname)
    return retval
                
def my_import_hook(name, globals=None, locals=None, fromlist=None):
    """replacement for __builtin__.__import__
    reloads module 'name' if it hasn't been already reloaded
    and then calls original __builtin__.__import__.
    """
##     if fromlist:
##         print 'Importing', fromlist, 'from module', name
##     else:
##         print 'Importing module', name
    reloadModuleAndParents(name, globals)
    return builtin_import(name, globals, locals, fromlist)
# Replacement for reload()
def reload(module, exclude=['sys', '__builtin__', '__main__']):
    """Recursively reload all modules used in the given module.  Optionally
    takes a list of modules to exclude from reloading.  The default exclude
    list contains sys, __main__, and __builtin__, to prevent, e.g., resetting 
    display, exception, and io hooks.
    """
    global builtin_import
    global old_modules
    global reloaded
    reloaded = {}
    old_modules = sys.modules.copy()
    sys.modules.clear()
    for ex in exclude+list(sys.builtin_module_names):
        if old_modules.has_key(ex) and not sys.modules.has_key(ex):
            print "EXCLUDING", ex
            sys.modules[ex] = old_modules[ex]
            reloaded[ex]=1
    builtin_import = __builtin__.__import__
    __builtin__.__import__ = my_import_hook
    try:
        return reloadModuleAndParents(module.__name__)
    finally:
        # restore old values
        __builtin__.__import__ = builtin_import
        for m in old_modules:
            if not sys.modules.has_key(m):
                sys.modules[m] = old_modules[m]



In [29]:

    
print(EmailReplyParser.parse_reply(mess))









    



---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-29-671024fec275> in <module>()
----> 1 print(EmailReplyParser.parse_reply(mess))

NameError: name 'EmailReplyParser' is not defined



In [ ]:

    
print(mess)



In [ ]:

    
threads[85].get_leaves()[0].data