Corr between centrality and community 0.1


An IPython notebook that explores the relationship(correlation) between betweenness centrality and community membership of a number of mailing-lists in a given time period.


In [1]:
%matplotlib inline

In [2]:
from bigbang.archive import Archive
import bigbang.parse as parse
import bigbang.graph as graph
import bigbang.mailman as mailman
import bigbang.process as process
import networkx as nx
import matplotlib.pyplot as plt
import pandas as pd
from pprint import pprint as pp
import pytz
import numpy as np
import math
from itertools import repeat

In [3]:
urls = ["http://mail.scipy.org/pipermail/ipython-dev/",
        "http://mail.scipy.org/pipermail/ipython-user/",
        "http://mail.scipy.org/pipermail/scipy-dev/",
        "http://mail.scipy.org/pipermail/scipy-user/",
        "http://mail.scipy.org/pipermail/numpy-discussion/"]


archives= [Archive(url,archive_dir="../archives") for url in urls]


Opening 138 archive files
Opening 139 archive files
Opening 160 archive files
Opening 159 archive files
Date parsing error on: 
pon, 4 stu 2002 16:22:52
Opening 177 archive files
Date parsing error on: 
Wed, 01 Nov 2006 15:46:73 +0800
Date parsing error on: 
Wed, 01 Nov 2006 15:46:73 +0800

The following sets start month and end month, both inclusive.


In [5]:
date_from_whole = [2010,1] #Include June(Start month)
date_to_whole = [2012,12] #Include December(End month)
total_month = (date_to_whole[0] - date_from_whole[0])*12 + (date_to_whole[1]-date_from_whole[1]+1)

In [6]:
date_from = []
date_to = []
temp_year = date_from_whole[0]
temp_month = date_from_whole[1]

for i in range(total_month):
    date_from.append(pd.datetime(temp_year,temp_month,1,tzinfo=pytz.utc))
    if temp_month == 12:
        temp_year += 1
        temp_month = 0
    date_to.append(pd.datetime(temp_year,temp_month+1,1,tzinfo=pytz.utc))
    temp_month += 1

In [7]:
def filter_by_date(df,d_from,d_to):
    return df[(df['Date'] > d_from) & (df['Date'] < d_to)]

In [8]:
IG = []
for k in range(total_month):
    dfs = [filter_by_date(arx.data,
                      date_from[k],
                      date_to[k]) for arx in archives]
    bdf = pd.concat(dfs)
    IG.append(graph.messages_to_interaction_graph(bdf))

#RG = graph.messages_to_reply_graph(messages)

#IG = graph.messages_to_interaction_graph(bdf)

In [9]:
bc = []
for j in range(total_month):
    bc.append(pd.Series(nx.betweenness_centrality(IG[j])))

In [10]:
len(bc)


Out[10]:
36

new_dict is a dictionary with keys as users' names, and values of their community membership(can have different interpretation) Here the community membership for a user is defined as sum of log(Ni + 1), with Ni corresponds to the number of emails a user sent to Mailing list i.


In [31]:
new_dict = [{} for i in repeat(None, total_month)]
new_dict1 = [{} for i in repeat(None, total_month)]
for t in range(total_month):
    filtered_activity = []
    for i in range(5):
        df = archives[i].data
        fdf = filter_by_date(df,date_from[t],date_to[t])
        filtered_activity.append(Archive(fdf).get_activity().sum())
    for k in range(len(filtered_activity)):
        for g in range(len(filtered_activity[k])):
            original_key = filtered_activity[k].keys()[g]
            new_key = (original_key[original_key.index("(") + 1:original_key.rindex(")")])
            if new_key not in new_dict[t]:
                new_dict[t][new_key] = 0
                new_dict1[t][new_key] = 0
            new_dict[t][new_key] += math.log(filtered_activity[k].get_values()[g]+1)
            #can define community membership by changing the above line.
            #example, direct sum of emails would be 
            new_dict1[t][new_key] += filtered_activity[k].get_values()[g]

In [47]:
for i in range(len(new_dict1)):
    [x+1 for x in new_dict1[i].values()]
    [np.log(x) for x in new_dict1[i].values()]

In [49]:
#check if there's name difference, return nothing if perfect.
for i in range(total_month):
    set(new_dict[i].keys()).difference(bc[i].index.values)
    set(bc[i].index.values).difference(new_dict[i].keys())
    set(new_dict1[i].keys()).difference(bc[i].index.values)
    set(bc[i].index.values).difference(new_dict1[i].keys())

In [53]:
#A list of corresponding betweenness centrality and community membership for all users, monthly
comparison = []
comparison1 = []
for i in range(len(new_dict)):
    comparison.append(pd.DataFrame([new_dict[i], bc[i]]))
    comparison1.append(pd.DataFrame([new_dict1[i], bc[i]]))

In [54]:
corr = []
corr1 = []
for i in range(len(new_dict)):
    corr.append(np.corrcoef(comparison[i].get_values()[0],comparison[i].get_values()[1])[0,1])
    corr1.append(np.corrcoef(comparison1[i].get_values()[0],comparison1[i].get_values()[1])[0,1])

In [56]:
corr1


Out[56]:
[0.96037411067096634,
 0.9290138698150141,
 0.89425367964646685,
 0.89917617005470662,
 0.92991018321441909,
 0.82147285776944867,
 0.87427839105054217,
 0.90600460345005496,
 0.88271122203671915,
 0.88553587680879164,
 0.85441186827144266,
 0.93719201756052384,
 0.83104612834996139,
 0.774586996595649,
 0.84519537229516306,
 0.85677539808041736,
 0.88870809362997261,
 0.8167283158671691,
 0.90511019084905298,
 0.89887965432552719,
 0.86415159226754346,
 0.87402695037197142,
 0.76063235325679746,
 0.90447740840444568,
 0.91035098143920701,
 0.9221783659392957,
 0.94070862509041964,
 0.95431393892826144,
 0.93629494205905239,
 0.90463638705091931,
 0.86892862334827548,
 0.8758045601354455,
 0.92942914877260052,
 0.92685099851158981,
 0.88117423471033274,
 0.86581332777478748]

In [57]:
#Blue as sum of log, red as log of sum, respect to community membership
x = range(1,total_month+1)
y = corr
plt.plot(x, y, marker='o')
z = corr1
plt.plot(x, z, marker='o', linestyle='--', color='r')


Out[57]:
[<matplotlib.lines.Line2D at 0x111bdbd50>]

In [ ]: