In [1]:
from bigbang.archive import Archive
import bigbang.parse as parse
import bigbang.graph as graph
import bigbang.mailman as mailman
import bigbang.process as process
import networkx as nx
import matplotlib.pyplot as plt
import pandas as pd
from pprint import pprint as pp
import pytz
import numpy as np
import math
import nltk
from itertools import repeat
from nltk.stem.lancaster import LancasterStemmer
st = LancasterStemmer()
from nltk.corpus import stopwords
import re

In [2]:
urls = ["http://mail.python.org/pipermail/ipython-dev/",
        "http://mail.python.org/pipermail/ipython-user/"]#,
        #"http://mail.python.org/pipermail/scipy-dev/",
        #"http://mail.python.org/pipermail/scipy-user/",
        #"http://mail.python.org/pipermail/numpy-discussion/"]


archives= [Archive(url,archive_dir="../archives") for url in urls]


Opening 138 archive files
Opening 139 archive files

In [3]:
act = archives[0].get_activity()
act1 = archives[1].get_activity()


/Users/jiabinchen/Desktop/Research/asd/bigbang/bigbang/archive.py:124: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_index,col_indexer] = value instead
  mdf2['Date'] = mdf['Date'].apply(lambda x: x.toordinal())

In [4]:
fig = plt.figure(figsize=(12.5, 7.5))

#act.idxmax().order().T.plot()
(act > 0).idxmax().order().plot()

fig.axes[0].yaxis_date()

In [5]:
timeorder = (act > 0).idxmax().order()
timeorder1 = (act1 > 0).idxmax().order()

In [6]:
archives[1].data[:2]


Out[6]:
From Subject Date In-Reply-To References Body
Message-ID
<3271DBB88437ED41A0AB239E6C2554A401117873@ussunm001.palmsource.com> Robin.Siebler@palmsource.com (Robin Siebler) [IPython-user] Crash 2003-03-27 12:27:08-08:00 None None I installed IPython-0.2.15pre3, played with it...
<3271DBB88437ED41A0AB239E6C2554A401117873@ussunm001.palmsource.com> Robin.Siebler at palmsource.com (Robin Siebler) [IPython-user] Crash 2003-03-27 12:27:08-08:00 None None I installed IPython-0.2.15pre3, played with it...

2 rows × 6 columns


In [7]:
for row in archives[0].data[:2].iterrows():
    print(row[1]["Body"])


Hi all,

after a suggestion by Jacek Generowicz, someone (not me) sent in a request for 
indexing the ipython lists at gmane.  I didn't do it but I'm perfectly happy 
with it, so thanks to whoever did it.

For those not familiar with the service, http://gmane.org provides a mailing 
list to news bridge, which allows you to follow the ipython lists with a news 
reader.

Cheers,

Fernando.


Hi all,

after a suggestion by Jacek Generowicz, someone (not me) sent in a request for 
indexing the ipython lists at gmane.  I didn't do it but I'm perfectly happy 
with it, so thanks to whoever did it.

For those not familiar with the service, http://gmane.org provides a mailing 
list to news bridge, which allows you to follow the ipython lists with a news 
reader.

Cheers,

Fernando.




In [8]:
arx = archives[0]

In [9]:
k = pd.DataFrame

In [10]:
first_participation = {}
for row in archives[0].data.iterrows():
    if row[1]["From"] not in first_participation:
        first_participation[row[1]["From"]] = row[1]["Date"]

In [11]:
first_participation1 = {}
for row in archives[1].data.iterrows():
    if row[1]["From"] not in first_participation1:
        first_participation1[row[1]["From"]] = row[1]["Date"]

In [67]:
#First list
wordcount={}
for row in archives[0].data.iterrows():
    w = row[1]["Body"].replace("'", "")
    k = re.sub(r'[^\w]', ' ', w)
    t = nltk.tokenize.word_tokenize(k)
    for g in t:
        try:
            word = st.stem(g)
        except:
            print(g)
            pass
        if word in stopwords.words('english'):
            continue
        if word not in wordcount:
            wordcount[word] = [1]
            wordcount[word].append(row[0])
            wordcount[word].append(row[1]["Date"])
            wordcount[word].append(row[1]["From"])
            wordcount[word].append(row[1]["In-Reply-To"])
        else:
            wordcount[word][0] += 1
wd = wordcount #In case

In [13]:
#Second List
wordcount1={}
for row in archives[1].data.iterrows():
    w = row[1]["Body"].replace("'", "")
    k = re.sub(r'[^\w]', ' ', w)
    t = nltk.tokenize.word_tokenize(k)
    for g in t:
        try:
            word = st.stem(g)
        except:
            print(g)
            pass
        if word in stopwords.words('english'):
            continue
        if word not in wordcount1:
            wordcount1[word] = [1]
            wordcount1[word].append(row[0])
            wordcount1[word].append(row[1]["Date"])
            wordcount1[word].append(row[1]["From"])
            wordcount1[word].append(row[1]["In-Reply-To"])
        else:
            wordcount1[word][0] += 1

In [14]:
#new_df = pd.DataFrame(wordcount.items(),columns=["Word","Others"])

In [15]:
#pd.concat(pd.Series(wordcount.keys()),pd.DataFrame(wordcount.values(),columns=["A","B","C","D","E"]))

In [16]:
#Wordcount information dataframe, with rows as words.
asd = pd.DataFrame(wordcount)
new_dataframe = asd.transpose()
new_dataframe.columns = ["Wordcount", "Message-ID", "Date","From","In-Reply-To"]

In [17]:
#Wordcount information dataframe, with rows as words.
asd1 = pd.DataFrame(wordcount1)
new_dataframe1 = asd1.transpose()
new_dataframe1.columns = ["Wordcount", "Message-ID", "Date","From","In-Reply-To"]

In [18]:
len(wordcount) #Number of unique words in mailing list1


Out[18]:
37146

In [19]:
len(wordcount1) #Number of unique words in mailing list2


Out[19]:
45244

In [20]:
#Number of same unique words in two mailing lists
samewordcount=0
for word in wordcount:
    if word in wordcount1:
        samewordcount += 1
samewordcount


Out[20]:
14688

In [21]:
#Total number of same words that are introduced by same people.
samecount = 0
for word in wordcount:
    if word in wordcount1:
        if wordcount[word][3] == wordcount1[word][3]:
            samecount += 1
samecount


Out[21]:
2984

In [22]:
#Among 100-500 appearance words, the number of common words between two mailing-list.
samewordcount = 0
for word in wordcount:
    if wordcount[word][0] >= 100 and wordcount[word][0] <= 500:
        if word in wordcount1:
            if wordcount1[word][0] >= 100 and wordcount1[word][0] <= 500:
                samewordcount += 1
samewordcount


Out[22]:
808

In [23]:
#Among 100-500 appearance words, the number of common words between two mailing-list that are first
#introduced by same people
same_person_count = 0
for word in wordcount:
    if wordcount[word][0] >= 100 and wordcount[word][0] <= 500:
        if word in wordcount1:
            if wordcount1[word][0] >= 100 and wordcount1[word][0] <= 500:
                if wordcount[word][3] == wordcount1[word][3]:
                    #print word
                    same_person_count += 1
samecount


Out[23]:
2984

In [24]:
#common word list(introduced by different people in different lists)
commonwords = {}
for word in wordcount:
    if wordcount[word][0] >= 100 and wordcount[word][0] <= 500:
        if word in wordcount1:
            if wordcount1[word][0] >= 100 and wordcount1[word][0] <= 500:
                if wordcount[word][3] != wordcount1[word][3]:
                    commonwords[word] = [wordcount[word][0],wordcount[word][3],wordcount[word][2],\
                                         wordcount1[word][0],wordcount1[word][3],wordcount1[word][2]]

In [25]:
len(commonwords)


Out[25]:
669

In [27]:
#Dataframe of information of those words introduced by different people
df1 = pd.DataFrame(commonwords)
commonword_differentauthor_dataframe = df1.transpose()
commonword_differentauthor_dataframe.columns = ["Wordcount1", "From1", "Date1","Wordcount2", "From2", "Date2"]
commonword_differentauthor_dataframe[:10]


Out[27]:
Wordcount1 From1 Date1 Wordcount2 From2 Date2
000 117 cggame at consultant.com (Van Dyke) 2003-08-30 13:44:50+02:00 105 thomashadim99 at netscape.net (Ciwen) 2003-10-19 15:00:32+00:00
0000 102 ralf@brainbot.com (Ralf Schmitt) 2003-09-24 13:05:14+02:00 185 fperez at colorado.edu (Fernando Perez) 2004-04-10 14:45:23+00:00
0200 369 ipython at ml.schieke.net (Jaco Schieke) 2004-08-02 18:19:31+00:00 348 ero at dkbza.org (Ero Carrera) 2004-05-09 08:11:36+00:00
0600 140 fperez@colorado.edu (Fernando Perez) 2003-04-17 12:43:47-06:00 108 gareth at wiked.org (Gareth J. Greenaway) 2003-04-15 17:03:05-07:00
0700 454 gb at cs.unc.edu (Gary Bishop) 2003-12-03 10:00:56+00:00 452 fperez@colorado.edu (Fernando Perez) 2003-05-30 12:45:15-06:00
1000 116 cmoad at indiana.edu (Charles Moad) 2005-02-21 08:47:43+00:00 178 gillet@scripps.edu (Alexandre Gillet) 2003-08-19 17:34:07-07:00
2003 184 cdodt@fcoe.k12.ca.us (Cory Dodt) 2003-04-17 07:32:56-07:00 168 Robin.Siebler at palmsource.com (Robin Siebler) 2003-03-27 13:13:13-08:00
2004 149 gb at cs.unc.edu (Gary Bishop) 2004-02-03 08:50:22+00:00 353 twl at sauria.com (Ted Leung) 2004-01-09 13:14:37+00:00
2005 386 Fernando.Perez at colorado.edu (Fernando Perez) 2005-01-23 18:56:58+00:00 489 jjl at pobox.com (John J Lee) 2004-12-31 10:49:16+00:00
2007 419 vivainio at gmail.com (Ville M. Vainio) 2007-01-17 19:19:08+01:00 442 bthom at cs.hmc.edu (belinda thom) 2007-01-07 21:36:13-08:00

10 rows × 6 columns


In [28]:
commonword_differentauthor_dataframe['Date1'][0] < commonword_differentauthor_dataframe['Date1'][1]


Out[28]:
True

In [29]:
len(commonwords)


Out[29]:
669

In [30]:
# The list of words that have potential of idea flows. Definition: A is introduced by p in list1 first, then q saw it and then 
# introduced the word A to list 2, vice versa. We defined q saw as q said sth in list1 before p poped out the word. 
# Total list of such word A. 
time_influence = 0
influnce_list = {}
for word in commonwords:
    if commonwords[word][2] > commonwords[word][5]: #Author2 comes first
        if commonwords[word][1] in first_participation1: #Check if author1 in list2
            if first_participation1[commonwords[word][1]] < commonwords[word][5]: #Check if author1\
                #in list2 and exists before the word first introduced in list2
                influnce_list[word] = commonwords[word]
                time_influence += 1
    else: #Author1 comes first
        if commonwords[word][4] in first_participation:
            if first_participation[commonwords[word][4]] < commonwords[word][2]:
                influnce_list[word] = commonwords[word]
                time_influence += 1

In [31]:
time_influence


Out[31]:
235

In [32]:
len(list(influnce_list.keys()))


Out[32]:
235

In [34]:
df2 = pd.DataFrame(influnce_list)
influnce_list_dataframe = df2.transpose()
influnce_list_dataframe.columns = ["Wordcount1", "From1", "Date1","Wordcount2", "From2", "Date2"]
influnce_list_dataframe[:20]


Out[34]:
Wordcount1 From1 Date1 Wordcount2 From2 Date2
0000 102 ralf@brainbot.com (Ralf Schmitt) 2003-09-24 13:05:14+02:00 185 fperez at colorado.edu (Fernando Perez) 2004-04-10 14:45:23+00:00
0600 140 fperez@colorado.edu (Fernando Perez) 2003-04-17 12:43:47-06:00 108 gareth at wiked.org (Gareth J. Greenaway) 2003-04-15 17:03:05-07:00
0700 454 gb at cs.unc.edu (Gary Bishop) 2003-12-03 10:00:56+00:00 452 fperez@colorado.edu (Fernando Perez) 2003-05-30 12:45:15-06:00
2004 149 gb at cs.unc.edu (Gary Bishop) 2004-02-03 08:50:22+00:00 353 twl at sauria.com (Ted Leung) 2004-01-09 13:14:37+00:00
2005 386 Fernando.Perez at colorado.edu (Fernando Perez) 2005-01-23 18:56:58+00:00 489 jjl at pobox.com (John J Lee) 2004-12-31 10:49:16+00:00
2007 419 vivainio at gmail.com (Ville M. Vainio) 2007-01-17 19:19:08+01:00 442 bthom at cs.hmc.edu (belinda thom) 2007-01-07 21:36:13-08:00
404 210 Fernando.Perez at colorado.edu (Fernando Perez) 2006-01-16 11:29:09-07:00 147 mantegazza at ill.fr (=?iso-8859-15?q?Fr=E9d=E... 2005-04-06 03:12:39+00:00
43 402 gb@cs.unc.edu (Gary Bishop) 2003-05-24 08:46:18+00:00 318 fperez@colorado.edu (Fernando Perez) 2003-05-30 12:45:15-06:00
47 297 fperez@colorado.edu (Fernando Perez) 2003-04-17 12:43:47-06:00 297 fperez at colorado.edu (Fernando Perez) 2003-04-17 12:43:47-06:00
52 351 ralf at brainbot.com (Ralf Schmitt) 2003-09-22 16:30:47+02:00 305 fperez at colorado.edu (Fernando Perez) 2004-04-10 14:45:23+00:00
58 357 gb at cs.unc.edu (Gary Bishop) 2003-05-24 16:33:59+00:00 363 fperez@colorado.edu (Fernando Perez) 2003-05-30 12:45:15-06:00
_ip 193 vivainio at gmail.com (Ville Vainio) 2006-01-30 23:18:54+02:00 220 oliphant.travis at ieee.org (Travis E. Oliphant) 2006-01-24 16:11:40-07:00
ac 264 prabhu at aero.iitm.ernet.in (Prabhu Ramachand... 2004-09-29 12:37:41+00:00 218 jhsh@sun.ac.za (Jannie Hofmeyr) 2003-08-29 11:34:13+02:00
acceiv 439 fperez at colorado.edu (Fernando Perez) 2003-05-13 13:13:27-06:00 373 gb@cs.unc.edu (Gary Bishop) 2003-05-13 15:11:30+00:00
across 409 fperez@colorado.edu (Fernando Perez) 2003-05-20 16:12:53-06:00 343 fperez at colorado.edu (Fernando Perez) 2003-05-20 16:12:53-06:00
admin 177 fperez at colorado.edu (Fernando Perez) 2003-10-07 10:35:56+00:00 262 tony at tcapp.com (Tony Cappellini) 2003-04-14 21:56:14-07:00
affect 296 pythondev-dang at lazytwinacres.net (Daniel 'D... 2004-06-24 14:18:14+00:00 185 Fernando.Perez at colorado.edu (Fernando Perez) 2004-08-13 11:05:14+00:00
afraid 227 fperez at colorado.edu (Fernando Perez) 2003-09-30 15:53:03+00:00 289 fperez@colorado.edu (Fernando Perez) 2003-06-30 17:57:40-06:00
alex 103 fperez at colorado.edu (Fernando Perez) 2003-04-17 13:12:21-06:00 418 fperez@colorado.edu (Fernando Perez) 2003-04-23 10:42:42-06:00
alph 114 fperez@colorado.edu (Fernando Perez) 2003-05-20 16:12:53-06:00 103 fperez at colorado.edu (Fernando Perez) 2003-05-20 16:12:53-06:00

20 rows × 6 columns


In [35]:
influence_words = list(influnce_list.keys())

In [36]:
#reduce the words that only contain numbers (lack of information)
reduced_influence_words = [] 
for word in influence_words:
    if word.isdigit() == False:
        reduced_influence_words.append(word)

In [37]:
len(reduced_influence_words)


Out[37]:
224

In [38]:
reduced_influence_words[:20]


Out[38]:
['lsf',
 'osx',
 'construct',
 'mom',
 'foolscap',
 'lack',
 'gz',
 'tcp',
 'subprocess',
 'catch',
 'prefix',
 'sleep',
 'decl',
 u'query',
 'perfect',
 'cython',
 'busy',
 'ver',
 u'temp',
 'autocal']

In [39]:
#Store the list
import csv
with open('test123.csv', 'w') as fp:
    a = csv.writer(fp)
    data = [reduced_influence_words]
    a.writerows(data)

#reduced_influence_words.to_csv()

End of main contents of this notebook, below are some analysis of unique word lists


In [40]:
list(influnce_list_dataframe.keys())


Out[40]:
Index([u'Wordcount1', u'From1', u'Date1', u'Wordcount2', u'From2', u'Date2'], dtype='object')

In [68]:
for key,value in list(wd.items()):
    if value <= 100 or value >= 500:
        del wd[key]

In [69]:
wc_array = np.array(list(wd.values()))

In [70]:
wc_array.sort()

In [72]:
len(wordcount)


Out[72]:
0

In [66]:
#List1's unique words and their count, power law distribution
%matplotlib inline
plt.plot(wcsort_array[:,0])


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-66-518ff4ca5f3c> in <module>()
      1 #List1's unique words and their count, power law distribution
      2 get_ipython().magic(u'matplotlib inline')
----> 3 plt.plot(wcsort_array[:,0])

NameError: name 'wcsort_array' is not defined

In [12]:
t = nltk.tokenize.word_tokenize(p)

In [11]:
len(nltk.corpus.stopwords.words('english'))


Out[11]:
127

In [20]:
a = []
for i in t:
    a.append(st.stem(i))

In [ ]: