This note book gives the trend of a single word in single mailing list.



In [1]:

    
%matplotlib inline



In [2]:

    
from bigbang.archive import Archive
import bigbang.parse as parse
import bigbang.graph as graph
import bigbang.mailman as mailman
import bigbang.process as process
import networkx as nx
import matplotlib.pyplot as plt
import pandas as pd
from pprint import pprint as pp
import pytz
import numpy as np
import math
import nltk
from itertools import repeat
from nltk.stem.lancaster import LancasterStemmer
st = LancasterStemmer()
from nltk.corpus import stopwords
import re



In [3]:

    
urls = ["http://mail.scipy.org/pipermail/ipython-dev/"]#,
        #"http://mail.scipy.org/pipermail/ipython-user/"],
        #"http://mail.scipy.org/pipermail/scipy-dev/",
        #"http://mail.scipy.org/pipermail/scipy-user/",
        #"http://mail.scipy.org/pipermail/numpy-discussion/"]


archives= [Archive(url,archive_dir="../archives") for url in urls]









    



Opening 138 archive files



In [4]:

    
checkword = "python" #can change words, should be lower case



In [5]:

    
df = pd.DataFrame(columns=["MessageId","Date","From","In-Reply-To","Count"])
for row in archives[0].data.iterrows():
    w = row[1]["Body"].replace("'", "")
    k = re.sub(r'[^\w]', ' ', w)
    k = k.lower()
    t = nltk.tokenize.word_tokenize(k)
    subdict = {}
    count = 0
    for g in t:
        try:
            word = st.stem(g)
        except:
            print g
            pass
        if word == checkword:
            count += 1
    if count == 0:
        continue
    else:
        subdict["MessageId"] = row[0]
        subdict["Date"] = row[1]["Date"]
        subdict["From"] = row[1]["From"]
        subdict["In-Reply-To"] = row[1]["In-Reply-To"]
        subdict["Count"] = count
        df = df.append(subdict,ignore_index=True)



In [6]:

    
df[:5]  #dataframe of informations of the particular word.









    Out[6]:






  
    
      
      MessageId
      Date
      From
      In-Reply-To
      Count
    
  
  
    
      0
                         <3E9E4094.7030802@colorado.edu>
       2003-04-16 23:50:12-06:00
          fperez@colorado.edu (Fernando Perez)
       <003d01c28a9a$3dcb8560$e301340a@cyberhigh.fcoe...
       2
    
    
      1
                         <3E9E4094.7030802@colorado.edu>
       2003-04-16 23:50:12-06:00
       fperez at colorado.edu (Fernando Perez)
       <003d01c28a9a$3dcb8560$e301340a@cyberhigh.fcoe...
       2
    
    
      2
       <000c01c304ee$3cb79e60$e901340a@cyberhigh.fcoe...
       2003-04-17 07:32:56-07:00
              cdodt@fcoe.k12.ca.us (Cory Dodt)
                         <3E9E4094.7030802@colorado.edu>
       3
    
    
      3
       <000c01c304ee$3cb79e60$e901340a@cyberhigh.fcoe...
       2003-04-17 07:32:56-07:00
           cdodt at fcoe.k12.ca.us (Cory Dodt)
                         <3E9E4094.7030802@colorado.edu>
       3
    
    
      4
                         <3E9EC1CA.3060800@colorado.edu>
       2003-04-17 09:01:30-06:00
          fperez@colorado.edu (Fernando Perez)
       <000c01c304ee$3cb79e60$e901340a@cyberhigh.fcoe...
       6
    
  

5 rows × 5 columns



In [7]:

    
start_date = df.iloc[0]["Date"]
end_date = df.iloc[len(df.index)-1]["Date"]



In [8]:

    
start_date









    Out[8]:





datetime.datetime(2003, 4, 16, 23, 50, 12, tzinfo=tzoffset(None, -21600))



In [9]:

    
end_date









    Out[9]:





datetime.datetime(2014, 9, 22, 16, 52, 34, tzinfo=tzoffset(None, 7200))



In [10]:

    
total_month = (end_date.year - start_date.year)*12 + (end_date.month-start_date.month+1)



In [11]:

    
total_month









    Out[11]:





138



In [12]:

    
k_month = []
temp_year = start_date.year
temp_month = start_date.month
for i in range(total_month):
    k_month.append((temp_year, temp_month))
    if temp_month == 12:
        temp_year += 1
        temp_month = 0
    temp_month += 1



In [16]:

    
time_dict = {}
for row in df.iterrows():
    if (row[1]['Date'].year, row[1]['Date'].month) not in time_dict:
        time_dict[(row[1]['Date'].year, row[1]['Date'].month)] = row[1]["Count"]
    else:
        time_dict[(row[1]['Date'].year, row[1]['Date'].month)] += row[1]["Count"]



In [17]:

    
by_month = []
for item in k_month:
    if item in time_dict:
        by_month.append(time_dict[item])
    else:
        by_month.append(0)



In [19]:

    
x = range(1,total_month+1)
y = by_month
plt.plot(x, y)
plt.xlabel("month")









    Out[19]:





<matplotlib.text.Text at 0x10829fa50>



In [20]:

    
y[62] #highest









    Out[20]:





469



In [21]:

    
#Check which month is 62
k_month[62]









    Out[21]:





(2008, 6)



In [ ]:

	MessageId	Date	From	In-Reply-To	Count
0	<3E9E4094.7030802@colorado.edu>	2003-04-16 23:50:12-06:00	fperez@colorado.edu (Fernando Perez)	<003d01c28a9a$3dcb8560$e301340a@cyberhigh.fcoe...	2
1	<3E9E4094.7030802@colorado.edu>	2003-04-16 23:50:12-06:00	fperez at colorado.edu (Fernando Perez)	<003d01c28a9a$3dcb8560$e301340a@cyberhigh.fcoe...	2
2	<000c01c304ee$3cb79e60$e901340a@cyberhigh.fcoe...	2003-04-17 07:32:56-07:00	cdodt@fcoe.k12.ca.us (Cory Dodt)	<3E9E4094.7030802@colorado.edu>	3
3	<000c01c304ee$3cb79e60$e901340a@cyberhigh.fcoe...	2003-04-17 07:32:56-07:00	cdodt at fcoe.k12.ca.us (Cory Dodt)	<3E9E4094.7030802@colorado.edu>	3
4	<3E9EC1CA.3060800@colorado.edu>	2003-04-17 09:01:30-06:00	fperez@colorado.edu (Fernando Perez)	<000c01c304ee$3cb79e60$e901340a@cyberhigh.fcoe...	6