In [1]:
%matplotlib inline
In [2]:
from bigbang.archive import Archive
import bigbang.parse as parse
import bigbang.graph as graph
import bigbang.mailman as mailman
import bigbang.process as process
import networkx as nx
import matplotlib.pyplot as plt
import pandas as pd
from pprint import pprint as pp
import pytz
import numpy as np
import math
import nltk
from itertools import repeat
from nltk.stem.lancaster import LancasterStemmer
st = LancasterStemmer()
from nltk.corpus import stopwords
import re
In [3]:
urls = ["http://mail.scipy.org/pipermail/ipython-dev/"]#,
#"http://mail.scipy.org/pipermail/ipython-user/"],
#"http://mail.scipy.org/pipermail/scipy-dev/",
#"http://mail.scipy.org/pipermail/scipy-user/",
#"http://mail.scipy.org/pipermail/numpy-discussion/"]
archives= [Archive(url,archive_dir="../archives") for url in urls]
In [4]:
checkword = "python" #can change words, should be lower case
In [5]:
df = pd.DataFrame(columns=["MessageId","Date","From","In-Reply-To","Count"])
for row in archives[0].data.iterrows():
w = row[1]["Body"].replace("'", "")
k = re.sub(r'[^\w]', ' ', w)
k = k.lower()
t = nltk.tokenize.word_tokenize(k)
subdict = {}
count = 0
for g in t:
try:
word = st.stem(g)
except:
print g
pass
if word == checkword:
count += 1
if count == 0:
continue
else:
subdict["MessageId"] = row[0]
subdict["Date"] = row[1]["Date"]
subdict["From"] = row[1]["From"]
subdict["In-Reply-To"] = row[1]["In-Reply-To"]
subdict["Count"] = count
df = df.append(subdict,ignore_index=True)
In [6]:
df[:5] #dataframe of informations of the particular word.
Out[6]:
In [7]:
start_date = df.iloc[0]["Date"]
end_date = df.iloc[len(df.index)-1]["Date"]
In [8]:
start_date
Out[8]:
In [9]:
end_date
Out[9]:
In [10]:
total_month = (end_date.year - start_date.year)*12 + (end_date.month-start_date.month+1)
In [11]:
total_month
Out[11]:
In [12]:
k_month = []
temp_year = start_date.year
temp_month = start_date.month
for i in range(total_month):
k_month.append((temp_year, temp_month))
if temp_month == 12:
temp_year += 1
temp_month = 0
temp_month += 1
In [16]:
time_dict = {}
for row in df.iterrows():
if (row[1]['Date'].year, row[1]['Date'].month) not in time_dict:
time_dict[(row[1]['Date'].year, row[1]['Date'].month)] = row[1]["Count"]
else:
time_dict[(row[1]['Date'].year, row[1]['Date'].month)] += row[1]["Count"]
In [17]:
by_month = []
for item in k_month:
if item in time_dict:
by_month.append(time_dict[item])
else:
by_month.append(0)
In [19]:
x = range(1,total_month+1)
y = by_month
plt.plot(x, y)
plt.xlabel("month")
Out[19]:
In [20]:
y[62] #highest
Out[20]:
In [21]:
#Check which month is 62
k_month[62]
Out[21]:
In [ ]: