In [59]:
import time
import pandas as pd
import numpy as np
from mwclient import Site
import pickle
import csv
%matplotlib inline

In [60]:
#df = pickle.load(open('findf.p','rb'))
df = pickle.load(open('consolidated-profile','rb'))

In [61]:
df.head()


Out[61]:
page year month links_to
0 Peder_E._Vorum 2016.0 1.0 [Category:Use dmy dates from May 2011, Andenes...
1 Peder_E._Vorum 2012.0 7.0 [Category:Use dmy dates from May 2011, Andenes...
2 Peder_E._Vorum 2011.0 8.0 [Category:Use dmy dates from May 2011, Andenes...
3 Peder_E._Vorum 2013.0 8.0 [Category:Use dmy dates from May 2011, Andenes...
4 Peder_E._Vorum 2011.0 1.0 [Andenes, County council (Norway), Egge, Germa...

In [62]:
ymg = df.groupby(['year','month'])
ymg.head()


Out[62]:
page year month links_to
0 Peder_E._Vorum 2016.0 1.0 [Category:Use dmy dates from May 2011, Andenes...
1 Peder_E._Vorum 2012.0 7.0 [Category:Use dmy dates from May 2011, Andenes...
2 Peder_E._Vorum 2011.0 8.0 [Category:Use dmy dates from May 2011, Andenes...
3 Peder_E._Vorum 2013.0 8.0 [Category:Use dmy dates from May 2011, Andenes...
4 Peder_E._Vorum 2011.0 1.0 [Andenes, County council (Norway), Egge, Germa...
5 Peder_E._Vorum 2010.0 11.0 [Andenes, County council (Norway), Egge, Germa...
6 Peder_E._Vorum 2014.0 12.0 [Category:Use dmy dates from May 2011, Andenes...
7 Peder_E._Vorum 2012.0 3.0 [Category:Use dmy dates from May 2011, Andenes...
8 Peder_E._Vorum 2015.0 8.0 [Category:Use dmy dates from May 2011, Andenes...
9 Peder_E._Vorum 2013.0 6.0 [Category:Use dmy dates from May 2011, Andenes...
10 Peder_E._Vorum 2016.0 8.0 [Category:Use dmy dates from May 2011, Andenes...
11 Peder_E._Vorum 2012.0 9.0 [Category:Use dmy dates from May 2011, Andenes...
12 Peder_E._Vorum 2011.0 2.0 [Andenes, County council (Norway), Egge, Germa...
13 Peder_E._Vorum 2015.0 2.0 [Category:Use dmy dates from May 2011, Andenes...
14 Peder_E._Vorum 2015.0 7.0 [Category:Use dmy dates from May 2011, Andenes...
15 Peder_E._Vorum 2012.0 1.0 [Category:Use dmy dates from May 2011, Andenes...
16 Peder_E._Vorum 2011.0 4.0 [Andenes, County council (Norway), Egge, Germa...
17 Peder_E._Vorum 2015.0 9.0 [Category:Use dmy dates from May 2011, Andenes...
18 Peder_E._Vorum 2012.0 12.0 [Category:Use dmy dates from May 2011, Andenes...
19 Peder_E._Vorum 2013.0 1.0 [Category:Use dmy dates from May 2011, Andenes...
20 Peder_E._Vorum 2012.0 10.0 [Category:Use dmy dates from May 2011, Andenes...
21 Peder_E._Vorum 2016.0 4.0 [Category:Use dmy dates from May 2011, Andenes...
22 Peder_E._Vorum 2011.0 11.0 [Category:Use dmy dates from May 2011, Andenes...
23 Peder_E._Vorum 2015.0 4.0 [Category:Use dmy dates from May 2011, Andenes...
24 Peder_E._Vorum 2013.0 10.0 [Category:Use dmy dates from May 2011, Andenes...
25 Peder_E._Vorum 2011.0 12.0 [Category:Use dmy dates from May 2011, Andenes...
26 Peder_E._Vorum 2015.0 11.0 [Category:Use dmy dates from May 2011, Andenes...
27 Peder_E._Vorum 2015.0 1.0 [Category:Use dmy dates from May 2011, Andenes...
28 Peder_E._Vorum 2014.0 9.0 [Category:Use dmy dates from May 2011, Andenes...
29 Peder_E._Vorum 2015.0 12.0 [Category:Use dmy dates from May 2011, Andenes...
... ... ... ... ...
158 Òscar_Ribas_Reig 2013.0 9.0 [Template:ISO 639 name es, Template:Prime Mini...
159 Òscar_Ribas_Reig 2008.0 1.0 [Template:Prime Ministers of Andorra, Template...
160 Òscar_Ribas_Reig 2014.0 8.0 [Template:ISO 639 name es, Template:Prime Mini...
161 Òscar_Ribas_Reig 2016.0 3.0 [Template:ISO 639 name es, Template:Prime Mini...
162 Òscar_Ribas_Reig 2010.0 7.0 [Template:ISO 639 name es, Template:Prime Mini...
163 Òscar_Ribas_Reig 2012.0 8.0 [Template:ISO 639 name es, Template:Prime Mini...
164 Òscar_Ribas_Reig 2008.0 5.0 [Template:Prime Ministers of Andorra, Template...
165 Òscar_Ribas_Reig 2016.0 2.0 [Template:ISO 639 name es, Template:Prime Mini...
166 Òscar_Ribas_Reig 2009.0 3.0 [Template:Prime Ministers of Andorra, Template...
167 Òscar_Ribas_Reig 2011.0 10.0 [Template:ISO 639 name es, Template:Prime Mini...
168 Òscar_Ribas_Reig 2011.0 8.0 [Template:ISO 639 name es, Template:Prime Mini...
169 Òscar_Ribas_Reig 2013.0 7.0 [Template:ISO 639 name es, Template:Prime Mini...
170 Òscar_Ribas_Reig 2008.0 6.0 [Template:Prime Ministers of Andorra, Template...
171 Òscar_Ribas_Reig 2007.0 7.0 [Template:Europe-politician-stub, Marc Forne M...
172 Òscar_Ribas_Reig 2015.0 6.0 [Template:ISO 639 name es, Template:Prime Mini...
173 Òscar_Ribas_Reig 2016.0 6.0 [Template:ISO 639 name es, Template:Prime Mini...
174 Òscar_Ribas_Reig 2007.0 10.0 [Template:Andorra-politician-stub, 1981, Decem...
175 Òscar_Ribas_Reig 2011.0 12.0 [Template:ISO 639 name es, Template:Prime Mini...
176 Òscar_Ribas_Reig 2013.0 11.0 [Template:ISO 639 name es, Template:Prime Mini...
177 Òscar_Ribas_Reig 2009.0 12.0 [Template:ISO 639 name es, Template:Prime Mini...
178 Òscar_Ribas_Reig 2010.0 10.0 [Template:ISO 639 name es, Template:Prime Mini...
179 Òscar_Ribas_Reig 2010.0 4.0 [Template:ISO 639 name es, Template:Prime Mini...
180 Òscar_Ribas_Reig 2009.0 2.0 [Template:Prime Ministers of Andorra, Template...
181 Òscar_Ribas_Reig 2016.0 7.0 [Template:ISO 639 name es, Template:Prime Mini...
182 Òscar_Ribas_Reig 2013.0 3.0 [Template:ISO 639 name es, Template:Prime Mini...
183 Òscar_Ribas_Reig 2016.0 12.0 [Template:ISO 639 name es, Template:Prime Mini...
184 Òscar_Ribas_Reig 2014.0 6.0 [Template:ISO 639 name es, Template:Prime Mini...
185 Òscar_Ribas_Reig 2011.0 9.0 [Template:ISO 639 name es, Template:Prime Mini...
186 Òscar_Ribas_Reig 2015.0 10.0 [Template:ISO 639 name es, Template:Prime Mini...
187 Òscar_Ribas_Reig 2009.0 11.0 [Template:ISO 639 name es, Template:Prime Mini...

188 rows × 4 columns


In [63]:
pageset = set(df.page.values)
len(pageset)


Out[63]:
2

In [64]:
korn = "" # Base path
name_d = {}
name_l = []
for name, grp in ymg: #Iterating over groups, each group - one month
    conn = str(int(name[0])) + "_" + str(int(name[1])).zfill(2) #Constructed name 
    cpth = korn + conn+'.csv' #Constructing pathname from base path and year_month
    name_l.append(conn)
    name_d[conn]=cpth #Storing names and paths
    f = open(cpth,'w',encoding='utf-8') #Opening a file to store an edge list
    writer = csv.writer(f, delimiter=',',lineterminator='\n') # Creating csv writer instance, to write into a file row by row
    for ind,row in grp.iterrows(): #Iterating over articles with revision in given month 
        for lk in row['links_to']: 
            lkr = lk.replace(' ','_')
            if lkr in pageset:
                writer.writerow((row['page'],lkr)) # If article links to one of our articles we add the edge pait
    f.close()
    print(name)
    #print(grp.head())


(2007.0, 7.0)
(2007.0, 8.0)
(2007.0, 9.0)
(2007.0, 10.0)
(2007.0, 11.0)
(2007.0, 12.0)
(2008.0, 1.0)
(2008.0, 2.0)
(2008.0, 3.0)
(2008.0, 4.0)
(2008.0, 5.0)
(2008.0, 6.0)
(2008.0, 7.0)
(2008.0, 8.0)
(2008.0, 9.0)
(2008.0, 10.0)
(2008.0, 11.0)
(2008.0, 12.0)
(2009.0, 1.0)
(2009.0, 2.0)
(2009.0, 3.0)
(2009.0, 4.0)
(2009.0, 5.0)
(2009.0, 6.0)
(2009.0, 7.0)
(2009.0, 8.0)
(2009.0, 9.0)
(2009.0, 10.0)
(2009.0, 11.0)
(2009.0, 12.0)
(2010.0, 1.0)
(2010.0, 2.0)
(2010.0, 3.0)
(2010.0, 4.0)
(2010.0, 5.0)
(2010.0, 6.0)
(2010.0, 7.0)
(2010.0, 8.0)
(2010.0, 9.0)
(2010.0, 10.0)
(2010.0, 11.0)
(2010.0, 12.0)
(2011.0, 1.0)
(2011.0, 2.0)
(2011.0, 3.0)
(2011.0, 4.0)
(2011.0, 5.0)
(2011.0, 6.0)
(2011.0, 7.0)
(2011.0, 8.0)
(2011.0, 9.0)
(2011.0, 10.0)
(2011.0, 11.0)
(2011.0, 12.0)
(2012.0, 1.0)
(2012.0, 2.0)
(2012.0, 3.0)
(2012.0, 4.0)
(2012.0, 5.0)
(2012.0, 6.0)
(2012.0, 7.0)
(2012.0, 8.0)
(2012.0, 9.0)
(2012.0, 10.0)
(2012.0, 11.0)
(2012.0, 12.0)
(2013.0, 1.0)
(2013.0, 2.0)
(2013.0, 3.0)
(2013.0, 4.0)
(2013.0, 5.0)
(2013.0, 6.0)
(2013.0, 7.0)
(2013.0, 8.0)
(2013.0, 9.0)
(2013.0, 10.0)
(2013.0, 11.0)
(2013.0, 12.0)
(2014.0, 1.0)
(2014.0, 2.0)
(2014.0, 3.0)
(2014.0, 4.0)
(2014.0, 5.0)
(2014.0, 6.0)
(2014.0, 7.0)
(2014.0, 8.0)
(2014.0, 9.0)
(2014.0, 10.0)
(2014.0, 11.0)
(2014.0, 12.0)
(2015.0, 1.0)
(2015.0, 2.0)
(2015.0, 3.0)
(2015.0, 4.0)
(2015.0, 5.0)
(2015.0, 6.0)
(2015.0, 7.0)
(2015.0, 8.0)
(2015.0, 9.0)
(2015.0, 10.0)
(2015.0, 11.0)
(2015.0, 12.0)
(2016.0, 1.0)
(2016.0, 2.0)
(2016.0, 3.0)
(2016.0, 4.0)
(2016.0, 5.0)
(2016.0, 6.0)
(2016.0, 7.0)
(2016.0, 8.0)
(2016.0, 9.0)
(2016.0, 10.0)
(2016.0, 11.0)
(2016.0, 12.0)

In [65]:
temp_d = {}
for n in name_l: #Iterating over month
    print(n)
    cdf = pd.read_csv(name_d[n], header=None,names=['from','to'])
    cart = set(cdf['from'].values) # Set of articles that have revision in this month
    tst = set(temp_d.keys()) # Set of already existing articles
    per = cart.intersection(tst) # Set of articles that have revision in this month and already existed
    sch = tst - per # Set of article that don't have revisions in this month
    for upd in cart: # Iterating over articles
        #if upd not in temp_d.keys(): # Checking if page already exists
        temp_d[upd] = []
        for index_s , rww in cdf[cdf['from']==upd].iterrows(): # Iterating over rows with given article in 'from' column
            temp_d[upd].append(rww['to']) #Adding link to temp_dictionary
    for rt in sch: #Iterating over articles that don't have revisions this month
        if rt in temp_d.keys():
            for clk in temp_d[rt]:
                cdf.loc[cdf.shape[0]+1] = [rt,clk] #Adding row to the edgelist if the article had prev revisions, but doesnt have this month
    cdf.sort_values('from',inplace=True) #Sorting table
    cdf.to_csv(name_d[n],index=False) #Saving it to the same file


2007_07
2007_08
2007_09
2007_10
2007_11
2007_12
2008_01
2008_02
2008_03
2008_04
2008_05
2008_06
2008_07
2008_08
2008_09
2008_10
2008_11
2008_12
2009_01
2009_02
2009_03
2009_04
2009_05
2009_06
2009_07
2009_08
2009_09
2009_10
2009_11
2009_12
2010_01
2010_02
2010_03
2010_04
2010_05
2010_06
2010_07
2010_08
2010_09
2010_10
2010_11
2010_12
2011_01
2011_02
2011_03
2011_04
2011_05
2011_06
2011_07
2011_08
2011_09
2011_10
2011_11
2011_12
2012_01
2012_02
2012_03
2012_04
2012_05
2012_06
2012_07
2012_08
2012_09
2012_10
2012_11
2012_12
2013_01
2013_02
2013_03
2013_04
2013_05
2013_06
2013_07
2013_08
2013_09
2013_10
2013_11
2013_12
2014_01
2014_02
2014_03
2014_04
2014_05
2014_06
2014_07
2014_08
2014_09
2014_10
2014_11
2014_12
2015_01
2015_02
2015_03
2015_04
2015_05
2015_06
2015_07
2015_08
2015_09
2015_10
2015_11
2015_12
2016_01
2016_02
2016_03
2016_04
2016_05
2016_06
2016_07
2016_08
2016_09
2016_10
2016_11
2016_12

In [ ]: