In [70]:
import pickle
import pandas as pd
from mwclient import Site
import datetime
import time
import os

In [71]:
file_name = 'politician-data-0-100'
tracker_file = file_name+'-tracker.csv' # make sure the file is in parallel to this program

base_path = 'profile-data' # to store the profile data. if the folder not present then it will create it
# no path delimiters please. don't use //
if(not os.path.isdir(base_path)):
    os.makedirs(base_path)

In [72]:
user_agent = 'Uni Koblenz-Landau student, kandhasamy@uni-koblenz.de'
wiki = Site(host='en.wikipedia.org', clients_useragent=user_agent)

In [73]:
start_time = time.time()
start_time


Out[73]:
1483621788.168219

In [74]:
pol_tracker = None
def read_politician_tracker() : 
    global pol_tracker
    pol_tracker = pd.read_csv(tracker_file)
    pol_tracker = pol_tracker[['handle','finished_reading','time_taken_in_mins']]
read_politician_tracker()
pol_tracker.head()


Out[74]:
handle finished_reading time_taken_in_mins
0 Quincy_Timberlake False 0.0
1 Fanny_Edelman False 0.0
2 Stojan_Novaković False 0.0
3 Paul_Scheffer False 0.0
4 Ed_Robb False 0.0

In [75]:
def get_unread_politician(pol_tracker):
    data_to_be_read = pol_tracker[pol_tracker['finished_reading'] == False]
    if(data_to_be_read.shape[0] > 0) :
        return data_to_be_read.iloc[0]['handle']
    else:
        return None
    
def write_read_politician(pol_tracker, politician,time_taken):
    # this could be improved - rather than filtering two times - get the row handle and update it
    pol_tracker.loc[pol_tracker['handle'] == politician, 'finished_reading'] = True
    pol_tracker.loc[pol_tracker['handle'] == politician, 'time_taken_in_mins'] = time_taken 

    pol_tracker.to_csv(tracker_file)
    
get_unread_politician(pol_tracker)


Out[75]:
'Quincy_Timberlake'

In [76]:
start_time = time.time()
politician_count = 0
while True:
    # get the politician to be read
    # read them completely
    #jump on to the next politician until everything is over
    start_politician_time = time.time()
    unread_politician = get_unread_politician(pol_tracker)
    if (unread_politician):
        politician_page = wiki.pages[unread_politician]
        pol_list = {}
        # go through all the years and all months and read a single article (or the latest previous article)
        for year in range(2001,2017):
            for mon in range(1,13):
                dateObj = datetime.date(year, mon, 1)
                #startDate = dateObj.isoformat()+'T00:00:00Z'
                endDate = dateObj.isoformat()+'T23:59:59Z'
                for article in politician_page.revisions(start=endDate, prop='ids|timestamp|content',dir='older', limit=1):
                    pol_list[str(year)+'-'+str(mon)] = article
                    break
        politician_count += 1
        end_politician_time = time.time()
        print(politician_count,') '+unread_politician+' is read. '+'time taken(mins) - ',(end_politician_time - start_politician_time)/60)
        pickle.dump(pol_list,open(base_path + '/'+unread_politician,'wb'))
        write_read_politician(pol_tracker, unread_politician, (end_politician_time - start_politician_time)/60 )
    else :
        print('Hooray!!!!!!! The job is over.')
        break
end_time = time.time()
print('Total Time taken -',(end_time - start_time) / 60)


1 ) Quincy_Timberlake is read. time taken(mins) -  0.8100204745928447
2 ) Fanny_Edelman is read. time taken(mins) -  0.7990706483523051
3 ) Stojan_Novaković is read. time taken(mins) -  0.8789964079856872
4 ) Paul_Scheffer is read. time taken(mins) -  0.7661576906840006
5 ) Ed_Robb is read. time taken(mins) -  0.7450967232386271
6 ) Nizamettin_Erkmen is read. time taken(mins) -  0.6623029510180155
7 ) Claudio_Scajola is read. time taken(mins) -  0.7038844704627991
8 ) Lawrence_Lual_Lual is read. time taken(mins) -  0.7397859930992127
9 ) Thomas_Clausen_(Louisiana) is read. time taken(mins) -  0.6808087031046549
10 ) Yang_Ti-liang is read. time taken(mins) -  0.70073535044988
11 ) Georges_Colombier is read. time taken(mins) -  0.6332885384559631
12 ) Gottlieb_Duttweiler is read. time taken(mins) -  0.6467467784881592
13 ) Mohammad_Natsir is read. time taken(mins) -  0.7488250454266866
14 ) Ján_Ševčík is read. time taken(mins) -  0.7234708944956462
15 ) Georges_Othily is read. time taken(mins) -  0.6745899319648743
16 ) Athanasios_N._Miaoulis is read. time taken(mins) -  0.7897930343945821
17 ) Kristalina_Georgieva is read. time taken(mins) -  0.8221425374348958
18 ) Otto_Hoetzsch is read. time taken(mins) -  0.8370245893796285
19 ) Rafiq_Ahmed_Jamali is read. time taken(mins) -  0.8406186938285828
20 ) Carolyn_Maloney is read. time taken(mins) -  0.9626320600509644
21 ) Sir_Gilbert_Pickering,_1st_Baronet is read. time taken(mins) -  0.8398561318715413
22 ) Karel_Rüütli is read. time taken(mins) -  0.7955053726832072
23 ) Todd_Rokita is read. time taken(mins) -  0.9043528079986572
24 ) Salmir_Kaplan is read. time taken(mins) -  0.8247261802355449
25 ) Ramón_Jiménez_Fuentes is read. time taken(mins) -  0.8128219604492187
26 ) Krasen_Kralev is read. time taken(mins) -  0.8261215647061666
27 ) Denver_Butler is read. time taken(mins) -  0.8121077140172323
28 ) Massimo_Giorgetti is read. time taken(mins) -  0.828660790125529
29 ) Nancy_Shukri is read. time taken(mins) -  0.8401762962341308
30 ) Julie_Green is read. time taken(mins) -  0.8394718249638875
31 ) Elżbieta_Pierzchała is read. time taken(mins) -  0.8190289855003356
32 ) Brice_Lalonde is read. time taken(mins) -  0.8357439557711284
33 ) Chandrasiri_Gajadeera is read. time taken(mins) -  0.7447451790173848
34 ) Slavica_Đukić_Dejanović is read. time taken(mins) -  0.6876219590504964
35 ) Ho_Dam is read. time taken(mins) -  0.6564692934354146
36 ) José_Piñera_Carvallo is read. time taken(mins) -  0.6961373686790466
37 ) Bruce_Jesson is read. time taken(mins) -  0.7883004824320475
38 ) Thomas_Ravenel is read. time taken(mins) -  0.7290303786595662
39 ) Akhilesh_Yadav is read. time taken(mins) -  0.7609363277753194
40 ) Luc_Recordon is read. time taken(mins) -  0.7107905586560567
41 ) Benjamin_D._Dwinnell is read. time taken(mins) -  0.6928639491399129
42 ) José_Francisco_Rábago is read. time taken(mins) -  0.6497402667999268
43 ) Gustaw_Przeczek is read. time taken(mins) -  0.7226025621096294
44 ) Jacques_Lavoie is read. time taken(mins) -  0.7138471086819966
45 ) Nessa_Childers is read. time taken(mins) -  0.8138584733009339
46 ) S._Nijamudeen is read. time taken(mins) -  0.6899546225865681
47 ) María_García_Quiroz is read. time taken(mins) -  0.6861859560012817
48 ) Tunji_Olurin is read. time taken(mins) -  0.6760647535324097
49 ) Jalagam_Vengala_Rao is read. time taken(mins) -  0.6863072673479717
50 ) Jim_McGovern_(British_politician) is read. time taken(mins) -  0.7042711297671
51 ) Mohammad_Reza_Aref is read. time taken(mins) -  0.7001890103022258
52 ) Alexander_Hermann,_Count_of_Wartensleben is read. time taken(mins) -  0.6893832206726074
53 ) Ali_Ahsan_Mohammad_Mojaheed is read. time taken(mins) -  0.6756738305091858
54 ) Faustino_Félix_Chávez is read. time taken(mins) -  0.6640854358673096
55 ) Saadaldeen_Talib is read. time taken(mins) -  0.5991753419240315
56 ) Laxmikant_Parsekar is read. time taken(mins) -  0.6366589466730753
57 ) Tulsidas_Jadhav is read. time taken(mins) -  0.6163785497347514
58 ) Rigoberta_Menchú is read. time taken(mins) -  0.744253412882487
59 ) Lucie_Leblanc is read. time taken(mins) -  0.6253040949503581
60 ) David_Milwyn_Duggan is read. time taken(mins) -  0.641196056207021
61 ) John_Wilson_(Mid_Durham_MP) is read. time taken(mins) -  0.6337432702382405
62 ) Yōsuke_Tsuruho is read. time taken(mins) -  0.6645492156346638
63 ) Mark_Souder is read. time taken(mins) -  0.7490802526473999
64 ) Benjamin_Boyd_(South_Carolina) is read. time taken(mins) -  0.7245477954546611
65 ) Joaquín_Abril_Martorell is read. time taken(mins) -  0.685699458916982
66 ) Martin_Lidegaard is read. time taken(mins) -  0.6672566572825114
67 ) Paul_Janson is read. time taken(mins) -  0.6292046626408895
68 ) Iain_Smith_(Scottish_politician) is read. time taken(mins) -  0.8398627320925395
69 ) Jacques_Germeaux is read. time taken(mins) -  0.7823602437973023
70 ) R._Avudaiyappan is read. time taken(mins) -  0.6933762033780416
71 ) Al_Smith is read. time taken(mins) -  0.7395054856936137
72 ) William_Gupton is read. time taken(mins) -  0.7009296417236328
73 ) Bill_Malarky is read. time taken(mins) -  0.7632142821947734
74 ) Jeanette_Dousdebes_Rubio is read. time taken(mins) -  0.7854939222335815
75 ) Buzz_Thomas is read. time taken(mins) -  0.808622415860494
76 ) Sachin_Ahir is read. time taken(mins) -  0.7690012335777283
77 ) Maria_Lohela is read. time taken(mins) -  0.7758005619049072
78 ) Carlos_Morales_Vázquez is read. time taken(mins) -  0.7055675307909648
79 ) Daniel_Spagnou is read. time taken(mins) -  0.6833850701649984
80 ) Antoine_Wright_(politician) is read. time taken(mins) -  0.6729867458343506
81 ) Pawan_Kumar_Tinu is read. time taken(mins) -  0.6822998245557149
82 ) Sardar_Tufail_Ahmad_Khan_Mayo is read. time taken(mins) -  0.5998750686645508
83 ) Mohammad_Khan_Qajar is read. time taken(mins) -  0.7575670798619588
84 ) Tom_Tauke is read. time taken(mins) -  0.8318018356959025
85 ) Wojciech_Olejniczak is read. time taken(mins) -  0.8031588792800903
86 ) Ayisha_Osori is read. time taken(mins) -  0.7786232630411783
87 ) Samuel_Miller_Quincy is read. time taken(mins) -  0.8178558627764384
88 ) Arsalan_Fathipour is read. time taken(mins) -  0.7849269072214763
89 ) Claus_Nissen_Riiber_Berg is read. time taken(mins) -  0.7521092732747395
90 ) Alun_Davies_(politician) is read. time taken(mins) -  0.7319616556167603
91 ) Muhammad_Khan_Achakzai is read. time taken(mins) -  0.6722055474917094
92 ) Dragan_Đilas is read. time taken(mins) -  0.719046676158905
93 ) Elżbieta_Bieńkowska is read. time taken(mins) -  0.7255872090657552
94 ) Sándor_Rónai is read. time taken(mins) -  0.6410789608955383
95 ) V._J._Sukselainen is read. time taken(mins) -  0.579851222038269
96 ) Peter_Bossman is read. time taken(mins) -  0.5791989843050639
97 ) Muhammad_Fazal_Karim is read. time taken(mins) -  0.5731673121452332
98 ) Hortensia_Aragón_Castillo is read. time taken(mins) -  0.6526701966921489
99 ) Walter_E._Fauntroy is read. time taken(mins) -  0.7210908969243367
100 ) Leopold_De_Wael is read. time taken(mins) -  0.6458297451337178
Hooray!!!!!!! The job is over.
Total Time taken - 73.15664520661036

Just interrupt the above code, whenever you want to and run the following code just to check how much time it took


In [52]:
curr_time = time.time()
start_time


Out[52]:
1483616594.0808723

In [53]:
politician_count


Out[53]:
10

In [54]:
#Overall time in hrs
(curr_time - start_time) / 3600


Out[54]:
0.1152450070116255

In [55]:
#Overall time in mins
(curr_time - start_time) / 60


Out[55]:
6.91470042069753

In [56]:
#Average time for one profile in mins
(curr_time - start_time ) / (60 *politician_count)


Out[56]:
0.691470042069753

In [23]:
#pol_list
#unread_politician
#pickle.dump(pol_list,open('profile-data//'+unread_politician,'wb'))