In [1]:
import json
with open('generated_entries_v2', 'r') as f:
    wikilist = json.load(f)

In [2]:
len(wikilist)


Out[2]:
716

In [3]:
wikilist[1]


Out[3]:
{'cameo_coding': [],
 'harvested_from': [{'arabic': 'https://ar.wikipedia.org/wiki/السيد_أحمد_عبد_الخالق',
   'english': ''}],
 'names': [{'arabic': 'السيد أحمد عبد الخالق', 'english': ''}],
 'pageid': [{'arabic': 2152585, 'english': ''}],
 'wiki_roles': [{'arabic': [{'end_date': ' 17 سبتمبر 2015 ',
     'role': ' وزير التعليم العالي الأسبق',
     'role_id': 1,
     'start_date': ' 17 يونيو 2014 '}],
   'english': []}],
 'wiki_scrape_date': '2018-01-16 16:45'}

In [4]:
monthdic={
    'January':"01",
    'February':"02",
    'March':"03",
    'April':"04",
    "May":"05",
    "June":"06",
    "July":"07",
    "August":"08",
    "September":"09",
    "October":"10",
    "November":"11",
    "December":"12"
}

In [7]:
def changeEnglishToEng():
    count=0
    for data in wikilist:
        count=count+1;
        try:
            data['wiki_roles'][0]['en']=data['wiki_roles'][0].pop("english")
        except Exception as e:
            print(e)
            print(count)
changeEnglishToEng()

In [10]:
def changeRoleToTitle():
    count=0
    for data in wikilist:
        count=count+1;
        try:
            for role in data['wiki_roles'][0]['en']:
                role['title']=role.pop("role")
            for role in data["wiki_roles"][0]["arabic"]:
                role['title']=role.pop("role")
        except Exception as e:
            print(e)
            print(count)
changeRoleToTitle()

In [12]:
wikilist[101]


Out[12]:
{'cameo_coding': [],
 'harvested_from': [{'arabic': '',
   'english': 'https://en.wikipedia.org/wiki/Abdirahman_Duale_Beyle'}],
 'names': [{'arabic': '', 'english': 'Abdirahman Duale Beyle'}],
 'pageid': [{'arabic': '', 'english': 41669456}],
 'wiki_roles': [{'arabic': [],
   'en': [{'end_date': ' Ministry of Foreign Affairs Somalia Minister of Foreign Affairs of Somalia ',
     'role_id': 1,
     'start_date': ' 29 March 2017',
     'title': ' Ministry of Finance Somalia Minister of Finance '},
    {'end_date': ' 27 January 2015',
     'role_id': 2,
     'start_date': ' 17 January 2014',
     'title': ' Ministry of Finance Somalia Minister of Finance '}]}],
 'wiki_scrape_date': '2018-01-16 16:46'}

In [17]:
def transferDate (date):
    datestring=date.split(" ");
    if len(datestring)==2:
        return datestring[1]+"-01-01";
    if len(datestring)==3:
        return datestring[2]+"-"+monthdic[datestring[1]]+"-01";
    if len(datestring)==4:
        #sometimes it is mm-dd-yyyy, sometimes it is dd-mm-yyyy.
        if str.isdigit(datestring[1])==False:
            return datestring[3]+"-"+monthdic[datestring[1]]+"-"+datestring[2];
        elif str.isdigit(datestring[2])==False:
            return datestring[3]+"-"+monthdic[datestring[2]]+"-"+datestring[1];
    return ""

In [18]:
def changeAllDate(wiki):
    data=wiki["wiki_roles"][0]['en']
    for element in data:
        element['end_date']=transferDate(element['end_date'])
        element['start_date']=transferDate(element['start_date'])
    data=wiki["wiki_roles"][0]['arabic']
    for element in data:
        element['end_date']=transferDate(element['end_date'])
        element['start_date']=transferDate(element['start_date'])

In [19]:
count=0;
for element in wikilist:
    count=count+1;
    try:
        changeAllDate(element)
    except Exception as e:
        print(count)


1
3
4
5
7
8
9
10
11
12
13
14
15
16
18
24
26
33
38
39
40
44
45
49
51
58
65
66
69
70
74
75
81
117
173
200
203
204
207
211
213
222
223
224
228
231
232
238
243
244
245
246
248
255
256
259
266
267
269
270
271
273
278
281
282
283
284
285
287
289
290
292
293
294
295
296
297
300
301
302
303
304
306
307
308
309
310
311
314
315
316
317
319
320
321
324
326
329
330
332
333
334
335
336
337
338
339
340
341
343
345
349
350
351
352
354
357
358
359
361
363
364
365
367
379
381
383
386
389
393
398
404
406
407
408
412
415
417
421
424
427
432
435
437
439
443
444
451
453
455
459
462
463
465
466
474
475
477
478
479
480
482
487
492
496
497
512
515
519
522
527
528
529
535
536
539
545
546
547
549
555
556
557
559
562
564
565
567
570
577
582
583
585
588
589
591
592
593
600
603
606
607
608
613
617
619
621
624
625
626
627
629
650
651
652
655
662
668
673
678
684
708

In [27]:
for wiki in wikilist:
    wiki.pop("namesnew")

In [34]:
for wiki in wikilist:
    wiki["wiki_roles"][0]["arabicname"]=wiki["names"][0]["arabic"]
    wiki["wiki_roles"][0]["englishname"]=wiki["names"][0]["english"]
    wiki["wiki_roles"][0]["englishlink"]=wiki["harvested_from"][0]["english"]
    wiki["wiki_roles"][0]["arabiclink"]=wiki["harvested_from"][0]["arabic"]

In [35]:
wikilist[0]


Out[35]:
{'cameo_coding': [],
 'harvested_from': [{'arabic': 'https://ar.wikipedia.org/wiki/أمين_المهدي',
   'english': ''}],
 'names': [{'arabic': 'أمين المهدي', 'english': ''}],
 'pageid': [{'arabic': 1740155, 'english': ''}],
 'wiki_roles': [{'arabic': [{'end_date': '',
     'role_id': 1,
     'start_date': '16 يوليو 2013',
     'title': 'وزير العدالة الانتقالية والمصالحة الوطنية'},
    {'end_date': ' 30 سبتمبر 2001',
     'role_id': 2,
     'start_date': ' 1 أكتوبر 2000',
     'title': 'وزير العدالة الانتقالية والمصالحة الوطنية'}],
   'arabiclink': 'https://ar.wikipedia.org/wiki/أمين_المهدي',
   'arabicname': 'أمين المهدي',
   'en': [],
   'englishlink': '',
   'englishname': ''}],
 'wiki_scrape_date': '2018-01-16 16:45'}

In [36]:
with open('transfered_wikidata', 'w') as fout:
    json.dump(wikilist, fout)

insert data into db


In [ ]:
from pymongo import MongoClient
import pymongo
import pickle
import logging
import time
import logging.config

start_time = time.time()
client1 =MongoClient('mongodb://user:pswd@portland.cs.ou.edu:port',maxPoolSize=5)
db1 = client1['lexisnexis']
wikientity = db1.wikientity
logging.basicConfig(filename='logging.python',level=logging.DEBUG)
import json

In [ ]:
with open('transfered_wikidata', 'r') as f:
    wikilist = json.load(f)
for wiki in wikilist:
    wiki["tagged"]=False

In [ ]:
for item in wikilist:
    wikientity.insert_one(item)

In [ ]:
wikientity.delete_many({ "taggingtime" : { "$exists" : False } })