In [1]:
import json
with open('generated_entries_v2', 'r') as f:
wikilist = json.load(f)
In [2]:
len(wikilist)
Out[2]:
In [3]:
wikilist[1]
Out[3]:
In [4]:
monthdic={
'January':"01",
'February':"02",
'March':"03",
'April':"04",
"May":"05",
"June":"06",
"July":"07",
"August":"08",
"September":"09",
"October":"10",
"November":"11",
"December":"12"
}
In [7]:
def changeEnglishToEng():
count=0
for data in wikilist:
count=count+1;
try:
data['wiki_roles'][0]['en']=data['wiki_roles'][0].pop("english")
except Exception as e:
print(e)
print(count)
changeEnglishToEng()
In [10]:
def changeRoleToTitle():
count=0
for data in wikilist:
count=count+1;
try:
for role in data['wiki_roles'][0]['en']:
role['title']=role.pop("role")
for role in data["wiki_roles"][0]["arabic"]:
role['title']=role.pop("role")
except Exception as e:
print(e)
print(count)
changeRoleToTitle()
In [12]:
wikilist[101]
Out[12]:
In [17]:
def transferDate (date):
datestring=date.split(" ");
if len(datestring)==2:
return datestring[1]+"-01-01";
if len(datestring)==3:
return datestring[2]+"-"+monthdic[datestring[1]]+"-01";
if len(datestring)==4:
#sometimes it is mm-dd-yyyy, sometimes it is dd-mm-yyyy.
if str.isdigit(datestring[1])==False:
return datestring[3]+"-"+monthdic[datestring[1]]+"-"+datestring[2];
elif str.isdigit(datestring[2])==False:
return datestring[3]+"-"+monthdic[datestring[2]]+"-"+datestring[1];
return ""
In [18]:
def changeAllDate(wiki):
data=wiki["wiki_roles"][0]['en']
for element in data:
element['end_date']=transferDate(element['end_date'])
element['start_date']=transferDate(element['start_date'])
data=wiki["wiki_roles"][0]['arabic']
for element in data:
element['end_date']=transferDate(element['end_date'])
element['start_date']=transferDate(element['start_date'])
In [19]:
count=0;
for element in wikilist:
count=count+1;
try:
changeAllDate(element)
except Exception as e:
print(count)
In [27]:
for wiki in wikilist:
wiki.pop("namesnew")
In [34]:
for wiki in wikilist:
wiki["wiki_roles"][0]["arabicname"]=wiki["names"][0]["arabic"]
wiki["wiki_roles"][0]["englishname"]=wiki["names"][0]["english"]
wiki["wiki_roles"][0]["englishlink"]=wiki["harvested_from"][0]["english"]
wiki["wiki_roles"][0]["arabiclink"]=wiki["harvested_from"][0]["arabic"]
In [35]:
wikilist[0]
Out[35]:
In [36]:
with open('transfered_wikidata', 'w') as fout:
json.dump(wikilist, fout)
In [ ]:
from pymongo import MongoClient
import pymongo
import pickle
import logging
import time
import logging.config
start_time = time.time()
client1 =MongoClient('mongodb://user:pswd@portland.cs.ou.edu:port',maxPoolSize=5)
db1 = client1['lexisnexis']
wikientity = db1.wikientity
logging.basicConfig(filename='logging.python',level=logging.DEBUG)
import json
In [ ]:
with open('transfered_wikidata', 'r') as f:
wikilist = json.load(f)
for wiki in wikilist:
wiki["tagged"]=False
In [ ]:
for item in wikilist:
wikientity.insert_one(item)
In [ ]:
wikientity.delete_many({ "taggingtime" : { "$exists" : False } })