In [1]:
import pickle

In [2]:
with open("arabicEntity.local","rb") as f:
    arabic_senlist=pickle.load(f)

In [7]:
import pickle
with open("stuff_makesense_spacy_ner","rb") as f1:
    spacy_entity=pickle.load(f1)

In [25]:
for key,value in spacy_entity[0].items():
    print(value=='PER')


True

In [10]:
count=0
for key,value in arabic_senlist.items():
    count=count+1
    if(count<2):
        print(key,value[0])
    else:
        break


الفترة 58e0455306036312a1dfbeb9

In [15]:
test=[0,1,2,3]

In [18]:
test[0:8]


Out[18]:
[0, 1, 2, 3]

plan to put 5 sentenceId to the noun if the entity does appear in 5 sentences


In [14]:
#build person entity and its sentences

In [59]:
%%time
person_sentences_dic={}
org_sentences_dic={}
count=0
for entity in spacy_entity:
    if(count%5000==0):
        print(str(count)+" processed");
    try:
        for key,value in entity.items():
            if(value=='PER'):
                sentences_list=arabic_senlist[key][0:5]
                person_sentences_dic[key]=sentences_list
                count=count+1
            elif(value=='ORG'):
                sentences_list1=arabic_senlist[key][0:5]
                org_sentences_dic[key]=sentences_list1
                count=count+1
            else:
                continue;
    except:
        continue;


0 processed
5000 processed
5000 processed
5000 processed
5000 processed
10000 processed
15000 processed
20000 processed
20000 processed
20000 processed
20000 processed
25000 processed
30000 processed
35000 processed
35000 processed
35000 processed
40000 processed
40000 processed
40000 processed
40000 processed
40000 processed
40000 processed
45000 processed
50000 processed
50000 processed
55000 processed
60000 processed
65000 processed
70000 processed
70000 processed
70000 processed
70000 processed
70000 processed
75000 processed
80000 processed
80000 processed
80000 processed
80000 processed
80000 processed
80000 processed
80000 processed
80000 processed
80000 processed
85000 processed
85000 processed
85000 processed
90000 processed
95000 processed
100000 processed
100000 processed
100000 processed
105000 processed
105000 processed
105000 processed
105000 processed
105000 processed
110000 processed
110000 processed
115000 processed
115000 processed
115000 processed
115000 processed
115000 processed
115000 processed
115000 processed
120000 processed
125000 processed
130000 processed
135000 processed
135000 processed
135000 processed
135000 processed
135000 processed
135000 processed
135000 processed
135000 processed
135000 processed
135000 processed
135000 processed
135000 processed
135000 processed
140000 processed
145000 processed
150000 processed
150000 processed
155000 processed
155000 processed
155000 processed
155000 processed
155000 processed
155000 processed
155000 processed
160000 processed
160000 processed
160000 processed
160000 processed
165000 processed
170000 processed
170000 processed
170000 processed
170000 processed
170000 processed
170000 processed
175000 processed
180000 processed
180000 processed
180000 processed
185000 processed
190000 processed
CPU times: user 638 ms, sys: 14.9 ms, total: 653 ms
Wall time: 648 ms

In [47]:
len(person_sentences_dic)


Out[47]:
111316

In [34]:
count=0
for entity in spacy_entity:
    count=count+1
    if(count<2):
        for key,value in entity.items():
            print(arabic_senlist[key][0:4])
    else:
        break


[ObjectId('58e0455306036312a1dfbf00'), ObjectId('58e0455306036312a1dfbf0c'), ObjectId('58e0455306036312a1dfbf0d'), ObjectId('58e0455306036312a1dfbf1a')]

In [64]:
count=0
for key,value in person_sentences_dic.items():
    count=count+1
    if(count<3):
        print(key)
        print(value)


العام
[ObjectId('58e0455306036312a1dfbf00'), ObjectId('58e0455306036312a1dfbf0c'), ObjectId('58e0455306036312a1dfbf0d'), ObjectId('58e0455306036312a1dfbf1a'), ObjectId('58e0455306036312a1dfbf28')]
الرئيس
[ObjectId('58e0455306036312a1dfbedb'), ObjectId('58e0455306036312a1dfbef5'), ObjectId('58e0455306036312a1dfbf23'), ObjectId('58e0455306036312a1dfbf31'), ObjectId('58e0455406036312a1dfbf3c')]

In [76]:
#verify that the word is in the sentence linked by the sentenceId it stores

In [77]:
sentence="ويقول الادعاء العام الايطالي إن الإمام المصري اختطف على أيدي عملاء السي.آي.إيه الذين نقلوه إلى قاعدة «آفيانو» الجوية والتي تقيم فيها وحدات إيطالية واميركية، ثم نقل إلى ألمانيا وبعدها إلى مصر، حيث تعرض للتعذيب، بحسب قوله."

In [78]:
word="العام"

In [79]:
word in sentence


Out[79]:
True

In [65]:
import pickle
try:
    with open("per_entities_sentences_dic.data",'wb') as f:
        pickle.dump(person_sentences_dic,f,pickle.HIGHEST_PROTOCOL)
except Exception as e:
    print(e)
    pass

In [66]:
import pickle
try:
    with open("org_entities_sentences_dic.data",'wb') as f:
        pickle.dump(org_sentences_dic,f,pickle.HIGHEST_PROTOCOL)
except Exception as e:
    print(e)
    pass

In [80]:
from pymongo import MongoClient
import time
import pickle
client=MongoClient()
client=MongoClient('mongodb:/portland.cs.ou.edu')
db=client['lexisnexis']
table=db["fast_per_entities"]

In [82]:
len(person_sentences_dic)


Out[82]:
111316

In [85]:
count=0
for key,value in person_sentences_dic.items():
    count=count+1
    if(count%5000==0):
        print(count)
    table.insert_one({"word":key,"sentenceids":value})


5000
10000
15000
20000
25000
30000
35000
40000
45000
50000
55000
60000
65000
70000
75000
80000
85000
90000
95000
100000
105000
110000

In [86]:
len(org_sentences_dic)


Out[86]:
7864

In [ ]:
table=db["fast_org_entities"]
for key,value in org_sentences_dic.items():
    table.insert({"word":key,"sentenceids":value})


/Users/yanliang/eventData/yan-virtualenv/document_cluster/lib/python3.6/site-packages/ipykernel_launcher.py:3: DeprecationWarning: insert is deprecated. Use insert_one or insert_many instead.
  This is separate from the ipykernel package so we can avoid doing imports until

In [ ]: