In [1]:

    
import pymongo
from pymongo import InsertOne,UpdateOne
from pymongo.errors import BulkWriteError
client = pymongo.MongoClient('101.132.114.125:27017')
db = client.tweet



In [2]:

    
from tqdm import tqdm
from datetime import datetime,timedelta

classses = [] for i in db.current_event.find({},{'event.class':1}): classses.append(i['event']['class']) len(classses) from collections import Counter count = Counter(classses)

def is_type_one(type_str): type_str = type_str.lower() return 'armed' in type_str or 'attack' in type_str or 'conflict' in type_str def is_type_two(type_str): type_str = type_str.lower() return 'disaster' in type_str or 'accident' in type_str def is_type_three(type_str): type_str = type_str.lower() return 'law' in type_str or 'crime' in type_str def is_type_four(type_str): type_str = type_str.lower() return 'politic' in type_str or 'election' in type_str def is_type_five(type_str): type_str = type_str.lower() return 'international' in type_str or 'relation' in type_str def get_type(type_str): type_str = type_str.lower() if 'armed' in type_str or 'attack' in type_str or 'conflict' in type_str: return 1 elif 'disaster' in type_str or 'accident' in type_str: return 2 elif 'law' in type_str or 'crime' in type_str: return 3 elif 'politic' in type_str or 'election' in type_str: return 4 elif 'international' in type_str or 'relation' in type_str: return 5 elif 'science' in type_str or 'technology' in type_str: return 6 elif 'business' in type_str or 'econom' in type_str: return 7 elif 'art' in type_str or 'culture' in type_str: return 8 elif 'sport' in type_str: return 9 elif 'Health' in type_str or 'environment' in type_str: return 10 else: return 0

for i in db.current_event.find({},{'event.class':1}): db.current_event.find_one_and_update({'_id': i['_id']}, {'$set': {'type':get_type(i['event']['class'])}})

requests = [UpdateOne({'_id': i['_id']}, {'$set': {'type':get_type(i['event']['class'])}}) for i in tqdm(db.current_event.find({},{'event.class':1}))] try: result = db.current_event.bulk_write(requests) pprint(result.bulk_api_result) except BulkWriteError as bwe: pprint(bwe.details)

LOCATION MISC ORGANIZATION PERSON NUMBER MONEY DATE DURATION ORDINAL



In [13]:

    
def get_ner_w(ner_dict):
    where_who = []
    why_what = []
    for k,v in ner_dict.iteritems():
        if k in ['LOCATION','MISC','ORGANIZATION','PERSON']:
            where_who.extend(v)
        elif k in ['NUMBER','MONEY','DATE']:#,'DURATION']:#,'ORDINAL']:
            why_what.extend(v)
        else:
            pass
    where_who = ['"'+i+'"' for i in where_who]
    where_who = '('+' OR '.join(where_who)+')'
    why_what = ['"'+i+'"' for i in why_what] 
    why_what = '('+' OR '.join(why_what)+')'
    return where_who,why_what



In [14]:

    
def get_query_str(item):
    date = datetime.strptime(item['event']['date'],'%Y-%m-%d')
    since = (date+timedelta(days=-1)).strftime('%Y-%m-%d')
    until = (date+timedelta(days=1)).strftime('%Y-%m-%d')
    ner_dict = item['ie']['ner_dict']
    where_who,why_what = get_ner_w(ner_dict)
    return where_who+' '+why_what+' since:'+since+' until:'+until

l=en



In [15]:

    
for item in db.current_event.find({'type':2},{'event.date':1,'ie.ner_dict':1}).limit(15):
    print get_query_str(item)









    



("Hurricane Alex" OR "Atlantic" OR "Texas" OR "Mexico") ("2010" OR "165" OR "2" OR "105") since:2010-06-30 until:2010-07-02
("BP" OR "Endangered Species Act" OR "United States" OR "Gulf of Mexico") () since:2010-06-30 until:2010-07-02
("Shenzhen") ("Six" OR "ten") since:2010-06-30 until:2010-07-02
("Russian") () since:2010-08-02 until:2010-08-04
("Iraqi" OR "Sulaimaniya") ("29" OR "21") since:2010-07-15 until:2010-07-17
("Israeli" OR "Palestinians" OR "West Bank") () since:2010-07-16 until:2010-07-18
("Southern China" OR "Philippines" OR "Vietnam" OR "Hai Phong") ("65") since:2010-07-16 until:2010-07-18
("Gazan" OR "Israel") () since:2010-07-16 until:2010-07-18
("Dalian") ("2,000" OR "two") since:2010-07-16 until:2010-07-18
("Greece") ("summer") since:2010-07-16 until:2010-07-18
("Pukë" OR "Albania") ("12" OR "14") since:2010-07-17 until:2010-07-19
("United Nations" OR "Israel" OR "Gaza") ("December 2008" OR "225") since:2010-07-17 until:2010-07-19
("New Britain" OR "Papua New Guinea") ("7.3" OR "two") since:2010-07-17 until:2010-07-19
("India" OR "Birbhum" OR "West Bengal") ("Two" OR "50") since:2010-07-18 until:2010-07-20
("Chinese" OR "Dalian") ("50" OR "two") since:2010-07-18 until:2010-07-20



In [17]:

    
import sys
reload(sys)
sys.setdefaultencoding('utf-8')



In [18]:

    
word2vec_dir = u'C:/Users/lxp/Desktop/nlp/事件识别与抽取/dataset/glove.840B.300d.word2vec.txt'



In [19]:

    
from gensim.models.word2vec import Word2Vec
wv = Word2Vec.load_word2vec_format(word2vec_dir,binary=False)

wv.wv['disaster']



In [21]:

    
wv.most_similar(positive=['disaster'])









    Out[21]:





[(u'disasters', 0.8176724910736084),
 (u'catastrophe', 0.7899595499038696),
 (u'catastrophic', 0.731852650642395),
 (u'calamity', 0.68067467212677),
 (u'crisis', 0.6719868779182434),
 (u'Disaster', 0.6718685626983643),
 (u'aftermath', 0.6681604981422424),
 (u'devastation', 0.6608259677886963),
 (u'tsunami', 0.6606495380401611),
 (u'tragedy', 0.6545232534408569)]



In [28]:

    
wv.similarity('Disater','hurricane')









    Out[28]:





0.12740960260070727



In [25]:

    
wv.similarity('disaster','texas')









    Out[25]:





0.17239310841106883



In [24]:

    
item









    Out[24]:





{u'_id': ObjectId('59fbc9fa60b18848c5a4d015'),
 u'event': {u'date': u'2010-07-19'},
 u'ie': {u'ner_dict': {u'LOCATION': [u'Dalian'],
   u'MISC': [u'Chinese'],
   u'NUMBER': [u'50', u'two']}}}



In [37]:

    
import re



In [44]:

    
def trigger_class_description(class_,des,top=1):
    print class_
    class_ = re.sub('and','',class_)
    class_ = class_.lower().split(' ')[0]
    print class_
    des = des.lower().split(' ')
    sims = [wv.similarity(class_,i) for i in des]
    sims.sort()
    return sims[-top:-1]



In [51]:

    
for item in db.current_event.find({'type':2}).limit(3):
    print item
    #print trigger_class_description(item['event']['class'],item['event']['title']+' .'+item['event']['description'])



In [ ]:

    
def preprocess_title_description(doc):
    doc = re.sub('\([\w ]+\)','',doc)
    doc = re.sub('\.','',doc)



In [48]:

    
item









    Out[48]:





{u'_id': ObjectId('59fbc9fa60b18848c5a4ce63'),
 u'event': {u'class': u'Disasters and accidents',
  u'date': u'2010-07-01',
  u'description': u'Six people are killed and ten others injured in Shenzhen after a space shuttle simulator ride plunged to the ground at a popular amusement park. (ntdtv)',
  u'title': u''},
 u'ie': {u'ner_dict': {u'LOCATION': [u'Shenzhen'],
   u'NUMBER': [u'Six', u'ten']},
  u'openie': {u'object': u'Shenzhen',
   u'relation': u'are',
   u'subject': u'people ten others'}},
 u'type': 2}



In [63]:

    
wv.similarity('kills','injure')









    Out[63]:





0.40956288158834531



In [65]:

    
def basic_cleaning2(string):
    string = str(string)
    string = re.sub('[0-9\(\)\!\^\%\$\'\"\.;,-\?\{\}\[\]\\/]', ' ', string)
    string = re.sub(' +', ' ', string)
    return string



In [66]:

    
basic_cleaning2('Six people are killed and ten others injured in Shenzhen after a space shuttle simulator ride plunged to the ground at a popular amusement park. (ntdtv)')









    Out[66]:





'Six people are killed and ten others injured in Shenzhen after a space shuttle simulator ride plunged to the ground at a popular amusement park ntdtv '



In [ ]: