In [1]:
import pymongo
from pymongo import InsertOne,UpdateOne
from pymongo.errors import BulkWriteError
client = pymongo.MongoClient('101.132.114.125:27017')
db = client.tweet

In [2]:
from tqdm import tqdm
from datetime import datetime,timedelta
classses = [] for i in db.current_event.find({},{'event.class':1}): classses.append(i['event']['class']) len(classses) from collections import Counter count = Counter(classses)
def is_type_one(type_str): type_str = type_str.lower() return 'armed' in type_str or 'attack' in type_str or 'conflict' in type_str def is_type_two(type_str): type_str = type_str.lower() return 'disaster' in type_str or 'accident' in type_str def is_type_three(type_str): type_str = type_str.lower() return 'law' in type_str or 'crime' in type_str def is_type_four(type_str): type_str = type_str.lower() return 'politic' in type_str or 'election' in type_str def is_type_five(type_str): type_str = type_str.lower() return 'international' in type_str or 'relation' in type_str def get_type(type_str): type_str = type_str.lower() if 'armed' in type_str or 'attack' in type_str or 'conflict' in type_str: return 1 elif 'disaster' in type_str or 'accident' in type_str: return 2 elif 'law' in type_str or 'crime' in type_str: return 3 elif 'politic' in type_str or 'election' in type_str: return 4 elif 'international' in type_str or 'relation' in type_str: return 5 elif 'science' in type_str or 'technology' in type_str: return 6 elif 'business' in type_str or 'econom' in type_str: return 7 elif 'art' in type_str or 'culture' in type_str: return 8 elif 'sport' in type_str: return 9 elif 'Health' in type_str or 'environment' in type_str: return 10 else: return 0
for i in db.current_event.find({},{'event.class':1}): db.current_event.find_one_and_update({'_id': i['_id']}, {'$set': {'type':get_type(i['event']['class'])}})
requests = [UpdateOne({'_id': i['_id']}, {'$set': {'type':get_type(i['event']['class'])}}) for i in tqdm(db.current_event.find({},{'event.class':1}))] try: result = db.current_event.bulk_write(requests) pprint(result.bulk_api_result) except BulkWriteError as bwe: pprint(bwe.details)
LOCATION MISC ORGANIZATION PERSON NUMBER MONEY DATE DURATION ORDINAL

In [13]:
def get_ner_w(ner_dict):
    where_who = []
    why_what = []
    for k,v in ner_dict.iteritems():
        if k in ['LOCATION','MISC','ORGANIZATION','PERSON']:
            where_who.extend(v)
        elif k in ['NUMBER','MONEY','DATE']:#,'DURATION']:#,'ORDINAL']:
            why_what.extend(v)
        else:
            pass
    where_who = ['"'+i+'"' for i in where_who]
    where_who = '('+' OR '.join(where_who)+')'
    why_what = ['"'+i+'"' for i in why_what] 
    why_what = '('+' OR '.join(why_what)+')'
    return where_who,why_what

In [14]:
def get_query_str(item):
    date = datetime.strptime(item['event']['date'],'%Y-%m-%d')
    since = (date+timedelta(days=-1)).strftime('%Y-%m-%d')
    until = (date+timedelta(days=1)).strftime('%Y-%m-%d')
    ner_dict = item['ie']['ner_dict']
    where_who,why_what = get_ner_w(ner_dict)
    return where_who+' '+why_what+' since:'+since+' until:'+until
l=en

In [15]:
for item in db.current_event.find({'type':2},{'event.date':1,'ie.ner_dict':1}).limit(15):
    print get_query_str(item)


("Hurricane Alex" OR "Atlantic" OR "Texas" OR "Mexico") ("2010" OR "165" OR "2" OR "105") since:2010-06-30 until:2010-07-02
("BP" OR "Endangered Species Act" OR "United States" OR "Gulf of Mexico") () since:2010-06-30 until:2010-07-02
("Shenzhen") ("Six" OR "ten") since:2010-06-30 until:2010-07-02
("Russian") () since:2010-08-02 until:2010-08-04
("Iraqi" OR "Sulaimaniya") ("29" OR "21") since:2010-07-15 until:2010-07-17
("Israeli" OR "Palestinians" OR "West Bank") () since:2010-07-16 until:2010-07-18
("Southern China" OR "Philippines" OR "Vietnam" OR "Hai Phong") ("65") since:2010-07-16 until:2010-07-18
("Gazan" OR "Israel") () since:2010-07-16 until:2010-07-18
("Dalian") ("2,000" OR "two") since:2010-07-16 until:2010-07-18
("Greece") ("summer") since:2010-07-16 until:2010-07-18
("Pukë" OR "Albania") ("12" OR "14") since:2010-07-17 until:2010-07-19
("United Nations" OR "Israel" OR "Gaza") ("December 2008" OR "225") since:2010-07-17 until:2010-07-19
("New Britain" OR "Papua New Guinea") ("7.3" OR "two") since:2010-07-17 until:2010-07-19
("India" OR "Birbhum" OR "West Bengal") ("Two" OR "50") since:2010-07-18 until:2010-07-20
("Chinese" OR "Dalian") ("50" OR "two") since:2010-07-18 until:2010-07-20

In [17]:
import sys
reload(sys)
sys.setdefaultencoding('utf-8')

In [18]:
word2vec_dir = u'C:/Users/lxp/Desktop/nlp/事件识别与抽取/dataset/glove.840B.300d.word2vec.txt'

In [19]:
from gensim.models.word2vec import Word2Vec
wv = Word2Vec.load_word2vec_format(word2vec_dir,binary=False)
wv.wv['disaster']

In [21]:
wv.most_similar(positive=['disaster'])


Out[21]:
[(u'disasters', 0.8176724910736084),
 (u'catastrophe', 0.7899595499038696),
 (u'catastrophic', 0.731852650642395),
 (u'calamity', 0.68067467212677),
 (u'crisis', 0.6719868779182434),
 (u'Disaster', 0.6718685626983643),
 (u'aftermath', 0.6681604981422424),
 (u'devastation', 0.6608259677886963),
 (u'tsunami', 0.6606495380401611),
 (u'tragedy', 0.6545232534408569)]

In [28]:
wv.similarity('Disater','hurricane')


Out[28]:
0.12740960260070727

In [25]:
wv.similarity('disaster','texas')


Out[25]:
0.17239310841106883

In [24]:
item


Out[24]:
{u'_id': ObjectId('59fbc9fa60b18848c5a4d015'),
 u'event': {u'date': u'2010-07-19'},
 u'ie': {u'ner_dict': {u'LOCATION': [u'Dalian'],
   u'MISC': [u'Chinese'],
   u'NUMBER': [u'50', u'two']}}}

In [37]:
import re

In [44]:
def trigger_class_description(class_,des,top=1):
    print class_
    class_ = re.sub('and','',class_)
    class_ = class_.lower().split(' ')[0]
    print class_
    des = des.lower().split(' ')
    sims = [wv.similarity(class_,i) for i in des]
    sims.sort()
    return sims[-top:-1]

In [51]:
for item in db.current_event.find({'type':2}).limit(3):
    print item
    #print trigger_class_description(item['event']['class'],item['event']['title']+' .'+item['event']['description'])

In [ ]:
def preprocess_title_description(doc):
    doc = re.sub('\([\w ]+\)','',doc)
    doc = re.sub('\.','',doc)

In [48]:
item


Out[48]:
{u'_id': ObjectId('59fbc9fa60b18848c5a4ce63'),
 u'event': {u'class': u'Disasters and accidents',
  u'date': u'2010-07-01',
  u'description': u'Six people are killed and ten others injured in Shenzhen after a space shuttle simulator ride plunged to the ground at a popular amusement park. (ntdtv)',
  u'title': u''},
 u'ie': {u'ner_dict': {u'LOCATION': [u'Shenzhen'],
   u'NUMBER': [u'Six', u'ten']},
  u'openie': {u'object': u'Shenzhen',
   u'relation': u'are',
   u'subject': u'people ten others'}},
 u'type': 2}

In [63]:
wv.similarity('kills','injure')


Out[63]:
0.40956288158834531

In [65]:
def basic_cleaning2(string):
    string = str(string)
    string = re.sub('[0-9\(\)\!\^\%\$\'\"\.;,-\?\{\}\[\]\\/]', ' ', string)
    string = re.sub(' +', ' ', string)
    return string

In [66]:
basic_cleaning2('Six people are killed and ten others injured in Shenzhen after a space shuttle simulator ride plunged to the ground at a popular amusement park. (ntdtv)')


Out[66]:
'Six people are killed and ten others injured in Shenzhen after a space shuttle simulator ride plunged to the ground at a popular amusement park ntdtv '

In [ ]: