In [1]:
import pymongo
from pymongo import InsertOne,UpdateOne
from pymongo.errors import BulkWriteError
client = pymongo.MongoClient('101.132.114.125:27017')
db = client.tweet
In [2]:
from tqdm import tqdm
from datetime import datetime,timedelta
In [13]:
def get_ner_w(ner_dict):
where_who = []
why_what = []
for k,v in ner_dict.iteritems():
if k in ['LOCATION','MISC','ORGANIZATION','PERSON']:
where_who.extend(v)
elif k in ['NUMBER','MONEY','DATE']:#,'DURATION']:#,'ORDINAL']:
why_what.extend(v)
else:
pass
where_who = ['"'+i+'"' for i in where_who]
where_who = '('+' OR '.join(where_who)+')'
why_what = ['"'+i+'"' for i in why_what]
why_what = '('+' OR '.join(why_what)+')'
return where_who,why_what
In [14]:
def get_query_str(item):
date = datetime.strptime(item['event']['date'],'%Y-%m-%d')
since = (date+timedelta(days=-1)).strftime('%Y-%m-%d')
until = (date+timedelta(days=1)).strftime('%Y-%m-%d')
ner_dict = item['ie']['ner_dict']
where_who,why_what = get_ner_w(ner_dict)
return where_who+' '+why_what+' since:'+since+' until:'+until
In [15]:
for item in db.current_event.find({'type':2},{'event.date':1,'ie.ner_dict':1}).limit(15):
print get_query_str(item)
In [17]:
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
In [18]:
word2vec_dir = u'C:/Users/lxp/Desktop/nlp/事件识别与抽取/dataset/glove.840B.300d.word2vec.txt'
In [19]:
from gensim.models.word2vec import Word2Vec
wv = Word2Vec.load_word2vec_format(word2vec_dir,binary=False)
In [21]:
wv.most_similar(positive=['disaster'])
Out[21]:
In [28]:
wv.similarity('Disater','hurricane')
Out[28]:
In [25]:
wv.similarity('disaster','texas')
Out[25]:
In [24]:
item
Out[24]:
In [37]:
import re
In [44]:
def trigger_class_description(class_,des,top=1):
print class_
class_ = re.sub('and','',class_)
class_ = class_.lower().split(' ')[0]
print class_
des = des.lower().split(' ')
sims = [wv.similarity(class_,i) for i in des]
sims.sort()
return sims[-top:-1]
In [51]:
for item in db.current_event.find({'type':2}).limit(3):
print item
#print trigger_class_description(item['event']['class'],item['event']['title']+' .'+item['event']['description'])
In [ ]:
def preprocess_title_description(doc):
doc = re.sub('\([\w ]+\)','',doc)
doc = re.sub('\.','',doc)
In [48]:
item
Out[48]:
In [63]:
wv.similarity('kills','injure')
Out[63]:
In [65]:
def basic_cleaning2(string):
string = str(string)
string = re.sub('[0-9\(\)\!\^\%\$\'\"\.;,-\?\{\}\[\]\\/]', ' ', string)
string = re.sub(' +', ' ', string)
return string
In [66]:
basic_cleaning2('Six people are killed and ten others injured in Shenzhen after a space shuttle simulator ride plunged to the ground at a popular amusement park. (ntdtv)')
Out[66]:
In [ ]: