In [1]:
from datetime import datetime,timedelta
from collections import defaultdict,Counter
from pprint import pprint
from tqdm import tqdm
import re
import pymongo
from pymongo import InsertOne, DeleteMany, ReplaceOne, UpdateOne
from pymongo.errors import BulkWriteError
In [2]:
from nltk.corpus import stopwords
In [3]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
import itertools
In [4]:
list_stopWords=list(set(stopwords.words('english')))
In [5]:
client = pymongo.MongoClient('localhost:27017')
db = client.tweet
In [6]:
#db.authenticate('admin','lixiepeng')
In [7]:
import pandas as pd
import spacy
In [8]:
nlp = spacy.load('en_core_web_md')
In [9]:
events = [e for e in db.current_event.find({},{'_id':1,'event.class':1,'event.date':1,'event.title':1,'event.description':1})]
In [10]:
events[0]
Out[10]:
In [11]:
events = [{'id':e['_id'],'class':e['event']['class'],'date':e['event']['date'],'title':e['event']['title'],'description':e['event']['description']} for e in events]
In [12]:
df_events = pd.DataFrame.from_records(events)
In [13]:
df_events.head()
Out[13]:
In [14]:
def class_code(type_str):
type_str = type_str.lower()
if 'armed' in type_str or 'attack' in type_str or 'conflict' in type_str:
return 1
elif 'disaster' in type_str or 'accident' in type_str:
return 2
elif 'law' in type_str or 'crime' in type_str:
return 3
elif 'politic' in type_str or 'election' in type_str:
return 4
elif 'international' in type_str or 'relation' in type_str:
return 5
elif 'science' in type_str or 'technology' in type_str:
return 6
elif 'business' in type_str or 'econom' in type_str:
return 7
elif 'art' in type_str or 'culture' in type_str:
return 8
elif 'sport' in type_str:
return 9
elif 'health' in type_str or 'environment' in type_str:
return 10
else:
return 0
In [15]:
df_events['class_code'] = df_events['class'].apply(class_code)
In [16]:
df_events.head()
Out[16]:
In [17]:
def description_clean(description):
description = description.split('. (')[0]+'.'
return description
In [18]:
df_events['des_clean'] = df_events['description'].apply(description_clean)
In [23]:
def class_similarity(class_text,span):
return nlp(class_text).similarity(nlp(span))
In [27]:
def get_query(doc,class_text,doc_index,doc_date):
doc_date = datetime.strptime(doc_date,'%Y-%m-%d')
date_0 = doc_date.strftime('%Y-%m-%d')
date_0_ = (doc_date+timedelta(days=-3)).strftime('%Y-%m-%d')
date_1 = (doc_date+timedelta(days=1)).strftime('%Y-%m-%d')
date_1_ = date_0
doc = nlp(doc)
kws = []
for i in doc.ents:
kws.append(i.text)
triggers = []
for token in doc:
if not token.is_stop and token.tag_.startswith('V'):
if token.text.lower() not in list_stopWords:
triggers.append((token.text,token.tag_,str(class_similarity(class_text,token.text))))
triggers = sorted(triggers,key=lambda x:x[2],reverse=True)[:3]
for i in triggers:
kws.append(i[0])
noun_chunks = []
for i in doc.noun_chunks:
noun_chunks.append((i.text,str(class_similarity(class_text,i.text))))
try:
kws.append(sorted(noun_chunks,key=lambda x:x[1],reverse=True)[0][0].split(' ')[-1])
except:
pass
kws = [w for w in kws if not w in list_stopWords]
kws = list(set(kws))
query = [i for i in itertools.combinations(kws,2)]
query = ['"'+i[0]+'"'+' '+'"'+i[1]+'"'+' '+'since:'+date_0_+' '+'until:'+date_0 for i in query]+['"'+i[0]+'"'+' '+'"'+i[1]+'"'+' '+'since:'+date_1_+' '+'until:'+date_1 for i in query]
print(query)
return query
In [28]:
queries = []
In [ ]:
for event in df_events.iterrows():
doc_index = event[0]
doc_date = event[1]['date']
doc_class = event[1]['class']
doc_title = event[1]['title']
doc = event[1]['des_clean']
class_text = doc_class.replace('and','')
query = get_query(doc,class_text,doc_index,doc_date)
queries.append(query)
In [ ]:
query = db.current_event.find({},{'_id':1})
ids = []
for i in query:
ids.append(i['_id'])
In [ ]:
requests = [UpdateOne({'_id': _id}, {'$set': {'queries':queries[index]}}) for index,_id in tqdm(enumerate(ids))]
try:
result = db.current_event.bulk_write(requests)
pprint(result.bulk_api_result)
except BulkWriteError as bwe:
pprint(bwe.details)
In [ ]: