In [1]:

    
from datetime import datetime,timedelta
from collections import defaultdict,Counter
from pprint import pprint
from tqdm import tqdm
import re

import pymongo
from pymongo import InsertOne, DeleteMany, ReplaceOne, UpdateOne
from pymongo.errors import BulkWriteError



In [2]:

    
from nltk.corpus import stopwords



In [3]:

    
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
import itertools



In [4]:

    
list_stopWords=list(set(stopwords.words('english')))



In [5]:

    
client = pymongo.MongoClient('localhost:27017')
db = client.tweet



In [6]:

    
#db.authenticate('admin','lixiepeng')



In [7]:

    
import pandas as pd
import spacy



In [8]:

    
nlp = spacy.load('en_core_web_md')



In [9]:

    
events = [e for e in db.current_event.find({},{'_id':1,'event.class':1,'event.date':1,'event.title':1,'event.description':1})]



In [10]:

    
events[0]









    Out[10]:





{'_id': ObjectId('59fbc9fa60b18848c5a4ce5a'),
 'event': {'class': 'Armed conflicts and attacks',
  'date': '2010-07-01',
  'description': 'In response to a rocket attack on southern Israel, Israeli airforce jets strike several targets in Gaza overnight, including Yasser Arafat International Airport, a weapons manufacturing facility and an infiltration tunnel into Israel. (Xinhua) (Arab News) (The Jerusalem Post)\nEuropean Union High Representative of the Union for Foreign Affairs and Security Policy Catherine Ashton says the planned demolition of Palestinian houses in East Jerusalem for an archeological park and tourist center is "an obstacle to peace" and "illegal under international law". (Arab News) (Israel National News)\nThe Methodist Church of Great Britain votes in Portsmouth to boycott Israeli-produced goods and services from the West Bank because of what it termed Israel\'s "illegal occupation of Palestinian lands". (The Jerusalem Post)\nUnited States Middle East envoy George J. Mitchell accuses Hamas of “unacceptable and inhuman behavior” with regard to captive soldier Gilad Shalit, held in isolation in Gaza for the last four years, and calls for his immediate release. (The Jerusalem Post)\nThe son of a founding member of Hamas is granted asylum in the United States. (Aljazeera)',
  'title': 'Israeli-Palestinian conflict'}}



In [11]:

    
events = [{'id':e['_id'],'class':e['event']['class'],'date':e['event']['date'],'title':e['event']['title'],'description':e['event']['description']} for e in events]



In [12]:

    
df_events = pd.DataFrame.from_records(events)



In [13]:

    
df_events.head()









    Out[13]:






  
    
      
      class
      date
      description
      id
      title
    
  
  
    
      0
      Armed conflicts and attacks
      2010-07-01
      In response to a rocket attack on southern Isr...
      59fbc9fa60b18848c5a4ce5a
      Israeli-Palestinian conflict
    
    
      1
      Armed conflicts and attacks
      2010-07-01
      11 Kurdish, a soldier and three members of a K...
      59fbc9fa60b18848c5a4ce5b
      
    
    
      2
      Armed conflicts and attacks
      2010-07-01
      At least 42 people are killed and at least 175...
      59fbc9fa60b18848c5a4ce5c
      
    
    
      3
      Armed conflicts and attacks
      2010-07-01
      Somali President Sharif Ahmed joins Somali tro...
      59fbc9fa60b18848c5a4ce5d
      
    
    
      4
      Arts and culture
      2010-07-01
      People take to the streets in Ottawa to celebr...
      59fbc9fa60b18848c5a4ce5f



In [14]:

    
def class_code(type_str):
    type_str = type_str.lower()
    if 'armed' in type_str or 'attack' in type_str or 'conflict' in type_str:
        return 1
    elif 'disaster' in type_str or 'accident' in type_str:
        return 2
    elif 'law' in type_str or 'crime' in type_str:
        return 3
    elif 'politic' in type_str or 'election' in type_str:
        return 4
    elif 'international' in type_str or 'relation' in type_str:
        return 5
    elif 'science' in type_str or 'technology' in type_str:
        return 6
    elif 'business' in type_str or 'econom' in type_str:
        return 7
    elif 'art' in type_str or 'culture' in type_str:
        return 8
    elif 'sport' in type_str:
        return 9
    elif 'health' in type_str or 'environment' in type_str:
        return 10
    else:
        return 0



In [15]:

    
df_events['class_code'] = df_events['class'].apply(class_code)



In [16]:

    
df_events.head()









    Out[16]:






  
    
      
      class
      date
      description
      id
      title
      class_code
    
  
  
    
      0
      Armed conflicts and attacks
      2010-07-01
      In response to a rocket attack on southern Isr...
      59fbc9fa60b18848c5a4ce5a
      Israeli-Palestinian conflict
      1
    
    
      1
      Armed conflicts and attacks
      2010-07-01
      11 Kurdish, a soldier and three members of a K...
      59fbc9fa60b18848c5a4ce5b
      
      1
    
    
      2
      Armed conflicts and attacks
      2010-07-01
      At least 42 people are killed and at least 175...
      59fbc9fa60b18848c5a4ce5c
      
      1
    
    
      3
      Armed conflicts and attacks
      2010-07-01
      Somali President Sharif Ahmed joins Somali tro...
      59fbc9fa60b18848c5a4ce5d
      
      1
    
    
      4
      Arts and culture
      2010-07-01
      People take to the streets in Ottawa to celebr...
      59fbc9fa60b18848c5a4ce5f
      
      8



In [17]:

    
def description_clean(description):
    description = description.split('. (')[0]+'.'
    return description



In [18]:

    
df_events['des_clean'] = df_events['description'].apply(description_clean)

def efitf(X): count = CountVectorizer(stop_words='english') X_train_count = count.fit_transform(X) tfidf = TfidfTransformer(use_idf=True,smooth_idf=True,sublinear_tf=True) X_train_tfidf = tfidf.fit_transform(X_train_count) tf_feature_names = count.get_feature_names() X_train_tfidf = [list(i) for i in list(X_train_tfidf.toarray())] EFITF = defaultdict(dict) for Type,values in enumerate(X_train_tfidf): for index,value in enumerate(values): if value > 0.0: EFITF[Type].update({tf_feature_names[index]:value}) return EFITF

X = []

X = df_events['des_clean'].tolist()

EFITF = efitf(X)



In [23]:

    
def class_similarity(class_text,span):
    return nlp(class_text).similarity(nlp(span))



In [27]:

    
def get_query(doc,class_text,doc_index,doc_date):
    doc_date = datetime.strptime(doc_date,'%Y-%m-%d')
    date_0 = doc_date.strftime('%Y-%m-%d')
    date_0_ = (doc_date+timedelta(days=-3)).strftime('%Y-%m-%d')
    date_1 = (doc_date+timedelta(days=1)).strftime('%Y-%m-%d')
    date_1_ = date_0
    doc = nlp(doc)
    kws = []
    for i in doc.ents:
        kws.append(i.text)
    triggers = []
    for token in doc:
        if not token.is_stop and token.tag_.startswith('V'):
            if token.text.lower() not in list_stopWords:
                triggers.append((token.text,token.tag_,str(class_similarity(class_text,token.text))))
    triggers = sorted(triggers,key=lambda x:x[2],reverse=True)[:3]
    for i in triggers:
        kws.append(i[0])
    noun_chunks = []
    for i in doc.noun_chunks:
        noun_chunks.append((i.text,str(class_similarity(class_text,i.text))))
    try:
        kws.append(sorted(noun_chunks,key=lambda x:x[1],reverse=True)[0][0].split(' ')[-1])
    except:
        pass
    kws = [w for w in kws if not w in list_stopWords]
    kws = list(set(kws))
    query = [i for i in itertools.combinations(kws,2)]
    query = ['"'+i[0]+'"'+' '+'"'+i[1]+'"'+' '+'since:'+date_0_+' '+'until:'+date_0 for i in query]+['"'+i[0]+'"'+' '+'"'+i[1]+'"'+' '+'since:'+date_1_+' '+'until:'+date_1 for i in query]
    print(query)
    return query



In [28]:

    
queries = []



In [ ]:

    
for event in df_events.iterrows():
    doc_index = event[0]
    doc_date = event[1]['date']
    doc_class = event[1]['class']
    doc_title = event[1]['title']
    doc = event[1]['des_clean']
    class_text = doc_class.replace('and','')
    query = get_query(doc,class_text,doc_index,doc_date)
    queries.append(query)



In [ ]:

    
query = db.current_event.find({},{'_id':1})
ids = []
for i in query:
    ids.append(i['_id'])



In [ ]:

    
requests = [UpdateOne({'_id': _id}, {'$set': {'queries':queries[index]}}) for index,_id in tqdm(enumerate(ids))]
try:
    result = db.current_event.bulk_write(requests)
    pprint(result.bulk_api_result)
except BulkWriteError as bwe:
    pprint(bwe.details)



In [ ]:

	class	date	description	id	title
0	Armed conflicts and attacks	2010-07-01	In response to a rocket attack on southern Isr...	59fbc9fa60b18848c5a4ce5a	Israeli-Palestinian conflict
1	Armed conflicts and attacks	2010-07-01	11 Kurdish, a soldier and three members of a K...	59fbc9fa60b18848c5a4ce5b
2	Armed conflicts and attacks	2010-07-01	At least 42 people are killed and at least 175...	59fbc9fa60b18848c5a4ce5c
3	Armed conflicts and attacks	2010-07-01	Somali President Sharif Ahmed joins Somali tro...	59fbc9fa60b18848c5a4ce5d
4	Arts and culture	2010-07-01	People take to the streets in Ottawa to celebr...	59fbc9fa60b18848c5a4ce5f