In [1]:
from datetime import datetime,timedelta
from collections import defaultdict,Counter
from pprint import pprint
from tqdm import tqdm
import re

import pymongo
from pymongo import InsertOne, DeleteMany, ReplaceOne, UpdateOne
from pymongo.errors import BulkWriteError

In [2]:
from nltk.corpus import stopwords

In [3]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
import itertools

In [4]:
list_stopWords=list(set(stopwords.words('english')))

In [5]:
client = pymongo.MongoClient('localhost:27017')
db = client.tweet

In [6]:
#db.authenticate('admin','lixiepeng')

In [7]:
import pandas as pd
import spacy

In [8]:
nlp = spacy.load('en_core_web_md')

In [9]:
events = [e for e in db.current_event.find({},{'_id':1,'event.class':1,'event.date':1,'event.title':1,'event.description':1})]

In [10]:
events[0]


Out[10]:
{'_id': ObjectId('59fbc9fa60b18848c5a4ce5a'),
 'event': {'class': 'Armed conflicts and attacks',
  'date': '2010-07-01',
  'description': 'In response to a rocket attack on southern Israel, Israeli airforce jets strike several targets in Gaza overnight, including Yasser Arafat International Airport, a weapons manufacturing facility and an infiltration tunnel into Israel. (Xinhua) (Arab News) (The Jerusalem Post)\nEuropean Union High Representative of the Union for Foreign Affairs and Security Policy Catherine Ashton says the planned demolition of Palestinian houses in East Jerusalem for an archeological park and tourist center is "an obstacle to peace" and "illegal under international law". (Arab News) (Israel National News)\nThe Methodist Church of Great Britain votes in Portsmouth to boycott Israeli-produced goods and services from the West Bank because of what it termed Israel\'s "illegal occupation of Palestinian lands". (The Jerusalem Post)\nUnited States Middle East envoy George J. Mitchell accuses Hamas of “unacceptable and inhuman behavior” with regard to captive soldier Gilad Shalit, held in isolation in Gaza for the last four years, and calls for his immediate release. (The Jerusalem Post)\nThe son of a founding member of Hamas is granted asylum in the United States. (Aljazeera)',
  'title': 'Israeli-Palestinian conflict'}}

In [11]:
events = [{'id':e['_id'],'class':e['event']['class'],'date':e['event']['date'],'title':e['event']['title'],'description':e['event']['description']} for e in events]

In [12]:
df_events = pd.DataFrame.from_records(events)

In [13]:
df_events.head()


Out[13]:
class date description id title
0 Armed conflicts and attacks 2010-07-01 In response to a rocket attack on southern Isr... 59fbc9fa60b18848c5a4ce5a Israeli-Palestinian conflict
1 Armed conflicts and attacks 2010-07-01 11 Kurdish, a soldier and three members of a K... 59fbc9fa60b18848c5a4ce5b
2 Armed conflicts and attacks 2010-07-01 At least 42 people are killed and at least 175... 59fbc9fa60b18848c5a4ce5c
3 Armed conflicts and attacks 2010-07-01 Somali President Sharif Ahmed joins Somali tro... 59fbc9fa60b18848c5a4ce5d
4 Arts and culture 2010-07-01 People take to the streets in Ottawa to celebr... 59fbc9fa60b18848c5a4ce5f

In [14]:
def class_code(type_str):
    type_str = type_str.lower()
    if 'armed' in type_str or 'attack' in type_str or 'conflict' in type_str:
        return 1
    elif 'disaster' in type_str or 'accident' in type_str:
        return 2
    elif 'law' in type_str or 'crime' in type_str:
        return 3
    elif 'politic' in type_str or 'election' in type_str:
        return 4
    elif 'international' in type_str or 'relation' in type_str:
        return 5
    elif 'science' in type_str or 'technology' in type_str:
        return 6
    elif 'business' in type_str or 'econom' in type_str:
        return 7
    elif 'art' in type_str or 'culture' in type_str:
        return 8
    elif 'sport' in type_str:
        return 9
    elif 'health' in type_str or 'environment' in type_str:
        return 10
    else:
        return 0

In [15]:
df_events['class_code'] = df_events['class'].apply(class_code)

In [16]:
df_events.head()


Out[16]:
class date description id title class_code
0 Armed conflicts and attacks 2010-07-01 In response to a rocket attack on southern Isr... 59fbc9fa60b18848c5a4ce5a Israeli-Palestinian conflict 1
1 Armed conflicts and attacks 2010-07-01 11 Kurdish, a soldier and three members of a K... 59fbc9fa60b18848c5a4ce5b 1
2 Armed conflicts and attacks 2010-07-01 At least 42 people are killed and at least 175... 59fbc9fa60b18848c5a4ce5c 1
3 Armed conflicts and attacks 2010-07-01 Somali President Sharif Ahmed joins Somali tro... 59fbc9fa60b18848c5a4ce5d 1
4 Arts and culture 2010-07-01 People take to the streets in Ottawa to celebr... 59fbc9fa60b18848c5a4ce5f 8

In [17]:
def description_clean(description):
    description = description.split('. (')[0]+'.'
    return description

In [18]:
df_events['des_clean'] = df_events['description'].apply(description_clean)
def efitf(X): count = CountVectorizer(stop_words='english') X_train_count = count.fit_transform(X) tfidf = TfidfTransformer(use_idf=True,smooth_idf=True,sublinear_tf=True) X_train_tfidf = tfidf.fit_transform(X_train_count) tf_feature_names = count.get_feature_names() X_train_tfidf = [list(i) for i in list(X_train_tfidf.toarray())] EFITF = defaultdict(dict) for Type,values in enumerate(X_train_tfidf): for index,value in enumerate(values): if value > 0.0: EFITF[Type].update({tf_feature_names[index]:value}) return EFITF
X = []
X = df_events['des_clean'].tolist()
EFITF = efitf(X)

In [23]:
def class_similarity(class_text,span):
    return nlp(class_text).similarity(nlp(span))

In [27]:
def get_query(doc,class_text,doc_index,doc_date):
    doc_date = datetime.strptime(doc_date,'%Y-%m-%d')
    date_0 = doc_date.strftime('%Y-%m-%d')
    date_0_ = (doc_date+timedelta(days=-3)).strftime('%Y-%m-%d')
    date_1 = (doc_date+timedelta(days=1)).strftime('%Y-%m-%d')
    date_1_ = date_0
    doc = nlp(doc)
    kws = []
    for i in doc.ents:
        kws.append(i.text)
    triggers = []
    for token in doc:
        if not token.is_stop and token.tag_.startswith('V'):
            if token.text.lower() not in list_stopWords:
                triggers.append((token.text,token.tag_,str(class_similarity(class_text,token.text))))
    triggers = sorted(triggers,key=lambda x:x[2],reverse=True)[:3]
    for i in triggers:
        kws.append(i[0])
    noun_chunks = []
    for i in doc.noun_chunks:
        noun_chunks.append((i.text,str(class_similarity(class_text,i.text))))
    try:
        kws.append(sorted(noun_chunks,key=lambda x:x[1],reverse=True)[0][0].split(' ')[-1])
    except:
        pass
    kws = [w for w in kws if not w in list_stopWords]
    kws = list(set(kws))
    query = [i for i in itertools.combinations(kws,2)]
    query = ['"'+i[0]+'"'+' '+'"'+i[1]+'"'+' '+'since:'+date_0_+' '+'until:'+date_0 for i in query]+['"'+i[0]+'"'+' '+'"'+i[1]+'"'+' '+'since:'+date_1_+' '+'until:'+date_1 for i in query]
    print(query)
    return query

In [28]:
queries = []

In [ ]:
for event in df_events.iterrows():
    doc_index = event[0]
    doc_date = event[1]['date']
    doc_class = event[1]['class']
    doc_title = event[1]['title']
    doc = event[1]['des_clean']
    class_text = doc_class.replace('and','')
    query = get_query(doc,class_text,doc_index,doc_date)
    queries.append(query)

In [ ]:
query = db.current_event.find({},{'_id':1})
ids = []
for i in query:
    ids.append(i['_id'])

In [ ]:
requests = [UpdateOne({'_id': _id}, {'$set': {'queries':queries[index]}}) for index,_id in tqdm(enumerate(ids))]
try:
    result = db.current_event.bulk_write(requests)
    pprint(result.bulk_api_result)
except BulkWriteError as bwe:
    pprint(bwe.details)

In [ ]: