In [2]:
from datetime import datetime,timedelta
from collections import defaultdict,Counter
from pprint import pprint
from tqdm import tqdm
import re

import pymongo
from pymongo import InsertOne, DeleteMany, ReplaceOne, UpdateOne
from pymongo.errors import BulkWriteError

In [3]:
client = pymongo.MongoClient('localhost:27017')
db = client.tweet

In [4]:
import pandas as pd
import spacy

In [13]:
nlp = spacy.load('en_core_web_md')

In [108]:
events = [e for e in db.current_event.find({'event.date':{'$gt':'2010-09-01','$lt':'2010-09-03'}},{'_id':1,'abstracts':1,'type':1})]

In [109]:
for i in events:
    print(i)


{'_id': ObjectId('59fbc9fa60b18848c5a4d4e4'), 'type': 1, 'abstracts': [{'media': ['BBC', 'Reuters'], 'abstract': '10 civilians are killed and 2 others are wounded after being struck by NATO during an election campaign in Rostaq, Afghanistan. Originally, a spokesman had said a "precision air strike" had hit a militant vehicle.'}]}
{'_id': ObjectId('59fbc9fa60b18848c5a4d4e5'), 'type': 1, 'abstracts': [{'media': ['YNet News'], 'abstract': 'Palestinians stone an Israeli car travelling through the West Bank resulting in a 12-year-old being injured.'}]}
{'_id': ObjectId('59fbc9fa60b18848c5a4d4e6'), 'type': 1, 'abstracts': [{'media': ['BBC'], 'abstract': 'The Mexican Army claims to have killed 25 drug cartel gunmen in a clash in Tamaulipas state near the United States border.'}]}
{'_id': ObjectId('59fbc9fa60b18848c5a4d4e7'), 'type': 2, 'abstracts': [{'media': ['BBC'], 'abstract': 'Permanent Representative of Pakistan to the United Nations Hussain Haroon calls for an inquiry after allegations emerge of Pakistani floodwaters being diverted into vulnerable villages in a bid to preserve crops.'}]}
{'_id': ObjectId('59fbc9fa60b18848c5a4d4e8'), 'type': 2, 'abstracts': [{'media': ['Straits Times'], 'abstract': "Severe Tropical Storm Lionrock lands in China's Fujian province with warnings of strong winds and torrential rains."}, {'media': ['Straits Times', 'AP via Sign On San Diego'], 'abstract': 'Typhoon Kompasu (Glenda) hits South Korea resulting in three deaths and leading to cancellation of flights and school classes.'}]}
{'_id': ObjectId('59fbc9fa60b18848c5a4d4e9'), 'type': 2, 'abstracts': [{'media': ['AP via Washington Post', 'Reuters'], 'abstract': 'A tropical storm warning is issued for the coast of Long Island in New York as Hurricane Earl approaches the east coast of the United States.'}, {'media': ['Bloomberg'], 'abstract': 'A state of emergency is declared in the states of North Carolina and Virginia.'}]}
{'_id': ObjectId('59fbc9fa60b18848c5a4d4ea'), 'type': 2, 'abstracts': [{'media': ['AP'], 'abstract': 'An oil rig explodes in the Gulf of Mexico, west of the Deepwater Horizon oil rig that exploded in April, killing no people.'}]}
{'_id': ObjectId('59fbc9fa60b18848c5a4d4eb'), 'type': 5, 'abstracts': [{'media': ['Jerusalem Post'], 'abstract': 'Lebanon requests an Interpol arrest warrant for Ghassan al-Jidd, a former General in the Lebanese Army who allegedly spied for the Mossad.'}]}
{'_id': ObjectId('59fbc9fa60b18848c5a4d4ec'), 'type': 3, 'abstracts': [{'media': ['Al Jazeera', 'RFI'], 'abstract': 'At least 17 migrants are kidnapped by suspected human traffickers in Tijuana, Baja California, in northwestern Mexico.'}]}
{'_id': ObjectId('59fbc9fa60b18848c5a4d4ed'), 'type': 3, 'abstracts': [{'media': ['AP', 'Reuters Africa'], 'abstract': 'Price riots continue in Mozambique, leaving a further four people dead.'}]}
{'_id': ObjectId('59fbc9fa60b18848c5a4d4ee'), 'type': 3, 'abstracts': [{'media': ['BBC', 'Latin American Herald Tribune'], 'abstract': 'Police in Brazil arrest almost the entire council in the city of Dourados, Mato Grosso do Sul, on suspicion of fraud and corruption, leaving the city without a government.'}]}
{'_id': ObjectId('59fbc9fa60b18848c5a4d4ef'), 'type': 3, 'abstracts': [{'media': ['BBC'], 'abstract': 'Figures show that at least 2,000 British police officers had 3 or more complaints made against them in 2009-10.'}]}
{'_id': ObjectId('59fbc9fa60b18848c5a4d4f0'), 'type': 4, 'abstracts': [{'media': ['SBS'], 'abstract': 'Independent Member of the Australian House of Representatives Andrew Wilkie announces that he will support the Gillard Labor Government on supply and confidence.'}]}
{'_id': ObjectId('59fbc9fa60b18848c5a4d4f1'), 'type': 4, 'abstracts': [{'media': ['New York Times'], 'abstract': 'BP warns the United States Congress that it might not be able to pay compensation for the Deepwater Horizon oil spill if it is barred from getting new offshore drilling permits.'}]}
{'_id': ObjectId('59fbc9fa60b18848c5a4d4f2'), 'type': 4, 'abstracts': [{'media': ['AFP via Google News'], 'abstract': 'Sebastian Vlădescu is replaced as Romanian Minister of Finance by Gheorghe Ialomitianu as part of a Cabinet reshuffle.'}]}
{'_id': ObjectId('59fbc9fa60b18848c5a4d4f3'), 'type': 9, 'abstracts': [{'media': ['BBC Sport', 'Sky News'], 'abstract': "Pakistan's high commissioner Wajid Shamsul Hasan says 3 of his country's cricketers connected to allegations of betting corruption by the News of the World tabloid could have been set up."}, {'media': ['AFP via ABC Online'], 'abstract': 'The International Cricket Council charges Pakistan captain Salman Butt and fast bowlers Mohammad Aamer and Mohammad Asif with corruption offences and provisionally suspends them from international cricket pending hearings.'}]}
events = [{'id':e['_id'],'class':e['event']['class'],'date':e['event']['date'],'title':e['event']['title'],'description':e['event']['description']} for e in events]
df_events = pd.DataFrame.from_records(events)
df_events.head()
df_events.iloc[0]['id']

In [110]:
events[0]


Out[110]:
{'_id': ObjectId('59fbc9fa60b18848c5a4d4e4'),
 'abstracts': [{'abstract': '10 civilians are killed and 2 others are wounded after being struck by NATO during an election campaign in Rostaq, Afghanistan. Originally, a spokesman had said a "precision air strike" had hit a militant vehicle.',
   'media': ['BBC', 'Reuters']}],
 'type': 1}

In [111]:
abst


Out[111]:
'Seven people are killed and 13 are injured after suicide bombers hit a state-run power station near the northern city of Samarra, Iraq. The Islamic State of Iraq and the Levant claims responsibility for the attack.'

In [113]:
records


Out[113]:
'903363851781099522'
def standard_text_clean()
def reference_similatity(event): abst = ''.join([i['abstract'] for i in event['abstracts']]) doc_reference = nlp(abst) print(abst) tweets = [] filter_dict = {'event_id':events[0]['_id'],'tweet.lang':'en','tweet.media.card_url':{'$ne':None}} query_dict = {'_id':1} first_news_tweet_id = min([i['_id'] for i in db.pos.find(filter_dict,query_dict).sort('_id').limit(1)]+[i['_id'] for i in db.paper.find(filter_dict,query_dict).sort('_id').limit(1)]) filter_dict = {'_id':{'$lt':first_news_tweet_id},'event_id':event['_id'],'tweet.lang':'en','tweet.media.card_url':None} query_dict = {'tweet.standard_text':1,} records = [i for i in db.pos.find(filter_dict,query_dict)]+[i for i in db.paper.find(filter_dict,query_dict)] print(len(records)) for tweet in records: tweet_id = tweet['_id'] tweet_text = tweet['tweet']['standard_text'] doc_tweet = nlp(tweet_text) tweets.append((tweet_id,doc_reference.similarity(doc_tweet),tweet_text)) tweets = sorted(tweets,key=lambda x:x[1],reverse=True) df_candidate = pd.DataFrame.from_records(tweets,index=range(len(tweets)),columns=['id','simi','text']) df_candidate = df_candidate.drop_duplicates(['simi']) return df_candidate['']

In [125]:
def reference_similatity(event):
    abst = ''.join([i['abstract'] for i in event['abstracts']])
    doc_reference = nlp(abst)
    print(abst)
    tweets = []
    #filter_dict = {'event_id':events[0]['_id'],'tweet.lang':'en','tweet.media.card_url':{'$ne':None}}
    #query_dict = {'_id':1}
    #first_news_tweet_id  = min([i['_id'] for i in db.pos.find(filter_dict,query_dict).sort('_id').limit(1)]+[i['_id'] for i in db.paper.find(filter_dict,query_dict).sort('_id').limit(1)])
    filter_dict = {'event_id':event['_id'],'tweet.lang':'en','tweet.media.card_url':None}
    query_dict = {'tweet.standard_text':1}
    records  = [i for i in db.pos.find(filter_dict,query_dict)]+[i for i in db.paper.find(filter_dict,query_dict)]
    print(len(records))
    for tweet in records:
        tweet_id = tweet['_id']
        tweet_text = tweet['tweet']['standard_text'] 
        doc_tweet = nlp(tweet_text)
        tweets.append((tweet_id,doc_reference.similarity(doc_tweet),tweet_text))
    tweets = sorted(tweets,key=lambda x:x[1],reverse=True)
    df_candidate = pd.DataFrame.from_records(tweets,index=range(len(tweets)),columns=['id','simi','text'])
    df_candidate = df_candidate.drop_duplicates(['simi'])
    return df_candidate[df_candidate['simi'] > 0.75][:100]

In [128]:
'\n'.join(['a','b'])


Out[128]:
'a\nb'

In [126]:
x[:3]


Out[126]:
id simi text
0 22805330372 0.962651 The three Pakistan cricketers accused of corru...
1 22840215879 0.946432 The International Cricket Council has suspende...
2 22838266153 0.942557 All three Pakistan cricketers accused of corru...

In [127]:
for i in events:
    x = reference_similatity(i)
    print(len(x))


10 civilians are killed and 2 others are wounded after being struck by NATO during an election campaign in Rostaq, Afghanistan. Originally, a spokesman had said a "precision air strike" had hit a militant vehicle.
155
100
Palestinians stone an Israeli car travelling through the West Bank resulting in a 12-year-old being injured.
31
19
The Mexican Army claims to have killed 25 drug cartel gunmen in a clash in Tamaulipas state near the United States border.
329
100
Permanent Representative of Pakistan to the United Nations Hussain Haroon calls for an inquiry after allegations emerge of Pakistani floodwaters being diverted into vulnerable villages in a bid to preserve crops.
183
100
Severe Tropical Storm Lionrock lands in China's Fujian province with warnings of strong winds and torrential rains.Typhoon Kompasu (Glenda) hits South Korea resulting in three deaths and leading to cancellation of flights and school classes.
143
80
A tropical storm warning is issued for the coast of Long Island in New York as Hurricane Earl approaches the east coast of the United States.A state of emergency is declared in the states of North Carolina and Virginia.
554
100
An oil rig explodes in the Gulf of Mexico, west of the Deepwater Horizon oil rig that exploded in April, killing no people.
382
100
Lebanon requests an Interpol arrest warrant for Ghassan al-Jidd, a former General in the Lebanese Army who allegedly spied for the Mossad.
20
15
At least 17 migrants are kidnapped by suspected human traffickers in Tijuana, Baja California, in northwestern Mexico.
29
18
Price riots continue in Mozambique, leaving a further four people dead.
437
100
Police in Brazil arrest almost the entire council in the city of Dourados, Mato Grosso do Sul, on suspicion of fraud and corruption, leaving the city without a government.
191
100
Figures show that at least 2,000 British police officers had 3 or more complaints made against them in 2009-10.
539
100
Independent Member of the Australian House of Representatives Andrew Wilkie announces that he will support the Gillard Labor Government on supply and confidence.
305
100
BP warns the United States Congress that it might not be able to pay compensation for the Deepwater Horizon oil spill if it is barred from getting new offshore drilling permits.
330
100
Sebastian Vlădescu is replaced as Romanian Minister of Finance by Gheorghe Ialomitianu as part of a Cabinet reshuffle.
66
42
Pakistan's high commissioner Wajid Shamsul Hasan says 3 of his country's cricketers connected to allegations of betting corruption by the News of the World tabloid could have been set up.The International Cricket Council charges Pakistan captain Salman Butt and fast bowlers Mohammad Aamer and Mohammad Asif with corruption offences and provisionally suspends them from international cricket pending hearings.
677
100

In [ ]: