In [1]:
import pandas as pd
import spacy

In [2]:
nlp = spacy.load('en_core_web_md')

In [7]:
df_demo = pd.read_csv('demo.csv',index_col=0)

In [8]:
df_demo.head()


Out[8]:
class date description title
0 Disasters and accidents 2017/9/1 Hurricane Irma, now a Category 2 hurricane wit... 2017 Atlantic hurricane season
1 Health and medicine 2017/9/1 Researchers report, in the Environmental Scien... Great Lakes
2 International relations 2017/9/1 South Korean President Moon Jae-in and U.S. Pr... South Korea鈥揢nited States relations
3 Law and crime 2017/9/1 United States federal judge Richard Posner has... United States Court of Appeals for the Seventh...
4 Politics and elections 2017/9/1 The Labour Party, led by Jacinda Ardern, surge... New Zealand general election, 2017

In [12]:
def class_code(type_str):
    type_str = type_str.lower()
    if 'armed' in type_str or 'attack' in type_str or 'conflict' in type_str:
        return 1
    elif 'disaster' in type_str or 'accident' in type_str:
        return 2
    elif 'law' in type_str or 'crime' in type_str:
        return 3
    elif 'politic' in type_str or 'election' in type_str:
        return 4
    elif 'international' in type_str or 'relation' in type_str:
        return 5
    elif 'science' in type_str or 'technology' in type_str:
        return 6
    elif 'business' in type_str or 'econom' in type_str:
        return 7
    elif 'art' in type_str or 'culture' in type_str:
        return 8
    elif 'sport' in type_str:
        return 9
    elif 'health' in type_str or 'environment' in type_str:
        return 10
    else:
        return 0

In [13]:
df_demo['class_code'] = df_demo['class'].apply(class_code)

In [18]:
df_demo.head()


Out[18]:
class date description title class_code
0 Disasters and accidents 2017/9/1 Hurricane Irma, now a Category 2 hurricane wit... 2017 Atlantic hurricane season 2
1 Health and medicine 2017/9/1 Researchers report, in the Environmental Scien... Great Lakes 10
2 International relations 2017/9/1 South Korean President Moon Jae-in and U.S. Pr... South Korea鈥揢nited States relations 5
3 Law and crime 2017/9/1 United States federal judge Richard Posner has... United States Court of Appeals for the Seventh... 3
4 Politics and elections 2017/9/1 The Labour Party, led by Jacinda Ardern, surge... New Zealand general election, 2017 4

In [42]:
def description_clean(description):
    description = description.split('. (')[0]+'.'
    return description

In [43]:
df_demo['des_clean'] = df_demo['description'].apply(description_clean)
for i in df_demo.itertuples(): print(i[3]) print(i[6])

In [57]:
from spacy import displacy
from IPython.display import display, HTML

In [122]:
def class_similarity(class_text,span):
    return nlp(class_text).similarity(nlp(span))

In [123]:
def print_pos_(doc,class_text):
    doc_pos_ = []
    for token in doc:
        if token.pos_ in ['VERB']:   #['AUX','NOUN','PROPN','VERB']:
            doc_pos_.extend([token.text,token.pos_,str(class_similarity(class_text,token.text))])
    print('|'.join(doc_pos_))

In [124]:
def print_noun_chunks(doc,class_text):
    noun_chunks =[]
    for i in doc.noun_chunks:
        noun_chunks.extend([i.text,str(class_similarity(class_text,i.text))])
    print('|'.join(noun_chunks))

In [148]:
def expand_with_noun_chuncks(noun_chuncks,ent):
    for noun_chunck in noun_chuncks:
        if ent in noun_chunck:
            return noun_chunck
    return ent

In [205]:
def argument_candidate(doc,class_text):
    arguments = []
    noun_chuncks = [i.text for i in doc.noun_chunks]
    for i in doc.ents:
        #arguments.append((expand_with_noun_chuncks(noun_chuncks,i.text),i.label_,str(class_similarity(class_text,expand_with_noun_chuncks(noun_chuncks,i.text)))))
        arguments.append((i.text,i.label_,str(class_similarity(class_text,i.text))))
    related_when = [i for i in arguments if i[1] in ['TIME','DATE']]
    related_where = [i for i in arguments if i[1] in ['GPE','LOC','FACILITY']]
    related_who = [i for i in arguments if i[1] in ['PERSON','NORP','ORG','']]
    related_what = [i for i in arguments if i[1] in ['PRODUCT','EVENT','WORK_OF_ART','LAW','LANGUAGE','PERCENT','MONEY','QUANTITY','ORDINAL','CARDINAL']]
    return arguments,{
        'related_when':related_when,
        'related_where':related_where,
        'related_who':related_who,
        'related_what':related_what,
    }

In [207]:
def trigger_candidate(doc,arguments,class_text):
    triggers = []
    arguments = [argu[0] for argu in arguments]
    noun_chuncks = [i.text for i in doc.noun_chunks]
    triggers.extend([(i,str(class_similarity(class_text,i))) for i in set(noun_chuncks)-set(arguments)])
    for token in doc:
        if token.tag_.startswith('V'):
            triggers.append((token.text,token.tag_,str(class_similarity(class_text,token.text))))
    return triggers

In [224]:
def print_arguments_triggers(arguments_dict,triggers):
    print('Arguments:\n')
    #pdb.set_trace()
    for k,v in arguments_dict.items():
        print(k+' : '+'|'+'|'.join(['|'.join(i) for i in v]))
    triggers_str = ''
    for i in triggers:
        triggers_str+='|'+'|'.join(i)
    print('Triggers:\n',triggers_str)

In [225]:
def display_doc(doc,style='ent',class_text=None):
    doc = nlp(doc)
    display(HTML(displacy.render(doc, style=style)))
    arguments,arguments_dict = argument_candidate(doc,class_text)
    triggers = trigger_candidate(doc,arguments,class_text)
    print_arguments_triggers(arguments_dict,triggers)
    #print_pos_(doc,class_text)
    #print_noun_chunks(doc,class_text)

In [226]:
for event in df_demo.iterrows():
    print(event[0])
    print(event[1]['class'],'\t',event[1]['date'],'\t',event[1]['title'])
    display_doc(event[1]['des_clean'],class_text=event[1]['class'].replace('and',''))


0
Disasters and accidents 	 2017/9/1 	 2017 Atlantic hurricane season
Hurricane Irma PERSON , now a Category 2 QUANTITY hurricane with maximum sustained winds of 110 miles per hour QUANTITY ( 175 kilometers per hour QUANTITY ), is expected to resume strengthening this weekend DATE increasing the danger when it nears the Leeward Islands LOC in the Caribbean LOC next Thursday DATE .
Arguments:

related_when : |this weekend|DATE|0.283249331654|next Thursday|DATE|0.160565826671
related_where : |the Leeward Islands|LOC|0.247873127015|the Caribbean|LOC|0.208435926875
related_who : |Hurricane Irma|PERSON|0.357908785797
related_what : |now a Category 2 hurricane|QUANTITY|0.380235448992|110 miles per hour|QUANTITY|0.185376154623|175 kilometers per hour|QUANTITY|0.148967080751
Triggers:
 |it|0.264386930191|the danger|0.462647103084|110 miles|0.13337395046|maximum sustained winds|0.374320373452|hour|0.162428970276|175 kilometers|0.0666239555169|is|VBZ|0.168897048868|expected|VBN|0.298831563871|resume|VB|0.146153933788|strengthening|VBG|0.212357447734|increasing|VBG|0.360947489227|nears|VBZ|0.138839396512
1
Health and medicine 	 2017/9/1 	 Great Lakes
Researchers report, in the Environmental Science & Technology ORG journal, the discovery of antidepressant concentrations in 10 CARDINAL kinds of fish in the Niagara River LOC , which links to the Great Lakes LOC via Lakes Erie LOC and Ontario GPE . Speculated causes include inadequate treatment of water re-introduced into said bodies of water.
Arguments:

related_when : |
related_where : |the Niagara River|LOC|0.388616151142|the Great Lakes|LOC|0.358734376058|Lakes Erie|LOC|0.32894182904|Ontario|GPE|0.32870366923
related_who : |the Environmental Science & Technology journal|ORG|0.626985301713
related_what : |10 kinds|CARDINAL|0.338455982602
Triggers:
 |antidepressant concentrations|0.621126117878|Speculated causes|0.402148572707|water|0.354915577487|Researchers|0.486778847535|inadequate treatment|0.606032998365|said bodies|0.362389242075|fish|0.307684981229|the discovery|0.409114996831|report|VBP|0.333384865889|links|VBZ|0.247388982788|Speculated|VBN|0.190679936706|include|VBP|0.338148698629|introduced|VBN|0.260558532936|said|VBN|0.247601628986
2
International relations 	 2017/9/1 	 South Korea鈥揢nited States relations
South Korean NORP President Moon Jae PERSON -in and U.S. GPE President Donald Trump PERSON agree to revise the South Korea GPE Ballistic Missile Range Guidelines which caps South Korea's GPE missile development.
Arguments:

related_when : |
related_where : |U.S. President Donald Trump|GPE|0.413748916657|South Korean President Moon Jae-in|GPE|0.40294889442|South Korea's missile development|GPE|0.51063238924
related_who : |South Korean President Moon Jae-in|NORP|0.40294889442|South Korean President Moon Jae-in|PERSON|0.40294889442|U.S. President Donald Trump|PERSON|0.413748916657
related_what : |
Triggers:
 |the South Korea Ballistic Missile Range Guidelines|0.486171632835|agree|VBP|0.277149975514|revise|VB|0.307453127518|caps|VBZ|0.0916833181096
3
Law and crime 	 2017/9/1 	 United States Court of Appeals for the Seventh Circuit
United States GPE federal judge Richard Posner PERSON has announced his retirement after three decades DATE of service on the bench of the 7th ORDINAL U.S. Circuit Court of Appeals ORG , headquartered in Chicago GPE , Illinois GPE . Appointed by President Ronald Reagan PERSON in 1981 DATE , Justice Posner PERSON has written more than 3,300 CARDINAL opinions from the bench.(AP ORG ).
Arguments:

related_when : |three decades|DATE|0.423120644064|1981|DATE|0.188043900903
related_where : |United States federal judge Richard Posner|GPE|0.631050195264|Chicago|GPE|0.29550988158|Illinois|GPE|0.342008780656
related_who : |United States federal judge Richard Posner|PERSON|0.631050195264|U.S. Circuit Court of Appeals|ORG|0.588310700173|President Ronald Reagan|PERSON|0.265534860052|United States federal judge Richard Posner|PERSON|0.631050195264|the bench.(AP|ORG|0.403803427413
related_what : |the 7th U.S. Circuit Court|ORDINAL|0.529322510022|more than 3,300 opinions|CARDINAL|0.398155457616
Triggers:
 |Appeals|0.462955921122|his retirement|0.423290169946|service|0.284943002868|Justice Posner|0.45301231031|the bench|0.343527492927|has|VBZ|0.404189224658|announced|VBN|0.185691645479|headquartered|VBN|0.230955994087|Appointed|VBN|0.292058855466|has|VBZ|0.404189224658|written|VBN|0.363548749008
4
Politics and elections 	 2017/9/1 	 New Zealand general election, 2017
The Labour Party ORG , led by Jacinda Ardern PERSON , surges in the polls and is neck-and-neck with the National Party ORG , led by Bill English PERSON , for the race for Prime Minister of New Zealand GPE .
Arguments:

related_when : |
related_where : |New Zealand|GPE|0.254721162151
related_who : |The Labour Party|ORG|0.574051702776|Jacinda Ardern|PERSON|0.021859615811|the National Party|ORG|0.589717172463|Bill English|PERSON|0.352255377966
related_what : |
Triggers:
 |neck|0.0705136826232|Prime Minister|0.48925934282|the polls|0.690135010383|the race|0.450207550477|-neck|0.0|led|VBN|0.311364358709|surges|VBZ|0.244959923431|is|VBZ|0.271034264143|led|VBN|0.311364358709
5
Politics and elections 	 2017/9/1 	 Kenyan general election, 2017
The Supreme Court ORG of Kenya GPE annuls the results of the recent presidential election, that indicated President Uhuru Kenyatta PERSON was reelected, due to irregularities, and orders a new election.
Arguments:

related_when : |
related_where : |Kenya|GPE|0.257459323197
related_who : |The Supreme Court|ORG|0.446410985937|President Uhuru Kenyatta|PERSON|0.274939847061
related_what : |
Triggers:
 |the results|0.359382037781|the recent presidential election|0.793275848827|a new election|0.653051979632|irregularities|0.186572255|annuls|VBP|0.196242995334|indicated|VBD|0.238709179065|was|VBD|0.280595929036|reelected|VBN|0.655730050033
6
Politics and elections 	 2017/9/1 	 Presidency of Donald Trump
Paul Ryan PERSON (R-WI), U.S. GPE Speaker of the House of Representatives ORG , urges President Donald Trump PERSON not to rescind the Deferred Action for Childhood Arrivals ORG ( DACA ORG ) program that protects immigrants who illegally entered the U.S. GPE as children from deportation.
Arguments:

related_when : |
related_where : |U.S. Speaker|GPE|0.369958426959|U.S. Speaker|GPE|0.369958426959
related_who : |Paul Ryan|PERSON|0.250817680569|the House of Representatives|ORG|0.447343834705|President Donald Trump|PERSON|0.482175436017|the Deferred Action for Childhood Arrivals|ORG|0.394030276319|Childhood Arrivals (DACA) program|ORG|0.242730180572
related_what : |
Triggers:
 |Representatives|0.431843478778|who|0.298166567757|the Deferred Action|0.393154236352|deportation|0.290521074462|the U.S.|0.437061635613|children|0.219449375835|R-WI|0.147229129185|the House|0.325866711781|immigrants|0.396761080045|urges|VBZ|0.3407898764|rescind|VB|0.304193407434|protects|VBZ|0.0743851382011|entered|VBD|0.223232574812
7
Science and technology 	 2017/9/1 	 2017 in astronomy
Asteroid 3122 Florence PERSON , which is roughly 2.7 miles QUANTITY ( 4.4 kilometers QUANTITY ) wide, comes within 4.4 million miles QUANTITY ( 7 million km QUANTITY ) of Earth LOC 鈥?approximately 18 CARDINAL times the distance from our planet to the Moon PERSON .
Arguments:

related_when : |
related_where : |Earth|LOC|0.415404542524
related_who : |Asteroid 3122 Florence|PERSON|0.192262113256|the Moon|PERSON|0.362654722181
related_what : |roughly 2.7 miles|QUANTITY|0.122975495387|4.4 kilometers|QUANTITY|-0.00234738796058|4.4 million miles|QUANTITY|0.143568291562|7 million km|QUANTITY|0.119574371354|18|CARDINAL|0.0484323900101
Triggers:
 |Asteroid|0.20533818023|our planet|0.471326009147|3122 Florence|0.08808609462|is|VBZ|0.368288025514|comes|VBZ|0.391523651506
8
Armed attacks and conflicts 	 2017/9/2 	 Iraqi Civil War
Seven CARDINAL people are killed and 13 CARDINAL are injured after suicide bombers hit a state-run power station near the northern city of Samarra GPE , Iraq GPE . The Islamic State of ORG Iraq GPE and the Levant ORG claims responsibility for the attack.
Arguments:

related_when : |
related_where : |Samarra|GPE|0.0|Iraq|GPE|0.371648268077|Iraq|GPE|0.371648268077
related_who : |The Islamic State of|ORG|0.48784271855|the Levant|ORG|0.0788780730697
related_what : |Seven people|CARDINAL|0.499983535135|13|CARDINAL|0.081986881148
Triggers:
 |suicide bombers|0.620420973623|responsibility|0.483108068312|the northern city|0.429933340975|a state-run power station|0.42076337093|the attack|0.703502175466|The Islamic State|0.480012098147|are|VBP|0.37283502209|killed|VBN|0.558928219765|are|VBP|0.37283502209|injured|VBN|0.415216759|hit|VBD|0.296802789259|run|VBN|0.304107976846|claims|VBZ|0.433081274207
9
Business and economy 	 2017/9/2 	 Trump Tower wiretapping allegations
Both the FBI ORG and NSD ORG declare that they possess no records indicating that Trump Tower ORG in New York City GPE , New York GPE was wiretapped earlier in March DATE .
Arguments:

related_when : |earlier in March|DATE|0.399820229254
related_where : |New York City|GPE|0.455834564546|New York City|GPE|0.455834564546
related_who : |Both the FBI|ORG|0.414553402528|NSD|ORG|-0.110817321485|Trump Tower|ORG|0.292076124271
related_what : |
Triggers:
 |they|0.433423319239|March|0.189796103938|no records|0.340568678527|declare|VB|0.216978955155|possess|VBP|0.256154632404|indicating|VBG|0.239218453283|was|VBD|0.295191275485|wiretapped|VBN|0.222028863382
10
Disasters and accidents 	 2017/9/2 	 2017 disasters in Kenya
Seven CARDINAL schoolgirls are killed and ten CARDINAL hospitalised after a fire at the Moi Girls School ORG in Nairobi GPE , Kenya GPE .
Arguments:

related_when : |
related_where : |Nairobi|GPE|0.222126340306|Kenya|GPE|0.222126340306
related_who : |the Moi Girls School|ORG|0.22810133241
related_what : |Seven schoolgirls|CARDINAL|0.218349128256|ten|CARDINAL|0.261438554171
Triggers:
 |a fire|0.330839454957|are|VBP|0.334823165857|killed|VBN|0.364402171227|hospitalised|VBN|0.426725006401
11
International relations 	 2017/9/2 	 Timor Gap
Australia GPE and East Timor GPE settle a dispute between the two CARDINAL countries in the Timor Sea LOC .
Arguments:

related_when : |
related_where : |Australia|GPE|0.313285475626|East Timor|GPE|0.278371843071|the Timor Sea|LOC|0.365749520623
related_who : |
related_what : |the two countries|CARDINAL|0.584559746707
Triggers:
 |a dispute|0.509121296394|settle|VBP|0.303386619771
12
Law and crime 	 2017/9/2 	 Law enforcement in Cambodia
Cambodian NORP opposition leader Kem Sokha PERSON is arrested for alleged treason.
Arguments:

related_when : |
related_where : |
related_who : |Cambodian opposition leader Kem Sokha|NORP|0.342590643695|Cambodian opposition leader Kem Sokha|PERSON|0.342590643695
related_what : |
Triggers:
 |alleged treason|0.581041341985|is|VBZ|0.422295721413|arrested|VBN|0.509579831637|alleged|VBN|0.529273999204
13
Business and economy 	 2017/9/3 	 Economy of Cambodia
The Cambodia Daily newspaper ORG announces it will publish its final edition after being ordered to pay USD$6.3 million CARDINAL in taxes as a result of an investigation, initiated by Prime Minister of Cambodia GPE Hun Sen PERSON , into private companies operating in Cambodia GPE .
Arguments:

related_when : |
related_where : |The Cambodia Daily newspaper|GPE|0.470491825123|The Cambodia Daily newspaper|GPE|0.470491825123
related_who : |The Cambodia Daily newspaper|ORG|0.470491825123|Cambodia Hun Sen|PERSON|0.0984757782364
related_what : |USD$6.3 million|CARDINAL|0.353454167983
Triggers:
 |it|0.471583147914|private companies|0.624084544149|Prime Minister|0.382542511416|a result|0.443271208147|taxes|0.545024875318|an investigation|0.380493395073|its final edition|0.367476797751|Cambodia|0.222366604159|announces|VBZ|0.255008009299|publish|VB|0.222224701048|being|VBG|0.392026718307|ordered|VBN|0.106949859847|pay|VB|0.451283369926|initiated|VBN|0.239341900738|operating|VBG|0.457765880604
14
Disasters and accidents 	 2017/9/3 	 Hurricane Harvey
Hurricane Harvey PERSON is projected to possibly become the second ORDINAL costliest hurricane in the United States's GPE history, with estimates ranging from US$72 billion MONEY to over $125 billion MONEY . In comparison, Hurricane Katrina's EVENT total damage is estimated to be around $118 to $160 billion MONEY .
Arguments:

related_when : |
related_where : |the United States's history|GPE|0.351613673761
related_who : |Hurricane Harvey|PERSON|0.392259451014
related_what : |the second costliest hurricane|ORDINAL|0.35900132965|US$72 billion|MONEY|0.21560053351|over $125 billion|MONEY|0.188517916882|Hurricane Katrina's total damage|EVENT|0.565762162693|around $118 to $160 billion|MONEY|0.154305351587
Triggers:
 |estimates|0.318047078796|comparison|0.175073361951|is|VBZ|0.168897048868|projected|VBN|0.271110314601|become|VB|0.291297778686|ranging|VBG|0.296154392485|is|VBZ|0.168897048868|estimated|VBN|0.324633880452|be|VB|0.281163323306
15
Disasters and accidents 	 2017/9/3 	 2017 California wildfires
The La Tuna Fire ORG continues to burn in Los Angeles GPE , California GPE , becoming the largest fire in the city's history at around 5,800 acres QUANTITY . Mayor Eric Garcetti PERSON declares a state of emergency.
Arguments:

related_when : |
related_where : |Los Angeles|GPE|0.203899121292|California|GPE|0.266263241727
related_who : |The La Tuna Fire|ORG|0.288991587071|Mayor Eric Garcetti|PERSON|0.146733920305
related_what : |around 5,800 acres|QUANTITY|0.234410498845
Triggers:
 |the largest fire|0.366123665129|emergency|0.507407083596|a state|0.260954882189|the city's history|0.315794170104|continues|VBZ|0.272902433907|burn|VB|0.258744036808|becoming|VBG|0.28174227587|declares|VBZ|0.197068026023
16
Disasters and accidents 	 2017/9/3 	 Unexploded ordnance
Large portions of Frankfurt GPE , Germany GPE , are evacuated as local authorities work to defuse a bomb left over from a Royal Air Force ORG raid during World War II EVENT . The evacuation is the largest to occur in Europe LOC since World War II EVENT .
Arguments:

related_when : |
related_where : |Frankfurt|GPE|0.108952229158|Germany|GPE|0.169187339897|Europe|LOC|0.217309333547
related_who : |a Royal Air Force raid|ORG|0.270302638226
related_what : |World War II|EVENT|0.361291829215|World War II|EVENT|0.361291829215
Triggers:
 |local authorities|0.351156004567|a bomb|0.27832351347|The evacuation|0.474433452575|Large portions|0.22377748531|are|VBP|0.334823165857|evacuated|VBN|0.246880536844|work|VBP|0.261331804331|defuse|VB|0.386446738386|left|VBN|0.140788984351|is|VBZ|0.168897048868|occur|VB|0.544515261547
17
International relations 	 2017/9/3 	 2017 North Korea crisis
2017 CARDINAL North Korean NORP nuclear test FAC A 6.3 CARDINAL magnitude earthquake is detected near the Punggye-ri Nuclear Test Site LOC in Kilju County GPE , North Korea GPE , after North Korea GPE tests its sixth ORDINAL and most powerful nuclear weapon to date. North Korean NORP state media claims the country tested a hydrogen bomb that can be fitted on an ICBM NORP .
Arguments:

related_when : |
related_where : |the Punggye-ri Nuclear Test Site|LOC|0.436814241617|Kilju County|GPE|0.215681359642|2017 North Korean nuclear test|GPE|0.427116084126|2017 North Korean nuclear test|GPE|0.427116084126
related_who : |2017 North Korean nuclear test|NORP|0.427116084126|2017 North Korean nuclear test|NORP|0.427116084126|an ICBM|NORP|0.319470720851
related_what : |2017 North Korean nuclear test|CARDINAL|0.427116084126|A 6.3 magnitude earthquake|CARDINAL|0.266066753153|its sixth and most powerful nuclear weapon|ORDINAL|0.484554837886
Triggers:
 |North Korean state media|0.528687920285|date|0.206718475618|the country|0.538792147084|North Korea|0.343575725129|a hydrogen bomb|0.253070026941|is|VBZ|0.288241151705|detected|VBN|0.076053307989|tests|VBZ|0.219511198468|claims|VBZ|0.352559890085|tested|VBD|0.147766338173|be|VB|0.332742543705|fitted|VBN|0.0415482072214
18
Science and technology 	 2017/9/3 	 Free-electron laser
The European NORP X-ray free-electron laser is inaugurated in Hamburg GPE , Germany GPE .
Arguments:

related_when : |
related_where : |Hamburg|GPE|0.16417236916|Germany|GPE|0.22523847438
related_who : |The European X-ray free-electron laser|NORP|0.424491526623
related_what : |
Triggers:
 |is|VBZ|0.368288025514|inaugurated|VBN|0.220502195244
19
Armed attacks and conflicts 	 2017/9/4 	 Syrian Civil War
Syrian NORP state television reports the Syrian Army ORG reaches a point 3 kilometers QUANTITY from Deir GPE ez-Zor, a city besieged by the Islamic NORP State of Iraq GPE and the Levant ORG since 2014 DATE .
Arguments:

related_when : |2014|DATE|0.0491542888513
related_where : |Deir ez-Zor|GPE|-0.0590066802067|Iraq|GPE|0.371648268077
related_who : |Syrian state television|NORP|0.438559804509|the Syrian Army|ORG|0.573829628291|the Islamic State|NORP|0.480012098147|the Levant|ORG|0.0788780730697
related_what : |3 kilometers|QUANTITY|0.0414803111491
Triggers:
 |a city|0.363040196838|a point|0.370069577379|reports|VBZ|0.386635915025|reaches|VBZ|0.214020959367|besieged|VBN|0.521823484769
20
Business and economy 	 2017/9/4 	 Economy of the United States
United Technologies Corp ORG will buy airplane parts maker Rockwell Collins ORG for USD$30 Billion FAC , including seven billion CARDINAL in debt previously incurred by Rockwell Collins PERSON .
Arguments:

related_when : |
related_where : |
related_who : |United Technologies Corp|ORG|0.506393096195|Rockwell Collins|ORG|0.105153675115|Rockwell Collins|PERSON|0.105153675115
related_what : |seven billion|CARDINAL|0.434848684479
Triggers:
 |airplane parts maker|0.383601014482|debt|0.546465344851|buy|VB|0.30092415467|including|VBG|0.33253101402|incurred|VBN|0.46038781498
21
Business and economy 	 2017/9/4 	 Media of the United States
Tronc Inc. ORG , the Los Angeles Times ORG and the Chicago Tribune ORG publisher, buys the New York Daily News ORG .
Arguments:

related_when : |
related_where : |
related_who : |Tronc Inc.|ORG|0.186215667038|the Los Angeles Times|ORG|0.333799038299|the Chicago Tribune publisher|ORG|0.434655137008|the New York Daily News|ORG|0.516343650488
related_what : |
Triggers:
 |buys|VBZ|0.32326229952
22
Disasters and accidents 	 2017/9/4 	 2017 Atlantic hurricane season
Hurricane Irma EVENT Hurricane Irma PERSON is now a Category 4 QUANTITY hurricane with maximum sustained winds of 130 mph QUANTITY ( 215 km/h QUANTITY ). Hurricane EVENT warnings are issued for the Leeward Islands LOC in the Caribbean LOC , which are expected to be affected Tuesday DATE . Rainfalls PERSON of up to 10 inches QUANTITY ( 25 centimeters QUANTITY ) are possible. Irma PERSON is forecast to strengthen over the next 48 hours TIME . The governors of Puerto Rico GPE and Florida GPE issue states of emergency.
Arguments:

related_when : |Tuesday|DATE|0.12120660954|the next 48 hours|TIME|0.203661716508
related_where : |the Leeward Islands|LOC|0.247873127015|the Caribbean|LOC|0.208435926875|Puerto Rico|GPE|0.136524945534|Florida|GPE|0.247440250985
related_who : |Hurricane Irma|PERSON|0.357908785797|Rainfalls|PERSON|0.0|Hurricane Irma|PERSON|0.357908785797
related_what : |Hurricane Irma|EVENT|0.357908785797|a Category 4 hurricane|QUANTITY|0.375361751773|130 mph|QUANTITY|0.179235986107|215 km/h|QUANTITY|0.0774449918472|Hurricane Irma|EVENT|0.357908785797|up to 10 inches|QUANTITY|0.162286805315|25 centimeters|QUANTITY|0.0562783222763
Triggers:
 |h|0.0384976888206|The governors|0.203248037528|Hurricane warnings|0.561845313666|Irma|-0.00522683117666|215 km|0.0646489319549|states|0.327626734376|maximum sustained winds|0.374320373452|emergency|0.507407083596|is|VBZ|0.168897048868|are|VBP|0.334823165857|issued|VBN|0.178657014528|are|VBP|0.334823165857|expected|VBN|0.298831563871|be|VB|0.281163323306|affected|VBN|0.531034018282|are|VBP|0.334823165857|is|VBZ|0.168897048868|forecast|VBN|0.393553587171|strengthen|VB|0.212357447734
23
International relations 	 2017/9/4 	 2017 North Korea crisis
South Korean NORP Defense Minister Song Young PERSON -moo says it is worth reviewing deployment of U.S. GPE strategic assets (aircraft carriers, nuclear submarines, and B-52 PRODUCT bombers) to South Korea GPE more regularly.
Arguments:

related_when : |
related_where : |U.S. strategic assets|GPE|0.584852217979|South Korean Defense Minister Song Young-moo|GPE|0.408137876699
related_who : |South Korean Defense Minister Song Young-moo|NORP|0.408137876699|South Korean Defense Minister Song Young-moo|PERSON|0.408137876699
related_what : |B-52 bombers|PRODUCT|0.145975781535
Triggers:
 |it|0.310377940963|South Korea|0.351919548799|deployment|0.254030296323|nuclear submarines|0.334960129915|aircraft carriers|0.312171874531|says|VBZ|0.270846762412|is|VBZ|0.288241151705|reviewing|VBG|0.261712066068
24
International relations 	 2017/9/4 	 Crisis in Venezuela
The opposition movement in Venezuela GPE seeks help from France GPE .
Arguments:

related_when : |
related_where : |Venezuela|GPE|0.196721537552|France|GPE|0.315508702161
related_who : |
related_what : |
Triggers:
 |help|0.342547838316|The opposition movement|0.482940938463|seeks|VBZ|0.400500035446
25
Law and crime 	 2017/9/4 	 Crime in Italy
Italian NORP fugitive and 'Ndrangheta member Rocco Morabito PERSON is arrested in Montevideo GPE , Uruguay GPE , after 23 years DATE on the run. He is now expected to be extradited to Italy GPE in the coming months DATE .
Arguments:

related_when : |23 years|DATE|0.295314025423|the coming months|DATE|0.385382241268
related_where : |Montevideo|GPE|0.12830496218|Uruguay|GPE|0.12830496218|Italy|GPE|0.17969688972
related_who : |Italian fugitive and 'Ndrangheta member Rocco Morabito|NORP|0.559902132756|Italian fugitive and 'Ndrangheta member Rocco Morabito|PERSON|0.559902132756
related_what : |
Triggers:
 |He|0.446581606332|the run|0.394003976412|is|VBZ|0.422295721413|arrested|VBN|0.509579831637|is|VBZ|0.422295721413|expected|VBN|0.313002139728|be|VB|0.413073447268|extradited|VBN|0.464506605587|coming|VBG|0.333901245547
26
Politics and elections 	 2017/9/4 	 Politics of Taiwan
Premier of the Republic of China GPE Lin Chuan PERSON offers his resignation as head of the Executive Branch ORG of Taiwan GPE .
Arguments:

related_when : |
related_where : |the Republic of China|GPE|0.469497943587|Taiwan|GPE|0.15698466789
related_who : |Lin Chuan|PERSON|0.0236051202478|the Executive Branch|ORG|0.39457847627
related_what : |
Triggers:
 |his resignation|0.443677278273|Premier|0.199533175525|China|0.156207825947|head|0.20214664283|the Republic|0.534595955881|offers|VBZ|0.0491723153135
27
Armed conflicts and attacks 	 2017/9/5 	 Syrian Civil War
Siege of Deir ez-Zor ( 2014鈥?7 PERSON ) GPE The Syrian Army ORG lifts the 28-month CARDINAL -long siege of Deir PRODUCT ez-Zor by the Islamic State of Iraq ORG and the Levant ORG .
Arguments:

related_when : |
related_where : |

|GPE|0.0
related_who : |2014鈥?7|PERSON|0.0|The Syrian Army|ORG|0.573829618817|the Islamic State of Iraq|ORG|0.527890415259|the Levant|ORG|0.0788780717674
related_what : |the 28-month-long siege|CARDINAL|0.470934522013|Deir ez-Zor|PRODUCT|-0.0590066792325
Triggers:
 |Siege|0.521823476154|Iraq|0.371648261941|the Islamic State|0.480012090222|lifts|VBZ|0.100133733104
28
Armed conflicts and attacks 	 2017/9/5 	 2016鈥?7 Rohingya persecution in Myanmar
More than 123,000 CARDINAL Rohingya PERSON refugees have fled Myanmar GPE and crossed into Bangladesh GPE due to escalating violence by the Myanmar Army ORG .
Arguments:

related_when : |
related_where : |Myanmar|GPE|0.110897563343|Bangladesh|GPE|0.243876441692
related_who : |More than 123,000 Rohingya refugees|PERSON|0.476712677498|the Myanmar Army|ORG|0.516522867508
related_what : |More than 123,000 Rohingya refugees|CARDINAL|0.476712677498
Triggers:
 |escalating violence|0.70722532895|have|VBP|0.425711278369|fled|VBN|0.449656575014|crossed|VBD|0.324421933027|escalating|VBG|0.533119524967
29
Disasters and accidents 	 2017/9/5 	 2017 Atlantic hurricane season
Hurricane Irma EVENT Hurricane Irma EVENT strengthens to a maximum Category 5 QUANTITY hurricane, becoming the strongest Atlantic LOC hurricane since 2005 DATE 's Hurricane Wilma EVENT in terms of maximum sustained winds, described as "extremely dangerous." The National Hurricane Center WORK_OF_ART ( NHC ORG ) predicts that Irma PERSON could strengthen even more due to favorable conditions.
Arguments:

related_when : |2005's Hurricane Wilma|DATE|0.345699893042
related_where : |the strongest Atlantic hurricane|LOC|0.420068582962
related_who : |(NHC|ORG|-0.00701237427904|Hurricane Irma|PERSON|0.357908785797
related_what : |Hurricane Irma|EVENT|0.357908785797|Hurricane Irma|EVENT|0.357908785797|a maximum Category 5 hurricane|QUANTITY|0.360076674442|2005's Hurricane Wilma|EVENT|0.345699893042|The National Hurricane Center|WORK_OF_ART|0.418866405429
Triggers:
 |maximum sustained winds|0.374320373452|Irma|-0.00522683117666|favorable conditions|0.380056787407|terms|0.242992989322|strengthens|VBZ|0.212357447734|becoming|VBG|0.28174227587|described|VBN|0.200793607194|predicts|VBZ|0.339945043135|strengthen|VB|0.212357447734
30
International relations 	 2017/9/5 	 India鈥揗yanmar relations
Indian NORP Prime Minister Narendra Modi PERSON heads to Myanmar GPE for a state visit.
Arguments:

related_when : |
related_where : |Myanmar|GPE|0.268959022181
related_who : |Indian Prime Minister Narendra Modi|NORP|0.333002686707|Indian Prime Minister Narendra Modi|PERSON|0.333002686707
related_what : |
Triggers:
 |a state visit|0.433511615993
31
International relations 	 2017/9/5 	 2017 North Korea crisis
Japan鈥揢nited States GPE relations, South Korea鈥揢nited States GPE relations FAC Amid tensions from South Korea GPE , U.S. GPE President Donald Trump PERSON announces that he is "allowing Japan GPE and South Korea GPE to buy a substantially increased amount of highly sophisticated military equipment from the United States GPE ." ( The Independent ORG ).
Arguments:

related_when : |
related_where : |Japan鈥揢nited States relations|GPE|0.793477104429|South Korea鈥揢nited States relations|GPE|0.708819330836|South Korea鈥揢nited States relations|GPE|0.708819330836|U.S. President Donald Trump|GPE|0.413748916657|Japan鈥揢nited States relations|GPE|0.793477104429|South Korea鈥揢nited States relations|GPE|0.708819330836|the United States|GPE|0.554561069776
related_who : |U.S. President Donald Trump|PERSON|0.413748916657|(The Independent|ORG|0.442440109051
related_what : |
Triggers:
 |he|0.329023423376|Japan|0.286642157745|South Korea|0.351919548799|a substantially increased amount|0.356462363157|highly sophisticated military equipment|0.500948522712|tensions|0.306615237557|announces|VBZ|0.291151386038|is|VBZ|0.288241151705|allowing|VBG|0.32130924819|buy|VB|0.176762466585|increased|VBN|0.330483266389
32
Law and crime 	 2017/9/5 	 Journalists killed in India
Prominent Indian NORP journalist and Lankesh Patrike PERSON editor Gauri Lankesh PERSON is shot dead by unidentified men outside her house in the city of Bengaluru GPE .
Arguments:

related_when : |
related_where : |Bengaluru|GPE|0.0651254167868
related_who : |Prominent Indian journalist|NORP|0.42397370554|Lankesh Patrike editor|PERSON|0.17564787189|Gauri Lankesh|PERSON|0.0
related_what : |
Triggers:
 |the city|0.500354653334|unidentified men|0.388254146727|her house|0.409643719279|is|VBZ|0.422295721413|shot|VBN|0.265817437951
33
Law and crime 	 2017/9/5 	 Australian Marriage Law Postal Survey (Same-sex marriage)
The High Court of Australia ORG receives a complaint against the Australian NORP Marriage Law Postal Survey citing the postal survey as "unique and offensive." (The Guardian) WORK_OF_ART .
Arguments:

related_when : |
related_where : |
related_who : |The High Court of Australia|ORG|0.587607746366|the Australian Marriage Law Postal Survey|NORP|0.711241499187
related_what : |(The Guardian)|WORK_OF_ART|0.361361944156
Triggers:
 |Australia|0.25587200476|(The Guardian|0.413507017033|The High Court|0.593857575155|a complaint|0.515442505477|the postal survey|0.453356289133|receives|VBZ|0.184443863222|citing|VBG|0.33119984825
34
Politics and elections 	 2017/9/5 	 Politics of Taiwan
Tainan GPE Mayor William Lai Ching-te PERSON is appointed as the new Premier by the President of the Republic of China GPE Tsai Ing-wen PERSON .
Arguments:

related_when : |
related_where : |Tainan Mayor William Lai Ching-te|GPE|0.269547018921|the Republic of China|GPE|0.469497943587
related_who : |Tainan Mayor William Lai Ching-te|PERSON|0.269547018921|Tsai Ing-wen|PERSON|0.1266557292
related_what : |
Triggers:
 |the Republic|0.534595955881|China|0.156207825947|the new Premier|0.332105353284|the President|0.521588055935|is|VBZ|0.271034264143|appointed|VBN|0.302930283368
35
Politics and elections 	 2017/9/5 	 Deferred Action for Childhood Arrivals
The Trump Administration ORG announces that, during the next six months DATE , it will be ending the DACA ORG program that has halted the deportation of about 800,000 CARDINAL people who were brought to the U.S. GPE illegally as children. Congress ORG is called upon to pass legislation to correct the situation.
Arguments:

related_when : |the next six months|DATE|0.338907360369
related_where : |the U.S.|GPE|0.437061635613
related_who : |The Trump Administration|ORG|0.591466005224|the DACA program|ORG|0.280110416205|Congress|ORG|0.57855267145
related_what : |about 800,000 people|CARDINAL|0.327572046668
Triggers:
 |it|0.359504523576|who|0.298166567757|the situation|0.45906628286|children|0.219449375835|legislation|0.497035700057|the deportation|0.395922391098|announces|VBZ|0.216135332413|be|VB|0.313054534805|ending|VBG|0.300788915757|has|VBZ|0.305889900052|halted|VBN|0.26774693886|were|VBD|0.29053448751|brought|VBN|0.31831567405|is|VBZ|0.271034264143|called|VBN|0.245750025744|pass|VB|0.185807649204|correct|VB|0.251773396354
36
Sports 	 2017/9/5 	 2017鈥?8 NBA season
Billionaire investor Tilman Fertitta PERSON buys the Houston Rockets NBA ORG basketball team for US$2.2 billion MONEY .
Arguments:

related_when : |
related_where : |
related_who : |Billionaire investor Tilman Fertitta|PERSON|0.11165074302|the Houston Rockets NBA basketball team|ORG|0.599674816765
related_what : |US$2.2 billion|MONEY|0.0792160550851
Triggers:
 |buys|VBZ|0.19532524527

In [314]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from collections import defaultdict

In [317]:
def efitf(X):
    count = CountVectorizer()
    X_train_count = count.fit_transform(X)
    tfidf = TfidfTransformer(use_idf=True)
    X_train_tfidf = tfidf.fit_transform(X_train_count)
    tf_feature_names = count.get_feature_names()
    X_train_tfidf = [list(i) for i in list(X_train_tfidf.toarray())]
    EFITF = defaultdict(list)
    for Type,values in enumerate(X_train_tfidf):
        #pdb.set_trace()
        for index,value in enumerate(values):
            if value > 0.0:
                EFITF[Type].append({tf_feature_names[index]:value}) 
    return EFITF

In [229]:
df_demo.head()


Out[229]:
class date description title class_code des_clean
0 Disasters and accidents 2017/9/1 Hurricane Irma, now a Category 2 hurricane wit... 2017 Atlantic hurricane season 2 Hurricane Irma, now a Category 2 hurricane wit...
1 Health and medicine 2017/9/1 Researchers report, in the Environmental Scien... Great Lakes 10 Researchers report, in the Environmental Scien...
2 International relations 2017/9/1 South Korean President Moon Jae-in and U.S. Pr... South Korea鈥揢nited States relations 5 South Korean President Moon Jae-in and U.S. Pr...
3 Law and crime 2017/9/1 United States federal judge Richard Posner has... United States Court of Appeals for the Seventh... 3 United States federal judge Richard Posner has...
4 Politics and elections 2017/9/1 The Labour Party, led by Jacinda Ardern, surge... New Zealand general election, 2017 4 The Labour Party, led by Jacinda Ardern, surge...

In [233]:
df_demo[df_demo['class_code'] == 1]['des_clean'].tolist()


Out[233]:
['Seven people are killed and 13 are injured after suicide bombers hit a state-run power station near the northern city of Samarra, Iraq. The Islamic State of Iraq and the Levant claims responsibility for the attack.',
 'Syrian state television reports the Syrian Army reaches a point 3 kilometers from Deir ez-Zor, a city besieged by the Islamic State of Iraq and the Levant since 2014.',
 'Siege of Deir ez-Zor (2014鈥?7)\r\n\r\nThe Syrian Army lifts the 28-month-long siege of Deir ez-Zor by the Islamic State of Iraq and the Levant.',
 'More than 123,000 Rohingya refugees have fled Myanmar and crossed into Bangladesh due to escalating violence by the Myanmar Army.']

In [239]:
X = [0]*11

In [242]:
for i in range(11):
    X[i] = ' '.join(df_demo[df_demo['class_code'] == i]['des_clean'].tolist())

In [318]:
EFITF =  efitf(X)

In [415]:
doc = nlp('The Cambodia Daily newspaper announces it will publish its final edition after being ordered to pay million in taxes as result of an investigation initiated by Prime Minister of Cambodia Hun Sen into private companies operating in Cambodia')

In [329]:
doc


Out[329]:
The Cambodia Daily newspaper announces it will publish its final edition after being ordered to pay USD$6.3 million in taxes as a result of an investigation, initiated by Prime Minister of Cambodia Hun Sen , into private companies operating in Cambodia .

In [332]:
doc.ents


Out[332]:
(The Cambodia Daily newspaper, USD$6.3 million, Cambodia, Hun Sen, Cambodia)

In [338]:
ents_texts = [i.text for i in doc.ents]

In [419]:
[i.text for i in doc.noun_chunks if i.text not in ents_texts and not i.is_stop]


---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-419-acd94cc15e2b> in <module>()
----> 1 [i.text for i in doc.noun_chunks if i.text not in ents_texts and not i.is_stop]

<ipython-input-419-acd94cc15e2b> in <listcomp>(.0)
----> 1 [i.text for i in doc.noun_chunks if i.text not in ents_texts and not i.is_stop]

AttributeError: 'spacy.tokens.span.Span' object has no attribute 'is_stop'

efitf vs doc2vec

for i in doc: print(i.text,i.tag_,i.pos_)
import json import codecs
EFITF = ''
with codecs.open('EFITF.json','r',encoding='utf-8') as f: EFITF = json.load(f)
EFITF['6']['european']
doc1 = nlp('The European free electron laser is inaugurated in Hamburg Germany')
for i in doc: print(i.text,i.tag_,i.pos_)
for i in doc: print(i.text,EFITF['7'][i.text.lower()])
for i in doc: print(i.text,nlp('business').similarity(nlp(i.text.lower())))