In [1]:
import math
import pandas as pd
import numpy as np
np.random.seed(2018)
import time

import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from gensim import corpora, models

import nltk
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.tag import pos_tag
from nltk.corpus import stopwords
nltk.download('wordnet')
nltk.download('stopwords')


from textblob import Word, TextBlob as tb

import spacy
nlp = spacy.load("en_core_web_sm")


[nltk_data] Downloading package wordnet to /Users/tashlin/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/tashlin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!

In [2]:
#read in data
data_path = "../data/csv_files/data_astro-ph.csv"
df = pd.read_csv(data_path, index_col= [0])

In [3]:
df.head()


Out[3]:
title summary
0 Gamma-Ray Bursts as the Death Throes of Massiv... It is proposed that gamma-ray bursts are cre...
1 Gravitational Lensing and the Variability of G The four observables associated with gravita...
2 The Ptolemaic Gamma-Ray Burst Universe The BATSE experiment on GRO has demonstrated...
3 Expanding Photospheres of Type II Supernovae a... We use the Expanding Photosphere Method to d...
4 Radiation Transfer in Gamma-Ray Bursts We have calculated gamma-ray radiative trans...

Clean Up and Conversion for Processing


In [4]:
def get_blob(some_string):
    #return converted textblob from string
    return tb(some_string)

def newline_replace(some_string):
    #replace newline character with space
    return some_string.replace('- ', '').replace('\n', ' ')

TF-IDF algo


In [5]:
def tf(word, blob):
    #term frequency, word count over all words
    return blob.words.count(word) / len(blob.words)

def n_containing(word, bloblist):
    #returns the number of documents containing word
    return sum(1 for blob in bloblist if word in blob.words)

def idf(word, bloblist):
    #inverse document frequency, measures how common a word is among all documents in bloblist
    return math.log(len(bloblist) / (1 + n_containing(word, bloblist)))

def tfidf(word, blob, bloblist):
    #computes the TF-IDF score, product of tf and idf.
    return tf(word, blob) * idf(word, bloblist)

Noun_Phrases from TextBlob


In [6]:
def get_noun_phrases(textblob_string):
    #return list of unique noun phrases
    np = textblob_string.noun_phrases
    return list(set(np))

def clean_up_kw(strng_lst):
    #filter out all non alpha characters 
    #convert empty space and dashes to underscores inline with tfidf bigram/trigrams
    new_list = []
    for strng in strng_lst:
        strng = strng.replace("-"," ")
        check_alpha = ''.join([i for i in strng if i.isalpha() or i=="_" or i==' ']) 
        join_alpha = "_".join(check_alpha.split())
        new_list.append(join_alpha)
    return list(set(new_list))

Arxiv Data Specific processing


In [7]:
#combine summary and title for analysis
#replace newline characters in string
df['comb'] = (df['title']+ " " + df["summary"].map(str)).apply(newline_replace)
df.head()


Out[7]:
title summary comb
0 Gamma-Ray Bursts as the Death Throes of Massiv... It is proposed that gamma-ray bursts are cre... Gamma-Ray Bursts as the Death Throes of Massiv...
1 Gravitational Lensing and the Variability of G The four observables associated with gravita... Gravitational Lensing and the Variability of G...
2 The Ptolemaic Gamma-Ray Burst Universe The BATSE experiment on GRO has demonstrated... The Ptolemaic Gamma-Ray Burst Universe The B...
3 Expanding Photospheres of Type II Supernovae a... We use the Expanding Photosphere Method to d... Expanding Photospheres of Type II Supernovae a...
4 Radiation Transfer in Gamma-Ray Bursts We have calculated gamma-ray radiative trans... Radiation Transfer in Gamma-Ray Bursts We ha...

In [8]:
def clean_up_text(strng):
    strng_replaced = strng.replace("-"," ")
    filter_alnum = ''.join([i for i in strng_replaced if i.isalnum() or i=="_" or i==' ' or i=='.'])
    return filter_alnum

In [9]:
df['cleaned'] = df['comb'].apply(clean_up_text)

In [10]:
df['cleaned'][0]


Out[10]:
'Gamma Ray Bursts as the Death Throes of Massive Binary Stars   It is proposed that gamma ray bursts are created in the mergers of double neutron star binaries and black hole neutron star binaries at cosmological distances. Bursts with complex profiles and relatively long durations are the result of magnetic flares generated by the Parker instability in a post merger differentially rotating disk. Some bursts may also be produced through neutrino antineutrino annihilation into electrons and positrons. In both cases an optically thick fireball of size sles 100 km is initially created which expands ultrarelativistically to large radii before radiating. Several previous objections to the cosmological merger model are eliminated. It is predicted that gamma ray bursts will be accompanied by a burst of gravitational radiation from the spiraling in binary which could be detected by LIGO. '

In [ ]:


In [ ]:


In [ ]:


In [11]:
#convert string to textblob
df['summary_tb'] = df['comb'].apply(get_blob)

In [12]:
df.head()


Out[12]:
title summary comb cleaned summary_tb
0 Gamma-Ray Bursts as the Death Throes of Massiv... It is proposed that gamma-ray bursts are cre... Gamma-Ray Bursts as the Death Throes of Massiv... Gamma Ray Bursts as the Death Throes of Massiv... (G, a, m, m, a, -, R, a, y, , B, u, r, s, t, ...
1 Gravitational Lensing and the Variability of G The four observables associated with gravita... Gravitational Lensing and the Variability of G... Gravitational Lensing and the Variability of G... (G, r, a, v, i, t, a, t, i, o, n, a, l, , L, ...
2 The Ptolemaic Gamma-Ray Burst Universe The BATSE experiment on GRO has demonstrated... The Ptolemaic Gamma-Ray Burst Universe The B... The Ptolemaic Gamma Ray Burst Universe The B... (T, h, e, , P, t, o, l, e, m, a, i, c, , G, ...
3 Expanding Photospheres of Type II Supernovae a... We use the Expanding Photosphere Method to d... Expanding Photospheres of Type II Supernovae a... Expanding Photospheres of Type II Supernovae a... (E, x, p, a, n, d, i, n, g, , P, h, o, t, o, ...
4 Radiation Transfer in Gamma-Ray Bursts We have calculated gamma-ray radiative trans... Radiation Transfer in Gamma-Ray Bursts We ha... Radiation Transfer in Gamma Ray Bursts We ha... (R, a, d, i, a, t, i, o, n, , T, r, a, n, s, ...

Implement Noun Phrases


In [13]:
#get noun phrases from TextBlob
df['np_dirty'] = df['summary_tb'].apply(get_noun_phrases)

In [14]:
#get cleaned noun phrases
df['np_clean'] = df['np_dirty'].apply(clean_up_kw)

In [ ]:


In [ ]:


In [ ]:

Preproccess for TF-IDF


In [15]:
def word_lemmatize(text):
    #lemmatize text
    return WordNetLemmatizer().lemmatize(text)

def sent_to_words(sentences):
    #convert sentences to individual words
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))
    
def remove_stopwords(texts):
    # remove stopwords from corpus
    return [[word for word in simple_preprocess(str(doc)) 
             if word not in stop_words] for doc in texts]
              
def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

In [16]:
words = list(sent_to_words(df['comb']))
data_words = [[word_lemmatize(kw) for kw in lst] for lst in words]



# hyperparameters threshold need tuning
# higher threshold fewer phrases.
bigram = gensim.models.Phrases(data_words, min_count=4, threshold=20) 
trigram = gensim.models.Phrases(bigram[data_words], threshold=20)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# See trigram example
print(trigram_mod[bigram_mod[data_words[0]]])
print("\n\n")
print(data_words[0])


['gamma_ray_burst', 'a', 'the', 'death', 'throe', 'of', 'massive', 'binary', 'star', 'it', 'is', 'proposed', 'that', 'gamma_ray_burst', 'are', 'created', 'in', 'the', 'merger', 'of', 'double', 'neutron_star', 'binary', 'and', 'black_hole', 'neutron_star', 'binary', 'at', 'cosmological', 'distance', 'burst', 'with', 'complex', 'profile', 'and', 'relatively', 'long', 'duration', 'are', 'the', 'result', 'of', 'magnetic_flare', 'generated', 'by', 'the', 'parker_instability', 'in', 'post', 'merger', 'differentially_rotating', 'disk', 'some', 'burst', 'may', 'also', 'be', 'produced', 'through', 'neutrino_antineutrino', 'annihilation', 'into', 'electron', 'and', 'positron', 'in', 'both', 'case', 'an', 'optically_thick', 'fireball', 'of', 'size', 'sle', 'km', 'is', 'initially', 'created', 'which', 'expands', 'to', 'large', 'radius', 'before', 'radiating', 'several', 'previous', 'objection', 'to', 'the', 'cosmological', 'merger', 'model', 'are', 'eliminated', 'it', 'is', 'predicted', 'that', 'gamma_ray_burst', 'will_be', 'accompanied_by', 'burst', 'of', 'gravitational_radiation', 'from', 'the', 'spiraling', 'in', 'binary', 'which', 'could_be', 'detected', 'by', 'ligo']



['gamma', 'ray', 'burst', 'a', 'the', 'death', 'throe', 'of', 'massive', 'binary', 'star', 'it', 'is', 'proposed', 'that', 'gamma', 'ray', 'burst', 'are', 'created', 'in', 'the', 'merger', 'of', 'double', 'neutron', 'star', 'binary', 'and', 'black', 'hole', 'neutron', 'star', 'binary', 'at', 'cosmological', 'distance', 'burst', 'with', 'complex', 'profile', 'and', 'relatively', 'long', 'duration', 'are', 'the', 'result', 'of', 'magnetic', 'flare', 'generated', 'by', 'the', 'parker', 'instability', 'in', 'post', 'merger', 'differentially', 'rotating', 'disk', 'some', 'burst', 'may', 'also', 'be', 'produced', 'through', 'neutrino', 'antineutrino', 'annihilation', 'into', 'electron', 'and', 'positron', 'in', 'both', 'case', 'an', 'optically', 'thick', 'fireball', 'of', 'size', 'sle', 'km', 'is', 'initially', 'created', 'which', 'expands', 'to', 'large', 'radius', 'before', 'radiating', 'several', 'previous', 'objection', 'to', 'the', 'cosmological', 'merger', 'model', 'are', 'eliminated', 'it', 'is', 'predicted', 'that', 'gamma', 'ray', 'burst', 'will', 'be', 'accompanied', 'by', 'burst', 'of', 'gravitational', 'radiation', 'from', 'the', 'spiraling', 'in', 'binary', 'which', 'could', 'be', 'detected', 'by', 'ligo']

In [17]:
paper_number = 12
print(trigram_mod[bigram_mod[data_words[paper_number]]])
print("\n\n")
print(data_words[paper_number])


['dynamic', 'of', 'the', 'intermediate_age', 'elliptical', 'lmc', 'cluster', 'ngc', 'in', 'this_paper', 'we', 'investigate', 'the', 'internal', 'dynamic', 'of', 'the', 'lmc', 'cluster', 'ngc', 'through', 'the', 'use', 'of', 'photometric', 'ccd_image', 'and', 'kinematic', 'stellar', 'radial_velocity', 'data', 'we', 'apply', 'variety', 'of', 'dynamical', 'model', 'to', 'this', 'data', 'including', 'multi', 'mass', 'king_michie', 'model', 'and', 'rotating', 'and', 'non_rotating', 'oblate_spheroid', 'model', 'we', 'discus', 'the', 'cluster', 'mass', 'to', 'light_ratio', 'and', 'place_constraint_on', 'the', 'cluster', 'mass', 'function']



['dynamic', 'of', 'the', 'intermediate', 'age', 'elliptical', 'lmc', 'cluster', 'ngc', 'in', 'this', 'paper', 'we', 'investigate', 'the', 'internal', 'dynamic', 'of', 'the', 'lmc', 'cluster', 'ngc', 'through', 'the', 'use', 'of', 'photometric', 'ccd', 'image', 'and', 'kinematic', 'stellar', 'radial', 'velocity', 'data', 'we', 'apply', 'variety', 'of', 'dynamical', 'model', 'to', 'this', 'data', 'including', 'multi', 'mass', 'king', 'michie', 'model', 'and', 'rotating', 'and', 'non', 'rotating', 'oblate', 'spheroid', 'model', 'we', 'discus', 'the', 'cluster', 'mass', 'to', 'light', 'ratio', 'and', 'place', 'constraint', 'on', 'the', 'cluster', 'mass', 'function']

In [ ]:


In [18]:
#Snowball stemmer aka porter2 stemmer regarded as better stemmer vs regular Porter
stemmer = SnowballStemmer("english")
stop_words = stopwords.words('english')
stop_words.extend(['compare', 'constraint'])   # expand the stopwords dictionary includes high frequence words

In [ ]:


In [19]:
data_words_nostops = remove_stopwords(data_words)
data_words_bigrams = make_bigrams(data_words_nostops)
data_words_trigrams = make_trigrams(data_words_bigrams)

In [20]:
# Preproccesed with bi/trigrams in pandas series
df_pre_tb = pd.Series(data_words_trigrams).to_frame(name='summary')

In [ ]:


In [21]:
def first_join_list(some_list):
    return " ".join(some_list)

In [22]:
df_pre = df_pre_tb['summary'].apply(first_join_list)

In [23]:
print(data_words[paper_number])
print("****")
print(df_pre[12])


['dynamic', 'of', 'the', 'intermediate', 'age', 'elliptical', 'lmc', 'cluster', 'ngc', 'in', 'this', 'paper', 'we', 'investigate', 'the', 'internal', 'dynamic', 'of', 'the', 'lmc', 'cluster', 'ngc', 'through', 'the', 'use', 'of', 'photometric', 'ccd', 'image', 'and', 'kinematic', 'stellar', 'radial', 'velocity', 'data', 'we', 'apply', 'variety', 'of', 'dynamical', 'model', 'to', 'this', 'data', 'including', 'multi', 'mass', 'king', 'michie', 'model', 'and', 'rotating', 'and', 'non', 'rotating', 'oblate', 'spheroid', 'model', 'we', 'discus', 'the', 'cluster', 'mass', 'to', 'light', 'ratio', 'and', 'place', 'constraint', 'on', 'the', 'cluster', 'mass', 'function']
****
dynamic intermediate_age elliptical lmc cluster ngc paper investigate internal dynamic lmc cluster ngc use photometric ccd_image kinematic stellar radial_velocity data apply variety dynamical model data including multi mass king_michie model rotating non_rotating oblate_spheroid model discus cluster mass light_ratio place cluster mass function

In [24]:
df_tb = df_pre.apply(get_blob)

In [25]:
df_tb.head()


Out[25]:
0    (g, a, m, m, a, _, r, a, y, _, b, u, r, s, t, ...
1    (g, r, a, v, i, t, a, t, i, o, n, a, l, _, l, ...
2    (p, t, o, l, e, m, a, i, c,  , g, a, m, m, a, ...
3    (e, x, p, a, n, d, i, n, g, _, p, h, o, t, o, ...
4    (r, a, d, i, a, t, i, o, n,  , t, r, a, n, s, ...
Name: summary, dtype: object

For timesake limit the amount of data to 1000 rows


In [26]:
df_limit = df_tb[:1000]

In [27]:
#return top 15 keywords for each document/summary
def tfidf_keywords(blob,df_blob):
    scores = {word: tfidf(word, blob,df_blob) for word in blob.words}
    sorted_words = sorted(scores.items(), key=lambda x: x[1], reverse=True)
    return [item[0] for item in sorted_words[:15]]

In [28]:
#run tfidf and print time taken
start = time.perf_counter()
results = [tfidf_keywords(blob, df_limit) for blob in df_limit]
finish = time.perf_counter()
print(f'finished in: {round(finish-start,2)}')


finished in: 130.16

In [ ]:


In [ ]:


In [29]:
df.head()


Out[29]:
title summary comb cleaned summary_tb np_dirty np_clean
0 Gamma-Ray Bursts as the Death Throes of Massiv... It is proposed that gamma-ray bursts are cre... Gamma-Ray Bursts as the Death Throes of Massiv... Gamma Ray Bursts as the Death Throes of Massiv... (G, a, m, m, a, -, R, a, y, , B, u, r, s, t, ... [double neutron star binaries, massive binary,... [gamma_ray_bursts, black_hole_neutron_star_bin...
1 Gravitational Lensing and the Variability of G The four observables associated with gravita... Gravitational Lensing and the Variability of G... Gravitational Lensing and the Variability of G... (G, r, a, v, i, t, a, t, i, o, n, a, l, , L, ... [$ \dot {, distant quasars, variability, relat... [, estimate_constraints, omega_, variability, ...
2 The Ptolemaic Gamma-Ray Burst Universe The BATSE experiment on GRO has demonstrated... The Ptolemaic Gamma-Ray Burst Universe The B... The Ptolemaic Gamma Ray Burst Universe The B... (T, h, e, , P, t, o, l, e, m, a, i, c, , G, ... [cosmic gamma-ray bursts, jennings, $ { \it, m... [log_s, gamma_ray_bursts, jennings, high_recoi...
3 Expanding Photospheres of Type II Supernovae a... We use the Expanding Photosphere Method to d... Expanding Photospheres of Type II Supernovae a... Expanding Photospheres of Type II Supernovae a... (E, x, p, a, n, d, i, n, g, , P, h, o, t, o, ... [variable stars, surface brightness fluctuatio... [, distance_measurement, mpc, good_agreement, ...
4 Radiation Transfer in Gamma-Ray Bursts We have calculated gamma-ray radiative trans... Radiation Transfer in Gamma-Ray Bursts We ha... Radiation Transfer in Gamma Ray Bursts We ha... (R, a, d, i, a, t, i, o, n, , T, r, a, n, s, ... [march, spherical, flare models, mev, gamma-ra... [march, effective_collimation, two_photon_pair...

In [30]:
#create new df for limited*  results
df_test = df[:1000]

In [31]:
#add keywords results to dataframe
df_test['keywords'] = results


/Users/tashlin/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  

In [32]:
df_test.head()


Out[32]:
title summary comb cleaned summary_tb np_dirty np_clean keywords
0 Gamma-Ray Bursts as the Death Throes of Massiv... It is proposed that gamma-ray bursts are cre... Gamma-Ray Bursts as the Death Throes of Massiv... Gamma Ray Bursts as the Death Throes of Massiv... (G, a, m, m, a, -, R, a, y, , B, u, r, s, t, ... [double neutron star binaries, massive binary,... [gamma_ray_bursts, black_hole_neutron_star_bin... [binary, merger, created, burst, gamma_ray_bur...
1 Gravitational Lensing and the Variability of G The four observables associated with gravita... Gravitational Lensing and the Variability of G... Gravitational Lensing and the Variability of G... (G, r, a, v, i, t, a, t, i, o, n, a, l, , L, ... [$ \dot {, distant quasars, variability, relat... [, estimate_constraints, omega_, variability, ... [dot, gravitational_lensing, lensing, splittin...
2 The Ptolemaic Gamma-Ray Burst Universe The BATSE experiment on GRO has demonstrated... The Ptolemaic Gamma-Ray Burst Universe The B... The Ptolemaic Gamma Ray Burst Universe The B... (T, h, e, , P, t, o, l, e, m, a, i, c, , G, ... [cosmic gamma-ray bursts, jennings, $ { \it, m... [log_s, gamma_ray_bursts, jennings, high_recoi... [gamma_ray_burst, born, recoil, become, neutro...
3 Expanding Photospheres of Type II Supernovae a... We use the Expanding Photosphere Method to d... Expanding Photospheres of Type II Supernovae a... Expanding Photospheres of Type II Supernovae a... (E, x, p, a, n, d, i, n, g, , P, h, o, t, o, ... [variable stars, surface brightness fluctuatio... [, distance_measurement, mpc, good_agreement, ... [sn, type_ii, distance, supernova, extinction,...
4 Radiation Transfer in Gamma-Ray Bursts We have calculated gamma-ray radiative trans... Radiation Transfer in Gamma-Ray Bursts We ha... Radiation Transfer in Gamma Ray Bursts We ha... (R, a, d, i, a, t, i, o, n, , T, r, a, n, s, ... [march, spherical, flare models, mev, gamma-ra... [march, effective_collimation, two_photon_pair... [discrete, gamma_ray_burst, energy, optical_de...

Combine Noun Phrases and TFIDF


In [33]:
df_test['np+tf'] = df_test['keywords'] + df_test['np_clean']


/Users/tashlin/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.

In [ ]:


In [34]:
#get textblob of keywords for analysis
def get_textblob_kw(data):
    string_list = ' '.join(data)
    text_blob = tb(string_list)
    return text_blob

In [35]:
#textblob of keywords to dataframe column
df_test['key_tb'] = df_test['np+tf'].apply(get_textblob_kw)


/Users/tashlin/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  

In [36]:
df_test.head()


Out[36]:
title summary comb cleaned summary_tb np_dirty np_clean keywords np+tf key_tb
0 Gamma-Ray Bursts as the Death Throes of Massiv... It is proposed that gamma-ray bursts are cre... Gamma-Ray Bursts as the Death Throes of Massiv... Gamma Ray Bursts as the Death Throes of Massiv... (G, a, m, m, a, -, R, a, y, , B, u, r, s, t, ... [double neutron star binaries, massive binary,... [gamma_ray_bursts, black_hole_neutron_star_bin... [binary, merger, created, burst, gamma_ray_bur... [binary, merger, created, burst, gamma_ray_bur... (b, i, n, a, r, y, , m, e, r, g, e, r, , c, ...
1 Gravitational Lensing and the Variability of G The four observables associated with gravita... Gravitational Lensing and the Variability of G... Gravitational Lensing and the Variability of G... (G, r, a, v, i, t, a, t, i, o, n, a, l, , L, ... [$ \dot {, distant quasars, variability, relat... [, estimate_constraints, omega_, variability, ... [dot, gravitational_lensing, lensing, splittin... [dot, gravitational_lensing, lensing, splittin... (d, o, t, , g, r, a, v, i, t, a, t, i, o, n, ...
2 The Ptolemaic Gamma-Ray Burst Universe The BATSE experiment on GRO has demonstrated... The Ptolemaic Gamma-Ray Burst Universe The B... The Ptolemaic Gamma Ray Burst Universe The B... (T, h, e, , P, t, o, l, e, m, a, i, c, , G, ... [cosmic gamma-ray bursts, jennings, $ { \it, m... [log_s, gamma_ray_bursts, jennings, high_recoi... [gamma_ray_burst, born, recoil, become, neutro... [gamma_ray_burst, born, recoil, become, neutro... (g, a, m, m, a, _, r, a, y, _, b, u, r, s, t, ...
3 Expanding Photospheres of Type II Supernovae a... We use the Expanding Photosphere Method to d... Expanding Photospheres of Type II Supernovae a... Expanding Photospheres of Type II Supernovae a... (E, x, p, a, n, d, i, n, g, , P, h, o, t, o, ... [variable stars, surface brightness fluctuatio... [, distance_measurement, mpc, good_agreement, ... [sn, type_ii, distance, supernova, extinction,... [sn, type_ii, distance, supernova, extinction,... (s, n, , t, y, p, e, _, i, i, , d, i, s, t, ...
4 Radiation Transfer in Gamma-Ray Bursts We have calculated gamma-ray radiative trans... Radiation Transfer in Gamma-Ray Bursts We ha... Radiation Transfer in Gamma Ray Bursts We ha... (R, a, d, i, a, t, i, o, n, , T, r, a, n, s, ... [march, spherical, flare models, mev, gamma-ra... [march, effective_collimation, two_photon_pair... [discrete, gamma_ray_burst, energy, optical_de... [discrete, gamma_ray_burst, energy, optical_de... (d, i, s, c, r, e, t, e, , g, a, m, m, a, _, ...

In [37]:
df_test['np+tf'][0]


Out[37]:
['binary',
 'merger',
 'created',
 'burst',
 'gamma_ray_burst',
 'neutron_star',
 'throe',
 'magnetic_flare',
 'neutrino_antineutrino',
 'objection',
 'spiraling',
 'ligo',
 'death',
 'sle',
 'accompanied',
 'gamma_ray_bursts',
 'black_hole_neutron_star_binaries',
 'complex_profiles',
 'large_radii',
 'km',
 'death_throes',
 'cosmological_distances',
 'massive_binary',
 'neutrino_antineutrino_annihilation',
 'spiraling_in_binary',
 'magnetic_flares',
 'gamma_ray',
 'gravitational_radiation',
 'previous_objections',
 'parker',
 'double_neutron_star_binaries',
 'cosmological_merger_model',
 'size_sles',
 'thick_fireball',
 'ligo']

In [38]:
df_test['key_tb'][0]


Out[38]:
TextBlob("binary merger created burst gamma_ray_burst neutron_star throe magnetic_flare neutrino_antineutrino objection spiraling ligo death sle accompanied gamma_ray_bursts black_hole_neutron_star_binaries complex_profiles large_radii km death_throes cosmological_distances massive_binary neutrino_antineutrino_annihilation spiraling_in_binary magnetic_flares gamma_ray gravitational_radiation previous_objections parker double_neutron_star_binaries cosmological_merger_model size_sles thick_fireball ligo")

In [39]:
# function using textblob 
# POS-Tag and remove all keywords that don't have a Noun associated with it 
# and Remove N-grams that have Verb at end.

def textblob_noun_filter(i, df):
    temp2 = []
    for word,tag in df['key_tb'][i].tags:
        if '_' in word:
            word_change = word.replace("_", " ")
            try:
                word_check = word_change.split()[-1]
                word_change2 = Word(word_check).singularize()
                if "V" in tb(word_change2).tags[-1][-1]:
                    pass
                else:
                    for w_word,t_tag in tb(word_change).tags:
                        if 'N' in t_tag and 'VBN' not in t_tag and 'IN' not in t_tag:
                            temp2.append(word)
            except:
                for w_word,t_tag in tb(word_change).tags:
                    if 'N' in t_tag and 'VBN' not in t_tag and 'IN' not in t_tag:
                        temp2.append(word)
        else: 
            if 'N' in tag and 'VBN' not in tag and 'IN' not in tag:
                temp2.append(word)
    #temp2 = [word for word,tag in df['key_tb'][i].tags if 'N' or 'VBG' in tag]
    temp1 = df['np+tf'][i]
    temp3 = [str(x) for x in temp1 if x in set(temp2)]
    return temp3

In [ ]:


In [40]:
# Similar to above function but using Spacy 
# POS-Tag and remove all keywords that don't have a Noun associated with it 
# and Remove N-grams that have Verb at end.
def spacy_noun_filter(lst):
    kw_list = []
    for kw in lst:
        if kw.count("_") > 2:
            end_kw = str(Word(kw.split("_")[-1]).singularize())
            doc = nlp(end_kw)
            for token in doc:
                if token.pos_ != 'VERB':
                    kw_list.append(kw)
        elif kw.count("_") == 1:
            kw_list.append(kw)
        else:
            doc = nlp(kw)
            for token in doc:
                if token.pos_ == 'NOUN':
                    kw_list.append(kw)
    return kw_list

In [41]:
# function to filter out substrings
def substr_in_list(elem, lst):
    for s in lst:
        if elem != s and elem in s:
            return True
    return False

In [42]:
#function to r emove all words of length 2 or less and duplicates
def remove_smallwords(lst):
    kw_list3 = []
    for kw in lst:
        if '_' in kw:
            if len(max(kw.split("_"), key=len)) > 2:
                kw_list3.append(kw)
        else:
            if len(kw) > 2:
                kw_list3.append(kw)
    return list(set(kw_list3))

In [43]:
tb_filtered = [textblob_noun_filter(i, df_test) for i in range(df_test.shape[0])]

spacy_filtered = [spacy_noun_filter(lst) for lst in tb_filtered]

no_substrings = [[j for j in i if not substr_in_list(j, i)] for i in spacy_filtered]

final_kw = [remove_smallwords(lst) for lst in no_substrings]

In [44]:
#add filtered keywords to dataframe
df_test["tfidf_np"] = final_kw


/Users/tashlin/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  

In [45]:
df_test["tfidf_np"][0]


Out[45]:
['magnetic_flares',
 'black_hole_neutron_star_binaries',
 'gamma_ray_bursts',
 'double_neutron_star_binaries',
 'cosmological_distances',
 'massive_binary',
 'complex_profiles',
 'merger',
 'size_sles',
 'thick_fireball',
 'gravitational_radiation',
 'large_radii',
 'previous_objections',
 'neutrino_antineutrino_annihilation',
 'ligo',
 'death_throes',
 'parker']

In [ ]:


In [ ]:


In [46]:
df_test.columns


Out[46]:
Index(['title', 'summary', 'comb', 'cleaned', 'summary_tb', 'np_dirty',
       'np_clean', 'keywords', 'np+tf', 'key_tb', 'tfidf_np'],
      dtype='object')

In [ ]:


In [47]:
#clean dataframe for easy reading
final_df = df_test.drop(['comb','cleaned', 'summary_tb','np_dirty','np_clean','key_tb','keywords', 'np+tf'], axis=1)
#final_df.rename(columns={"2nd_filter": "tfidf"}, inplace=True)
word_num = 500
#print(final_df['tfidf'][word_num])
print(final_df['tfidf_np'][word_num])
print(final_df.columns)


['experiments', 'dark_clusters', 'brown_dwarfs', 'intensive_survey', 'event_rate']
Index(['title', 'summary', 'tfidf_np'], dtype='object')

In [ ]:


In [48]:
#output dataframe
final_df.to_csv("kw-astro-jan16.csv")

In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]: