In [1]:
import math
import pandas as pd
import numpy as np
np.random.seed(2018)
import time
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from gensim import corpora, models
import nltk
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.tag import pos_tag
from nltk.corpus import stopwords
nltk.download('wordnet')
nltk.download('stopwords')
from textblob import Word, TextBlob as tb
import spacy
nlp = spacy.load("en_core_web_sm")
In [2]:
#read in data
data_path = "../data/csv_files/data_astro-ph.csv"
df = pd.read_csv(data_path, index_col= [0])
In [3]:
df.head()
Out[3]:
In [4]:
def get_blob(some_string):
#return converted textblob from string
return tb(some_string)
def newline_replace(some_string):
#replace newline character with space
return some_string.replace('- ', '').replace('\n', ' ')
In [5]:
def tf(word, blob):
#term frequency, word count over all words
return blob.words.count(word) / len(blob.words)
def n_containing(word, bloblist):
#returns the number of documents containing word
return sum(1 for blob in bloblist if word in blob.words)
def idf(word, bloblist):
#inverse document frequency, measures how common a word is among all documents in bloblist
return math.log(len(bloblist) / (1 + n_containing(word, bloblist)))
def tfidf(word, blob, bloblist):
#computes the TF-IDF score, product of tf and idf.
return tf(word, blob) * idf(word, bloblist)
In [6]:
def get_noun_phrases(textblob_string):
#return list of unique noun phrases
np = textblob_string.noun_phrases
return list(set(np))
def clean_up_kw(strng_lst):
#filter out all non alpha characters
#convert empty space and dashes to underscores inline with tfidf bigram/trigrams
new_list = []
for strng in strng_lst:
strng = strng.replace("-"," ")
check_alpha = ''.join([i for i in strng if i.isalpha() or i=="_" or i==' '])
join_alpha = "_".join(check_alpha.split())
new_list.append(join_alpha)
return list(set(new_list))
In [7]:
#combine summary and title for analysis
#replace newline characters in string
df['comb'] = (df['title']+ " " + df["summary"].map(str)).apply(newline_replace)
df.head()
Out[7]:
In [8]:
def clean_up_text(strng):
strng_replaced = strng.replace("-"," ")
filter_alnum = ''.join([i for i in strng_replaced if i.isalnum() or i=="_" or i==' ' or i=='.'])
return filter_alnum
In [9]:
df['cleaned'] = df['comb'].apply(clean_up_text)
In [10]:
df['cleaned'][0]
Out[10]:
In [ ]:
In [ ]:
In [ ]:
In [11]:
#convert string to textblob
df['summary_tb'] = df['comb'].apply(get_blob)
In [12]:
df.head()
Out[12]:
In [13]:
#get noun phrases from TextBlob
df['np_dirty'] = df['summary_tb'].apply(get_noun_phrases)
In [14]:
#get cleaned noun phrases
df['np_clean'] = df['np_dirty'].apply(clean_up_kw)
In [ ]:
In [ ]:
In [ ]:
In [15]:
def word_lemmatize(text):
#lemmatize text
return WordNetLemmatizer().lemmatize(text)
def sent_to_words(sentences):
#convert sentences to individual words
for sentence in sentences:
yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))
def remove_stopwords(texts):
# remove stopwords from corpus
return [[word for word in simple_preprocess(str(doc))
if word not in stop_words] for doc in texts]
def make_bigrams(texts):
return [bigram_mod[doc] for doc in texts]
def make_trigrams(texts):
return [trigram_mod[bigram_mod[doc]] for doc in texts]
In [16]:
words = list(sent_to_words(df['comb']))
data_words = [[word_lemmatize(kw) for kw in lst] for lst in words]
# hyperparameters threshold need tuning
# higher threshold fewer phrases.
bigram = gensim.models.Phrases(data_words, min_count=4, threshold=20)
trigram = gensim.models.Phrases(bigram[data_words], threshold=20)
# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)
# See trigram example
print(trigram_mod[bigram_mod[data_words[0]]])
print("\n\n")
print(data_words[0])
In [17]:
paper_number = 12
print(trigram_mod[bigram_mod[data_words[paper_number]]])
print("\n\n")
print(data_words[paper_number])
In [ ]:
In [18]:
#Snowball stemmer aka porter2 stemmer regarded as better stemmer vs regular Porter
stemmer = SnowballStemmer("english")
stop_words = stopwords.words('english')
stop_words.extend(['compare', 'constraint']) # expand the stopwords dictionary includes high frequence words
In [ ]:
In [19]:
data_words_nostops = remove_stopwords(data_words)
data_words_bigrams = make_bigrams(data_words_nostops)
data_words_trigrams = make_trigrams(data_words_bigrams)
In [20]:
# Preproccesed with bi/trigrams in pandas series
df_pre_tb = pd.Series(data_words_trigrams).to_frame(name='summary')
In [ ]:
In [21]:
def first_join_list(some_list):
return " ".join(some_list)
In [22]:
df_pre = df_pre_tb['summary'].apply(first_join_list)
In [23]:
print(data_words[paper_number])
print("****")
print(df_pre[12])
In [24]:
df_tb = df_pre.apply(get_blob)
In [25]:
df_tb.head()
Out[25]:
In [26]:
df_limit = df_tb[:1000]
In [27]:
#return top 15 keywords for each document/summary
def tfidf_keywords(blob,df_blob):
scores = {word: tfidf(word, blob,df_blob) for word in blob.words}
sorted_words = sorted(scores.items(), key=lambda x: x[1], reverse=True)
return [item[0] for item in sorted_words[:15]]
In [28]:
#run tfidf and print time taken
start = time.perf_counter()
results = [tfidf_keywords(blob, df_limit) for blob in df_limit]
finish = time.perf_counter()
print(f'finished in: {round(finish-start,2)}')
In [ ]:
In [ ]:
In [29]:
df.head()
Out[29]:
In [30]:
#create new df for limited* results
df_test = df[:1000]
In [31]:
#add keywords results to dataframe
df_test['keywords'] = results
In [32]:
df_test.head()
Out[32]:
In [33]:
df_test['np+tf'] = df_test['keywords'] + df_test['np_clean']
In [ ]:
In [34]:
#get textblob of keywords for analysis
def get_textblob_kw(data):
string_list = ' '.join(data)
text_blob = tb(string_list)
return text_blob
In [35]:
#textblob of keywords to dataframe column
df_test['key_tb'] = df_test['np+tf'].apply(get_textblob_kw)
In [36]:
df_test.head()
Out[36]:
In [37]:
df_test['np+tf'][0]
Out[37]:
In [38]:
df_test['key_tb'][0]
Out[38]:
In [39]:
# function using textblob
# POS-Tag and remove all keywords that don't have a Noun associated with it
# and Remove N-grams that have Verb at end.
def textblob_noun_filter(i, df):
temp2 = []
for word,tag in df['key_tb'][i].tags:
if '_' in word:
word_change = word.replace("_", " ")
try:
word_check = word_change.split()[-1]
word_change2 = Word(word_check).singularize()
if "V" in tb(word_change2).tags[-1][-1]:
pass
else:
for w_word,t_tag in tb(word_change).tags:
if 'N' in t_tag and 'VBN' not in t_tag and 'IN' not in t_tag:
temp2.append(word)
except:
for w_word,t_tag in tb(word_change).tags:
if 'N' in t_tag and 'VBN' not in t_tag and 'IN' not in t_tag:
temp2.append(word)
else:
if 'N' in tag and 'VBN' not in tag and 'IN' not in tag:
temp2.append(word)
#temp2 = [word for word,tag in df['key_tb'][i].tags if 'N' or 'VBG' in tag]
temp1 = df['np+tf'][i]
temp3 = [str(x) for x in temp1 if x in set(temp2)]
return temp3
In [ ]:
In [40]:
# Similar to above function but using Spacy
# POS-Tag and remove all keywords that don't have a Noun associated with it
# and Remove N-grams that have Verb at end.
def spacy_noun_filter(lst):
kw_list = []
for kw in lst:
if kw.count("_") > 2:
end_kw = str(Word(kw.split("_")[-1]).singularize())
doc = nlp(end_kw)
for token in doc:
if token.pos_ != 'VERB':
kw_list.append(kw)
elif kw.count("_") == 1:
kw_list.append(kw)
else:
doc = nlp(kw)
for token in doc:
if token.pos_ == 'NOUN':
kw_list.append(kw)
return kw_list
In [41]:
# function to filter out substrings
def substr_in_list(elem, lst):
for s in lst:
if elem != s and elem in s:
return True
return False
In [42]:
#function to r emove all words of length 2 or less and duplicates
def remove_smallwords(lst):
kw_list3 = []
for kw in lst:
if '_' in kw:
if len(max(kw.split("_"), key=len)) > 2:
kw_list3.append(kw)
else:
if len(kw) > 2:
kw_list3.append(kw)
return list(set(kw_list3))
In [43]:
tb_filtered = [textblob_noun_filter(i, df_test) for i in range(df_test.shape[0])]
spacy_filtered = [spacy_noun_filter(lst) for lst in tb_filtered]
no_substrings = [[j for j in i if not substr_in_list(j, i)] for i in spacy_filtered]
final_kw = [remove_smallwords(lst) for lst in no_substrings]
In [44]:
#add filtered keywords to dataframe
df_test["tfidf_np"] = final_kw
In [45]:
df_test["tfidf_np"][0]
Out[45]:
In [ ]:
In [ ]:
In [46]:
df_test.columns
Out[46]:
In [ ]:
In [47]:
#clean dataframe for easy reading
final_df = df_test.drop(['comb','cleaned', 'summary_tb','np_dirty','np_clean','key_tb','keywords', 'np+tf'], axis=1)
#final_df.rename(columns={"2nd_filter": "tfidf"}, inplace=True)
word_num = 500
#print(final_df['tfidf'][word_num])
print(final_df['tfidf_np'][word_num])
print(final_df.columns)
In [ ]:
In [48]:
#output dataframe
final_df.to_csv("kw-astro-jan16.csv")
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]: