In [1]:
import pandas as pd
import sqlite3
import gensim
import nltk
import glob
import json
import pickle
## Helpers
def save_pkl(target_object, filename):
with open(filename, "wb") as file:
pickle.dump(target_object, file)
def load_pkl(filename):
return pickle.load(open(filename, "rb"))
def save_json(target_object, filename):
with open(filename, 'w') as file:
json.dump(target_object, file)
def load_json(filename):
with open(filename, 'r') as file:
data = json.load(file)
return data
In [2]:
# Loading metadata from trainning database
con = sqlite3.connect("F:/FMR/data.sqlite")
db_documents = pd.read_sql_query("SELECT * from documents", con)
db_authors = pd.read_sql_query("SELECT * from authors", con)
data = db_documents # just a handy alias
data.head()
Out[2]:
In [3]:
tokenised = load_json("abstract_tokenised.json")
In [4]:
# Let's have a peek
tokenised["acis2001/1"][:10]
Out[4]:
In this stage, we preprocess the data so it could be read by Gensim. Then we will furthur clean up the data to better train the model.
First of all, we need a dictionary of our corpus, i.e., the whole collection of our full texts. However, there are documents in our dataset written in some other languages. We need to stay with one language (in the example, English) in order to best train the model, so let's filter them out first.
TextBlob ships with a handy API wrapper of Google's language detection service. We will store the id of these non-English documents in a list called non_en and save it as a pickled file for later use.
In [ ]:
from textblob import TextBlob
non_en = [] # a list of ids of the documents in other languages
count = 0
for id_, entry in data.iterrows():
count += 1
try:
lang = TextBlob(entry["title"] + " " + entry["abstract"]).detect_language()
except:
raise
if lang != 'en':
non_en.append(id_)
print(lang, data.iloc[id_]["title"])
if (count % 100) == 0:
print("Progress: ", count)
save_pkl(non_en, "non_en.list.pkl")
In [5]:
non_en = load_pkl("non_en.list.pkl")
In [6]:
# Convert our dict-based structure to be a list-based structure that are readable by Gensim and at the same time,
# filter out those non-English documents
tokenised_list = [tokenised[i] for i in data["submission_path"] if i not in non_en]
Although we tried to handle these hyphenations in the previous tutorial, now we still have them for some reasons. The most conveient way to remove them is to remove them in the corpus and rebuild the dictionary. Then re-apply our previous filter.
In [7]:
def remove_hyphenation(l):
return [i.replace("- ", "").replace("-", "") for i in l]
tokenised_list = [remove_hyphenation(i) for i in tokenised_list]
In [8]:
from nltk.stem.wordnet import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
def lemmatize(l):
return [" ".join([lemmatizer.lemmatize(token)
for token
in phrase.split(" ")])
for phrase in l]
def lemmatize_all(tokenised):
# Lemmatize the documents.
lemmatized = [lemmatize(entry) for entry in tokenised]
return lemmatized
In [9]:
" ".join([lemmatizer.lemmatize(token)
for token
in 'assistive technologies'.split(" ")])
Out[9]:
In [10]:
tokenised_list = lemmatize_all(tokenised_list)
In [12]:
# In case we need it in the future
save_json(tokenised_list, "abstract_lemmatized.json")
In [11]:
# To load it:
tokenised_list = load_json("abstract_lemmatized.json")
Then we can create our lemmatized vocabulary.
In [12]:
from gensim.corpora import Dictionary
# Create a dictionary for all the documents. This might take a while.
dictionary = Dictionary(tokenised_list)
In [13]:
# Let's see what's inside, note the spelling :)
# But there is really nothing we can do with that.
dictionary[0]
Out[13]:
In [14]:
len(dictionary)
Out[14]:
Obviously we have a way too large vocabulary size. This is because the algorithm used in TextBlob's noun phrase extraction is not very robust in complicated scenario. Let's see what we can do about this.
First of all, let's rule out the most obvious ones: words and phrases that appear in too many documents and ones that appear only 1-5 documents. Gensim provides a very convenient built-in function to filter them out:
In [15]:
# remove tokens that appear in less than 20 documents and tokens that appear in more than 50% of the documents.
dictionary.filter_extremes(no_below=2, no_above=0.5, keep_n=None)
In [16]:
len(dictionary)
Out[16]:
Now we have drastically reduced the size of the vocabulary from 2936116 to 102508. However this is not enough. For example:
In [17]:
# Helpers
display_limit = 10
def shorter_than(n):
bad = []
count = 0
for i in dictionary:
if len(dictionary[i]) < n:
count += 1
if count < display_limit:
print(dictionary[i])
bad.append(i)
print(count)
return bad
def if_in(symbol):
bad = []
count = 0
for i in dictionary:
if symbol in dictionary[i]:
count += 1
if count < display_limit:
print(dictionary[i])
bad.append(i)
print(count)
return bad
def more_than(symbol, n):
bad = []
count = 0
for i in dictionary:
if dictionary[i].count(symbol) > n:
count += 1
if count < display_limit:
print(dictionary[i])
bad.append(i)
print(count)
return bad
In [18]:
bad = shorter_than(3)
We have 752 such meaningless tokens in our vocabulary. Presumably this is because that during the extraction of the PDF, some mathenmatical equations are parsed as plain text (of course).
Now we are going to remove these:
In [19]:
dictionary.filter_tokens(bad_ids=bad)
In [21]:
display_limit = 10
bad = if_in("*")
In [22]:
dictionary.filter_tokens(bad_ids=bad)
In [23]:
bad = if_in("<")
In [24]:
dictionary.filter_tokens(bad_ids=bad)
In [25]:
bad = if_in(">")
In [26]:
dictionary.filter_tokens(bad_ids=bad)
In [27]:
bad = if_in("%")
In [28]:
dictionary.filter_tokens(bad_ids=bad)
In [29]:
bad = if_in("/")
In [30]:
dictionary.filter_tokens(bad_ids=bad)
In [31]:
bad = if_in("[")
bad += if_in("]")
bad += if_in("}")
bad += if_in("{")
dictionary.filter_tokens(bad_ids=bad)
In [33]:
display_limit = 20
bad = more_than(" ", 3)
dictionary.filter_tokens(bad_ids=bad)
In [34]:
bad = if_in("- ") # verify that there is no hyphenation problem
In [35]:
bad = if_in("quarter")
dictionary.filter_tokens(bad_ids=bad)
There are a lot of citations and references in the PDFs, and they are extremely difficult to be recoginsed given that they come in a lot of variants.
We will demostrate how to identify these names and locations in another tutorial (see TOC) using a Stanford NLP library, and eventually we can get a list of names and locations in names.json and locations.json respectively.
In [36]:
names = load_json("names.json")
name_ids = [i for i, v in dictionary.iteritems() if v in names]
dictionary.filter_tokens(bad_ids=name_ids)
locations = load_json("locations.json")
location_ids = [i for i, v in dictionary.iteritems() if v in locations]
dictionary.filter_tokens(bad_ids=location_ids)
In [83]:
locations[:10]
Out[83]:
In [84]:
names[:15] # not looking good, but it seems like it won't do much harm either
Out[84]:
In [37]:
corpus = [dictionary.doc2bow(l) for l in tokenised_list]
In [87]:
# Save it for future usage
from gensim.corpora.mmcorpus import MmCorpus
MmCorpus.serialize("aisnet_abstract_np_cleaned.mm", corpus)
In [88]:
# Also save the dictionary
dictionary.save("aisnet_abstract_np_cleaned.ldamodel.dictionary")
In [38]:
# To load the corpus:
from gensim.corpora.mmcorpus import MmCorpus
corpus = MmCorpus("aisnet_abstract_cleaned.mm")
# To load the dictionary:
from gensim.corpora import Dictionary
dictionary = Dictionary.load("aisnet_abstract_np_cleaned.ldamodel.dictionary")
In [46]:
# Train LDA model.
from gensim.models import LdaModel
# Set training parameters.
num_topics = 150
chunksize = 2000
passes = 1
iterations = 150
eval_every = None # Don't evaluate model perplexity, takes too much time.
# Make a index to word dictionary.
print("Dictionary test: " + dictionary[0]) # This is only to "load" the dictionary.
id2word = dictionary.id2token
model = LdaModel(corpus=corpus, id2word=id2word, chunksize=chunksize, \
alpha='auto', eta='auto', \
iterations=iterations, num_topics=num_topics, \
passes=passes, eval_every=eval_every)
In [47]:
# Save the LDA model
model.save("aisnet_abstract_150_cleaned.ldamodel")
In [92]:
from gensim.models import LdaModel
model = LdaModel.load("aisnet_abstract_150_cleaned.ldamodel")
In [48]:
import pyLDAvis.gensim
vis = pyLDAvis.gensim.prepare(model, corpus, dictionary)
In [49]:
pyLDAvis.display(vis)
Out[49]: