Topic Modelling using Latent-Dirichlet Allocation


In [117]:
## additional installations in colab
!pip install pyLDAvis
!python -m spacy download en_core_web_lg  ## restart once download is complete.

## general imports

import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import spacy
from spacy import displacy

import pyLDAvis
from pyLDAvis import sklearn
pyLDAvis.enable_notebook()


Requirement already satisfied: pyLDAvis in /usr/local/lib/python3.6/dist-packages (2.1.2)
Requirement already satisfied: numpy>=1.9.2 in /usr/local/lib/python3.6/dist-packages (from pyLDAvis) (1.18.5)
Requirement already satisfied: jinja2>=2.7.2 in /usr/local/lib/python3.6/dist-packages (from pyLDAvis) (2.11.2)
Requirement already satisfied: numexpr in /usr/local/lib/python3.6/dist-packages (from pyLDAvis) (2.7.1)
Requirement already satisfied: future in /usr/local/lib/python3.6/dist-packages (from pyLDAvis) (0.16.0)
Requirement already satisfied: pytest in /usr/local/lib/python3.6/dist-packages (from pyLDAvis) (3.6.4)
Requirement already satisfied: scipy>=0.18.0 in /usr/local/lib/python3.6/dist-packages (from pyLDAvis) (1.4.1)
Requirement already satisfied: funcy in /usr/local/lib/python3.6/dist-packages (from pyLDAvis) (1.14)
Requirement already satisfied: joblib>=0.8.4 in /usr/local/lib/python3.6/dist-packages (from pyLDAvis) (0.15.1)
Requirement already satisfied: wheel>=0.23.0 in /usr/local/lib/python3.6/dist-packages (from pyLDAvis) (0.34.2)
Requirement already satisfied: pandas>=0.17.0 in /usr/local/lib/python3.6/dist-packages (from pyLDAvis) (1.0.4)
Requirement already satisfied: MarkupSafe>=0.23 in /usr/local/lib/python3.6/dist-packages (from jinja2>=2.7.2->pyLDAvis) (1.1.1)
Requirement already satisfied: more-itertools>=4.0.0 in /usr/local/lib/python3.6/dist-packages (from pytest->pyLDAvis) (8.3.0)
Requirement already satisfied: atomicwrites>=1.0 in /usr/local/lib/python3.6/dist-packages (from pytest->pyLDAvis) (1.4.0)
Requirement already satisfied: py>=1.5.0 in /usr/local/lib/python3.6/dist-packages (from pytest->pyLDAvis) (1.8.1)
Requirement already satisfied: setuptools in /usr/local/lib/python3.6/dist-packages (from pytest->pyLDAvis) (47.1.1)
Requirement already satisfied: attrs>=17.4.0 in /usr/local/lib/python3.6/dist-packages (from pytest->pyLDAvis) (19.3.0)
Requirement already satisfied: pluggy<0.8,>=0.5 in /usr/local/lib/python3.6/dist-packages (from pytest->pyLDAvis) (0.7.1)
Requirement already satisfied: six>=1.10.0 in /usr/local/lib/python3.6/dist-packages (from pytest->pyLDAvis) (1.12.0)
Requirement already satisfied: pytz>=2017.2 in /usr/local/lib/python3.6/dist-packages (from pandas>=0.17.0->pyLDAvis) (2018.9)
Requirement already satisfied: python-dateutil>=2.6.1 in /usr/local/lib/python3.6/dist-packages (from pandas>=0.17.0->pyLDAvis) (2.8.1)
Requirement already satisfied: en_core_web_lg==2.2.5 from https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-2.2.5/en_core_web_lg-2.2.5.tar.gz#egg=en_core_web_lg==2.2.5 in /usr/local/lib/python3.6/dist-packages (2.2.5)
Requirement already satisfied: spacy>=2.2.2 in /usr/local/lib/python3.6/dist-packages (from en_core_web_lg==2.2.5) (2.2.4)
Requirement already satisfied: setuptools in /usr/local/lib/python3.6/dist-packages (from spacy>=2.2.2->en_core_web_lg==2.2.5) (47.1.1)
Requirement already satisfied: plac<1.2.0,>=0.9.6 in /usr/local/lib/python3.6/dist-packages (from spacy>=2.2.2->en_core_web_lg==2.2.5) (1.1.3)
Requirement already satisfied: cymem<2.1.0,>=2.0.2 in /usr/local/lib/python3.6/dist-packages (from spacy>=2.2.2->en_core_web_lg==2.2.5) (2.0.3)
Requirement already satisfied: preshed<3.1.0,>=3.0.2 in /usr/local/lib/python3.6/dist-packages (from spacy>=2.2.2->en_core_web_lg==2.2.5) (3.0.2)
Requirement already satisfied: catalogue<1.1.0,>=0.0.7 in /usr/local/lib/python3.6/dist-packages (from spacy>=2.2.2->en_core_web_lg==2.2.5) (1.0.0)
Requirement already satisfied: murmurhash<1.1.0,>=0.28.0 in /usr/local/lib/python3.6/dist-packages (from spacy>=2.2.2->en_core_web_lg==2.2.5) (1.0.2)
Requirement already satisfied: requests<3.0.0,>=2.13.0 in /usr/local/lib/python3.6/dist-packages (from spacy>=2.2.2->en_core_web_lg==2.2.5) (2.23.0)
Requirement already satisfied: srsly<1.1.0,>=1.0.2 in /usr/local/lib/python3.6/dist-packages (from spacy>=2.2.2->en_core_web_lg==2.2.5) (1.0.2)
Requirement already satisfied: thinc==7.4.0 in /usr/local/lib/python3.6/dist-packages (from spacy>=2.2.2->en_core_web_lg==2.2.5) (7.4.0)
Requirement already satisfied: wasabi<1.1.0,>=0.4.0 in /usr/local/lib/python3.6/dist-packages (from spacy>=2.2.2->en_core_web_lg==2.2.5) (0.6.0)
Requirement already satisfied: tqdm<5.0.0,>=4.38.0 in /usr/local/lib/python3.6/dist-packages (from spacy>=2.2.2->en_core_web_lg==2.2.5) (4.41.1)
Requirement already satisfied: numpy>=1.15.0 in /usr/local/lib/python3.6/dist-packages (from spacy>=2.2.2->en_core_web_lg==2.2.5) (1.18.5)
Requirement already satisfied: blis<0.5.0,>=0.4.0 in /usr/local/lib/python3.6/dist-packages (from spacy>=2.2.2->en_core_web_lg==2.2.5) (0.4.1)
Requirement already satisfied: importlib-metadata>=0.20; python_version < "3.8" in /usr/local/lib/python3.6/dist-packages (from catalogue<1.1.0,>=0.0.7->spacy>=2.2.2->en_core_web_lg==2.2.5) (1.6.0)
Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.6/dist-packages (from requests<3.0.0,>=2.13.0->spacy>=2.2.2->en_core_web_lg==2.2.5) (1.24.3)
Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.6/dist-packages (from requests<3.0.0,>=2.13.0->spacy>=2.2.2->en_core_web_lg==2.2.5) (3.0.4)
Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.6/dist-packages (from requests<3.0.0,>=2.13.0->spacy>=2.2.2->en_core_web_lg==2.2.5) (2020.4.5.1)
Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.6/dist-packages (from requests<3.0.0,>=2.13.0->spacy>=2.2.2->en_core_web_lg==2.2.5) (2.9)
Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.6/dist-packages (from importlib-metadata>=0.20; python_version < "3.8"->catalogue<1.1.0,>=0.0.7->spacy>=2.2.2->en_core_web_lg==2.2.5) (3.1.0)
✔ Download and installation successful
You can now load the model via spacy.load('en_core_web_lg')

Step 1: Load and Analyze data

As part of this step we will try to load and analyze our data in order to get a general intuition of our data.

Source of this dataset is from BBC News Insight data.

About the Data: All rights, including copyright, in the content of the original articles are owned by the BBC.


In [0]:
#from google.colab import drive
#drive.mount('/content/drive')

In [119]:
### Reading the dataset from path

filename = '/content/drive/My Drive/Colab Notebooks/data/bbc/bbc_raw_dict.json'
data = pd.read_json(filename)
data.head()


Out[119]:
category content filename
0 entertainment Musicians to tackle US red tape\n\nMusicians' ... /Users/rishushrivastava/Documents/AI/Dataset/b...
1 entertainment U2's desire to be number one\n\nU2, who have w... /Users/rishushrivastava/Documents/AI/Dataset/b...
2 entertainment Rocker Doherty in on-stage fight\n\nRock singe... /Users/rishushrivastava/Documents/AI/Dataset/b...
3 entertainment Snicket tops US box office chart\n\nThe film a... /Users/rishushrivastava/Documents/AI/Dataset/b...
4 entertainment Ocean's Twelve raids box office\n\nOcean's Twe... /Users/rishushrivastava/Documents/AI/Dataset/b...

In [120]:
### get the shape of the data

data.shape


Out[120]:
(2225, 3)

In [121]:
### Check for any null or na

data['content'].isna().value_counts() # no null records found. hence the data is in good quality


Out[121]:
False    2225
Name: content, dtype: int64

In [122]:
### distribution of the dataset

data['category'].value_counts()


Out[122]:
sport            511
business         510
politics         417
tech             401
entertainment    386
Name: category, dtype: int64

In [123]:
### visualize the category - total article spread.

plt.figure(figsize=(10,5))
sns.set_style("whitegrid")
sns.countplot(x='category',data=data, orient='h', palette='Spectral_r')
plt.title("Article Counts by Category (BBC) - Actual Data Source")
plt.show()


The BBC news data is currently divided into 5 Major Categories: Entertainment, Business, Sport, Politics and Tech.

Note: Our target is to understand the BBC news content and build a topic cluster for the news content. We will assume there is no previous knowledge of the above categories. The categories from the original dataset are merely for the purpose of cross-validation and testing.

Step 2: Data Cleaning and Transformation

In this step, we will try to clean and transform the existing dataset into a more managable chunks to be able to fit into the LDA model. During this process, we will also try to get a little in-depth intuition of the dataset using NLP techniques.


In [0]:
# load spacy
nlp = spacy.load('en_core_web_lg')

In [125]:
# selecting and analysing the first item in the dataframe
doc_one = nlp(data['content'].iloc[0])

displacy.render(doc_one,style='ent',jupyter=True)


Musicians to tackle US GPE red tapeMusicians' groups are to tackle US GPE visa regulations which are blamed for hindering British NORP acts' chances of succeeding across the Atlantic LOC .A singer hoping to perform in the US GPE can expect to pay $ 1,300 MONEY (£680) simply for obtaining a visa. Groups including the Musicians' Union ORG are calling for an end to the "raw deal" faced by British NORP performers. US GPE acts are not faced with comparable expense and bureaucracy when visiting the UK GPE for promotional purposes. Nigel McCune PERSON from the Musicians' Union ORG said British NORP musicians are "disadvantaged" compared to their US GPE counterparts. A sponsor has to make a petition on their behalf, which is a form amounting to nearly 30 CARDINAL pages, while musicians face tougher regulations than athletes and journalists. "If you make a mistake on your form, you risk a five-year DATE ban and thus the ability to further your career," says Mr McCune PERSON ."The US GPE is the world's biggest music market, which means something has to be done about the creaky bureaucracy," says Mr McCune PERSON . "The current situation is preventing British NORP acts from maintaining momentum and developing in the US GPE ," he added.The Musicians' Union stance is being endorsed by the Music Managers' Forum ORG (MMF), who say British NORP artists face "an uphill struggle" to succeed in the US GPE , thanks to the tough visa requirements, which are also seen as impractical. The MMF ORG 's general secretary James Seller PERSON said: "Imagine if you were an orchestra from the Orkneys PERSON ? Every member would have to travel to London GPE to have their visas processed.""The US GPE market is seen as the holy grail FAC and one CARDINAL of the benchmarks of success, and we're still going to fight to get in there. "It's still very important, but there are other markets like Europe LOC , India GPE and China GPE ," added Mr Seller PERSON . A Department for Media ORG , Culture and Sport spokeswoman said: "We're aware that people are experiencing problems, and are working with the US GPE embassy and record industry to see what we can do about it." A US Embassy ORG spokesman said: "We are aware that entertainers require visas for time-specific visas and are doing everything we can to process those applications speedily." "We are aware of the importance of cultural exchange and we will do our best to facilitate that," he added.

With the above display, we can notice the varied entities across the dataset.We would like to remove most of the entites out like PERSON, ORG, CARDINAL, GPE, LOC and others. This will help us in narrowing down the topic words.


In [126]:
## Lets try to reduce down the data set into selected POS.

additional_stop_words = ['say','go','come','get','see','use','take','want','tell','need']

def parse_filter_document(doc):
  '''
    1. Remove Stop Words 
    2. filter selective POS. 
    3. apply lemma on the tokens
    @returns: list of filtered string
  '''
  filtered_doc = []
  for token in doc:
    if token.is_stop == False | token.is_punct == False | token.is_space == False:
      if token.pos_ in ['NOUN','VERB']:
        if token.lemma_ not in additional_stop_words:
          filtered_doc.append(token.lemma_)

  return ' '.join(filtered_doc)

print(parse_filter_document(doc_one))


musician tackle tape musician group tackle visa regulation blame hinder act chance succeed singer hope perform expect pay £680 obtain visa group include call end deal face performer act face expense bureaucracy visit purpose musician compare counterpart sponsor petition behalf form amount page musician face regulation athlete journalist mistake form risk year ban ability career world music market mean bureaucracy situation prevent act maintain momentum develop add stance endorse artist face struggle succeed thank visa requirement secretary imagine orchestra member travel visa process market benchmark success fight market add spokeswoman people experience problem work embassy record industry spokesman entertainer require visa time visa process application importance exchange facilitate add

With the above method and applying it on one single document / news article, we can see we have cleaned and reduced the news article into meaningful chunk of data. We will use this method in the subsequent parsing of the data set.

Applying the parser method that we created into all of the dataframe and create a list new column named : parsed_content


In [127]:
df = data.copy()

df['parsed_content'] = data['content'].apply(lambda x: parse_filter_document(nlp(x)))

df[['category','content','parsed_content']].head(10)


Out[127]:
category content parsed_content
0 entertainment Musicians to tackle US red tape\n\nMusicians' ... musician tackle tape musician group tackle vis...
1 entertainment U2's desire to be number one\n\nU2, who have w... desire number win hit cling status band world ...
2 entertainment Rocker Doherty in on-stage fight\n\nRock singe... rocker stage fight singer involve fight band g...
3 entertainment Snicket tops US box office chart\n\nThe film a... top office chart film adaptation novel top box...
4 entertainment Ocean's Twelve raids box office\n\nOcean's Twe... raid box office crime caper sequel star number...
5 entertainment 'Landmark movies' of 2004 hailed\n\nUS film pr... movie hail film professional declare passion m...
6 entertainment Pete Doherty misses bail deadline\n\nSinger Pe... miss bail deadline singer spend weekend jail b...
7 entertainment Fockers retain film chart crown\n\nComedy Meet... focker retain film chart crown hold number spo...
8 entertainment Top gig award for Scissor Sisters\n\nNew York ... gig award band win gig year award performance ...
9 entertainment Johnny Depp: The acting outlaw\n\nJohnny Depp,... act outlaw role creator celebrate talent star ...

Applying CountVectorizer on the new parsed dataframe. This will vectorize the dataframe column : parsed_content. Once the data is vectorized we will fit it into our model.


In [0]:
cnt_vec = CountVectorizer(max_df=0.95, min_df=1, lowercase=True)

In [0]:
df_cnt_vec = cnt_vec.fit_transform(df['parsed_content'])

In [130]:
df_cnt_vec            # a sparse matrix is generated.


Out[130]:
<2225x9297 sparse matrix of type '<class 'numpy.int64'>'
	with 175983 stored elements in Compressed Sparse Row format>

At this step, we are ready to fit our data into the LDA model. We have now cleaned, parsed and vectorised our data. In the next step, we will add the vectorized data into our model.

Step 3: Defining LDA model

In this subsequent steps we will try to fit in the LDA model based on the transformed and vectorised dataframe we created in the previous step. Based on our initial data analysis, our target topic cluster will be similar to the one in the original data. This will ensure to generate a clearer understanding of this data.


In [0]:
### Define the LDA model and set the topic size to 5.

topic_clusters = 5            ## assumption is based on the original dataset.

lda_model = LatentDirichletAllocation(n_components=topic_clusters, 
                                      learning_decay=0.7, 
                                      batch_size=128, 
                                      random_state=42)

In [132]:
### Fit the filtered data to the model

lda_model.fit(df_cnt_vec)


Out[132]:
LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='batch', learning_offset=10.0,
                          max_doc_update_iter=100, max_iter=10,
                          mean_change_tol=0.001, n_components=5, n_jobs=None,
                          perp_tol=0.1, random_state=42, topic_word_prior=None,
                          total_samples=1000000.0, verbose=0)

Note: Fitting the model to the dataset may take a long time. You will see the output as model summary, if success.


In [0]:
### Transform the dataset with the generated model

result_df = lda_model.transform(df_cnt_vec)

Let us now generate some model outputs based on the dataset. Our model will generate the topics and the word distribution across topics.


In [134]:
### List down the number of generated Topics

# sport ; business  ; politics  ; tech   ; entertainment 

topic_word_dict = {}

print("Topic ID| Word Distribution")
print("--------|----------------------------------------------------------------")
for index, topic in enumerate(lda_model.components_):
  topic_words_max = [cnt_vec.get_feature_names()[i] for i in topic.argsort()[-15:]]
  topic_word_dict[index] = topic_words_max

  print(f"Topic:{index:{2}}| {' ,'.join(topic_words_max)}")


Topic ID| Word Distribution
--------|----------------------------------------------------------------
Topic: 0| way ,work ,device ,file ,firm ,company ,network ,video ,year ,tv ,music ,service ,technology ,phone ,people
Topic: 1| change ,campaign ,law ,year ,leader ,work ,country ,claim ,issue ,minister ,plan ,party ,election ,people ,government
Topic: 2| report ,business ,fall ,rate ,month ,economy ,growth ,share ,price ,sale ,firm ,rise ,market ,company ,year
Topic: 3| virus ,security ,computer ,album ,release ,year ,band ,song ,mail ,user ,people ,number ,software ,site ,music
Topic: 4| club ,set ,world ,match ,think ,team ,star ,award ,player ,time ,year ,film ,play ,game ,win

From the above generated topic and looking at the distribution of words in each of the 5 topics, we can say the following:

  1. Topic 0 -> relates to tv or music (Entertainment)
  2. Topic 1 -> relates to Politics or election related news
  3. Topic 2 -> relates to business or company
  4. Topic 3 -> relates to Technology
  5. Topic 4 -> relates to some sort of game or play (Sports)

Lets try to get more details and generate the weight distribution of topics across the dataframe/documents.


In [135]:
topics = [ "Topic "+str(t) for t in range(lda_model.n_components)]
indexes = [ i for i in range(len(df))]

topic_dist_df = pd.DataFrame(data=np.round(result_df,decimals=2), columns=topics, index=indexes)

dominant_topic = np.argmax(topic_dist_df.values, axis=1)
topic_dist_df['dominant_topic'] = dominant_topic

df['Topic 0'] = topic_dist_df['Topic 0']
df['Topic 1'] = topic_dist_df['Topic 1']
df['Topic 2'] = topic_dist_df['Topic 2']
df['Topic 3'] = topic_dist_df['Topic 3']
df['Topic 4'] = topic_dist_df['Topic 4']
df['dominant_topic'] = topic_dist_df['dominant_topic']

df[['content','Topic 0','Topic 1','Topic 2','Topic 3','Topic 4','dominant_topic']].head(10)


Out[135]:
content Topic 0 Topic 1 Topic 2 Topic 3 Topic 4 dominant_topic
0 Musicians to tackle US red tape\n\nMusicians' ... 0.00 0.54 0.17 0.24 0.05 1
1 U2's desire to be number one\n\nU2, who have w... 0.00 0.00 0.02 0.75 0.23 3
2 Rocker Doherty in on-stage fight\n\nRock singe... 0.24 0.00 0.00 0.37 0.38 4
3 Snicket tops US box office chart\n\nThe film a... 0.00 0.00 0.00 0.35 0.64 4
4 Ocean's Twelve raids box office\n\nOcean's Twe... 0.17 0.00 0.00 0.10 0.73 4
5 'Landmark movies' of 2004 hailed\n\nUS film pr... 0.28 0.37 0.00 0.00 0.35 1
6 Pete Doherty misses bail deadline\n\nSinger Pe... 0.54 0.06 0.00 0.24 0.16 0
7 Fockers retain film chart crown\n\nComedy Meet... 0.00 0.00 0.16 0.14 0.69 4
8 Top gig award for Scissor Sisters\n\nNew York ... 0.00 0.00 0.00 0.46 0.53 4
9 Johnny Depp: The acting outlaw\n\nJohnny Depp,... 0.00 0.00 0.00 0.05 0.94 4

Step 4: Visualizing the LDA Model distribution

In this step we use a specialised visualisation library called pyLDAvis which aims at generating an interactive visualization for LDA model.

Read more on pyLDAvis in the official document.


In [0]:
viz = sklearn.prepare(lda_model=lda_model, dtm=df_cnt_vec, vectorizer=cnt_vec)

In [137]:
pyLDAvis.display(viz)


Out[137]:

In [0]: