In [61]:
import pandas as pd
import numpy as np
from collections import Counter
import matplotlib.pyplot as plt
import seaborn
import planecrashinfo_light as pci
from sklearn.cluster import KMeans
from sklearn.externals import joblib
from gensim import corpora, models, utils
from gensim.models import TfidfModel, LsiModel
import nltk
from nltk.corpus import stopwords
%matplotlib inline
In [2]:
df = pd.read_csv('data/data.csv')
In [3]:
df = pci.clean_database(df)
df.head()
Out[3]:
In [4]:
print('Total number of the data: {}'.format(df.shape[0]))
print('Number of the not empty summaries: {}'.format(df[df.Summary.isnull()].shape[0]))
We'll use Latent Dirichlet allocation (LDA). LDA is a topic model that generates topics based on word frequency from a set of documents. LDA is particularly useful for finding reasonably accurate mixtures of topics within a given document set.
In [5]:
# set data without summary as empty row
df.Summary.fillna('', inplace=True)
In [6]:
splitter = nltk.data.load('tokenizers/punkt/english.pickle')
tokenizer = nltk.tokenize.TreebankWordTokenizer()
stopset = set(stopwords.words('english'))
In [7]:
def text_to_words(text):
tokenized_sentences = []
sentences = splitter.tokenize(text)
for sentence in sentences:
tokens = []
# remove punctuation and stopwords
for token in utils.tokenize(sentence, lowercase=True, deacc=True, errors="ignore"):
if token not in stopset:
tokens.append(token)
tokenized_sentences.extend(tokens)
tokenized_sentences.extend([' '.join(bigram) for bigram in nltk.ngrams(tokens, 2)])
return tokenized_sentences
In [8]:
texts = [text_to_words(text) for text in df.Summary.values]
In [9]:
# create dictionary
dictionary = corpora.Dictionary(texts)
In [10]:
# get tokens that appear too often
popular_tokens = [token for token, frequency in dictionary.dfs.items() if frequency > 680]
for token in popular_tokens:
print(dictionary.get(token))
In [11]:
# remove from dictionary the tokens that appear too often
dictionary.filter_tokens(popular_tokens)
In [12]:
# create documents corpus
corpus = [dictionary.doc2bow(text) for text in texts]
In [13]:
tfidf = TfidfModel(corpus, dictionary=dictionary, normalize=True)
corpus_tfidf = tfidf[corpus]
We'll use module for Latent Semantic Analysis (aka Latent Semantic Indexing). It implements fast truncated SVD (Singular Value Decomposition)
In [53]:
np.random.seed(42)
# project to 2 dimensions for visualization
lsi_model = LsiModel(corpus_tfidf, id2word=dictionary, num_topics=2)
In [54]:
# save coordinates
coords = []
for coord in lsi_model[corpus]:
if len(coord) > 1:
coords.append((coord[0][1], coord[1][1]))
In [55]:
max_clusters = 10
clusters_num = range(1, max_clusters + 1)
inertias = np.zeros(max_clusters)
In [56]:
for cluster_num in clusters_num:
kmeans = KMeans(cluster_num, random_state=42).fit(coords)
# "inertia_" is sum of distances of samples to their closest cluster center
inertias[cluster_num - 1] = kmeans.inertia_
In [286]:
plt.plot(clusters_num, inertias, "b*-")
plt.ylabel('Inertia')
plt.xlabel('K')
plt.title('Sum of distances of samples to their closest cluster center')
plt.show()
In [58]:
# let's choose number of the clusters as 6
final_clusters_number = 6
kmeans = KMeans(final_clusters_number, random_state=42).fit(coords)
In [281]:
# distribution per class
plt.bar(Counter(kmeans.labels_).keys(), Counter(kmeans.labels_).values())
plt.title('Distribution per class');
In [60]:
colors = ["g", "r", "m", "c", "y", "k"]
for i in range(len(coords)):
plt.scatter(coords[i][0], coords[i][1], c=colors[kmeans.labels_[i]], s=10)
plt.title('Clusterization of the topics')
plt.show()
In [35]:
np.random.seed(42)
lda = models.ldamodel.LdaModel(corpus_tfidf, id2word=dictionary, num_topics=final_clusters_number, passes=10)
In [63]:
joblib.dump(lda, 'dumps/lda.pkl')
# lda = joblib.load('dumps/lda.pkl')
Out[63]:
In [36]:
lda.show_topics(final_clusters_number, num_words=10, formatted=False)
Out[36]:
In [37]:
# approximate names of the topics
summaries_topics = [
'takeoff',
'crashed shortly',
'landing',
'weather conditions',
'in route',
'crashed in'
]
In [47]:
def compute_topic_summary_matrix(model, corpus, summaries_number):
ts_matrix = pd.DataFrame(
data=np.zeros((final_clusters_number, summaries_number)), columns=range(summaries_number)
)
for i in range(summaries_number):
summary_topics = model.get_document_topics(corpus[i])
for topic, prob in summary_topics:
ts_matrix[i][topic] += prob
return ts_matrix
In [107]:
plt.figure(figsize=(10, 6))
seaborn.heatmap(compute_topic_summary_matrix(lda, corpus, 20).transpose())
plt.title('Hot map of the summary topics')
plt.xticks(range(7), summaries_topics, rotation=50, ha='center');
In [38]:
def get_topic_summary(summary_number):
"""Get the topic of the current summary"""
summary_topics = lda.get_document_topics(corpus[summary_number])
summary_topics.sort(key=lambda tup: tup[1], reverse=True)
return summaries_topics[summary_topics[0][0]]
In [39]:
df['Summary_topic'] = 0
for i in range(df.shape[0]):
df.loc[df.index[i], 'Summary_topic'] = get_topic_summary(i)
In [282]:
df.Summary_topic.value_counts().sort_values(ascending=True).plot.barh()
plt.title('Distribution of the topics');
In [283]:
s = df.groupby(by=['Summary_topic']).Fatalities_total.sum().sort_values(ascending=True, na_position='first')[-20:]
s.plot.barh(title='Number of #fatalities per summary topic');
In [291]:
grouped = df.groupby(by=['Summary_topic', (df.index.year // 10) * 10]).size()
margin = np.arange(-3.75, 3.75, 1.25)
colors = ["c", "m", "y", "g", "k", "r"]
plt.figure(figsize=(12, 8))
ax = plt.subplot()
legend = []
for i, index in enumerate(grouped.index.levels[0]):
xx = ax.bar(np.array(grouped.index.levels[1]) - margin[i], grouped[index].values, color=colors[i], width=1.2)
legend.append(xx)
plt.legend(legend, grouped.index.levels[0])
plt.title('Distribution of the #accidents by topics by decade')
plt.show()
In [292]:
grouped = df.groupby(by=['Summary_topic', (df.index.year // 10) * 10]).Fatalities_total.sum()
margin = np.arange(-3.75, 3.75, 1.25)
colors = ["c", "m", "y", "g", "k", "r"]
plt.figure(figsize=(12, 8))
ax = plt.subplot()
legend = []
for i, index in enumerate(grouped.index.levels[0]):
xx = ax.bar(np.array(grouped.index.levels[1]) - margin[i], grouped[index].values, color=colors[i], width=1.2)
legend.append(xx)
plt.legend(legend, grouped.index.levels[0])
plt.title('#Fatalities (total) by topics by decade')
plt.show()
In [ ]: