In [61]:

    
import pandas as pd
import numpy as np
from collections import Counter
import matplotlib.pyplot as plt
import seaborn

import planecrashinfo_light as pci

from sklearn.cluster import KMeans
from sklearn.externals import joblib

from gensim import corpora, models, utils
from gensim.models import TfidfModel, LsiModel

import nltk
from nltk.corpus import stopwords

%matplotlib inline

Raw data



In [2]:

    
df = pd.read_csv('data/data.csv')

Clean(er) Data



In [3]:

    
df = pci.clean_database(df)
df.head()









    Out[3]:






  
    
      
      Time
      Location
      Operator
      Route
      AC_Type
      Aboard
      Fatalities
      Ground
      Summary
      Origin
      Destination
      Fatalities_total
      Location_Country
      Accident_type
    
    
      Date
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      1921-02-03
      NaN
      Mendotta, Minnesota
      US Aerial Mail Service
      NaN
      De Havilland DH-4
      1 (passengers:0 crew:1)
      1 (passengers:0 crew:1)
      0.0
      Shortly after takeoff from Minneapolis-World C...
      NaN
      NaN
      1.0
      USA
      0
    
    
      1921-02-09
      NaN
      La Crosse, Wisconsin
      US Aerial Mail Service
      NaN
      Junkers F-13
      3 (passengers:0 crew:3)
      3 (passengers:0 crew:3)
      0.0
      Crashed for unknown reasons. Both pilots and t...
      NaN
      NaN
      3.0
      USA
      0
    
    
      1921-02-15
      NaN
      Gibraltar
      Aeropostale
      NaN
      Breguet 14
      2 (passengers:0 crew:2)
      2 (passengers:0 crew:2)
      0.0
      The mail fligh encountered poor weather condit...
      NaN
      NaN
      2.0
      Gibraltar
      0
    
    
      1921-02-22
      NaN
      Elko, Nevada
      US Aerial Mail Service
      NaN
      De Havilland DH-4
      1 (passengers:0 crew:1)
      1 (passengers:0 crew:1)
      0.0
      Shortly after taking off, the aircraft stalled...
      NaN
      NaN
      1.0
      USA
      0
    
    
      1921-04-06
      NaN
      Point Cook, Australia
      Military - Royal Australian Air Force
      NaN
      Avro 504
      2 (passengers:0 crew:0)
      2 (passengers:0 crew:0)
      0.0
      Shortly after taking off on a training flight,...
      NaN
      NaN
      2.0
      Australia
      0



In [4]:

    
print('Total number of the data: {}'.format(df.shape[0]))
print('Number of the not empty summaries: {}'.format(df[df.Summary.isnull()].shape[0]))









    



Total number of the data: 5686
Number of the not empty summaries: 231

Let's try to cluster summaries

We'll use Latent Dirichlet allocation (LDA). LDA is a topic model that generates topics based on word frequency from a set of documents. LDA is particularly useful for finding reasonably accurate mixtures of topics within a given document set.



In [5]:

    
# set data without summary as empty row
df.Summary.fillna('', inplace=True)



In [6]:

    
splitter = nltk.data.load('tokenizers/punkt/english.pickle')
tokenizer = nltk.tokenize.TreebankWordTokenizer()
stopset = set(stopwords.words('english'))



In [7]:

    
def text_to_words(text):   
    tokenized_sentences = []
    sentences = splitter.tokenize(text)
    for sentence in sentences:
        tokens = []
        # remove punctuation and stopwords
        for token in utils.tokenize(sentence, lowercase=True, deacc=True, errors="ignore"):
            if token not in stopset:                
                tokens.append(token)
                
        
        tokenized_sentences.extend(tokens)
        tokenized_sentences.extend([' '.join(bigram) for bigram in nltk.ngrams(tokens, 2)])

    return tokenized_sentences



In [8]:

    
texts = [text_to_words(text) for text in df.Summary.values]



In [9]:

    
# create dictionary
dictionary = corpora.Dictionary(texts)



In [10]:

    
# get tokens that appear too often
popular_tokens = [token for token, frequency in dictionary.dfs.items() if frequency > 680]

for token in popular_tokens:
    print(dictionary.get(token))









    



aircraft
crashed
engine
crew
approach
flight
runway
failure
pilot
plane



In [11]:

    
# remove from dictionary the tokens that appear too often
dictionary.filter_tokens(popular_tokens)



In [12]:

    
# create documents corpus
corpus = [dictionary.doc2bow(text) for text in texts]

And next question is how many clusters do we have? So let's reduce number of the dimensions for visualization



In [13]:

    
tfidf = TfidfModel(corpus, dictionary=dictionary, normalize=True)
corpus_tfidf = tfidf[corpus]

We'll use module for Latent Semantic Analysis (aka Latent Semantic Indexing). It implements fast truncated SVD (Singular Value Decomposition)



In [53]:

    
np.random.seed(42)

# project to 2 dimensions for visualization
lsi_model = LsiModel(corpus_tfidf, id2word=dictionary, num_topics=2)



In [54]:

    
# save coordinates
coords = []

for coord in lsi_model[corpus]:
    if len(coord) > 1:
        coords.append((coord[0][1], coord[1][1]))



In [55]:

    
max_clusters = 10
clusters_num = range(1, max_clusters + 1)
inertias = np.zeros(max_clusters)



In [56]:

    
for cluster_num in clusters_num:
    kmeans = KMeans(cluster_num, random_state=42).fit(coords)
    # "inertia_" is sum of distances of samples to their closest cluster center
    inertias[cluster_num - 1] = kmeans.inertia_



In [286]:

    
plt.plot(clusters_num, inertias, "b*-")

plt.ylabel('Inertia')
plt.xlabel('K')
plt.title('Sum of distances of samples to their closest cluster center')
plt.show()



In [58]:

    
# let's choose number of the clusters as 6
final_clusters_number = 6

kmeans = KMeans(final_clusters_number, random_state=42).fit(coords)



In [281]:

    
# distribution per class
plt.bar(Counter(kmeans.labels_).keys(), Counter(kmeans.labels_).values())
plt.title('Distribution per class');



In [60]:

    
colors = ["g", "r", "m", "c", "y", "k"]

for i in range(len(coords)):
    plt.scatter(coords[i][0], coords[i][1], c=colors[kmeans.labels_[i]], s=10)
    
plt.title('Clusterization of the topics')
plt.show()

Build LDA model with num_topics = 6



In [35]:

    
np.random.seed(42)

lda = models.ldamodel.LdaModel(corpus_tfidf, id2word=dictionary, num_topics=final_clusters_number, passes=10)



In [63]:

    
joblib.dump(lda, 'dumps/lda.pkl')
# lda = joblib.load('dumps/lda.pkl')









    Out[63]:





['dumps/lda.pkl']



In [36]:

    
lda.show_topics(final_clusters_number, num_words=10, formatted=False)









    Out[36]:





[(0,
  [('crashed takeoff', 0.0019627148510899133),
   ('takeoff', 0.0014971220431443486),
   ('mountain poor', 0.00077419310143919261),
   ('poor weather', 0.00075855142118699587),
   ('poor', 0.00071142943923526304),
   ('weather', 0.00066648193278992149),
   ('mountain', 0.00061560579528629545),
   ('ft', 0.0005779274754642822),
   ('landing', 0.00048048182931915913),
   ('crashed burned', 0.00046000037823708631)]),
 (1,
  [('crashed shortly', 0.001054787198294648),
   ('taking', 0.0010225011707444053),
   ('shot', 0.00098489588862058245),
   ('shortly', 0.0009213643819228173),
   ('shortly taking', 0.00082935510809260609),
   ('rebels', 0.00056728356805099754),
   ('takeoff', 0.00048241039372204158),
   ('forces', 0.00047834136067647997),
   ('crashed taking', 0.00046165310804528314),
   ('crashed take', 0.00043040271906388984)]),
 (2,
  [('control', 0.0010502088750443317),
   ('landing', 0.00092092187905057344),
   ('takeoff', 0.0009146352097339918),
   ('taking', 0.00090008645589918031),
   ('airport', 0.00089582445651587613),
   ('cargo', 0.00079947455997207413),
   ('loss', 0.0007929171135743747),
   ('engine failure', 0.0007754645440090737),
   ('lost', 0.0007507932824560048),
   ('emergency', 0.00074530935511506983)]),
 (3,
  [('mountain', 0.0018874761122867317),
   ('weather', 0.0013794695081746587),
   ('struck', 0.0012922257034500698),
   ('conditions', 0.0012582128002179029),
   ('crashed mountain', 0.0011850419722558676),
   ('struck mountain', 0.0010828233147423577),
   ('vfr', 0.0010695278489258856),
   ('weather conditions', 0.0010335208682359194),
   ('vfr flight', 0.0010314754919316961),
   ('adverse weather', 0.00094256458803784705)]),
 (4,
  [('cargo', 0.0016886585286911954),
   ('cargo plane', 0.0016642886151924806),
   ('crashed en', 0.0016306859176653998),
   ('en route', 0.0015256158586003919),
   ('en', 0.0015242060318883235),
   ('route', 0.0014915714680539341),
   ('crashed sea', 0.0014141533291409597),
   ('attempting', 0.0013950369684239401),
   ('land', 0.0013677125744226055),
   ('attempting land', 0.0013442038206933119)]),
 (5,
  [('crashed approach', 0.0010846626608930639),
   ('route', 0.00099963700396901289),
   ('mountain', 0.00096211368805744035),
   ('en route', 0.00094225686483102704),
   ('en', 0.00094031223631040137),
   ('mountains', 0.00072381263546614021),
   ('mountain en', 0.00067956609645579912),
   ('crashed mountain', 0.00062850719720220481),
   ('crashed mountains', 0.00057469077337268478),
   ('mountainside', 0.00047827880468909954)])]



In [37]:

    
# approximate names of the topics

summaries_topics = [
    'takeoff',
    'crashed shortly',
    'landing',
    'weather conditions',
    'in route',
    'crashed in'
]

Model interpretation



In [47]:

    
def compute_topic_summary_matrix(model, corpus, summaries_number):
    ts_matrix = pd.DataFrame(
        data=np.zeros((final_clusters_number, summaries_number)), columns=range(summaries_number)
    )
    
    for i in range(summaries_number):
        summary_topics = model.get_document_topics(corpus[i])
        
        for topic, prob in summary_topics:
            ts_matrix[i][topic] += prob
        
    return ts_matrix



In [107]:

    
plt.figure(figsize=(10, 6))
seaborn.heatmap(compute_topic_summary_matrix(lda, corpus, 20).transpose())

plt.title('Hot map of the summary topics')
plt.xticks(range(7), summaries_topics, rotation=50, ha='center');



In [38]:

    
def get_topic_summary(summary_number):
    """Get the topic of the current summary"""
    summary_topics = lda.get_document_topics(corpus[summary_number])
    summary_topics.sort(key=lambda tup: tup[1], reverse=True)
        
    return summaries_topics[summary_topics[0][0]]



In [39]:

    
df['Summary_topic'] = 0

for i in range(df.shape[0]):
   df.loc[df.index[i], 'Summary_topic'] = get_topic_summary(i)



In [282]:

    
df.Summary_topic.value_counts().sort_values(ascending=True).plot.barh()
plt.title('Distribution of the topics');



In [283]:

    
s = df.groupby(by=['Summary_topic']).Fatalities_total.sum().sort_values(ascending=True, na_position='first')[-20:]
s.plot.barh(title='Number of #fatalities per summary topic');



In [291]:

    
grouped = df.groupby(by=['Summary_topic', (df.index.year // 10) * 10]).size()

margin = np.arange(-3.75, 3.75, 1.25)
colors = ["c", "m", "y", "g", "k", "r"]

plt.figure(figsize=(12, 8))
ax = plt.subplot()
legend = []

for i, index in enumerate(grouped.index.levels[0]):
    xx = ax.bar(np.array(grouped.index.levels[1]) - margin[i], grouped[index].values, color=colors[i], width=1.2)
    legend.append(xx)

plt.legend(legend, grouped.index.levels[0])
plt.title('Distribution of the #accidents by topics by decade')
plt.show()



In [292]:

    
grouped = df.groupby(by=['Summary_topic', (df.index.year // 10) * 10]).Fatalities_total.sum()

margin = np.arange(-3.75, 3.75, 1.25)
colors = ["c", "m", "y", "g", "k", "r"]

plt.figure(figsize=(12, 8))
ax = plt.subplot()
legend = []

for i, index in enumerate(grouped.index.levels[0]):
    xx = ax.bar(np.array(grouped.index.levels[1]) - margin[i], grouped[index].values, color=colors[i], width=1.2)
    legend.append(xx)

plt.legend(legend, grouped.index.levels[0])
plt.title('#Fatalities (total) by topics by decade')
plt.show()



In [ ]:

	Time	Location	Operator	Route	AC_Type	Aboard	Fatalities	Ground	Summary	Origin	Destination	Fatalities_total	Location_Country	Accident_type
Date
1921-02-03	NaN	Mendotta, Minnesota	US Aerial Mail Service	NaN	De Havilland DH-4	1 (passengers:0 crew:1)	1 (passengers:0 crew:1)	0.0	Shortly after takeoff from Minneapolis-World C...	NaN	NaN	1.0	USA	0
1921-02-09	NaN	La Crosse, Wisconsin	US Aerial Mail Service	NaN	Junkers F-13	3 (passengers:0 crew:3)	3 (passengers:0 crew:3)	0.0	Crashed for unknown reasons. Both pilots and t...	NaN	NaN	3.0	USA	0
1921-02-15	NaN	Gibraltar	Aeropostale	NaN	Breguet 14	2 (passengers:0 crew:2)	2 (passengers:0 crew:2)	0.0	The mail fligh encountered poor weather condit...	NaN	NaN	2.0	Gibraltar	0
1921-02-22	NaN	Elko, Nevada	US Aerial Mail Service	NaN	De Havilland DH-4	1 (passengers:0 crew:1)	1 (passengers:0 crew:1)	0.0	Shortly after taking off, the aircraft stalled...	NaN	NaN	1.0	USA	0
1921-04-06	NaN	Point Cook, Australia	Military - Royal Australian Air Force	NaN	Avro 504	2 (passengers:0 crew:0)	2 (passengers:0 crew:0)	0.0	Shortly after taking off on a training flight,...	NaN	NaN	2.0	Australia	0