In [1]:
import random
import palettable
import pandas as pd
from time import sleep
import plotly.plotly as py
import plotly.tools as tls
from plotly.graph_objs import *
def main():
# Urls
plots = []
# Colors
palette = palettable.colorbrewer.qualitative.Accent_8.hex_colors + \
palettable.colorbrewer.qualitative.Dark2_6.hex_colors
random.seed(18675309)
random.shuffle(palette)
for i in range(5, 16):
t = 'pd_topics_' + str(i) + '.csv'
d = 'topics_definitions_' + str(i) + '.csv'
filename = 'topic' + str(i)
# Topic assignments
df = pd.read_csv('../../results/'+t, header=None)
df.columns = ['tokenized_description', 'topic']
df = df[df['tokenized_description'] != '[]']
topics = pd.DataFrame({'count' : df.groupby('topic')['topic'].count()}).reset_index()
# Topic definitions
df = pd.read_csv('../../results/'+d, header=None)
df.columns = ['topic', 'words']
df['words'] = df['words'].apply(lambda x: ', '.join(x.split()[:10]))
# Merge
df = pd.merge(topics, df, on='topic')
# Chart data
data = Data([
Bar(
x=df['topic'].tolist(),
y=df['count'].tolist(),
text=df['words'].tolist(),
textfont=Font(
family='Times new roman'
),
marker=Marker(
color=palette
),
line=Line(
color='white',
width= 1.0
)
)
])
layout = Layout(
title='Purchase Orders per Topic',
font=Font(
family='Times new roman'
),
showlegend=False
)
fig = Figure(data=data, layout=layout)
# Figure
plots.append(py.plot(fig, filename = filename, auto_open=False, world_readable=True))
# Sleep
sleep(random.randint(30, 60))
# Return
return plots
In [2]:
plots = main()
Note that the topic numbers are not consistent across graphs. This, known as label switching in the literature, is due to LDA not being an identifiable model.
The following plots are for the 11 LDA results we created, for number of topics from 5 to 15. On mouseover, the top 10 words for each topic are shown.
In [3]:
tls.embed(plots[0])
Out[3]:
In [4]:
tls.embed(plots[1])
Out[4]:
In [5]:
tls.embed(plots[2])
Out[5]:
In [6]:
tls.embed(plots[3])
Out[6]:
In [7]:
tls.embed(plots[4])
Out[7]:
In [8]:
tls.embed(plots[5])
Out[8]:
In [9]:
tls.embed(plots[6])
Out[9]:
In [10]:
tls.embed(plots[7])
Out[10]:
In [11]:
tls.embed(plots[8])
Out[11]:
In [12]:
tls.embed(plots[9])
Out[12]:
In [13]:
tls.embed(plots[10])
Out[13]:
In [ ]: