viz_text_analysis_topic_assignments


Visualizing Topic Assignments

Imports


In [1]:
import numpy as np
import pandas as pd
import plotly.plotly as py
from plotly.graph_objs import *

Load


In [2]:
df = pd.read_csv('../../results/topic_count_10.csv')

In [3]:
df.sort(['department_name', 'topic'], inplace=True)

Transform

Percentage Values


In [4]:
totals = pd.DataFrame({'total' : df.groupby('department_name')['count'].sum()}).reset_index()

In [5]:
df = pd.merge(df, totals, on='department_name')

In [6]:
df['percentage'] = df['count'] / df['total']

Matrix Form


In [7]:
matrix = df.pivot(index='words', columns='department_name', values='percentage')

In [8]:
matrix


Out[8]:
department_name alpdc plant expenditures proj azcss general admin cchem instr ops cchem res research cdcdn bas mat mgmt stores ops cdcdn chem engineering svc ceeeg res research eeres microfab lab eeres res central eeres res projects ... iqbbb research jbssl ssl contracts grants mdnst ns research grants mepmb mb res grants ooebi depolymerization ooebi support centers physi ss atomic molecular physi ss condensed matter expe ukhds central housekeeping ukhds central maintenance
words
aldrich, sigma, technologies, life, england, biolabs, cell, vwr, international, reagent NaN 0.004376 0.134615 0.326571 0.085324 NaN 0.251889 0.064 0.259615 0.109890 ... 0.400000 0.013453 0.393881 0.250000 0.504918 0.163265 0.021053 0.069767 NaN NaN
dell, tips, lp, marketing, tip, scientific, rainin, digikey, pipet, usa 0.08 0.122538 0.050000 0.034142 0.027304 NaN 0.105793 0.040 0.168269 0.109890 ... 0.077437 0.542601 0.065010 0.122951 0.032787 0.102041 0.378947 0.139535 0.161850 0.004464
grainger, type, size, length, material, steel, head, width, dia, finish 0.12 0.019694 0.038462 0.016329 0.058020 0.813559 0.032746 0.208 0.009615 0.043956 ... 0.007799 0.022422 0.007648 0.012295 0.003279 0.004082 0.084211 0.116279 0.057803 0.750000
hp, max, office, cartridge, toner, packard, hewlett, black, yield, color 0.04 0.087527 0.076923 0.017318 0.030717 0.050847 0.030227 0.032 0.067308 0.043956 ... 0.023955 0.076233 0.011472 0.004098 0.003279 0.036735 0.010526 0.040698 0.069364 0.040179
laboratories, praxair, bio, spectrum, rad, chemicals, thorlabs, distribution, supply, sciences 0.08 0.024070 0.069231 0.117763 0.017065 0.118644 0.261965 0.028 0.216346 0.274725 ... 0.081894 0.107623 0.099426 0.061475 0.101639 0.089796 0.400000 0.296512 0.063584 0.040179
max, office, black, color, sanford, ink, size, point, fine, blue 0.16 0.157549 0.038462 0.021277 0.088737 NaN 0.015113 0.024 0.014423 0.021978 ... 0.026184 0.031390 0.034417 0.057377 0.013115 0.012245 0.021053 0.011628 0.011561 0.071429
office, max, color, size, label, 3m, private, white, tape, box 0.32 0.433260 0.065385 0.024245 0.122867 NaN 0.022670 0.076 0.067308 0.076923 ... 0.017270 0.076233 0.007648 0.036885 NaN 0.028571 NaN 0.046512 0.046243 0.071429
paper, office, white, max, boise, lb, cdw, color, carton, clark 0.20 0.146608 0.053846 0.023256 0.023891 NaN 0.045340 0.024 0.062500 0.131868 ... 0.029526 0.071749 0.028681 0.020492 0.003279 0.044898 0.063158 0.034884 0.456647 0.008929
scientific, fisher, thermo, chemical, glass, cs, bottle, cas, filter, fisherbrand NaN 0.002188 0.353846 0.326076 0.389078 NaN 0.151134 0.136 0.072115 0.109890 ... 0.171588 0.049327 0.187380 0.213115 0.288525 0.269388 0.010526 0.139535 0.005780 0.004464
vwr, international, scientific, label, private, fisher, cs, sterile, free, tube NaN 0.002188 0.119231 0.093023 0.156997 0.016949 0.083123 0.368 0.062500 0.076923 ... 0.164345 0.008969 0.164436 0.221311 0.049180 0.248980 0.010526 0.104651 0.127168 0.008929

10 rows × 25 columns


In [9]:
matrix = matrix.fillna(0)

In [10]:
matrix_vals = np.array(matrix)

Topics


In [11]:
topics = df.words.unique().tolist()

Departments


In [12]:
depts = matrix.columns

Heat Map


In [13]:
data = Data([
    Heatmap(
        z=matrix_vals,
        y=topics,
        x=depts,
        colorscale='Greys',
        reversescale=True
    )
])

layout = Layout(
    title='Topic Assignment',
    xaxis=XAxis(
        titlefont=Font(
            size=10
        ),
        range=[-0.5, 14.5],
        type='category',
        autorange=True,
        showgrid=False,
        showline=False,
        autotick=False,
        showticklabels=True,
        ticklen=0,
        tickangle=-90,
        tickfont=Font(
            size=8
        )
    ),
    yaxis=YAxis(
        titlefont=Font(
            size=10
        ),
        range=[-0.5, 9.5],
        type='category',
        autorange=True,
        showticklabels=True,
        ticklen=0,
        tickangle=0,
        tickfont=Font(
            size=8
        )
    ),
    margin=Margin(
        l=350,
        r=0,
        b=150,
        t=50
    )
)

fig = Figure(data=data, layout=layout)
py.iplot(fig, filename='basic-heatmap')


Out[13]:

In [ ]: