View CPT Results for experiment "{}"


In [ ]:
%matplotlib inline

import matplotlib
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

pd.set_option('display.mpl_style', 'default') # Make the graphs a bit prettier
plt.rcParams['figure.figsize'] = (10, 7)

Topic word perplexity


In [ ]:
import os

def load_perplexity_results(results_dir):
    perplexity_topic = pd.read_csv(os.path.join(results_dir, 'perplexity_topic.csv'), index_col=0)
    perplexity_opinion = pd.read_csv(os.path.join(results_dir, 'perplexity_opinion.csv'), index_col=0)
    
    return perplexity_topic, perplexity_opinion

In [ ]:
results_dir = '{}'

In [ ]:
perplexity_topic, perplexity_opinion = load_perplexity_results(results_dir)
perplexity_topic.plot();

In [ ]:
# change intervals if needed
intervals = ['20', '100', '180', '200']

In [ ]:
perplexity_topic[intervals].plot();

Opinion word perplexity


In [ ]:
perplexity_opinion.plot();

In [ ]:
perplexity_opinion[intervals].plot();

Results for topics and opinions


In [ ]:
# choose optimal number of topics based on perplexity results
nTopics = 0

In [ ]:
# load results
import glob
import os
import re

def load_experiment_results(nTopics, results_dir):

    # document topic matrix (theta)
    theta = pd.read_csv(os.path.join(results_dir, 'theta_{}.csv'.format(nTopics)), index_col=0)

    # phi topic (transposed)
    phi_topic_t = pd.read_csv(os.path.join(results_dir, 'topics_{}.csv'.format(nTopics)), index_col=0, encoding='utf-8')

    # phi opinion (transposed) per perspective
    opinion_files = glob.glob('{}/opinions_*_{}.csv'.format(results_dir, nTopics))
    phis_opinion_t = {}
    for f in opinion_files:
        m = re.match(r'.+opinions_(.+).csv', f)
        name = m.group(1).replace('_{}'.format(nTopics), '')
        phis_opinion_t[name] = pd.read_csv(f, index_col=0, encoding='utf-8')
    return theta, phi_topic_t, phis_opinion_t

In [ ]:
theta, phi_topic_t, phis_opinion_t = load_experiment_results(nTopics, results_dir)

'Weight' of the most important topic per document

Each topic has a certain weight in the document. We would like to see one (or a few) topics with high weight per document. Below are some basic statistics on the most important topic (highest weight) per document.


In [ ]:
def print_topic_weights(theta):
    a = theta.copy().apply(max, axis=1)
    a.sort(ascending=False)

    print 'Mean weight of most important topic: {} (std: {})'.format(a.mean(), a.std())
    print 'Min and max of most important topic: min: {}, max: {}'.format(a.min(), a.max())
    print 'Median weight of most important topic: ', a.median()

print_topic_weights(theta)

Line graph of the sorted topic weigths:


In [ ]:
def plot_topic_weights(theta):
    theta.copy().apply(lambda row:sorted(row)[::-1], axis=1).transpose().plot(legend=None);

plot_topic_weights(theta)

Number of topics per document

The number of topics per document is calculated by counting the number of topic with weight above a certain threshold. The numer of topics per document should (roughly) be equivalent to the number of pm:topics per document (however, at the moment we do not have statistics of this).


In [ ]:
def display_number_of_topics_per_document(theta, threshold=0.05):
    topic_counts = theta.apply(lambda row: np.sum(row > threshold), axis=1)
    print topic_counts.value_counts(sort=False)
    topic_counts.hist(bins=len(topic_counts.value_counts()), figsize=(5,3));

display_number_of_topics_per_document(theta)

Topics


In [ ]:
def show_topic(t, top=10):
    """Return dataframe containing <top> topic words, weights of topic words and per perspective opinion words and
    weigths of opinion words.
    
    Parameters
        t : str - index of topic number
        top : int - the number of words to store in the dataframe
    
    Returns Pandas DataFrame
    """
    topic = phi_topic_t[t].copy()
    topic.sort(ascending=False)
    topic = topic[0:top]
    df_t = pd.DataFrame(topic)
    df_t.reset_index(level=0, inplace=True)
    df_t.columns = ['topic_{}'.format(t), 'weights_topic_{}'.format(t)]

    dfs = [df_t]

    for p, o in phis_opinion_t.iteritems():
        opinion = o[t].copy()
        opinion.sort(ascending=False)
        opinion = opinion[0:top]
        df_o = pd.DataFrame(opinion)
        df_o.reset_index(level=0, inplace=True)
        df_o.columns = ['opinion_{}_{}'.format(t, p), 'weights_opinion_{}_{}'.format(t, p)]
        dfs.append(df_o)
        
    return pd.concat(dfs, axis=1)

def print_topics(phi_topic_t, phis_opinion_t, nTopics):
    jsd = None
    jsdFile = os.path.join(results_dir, 'jsd_{}.csv'.format(nTopics))
    if os.path.isfile(jsdFile):
        print 'Ordering topics by Jensen-Shannon divergence of the opinions'
        print
        jsd = pd.read_csv(jsdFile, index_col=0)
        r = jsd.sort(['jsd'], ascending=False)
        print 'Min jsd: {}, max jsd: {}'.format(np.min(jsd['jsd']), np.max(jsd['jsd']))
        print 'Average jsd: {} (std: {}), median jsd: {}'.format(np.mean(jsd['jsd']), np.std(jsd['jsd']), np.median(jsd['jsd']))
        print
        loop = r.index
    else:
        print 'Ordering topics by topic number'
        print '(To order topics by Jensen-Shannon divergence of the opinions run the experiment_jsd_opinions script.)'
        print
        loop = range(nTopics)
    for i in loop:
        if not jsd is None:
            print 'JSD:', jsd.at[i, 'jsd']
        pLabels = ['opinion_{}_{}'.format(i, p) for p, o in phis_opinion_t.iteritems()]
        print show_topic(str(i))[['topic_{}'.format(i)] + pLabels]
        print

In [ ]:
print_topics(phi_topic_t, phis_opinion_t, nTopics)