EDA for roletheme models


In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline

In [2]:
meta = pd.read_csv('../../metadata/filtered_fiction_plus_18c.tsv', sep = '\t', index_col = 'docid')
meta = meta[~meta.index.duplicated(keep = 'first')]

In [5]:
# read the doctopic distributions

doctopic_path = '../roletheme/sixthmodelIII_doctopics.tsv'
charvectors = dict()
yearcollections = dict()

with open(doctopic_path, encoding = 'utf-8') as f:
    for line in f:
        fields = line.strip().split('\t')
        if fields[0] == 'book':
            continue
        charid = fields[1]
        docid = charid.split('|')[0]
        if docid not in meta.index:
            print('error: ', docid)
            continue
        else:
            yr = meta.loc[docid, 'inferreddate']
        
        vector = np.array([int(float(x)) for x in fields[3:]], dtype = 'int32')
        charvectors[charid] = vector
        if yr not in yearcollections:
            yearcollections[yr] = []
        yearcollections[yr].append(vector)


error

In [8]:
yeardists = dict()
yrmax = 0
yrmin = 3000

for yr, collex in yearcollections.items():
    if yr > yrmax:
        yrmax = yr
    if yr < yrmin:
        yrmin = yr
    
    charsum = np.sum(collex, axis = 0)
    dist = charsum / np.sum(charsum)
    yeardists[yr] = dist

yearspan = yrmax - yrmin
    
topicdists = dict()
for i in range(len(dist)):
    topicdists[i] = np.zeros(yearspan + 1)
    
for yr in range(yrmin, yrmax + 1):
    for idx, value in enumerate(yeardists[yr]):
        topicdists[idx][yr - yrmin] = value

In [12]:
xaxis = [x for x in range(yrmin, yrmax + 1)]

for i in range(80, 88):
    plt.plot(xaxis, topicdists[i])

plt.show()



In [14]:
numtopics = len(dist)
numtopics


Out[14]:
240

In [13]:
# from Olivia Guest
# https://github.com/oliviaguest/gini

import numpy as np

def gini(array):
    """Calculate the Gini coefficient of a numpy array."""
    # based on bottom eq:
    # http://www.statsdirect.com/help/generatedimages/equations/equation154.svg
    # from:
    # http://www.statsdirect.com/help/default.htm#nonparametric_methods/gini.htm
    # All values are treated equally, arrays must be 1d:
    array = array.flatten()
    if np.amin(array) < 0:
        # Values cannot be negative:
        array -= np.amin(array)
    # Values cannot be 0:
    array += 0.0000001
    # Values must be sorted:
    array = np.sort(array)
    # Index per array element:
    index = np.arange(1,array.shape[0]+1)
    # Number of array elements:
    n = array.shape[0]
    # Gini coefficient:
    return ((np.sum((2 * index - n  - 1) * array)) / (n * np.sum(array)))

In [15]:
ginis = []

for topic in range(numtopics):
    ginis.append(gini(topicdists[topic]))

In [16]:
np.mean(ginis[0:60])


Out[16]:
0.38929352277079848

In [17]:
np.mean(ginis[60: ])


Out[17]:
0.32898731529071001

In [18]:
np.std(ginis)


Out[18]:
0.14075978899366212

In [20]:
from scipy.stats import ttest_ind

In [21]:
ttest_ind(ginis[0:60], ginis[60: ], equal_var = False)


Out[21]:
Ttest_indResult(statistic=2.5059210838104331, pvalue=0.014206687638479236)

In [22]:
# So the themes are significantly more concentrated on the x axis than the roles

In [24]:
topicsizes = np.zeros(numtopics)

for yr, collex in yearcollections.items():
    charsum = np.sum(collex, axis = 0)
    topicsizes = topicsizes + charsum

topicsizes = topicsizes / np.sum(topicsizes)

In [30]:
plt.figure(figsize = (9, 9))
plt.scatter(topicsizes[0:60], ginis[0:60], marker = 'o', color = 'r')
plt.scatter(topicsizes[60:], ginis[60:], marker = 'D', color = 'b')
plt.xlim(0.001, 0.008)
plt.show()



In [ ]: