In [1]:
    
import pandas as pd
import numpy as np
from sklearn.manifold import TSNE
from sklearn.manifold import MDS
from matplotlib import pyplot as plt
import matplotlib, random
%matplotlib inline
    
In [2]:
    
doctopicpath = '../fic50/fic50_vols.tsv'
samples = []
labels = []
with open(doctopicpath, encoding = 'utf-8') as f:
    for line in f:
        fields = line.strip().split('\t')
        charid = fields[1]
        vector = np.array(fields[2 : ], dtype = 'float32')    
        samples.append(vector)
        labels.append(charid)
samples = np.array(samples)
samples.shape
    
    Out[2]:
In [3]:
    
visualizer = TSNE(init = "pca", perplexity = 15)
    
In [4]:
    
newarray = visualizer.fit_transform(samples)
    
In [5]:
    
newarray.shape
    
    Out[5]:
In [6]:
    
x = newarray[ : , 0]
y = newarray[ : , 1]
    
In [7]:
    
plt.scatter(x, y)
plt.show()
    
    
In [8]:
    
meta = pd.read_csv('../../metadata/filtered_fiction_plus_18c.tsv', sep = '\t', index_col = 'docid')
meta = meta[~meta.index.duplicated(keep = 'first')]
    
In [9]:
    
labelcolors = []
for charid in labels:
    doc = charid.split('|')[0]
    date = meta.loc[doc, 'inferreddate']
    labelcolors.append(date)
minimum = min(labelcolors)
maximum = max(labelcolors)
labelcolors = np.array(labelcolors)
labelcolors = (labelcolors - minimum) / (maximum - minimum)
    
In [10]:
    
docdict = dict()
maxnum = 0
labelcolors = []
for charid in labels:
    doc = charid.split('|')[0]
    if doc not in docdict:
        docdict[doc] = random.uniform(0, 1)
        labelcolors.append(docdict[doc])
    else:
        labelcolors.append(docdict[doc])
    
In [13]:
    
plt.figure(figsize=(10,10))
cmap = plt.cm.rainbow
# norm = matplotlib.colors.Normalize(vmin=1.5, vmax=4.5)
plt.scatter(x, y, color = cmap(labelcolors), s = 3)
    
    Out[13]:
    
In [14]:
    
doctopicpath = '../biofic50/biofic50_viz.tsv'
biofic = []
biolabels = []
with open(doctopicpath, encoding = 'utf-8') as f:
    for line in f:
        fields = line.strip().split('\t')
        charid = fields[1]
        vector = np.array(fields[2 : ], dtype = 'float32')    
        biofic.append(vector)
        biolabels.append(charid)
biofic = np.array(biofic)
biofic.shape
    
    Out[14]:
In [15]:
    
tsnetransformer = TSNE(perplexity = 40, init = 'pca')
    
In [16]:
    
bioarray = visualizer.fit_transform(biofic)
    
In [88]:
    
labeldict = dict()
detectives = {'11031|Holmes', '11594|Holmes', 
              'mdp.39015001153629|Poirot', 'mdp.39015061868363|MissMarple',
             'inu.30000007060282|Wimsey', 'mdp.39015003474866|Spade', 
              'uc1.$b808866|Ed'}
historical = {'12985|Diana', '12449|Flora', '5854|Rowena', '12449|MissBradwardine', '5854|Rebecca',
             'osu.32435083797019|Venetia', 'mdp.39015047606440|Serena'}
dudectr = 0
galctr = 0
otherctr = 0
with open('../dataprep/biofic2take.tsv', encoding = 'utf-8') as f:
    for line in f:
        
        fields = line.strip().split('\t')
        charid = fields[0]
        label = fields[1]
        date = int(label[3:7])
        alpha = (date - 1000) / 1100
        
        if label.endswith('f'):
            shape = 'v'
            galctr += 1
        elif label.endswith('m'):
            shape = 'o'
            dudectr += 1
        else:
            shape = 's'
            otherctr += 1
            
        if charid in detectives:
            labeldict[charid] = ('b', alpha, shape)
        elif charid in historical:
            labeldict[charid] = ('g', alpha, shape)
        elif label.startswith('bio') and date < 1900:
            labeldict[charid] = ('k', alpha, shape)
        elif label.startswith('bio'):
            labeldict[charid] = ('k', alpha, shape)
        elif label.startswith('fic') and date < 1900:
            labeldict[charid] = ('r', alpha, shape)
        else:
            labeldict[charid] = ('r', alpha, shape)
dudecolors = np.zeros((dudectr, 4))
dudesizes = []
galcolors = np.zeros((galctr, 4))
galsizes = []
othercolors = np.zeros((otherctr, 4))
othersizes = []
x = bioarray[ : , 0]
y = bioarray[ : , 1]
dudex = []
galx = []
otherx = []
dudey = []
galy = []
othery = []
dudectr = 0
galctr = 0
otherctr = 0
for idx, l in enumerate(biolabels):
    color, alpha, shape = labeldict[l]
    if shape == 's':
        if color == 'r':
            othercolors[otherctr, 0] = 1
        elif color == 'b':
            othercolors[otherctr, 2] = 1
        
        if color == 'b':
            othersizes.append(27)
        else:
            othersizes.append(7)
        
        othercolors[otherctr, 3] = alpha
        otherctr += 1
        
        otherx.append(x[idx])
        othery.append(y[idx])
    
    if shape == 'v':
        if color == 'r':
            galcolors[galctr, 0] = 1
            galcolors[galctr, 2] = .4
        elif color == 'b':
            galcolors[galctr, 2] = 1
        elif color == 'g':
            galcolors[galctr, 1] = 1
        else:
            galcolors[galctr, 2] = .4
        
        if color == 'b' or color == 'g':
            galsizes.append(27)
        else:
            galsizes.append(7)
        
        galcolors[galctr, 3] = alpha
        galctr += 1
        
        galx.append(x[idx])
        galy.append(y[idx])
    
    if shape == 'o':
        if color == 'r':
            dudecolors[dudectr, 0] = 1
        elif color == 'b':
            dudecolors[dudectr, 2] = 1
        
        if color == 'b':
            dudesizes.append(27)
        else:
            dudesizes.append(7)
        
        dudecolors[dudectr, 3] = alpha
        dudectr += 1
        
        dudex.append(x[idx])
        dudey.append(y[idx])
    
In [80]:
    
labeldict = dict()
detectives = {'11031|Holmes', '11594|Holmes', 
              'mdp.39015001153629|Poirot', 'mdp.39015061868363|MissMarple',
             'inu.30000007060282|Wimsey', 'mdp.39015003474866|Spade', 
              'uc1.$b808866|Ed'}
historical = {'12985|Diana', '12449|Flora', '5854|Rowena', '12449|MissBradwardine', '5854|Rebecca',
             '10651|Elizabeth', '1516|Elinor', '10651|Mrs.Bennet'}
ctr = 0
with open('../dataprep/biofic2take.tsv', encoding = 'utf-8') as f:
    for line in f:
        ctr += 1
        fields = line.strip().split('\t')
        charid = fields[0]
        label = fields[1]
        date = int(label[3:7])
        alpha = (date - 1750) / 260
        if charid in detectives:
            labeldict[charid] = ('b', alpha)
        elif charid in historical:
            labeldict[charid] = ('g', alpha)
        elif label.endswith('f'):
            labeldict[charid] = ('r', alpha)
        elif label.endswith('m') and label.startswith('bio'):
            labeldict[charid] = ('g', alpha)
        elif label.endswith('m') and label.startswith('fic'):
            labeldict[charid] = ('c', alpha)
        else:
            labeldict[charid] = ('k', alpha)
            
labelcolors = np.zeros((ctr, 4))
labelsizes = []
for idx, l in enumerate(biolabels):
    color, alpha = labeldict[l]
    if color == 'r':
        labelcolors[idx, 0] = 1
    elif color == 'b':
        labelcolors[idx, 2] = 1
    elif color == 'g':
        labelcolors[idx, 1] = 1
    elif color == 'c':
        labelcolors[idx, 1] = 0.5
        labelcolors[idx, 2] = 0.5
    
    labelcolors[idx, 3] = alpha
        
    if color == 'b':
        labelsizes.append(25)
    else:
        labelsizes.append(2)
    
In [89]:
    
galx = np.array(galx, ndmin = 2)
galy = np.array(galy, ndmin = 2)
dudex = np.array(dudex, ndmin = 2)
dudey = np.array(dudey, ndmin = 2)
otherx = np.array(otherx, ndmin = 2)
othery = np.array(othery, ndmin = 2)
    
In [90]:
    
plt.figure(figsize=(14, 14))
plt.scatter(galx, galy, color = galcolors, s = galsizes, marker = "v")
plt.scatter(dudex, dudey, color = dudecolors, s = dudesizes, marker = "o")
plt.scatter(otherx, othery, color = othercolors, s = othersizes, marker = "s")
plt.show()
    
    
In [92]:
    
from scipy.spatial.distance import cosine
numrows = biofic.shape[0]
distmat = np.array([np.zeros(numrows) for x in range(numrows)])
for i in range(numrows):
    if i % 100 == 1:
        print(i)
    for j in range(i + 1, numrows):
        distance = cosine(biofic[i, : ], biofic[j, : ])
        distmat[i, j] = distance
        distmat[j, i] = distance
print(distmat.shape)
    
    
In [99]:
    
from sklearn.manifold import MDS
mdsviz = MDS(dissimilarity = 'precomputed')
biomds = mdsviz.fit_transform(distmat)
    
In [100]:
    
x = biomds[ : , 0]
y = biomds[ : , 1]
plt.figure(figsize=(10,10))
plt.scatter(x, y, color = labelcolors, s = labelsizes)
plt.show()
    
    
In [ ]: