In [1]:
from gensim.models import AuthorTopicModel

model = AuthorTopicModel.load('model.atmodel')


C:\ProgramData\Anaconda3\lib\site-packages\gensim\utils.py:860: UserWarning: detected Windows; aliasing chunkize to chunkize_serial
  warnings.warn("detected Windows; aliasing chunkize to chunkize_serial")

In [2]:
model.show_topics(num_topics=20)


Out[2]:
[(33,
  '0.006*"report" + 0.005*"made" + 0.003*"vocesendirecto" + 0.003*"fmi" + 0.003*"lipton" + 0.003*"weapons" + 0.003*"tech" + 0.002*"forest" + 0.002*"groundhog" + 0.002*"killing"'),
 (34,
  '0.008*"ElFinanciero_Mx Vacación" + 0.008*"Ernesto_MDiaz ManceraMiguelMX No va a decir" + 0.004*"nandorejas" + 0.004*"O sea" + 0.004*"enterado" + 0.004*"vacación" + 0.001*"CPEUM Menos" + 0.001*"cpeum" + 0.001*"JMarquezP Todo" + 0.001*"jmarquezp"'),
 (42,
  '0.089*"lomásyi" + 0.042*"notieress" + 0.014*"LoMásYI COLUMNA" + 0.010*"padremachorro" + 0.008*"México Entérate" + 0.008*"TU VOZ" + 0.008*"mazapereda" + 0.008*"libertadreligiosa" + 0.007*"siredingv" + 0.007*"COLUMNA   "'),
 (84,
  '0.007*"aclaró" + 0.005*"undato" + 0.005*"james" + 0.005*"engomado" + 0.005*"dm" + 0.005*"hoynocircula" + 0.005*"documentoíndigo" + 0.005*"‘Días de" + 0.004*"placas" + 0.004*"Opus …"'),
 (51,
  '0.052*"bbcmundo" + 0.018*"plumaje" + 0.013*"hoyenanimal" + 0.010*"eedienteanimal" + 0.007*"uanl" + 0.006*"pamelarogue" + 0.005*"elsabuesoap" + 0.004*"Observatorio del Feminicidio" + 0.004*"blogs" + 0.004*"austeridad"'),
 (13,
  '0.027*"arturozamora" + 0.024*"vrubiomarquez" + 0.020*"gdehoyoswalther" + 0.017*"fiorillo" + 0.017*"Carlos Fiorillo" + 0.014*"betoborge" + 0.006*"Crédito Público de SHCP_mx" + 0.006*"FedericoPatino_" + 0.005*"marthatagle" + 0.004*"curas"'),
 (57,
  '0.017*"puntosycomas" + 0.008*"Opinión “" + 0.008*"paezvarela" + 0.008*"Video Una" + 0.007*"columnadecolumnas" + 0.006*"lavan" + 0.006*"sinembargotv" + 0.006*"mundano" + 0.006*"giorgioromero" + 0.006*"cúpulas"'),
 (44,
  '0.000*"qaddafi" + 0.000*"Information Minister" + 0.000*"enfrentaremos" + 0.000*"davidfrum" + 0.000*"wrong" + 0.000*"w" + 0.000*"laying" + 0.000*"lines" + 0.000*"lot" + 0.000*"sounding"'),
 (43,
  '0.020*"pasedelista" + 0.017*"peñadebeserdespedidoya" + 0.016*"noalsilencio" + 0.015*"pasedelista_pm" + 0.007*"hekglez" + 0.005*"websanjuanamartinez" + 0.005*"rcanudasg" + 0.005*"esuncrimendeestado" + 0.004*"retweeted" + 0.004*"erendiritas"'),
 (29,
  '0.032*"astillero" + 0.009*"relevo" + 0.007*"Julio Astillero" + 0.006*"encabronados" + 0.006*"apocado" + 0.005*"unitario" + 0.005*"gatopardista" + 0.005*"obediente" + 0.004*"camisetas" + 0.004*"videocharla"'),
 (12,
  '0.036*"fernandozarates" + 0.009*"Rigo Salgado" + 0.009*"paobarquet" + 0.006*"Enrique Alfaro" + 0.006*"--&gt Escribe J_Fdz_Menendez" + 0.006*"profundizar" + 0.005*"estrenaron" + 0.005*"establezca" + 0.005*"cardoso" + 0.005*"elsillóndepensar"'),
 (35,
  '0.000*"qaddafi" + 0.000*"Information Minister" + 0.000*"enfrentaremos" + 0.000*"davidfrum" + 0.000*"wrong" + 0.000*"w" + 0.000*"laying" + 0.000*"lines" + 0.000*"lot" + 0.000*"sounding"'),
 (18,
  '0.058*"informaciónw" + 0.025*"asísopitas" + 0.022*"marchapormorelos" + 0.022*"marthadebayleenw" + 0.021*"deportesw" + 0.015*"elwesomx" + 0.013*"deporteswradio" + 0.008*"elfutbolentusoídos" + 0.007*"uaemorelos" + 0.006*"JC_Zuniga AlejandroGomezA Lechugol"'),
 (54,
  '0.013*"abrazos" + 0.010*"soriana" + 0.009*"ño" + 0.008*"Video La Banda MS" + 0.008*"resurge" + 0.006*"eléctricos" + 0.005*"riñas" + 0.005*"comipens" + 0.005*"sofocado" + 0.005*"ipc"'),
 (87,
  '0.053*"askmenlatam" + 0.053*"vía_askmenlatam" + 0.024*"autosrpmoficial" + 0.012*"empadron" + 0.006*"aficionado" + 0.005*"juanterrazas" + 0.005*"¡ATENCIÓN" + 0.004*"elhumordeterrazas" + 0.004*"porsche" + 0.004*"¡Siguen"'),
 (86,
  '0.031*"mileniohey" + 0.007*"collado" + 0.006*"ElFinanciero_mx Buen día" + 0.005*"sacc" + 0.004*" MilenioNegocios: ba_anderson y vivircomoreina" + 0.004*"azucenau" + 0.004*"handball" + 0.004*"la Ley de Vida Silvestre" + 0.004*"sacados" + 0.004*"apobación"'),
 (39,
  '0.047*"excélsiortv" + 0.025*"escribe" + 0.020*"excélsiorinforma" + 0.017*"lopezdoriga" + 0.014*"imagen_radio" + 0.013*"Imagen Radio" + 0.010*"másgrandequenunca" + 0.010*"escuchas" + 0.008*"excélsior" + 0.008*"registrarte"'),
 (62,
  '0.010*"williemty" + 0.006*"inédito" + 0.005*"Ciudad Universitaria de UNAM_MX" + 0.004*"excéntrico" + 0.004*"mileniohey" + 0.004*"Texto INÉDITO" + 0.004*"Marcelino Perelló No" + 0.004*"malia" + 0.004*"extravagante" + 0.004*"G…"'),
 (74,
  '0.011*"rivapa" + 0.009*"lt" + 0.009*"spinoso" + 0.007*"jairavalosl" + 0.005*"condicionó" + 0.005*"latentación" + 0.005*"ejecentral" + 0.004*"pilares" + 0.004*"flor" + 0.004*"presidida"'),
 (49,
  '0.013*"El Universal" + 0.008*"lacronicadehoy" + 0.008*"Portada Milenio" + 0.007*"Portada Reforma" + 0.005*"Portada Excelsior" + 0.003*"ManceraMiguelMX CDMX_Semovi Le…" + 0.003*"CDMX_Semovi Dejaría subir" + 0.003*"INEMexico Las" + 0.003*"desorganizado" + 0.003*"previsor"')]

In [ ]:
aut_top = {}
for key, value in model.id2author.items():
    aut_top[value] = model.get_author_topics(value)

In [ ]:
aut_top

In [3]:
%%time
from sklearn.manifold import TSNE
tsne = TSNE(n_components=2, random_state=0)
smallest_author = 0  # Ignore authors with documents less than this.
authors = [model.author2id[a] for a in model.author2id.keys() if len(model.author2doc[a]) >= smallest_author]
_ = tsne.fit_transform(model.state.gamma[authors, :])  # Result stored in tsne.embedding_


Wall time: 1.33 s

In [4]:
from bokeh.io import output_file
output_file('grafica.html')

In [5]:
from bokeh.models import HoverTool
from bokeh.plotting import figure, show, ColumnDataSource

x = tsne.embedding_[:, 0]
y = tsne.embedding_[:, 1]
author_names = [model.id2author[a] for a in authors]

scale = 0.01
author_sizes = [len(model.author2doc[a]) for a in author_names]
radii = [size * scale for size in author_sizes]

source = ColumnDataSource(
        data=dict(
            x=x,
            y=y,
            author_names=author_names,
            author_sizes=author_sizes,
            radii=radii,
        )
    )

hover = HoverTool(
        tooltips=[
        ("author", "@author_names"),
        ("size", "@author_sizes"),
        ]
    )

p = figure(tools=[hover, 'crosshair,pan,wheel_zoom,box_zoom,reset,save,lasso_select'])
p.scatter('x', 'y', radius='radii', source=source, fill_alpha=0.6, line_color=None)
show(p)

In [6]:
from gensim.similarities import MatrixSimilarity


index = MatrixSimilarity(model[list(model.id2author.values())])

In [7]:
from gensim import matutils
import pandas as pd


author_vecs = [model.get_author_topics(author) for author in model.id2author.values()]

def similarity(vec1, vec2):
    dist = matutils.hellinger(matutils.sparse2full(vec1, model.num_topics), \
                              matutils.sparse2full(vec2, model.num_topics))
    sim = 1.0 / (1.0 + dist)
    return sim

def get_sims(vec):
    sims = [similarity(vec, vec2) for vec2 in author_vecs]
    return sims

def get_table(name, top_n=10, smallest_author=1):
    sims = get_sims(model.get_author_topics(name))


    table = []
    for elem in enumerate(sims):
        author_name = model.id2author[elem[0]]
        sim = elem[1]
        author_size = len(model.author2doc[author_name])
        if author_size >= smallest_author:
            table.append((author_name, sim, author_size))
            

    df = pd.DataFrame(table, columns=['Author', 'Score', 'Size'])
    df = df.sort_values('Score', ascending=False)[:top_n]
    
    return df

In [8]:
get_table('Pajaropolitico',top_n=136)


Out[8]:
Author Score Size
43 Pajaropolitico 1.000000 726
111 sdpnoticias 0.732948 53
108 revistaproceso 0.723680 386
31 LaRazon_mx 0.718128 624
39 NoticiasMVS 0.716460 1040
49 Reforma 0.715041 827
73 diario24horas 0.713279 1099
93 lacronicadehoy 0.713079 699
40 Notimex 0.711929 865
47 PublimetroMX 0.711838 749
38 NTelevisa_com 0.708287 1205
59 UnoNoticias 0.703522 441
96 lopezdoriga 0.703421 658
77 elsolde_mexico 0.702873 721
95 lasillarota 0.700213 949
25 Foro_TV 0.696832 1039
57 SinEmbargoMX 0.696292 935
106 politicomx 0.695946 352
35 MarioBeteta 0.694800 439
50 Reporte_Indigo 0.694313 641
37 Milenio 0.691894 803
101 mileniotv 0.690583 122
113 sopitas 0.689154 235
19 El_Universal_Mx 0.688992 1354
22 ExpansionMx 0.688389 1073
42 PabloHiriart 0.688014 27
76 eleconomista 0.685757 420
36 MexicanTimes 0.685367 353
94 lajornadaonline 0.684938 467
91 karlaiberia 0.683972 47
... ... ... ...
68 arturoangel20 0.628420 89
23 FedericoArreola 0.626903 153
100 maurimm 0.626082 1
107 puigcarlos 0.626004 15
11 DeniseDresserG 0.625112 91
78 epigmenioibarra 0.624345 236
71 carlosramirezh 0.622162 59
112 sergioaguayo 0.616861 14
5 AristeguiOnline 0.615096 604
46 Pizu 0.614866 42
105 panchogarfias 0.610419 2
32 LeonKrauze 0.608878 82
7 Canal22 0.607204 281
85 javieraparicio 0.606412 9
69 beltrandelrio 0.606350 103
114 soyfdelrincon 0.605954 21
13 DiegoEOsorno 0.603251 5
89 jshm00 0.596335 9
98 macariomx 0.596170 350
0 ADNPolitico 0.577771 313
4 AnaPOrdorica 0.570918 46
1 ActualidadRT 0.570749 842
72 caudillomx 0.570663 1
104 nytimes 0.546137 545
67 andreslajous 0.537447 30
66 alvaro_delgado 0.531203 29
2 Adela_Micha 0.527842 32
64 aguilarcamin 0.523403 28
3 Amsalazar 0.516466 30
65 ahope71 0.514726 55

116 rows × 3 columns


In [ ]: