In [1]:
%pylab inline


Populating the interactive namespace from numpy and matplotlib

1.7.2 Finding concepts in texts - Hierarchical Dirichlet Process.


In [61]:
import nltk
from tethne.readers import zotero
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
import gensim
import networkx as nx
import pandas as pd

wordnet = nltk.WordNetLemmatizer()
stemmer = nltk.SnowballStemmer('english')
stoplist = stopwords.words('english')

In [3]:
text_root = '../data/EmbryoProjectTexts/files'
zotero_export_path = '../data/EmbryoProjectTexts'

corpus = nltk.corpus.PlaintextCorpusReader(text_root, 'https.+')
metadata = zotero.read(zotero_export_path, index_by='link', follow_links=False)

In [4]:
def normalize_token(token):
    """
    Convert token to lowercase, and stem using the Porter algorithm.

    Parameters
    ----------
    token : str

    Returns
    -------
    token : str
    """
    return wordnet.lemmatize(token.lower())

def filter_token(token):
    """
    Evaluate whether or not to retain ``token``.

    Parameters
    ----------
    token : str

    Returns
    -------
    keep : bool
    """
    token = token.lower()
    return token not in stoplist and token.isalpha() and len(token) > 2

We will represent our documents as a list of lists. Each sub-list contains tokens in the document.


In [5]:
documents=[[normalize_token(token) 
            for token in corpus.words(fileids=[fileid])
            if filter_token(token)]
           for fileid in corpus.fileids()]

In [55]:
years = [metadata[fileid].date for fileid in corpus.fileids()]

Further filtering

LDA in Python is a bit computationally expensive, so anything we can do to cut down on "noise" will help. Let's take a look at wordcounts and documentcounts to see whether we can narrow in on more useful terms.


In [6]:
wordcounts = nltk.FreqDist([token for document in documents for token in document])

In [7]:
wordcounts.plot(20)



In [8]:
documentcounts = nltk.FreqDist([token for document in documents for token in set(document)])

In [9]:
documentcounts.plot(80)


Here we filter the tokens in each document, preserving the shape of the corpus.


In [10]:
filtered_documents = [[token for token in document 
                      if wordcounts[token] < 2000
                      and 1 < documentcounts[token] < 350]
                     for document in documents]

It's easier to compute over integers, so we use a Dictionary to create a mapping between words and their integer/id representation.


In [11]:
dictionary = gensim.corpora.Dictionary(filtered_documents)

The doc2bow() converts a document (series of tokens) into a bag-of-words representation.


In [12]:
documents_bow = [dictionary.doc2bow(document) for document in filtered_documents]

We're ready to fit the model! We pass our BOW-transformed documents, our dictionary, and the number of topics. update_every=0 disables an "online" feature in the sampler (used for very very large corpora), and passes=20 tells the sampler to pass over the whole corpus 20 times.


In [14]:
hmodel = gensim.models.HdpModel(documents_bow, id2word=dictionary)

In [29]:
for t in hmodel.show_topics(topics=200, topn=5, formatted=True):
    print t


topic 0: 0.004*gene + 0.004*theory + 0.004*www + 0.004*wilson + 0.004*scientist
topic 1: 0.007*gene + 0.005*dna + 0.004*kammerer + 0.003*egg + 0.003*called
topic 2: 0.008*fistula + 0.007*stem + 0.007*autism + 0.007*hamlin + 0.004*campbell
topic 3: 0.006*child + 0.004*fetus + 0.003*kanner + 0.003*tiedemann + 0.003*baby
topic 4: 0.003*eugenics + 0.003*frog + 0.003*kammerer + 0.003*toad + 0.003*egg
topic 5: 0.009*egg + 0.004*dinosaur + 0.003*system + 0.003*cope + 0.003*death
topic 6: 0.004*abortion + 0.004*thalidomide + 0.003*franklin + 0.003*woman + 0.003*pregnancy
topic 7: 0.005*gene + 0.003*network + 0.003*abortion + 0.003*form + 0.003*protein
topic 8: 0.007*colborn + 0.006*endocrine + 0.005*chemical + 0.003*effect + 0.003*wildlife
topic 9: 0.006*cocaine + 0.004*drug + 0.004*thalidomide + 0.003*born + 0.003*pregnancy
topic 10: 0.016*court + 0.012*preembryos + 0.006*ivf + 0.006*case + 0.006*agreement
topic 11: 0.005*mendel + 0.004*body + 0.003*cuvier + 0.003*anatomy + 0.003*von
topic 12: 0.003*court + 0.003*preembryos + 0.003*organism + 0.002*haeckel + 0.002*russell
topic 13: 0.006*genome + 0.004*edward + 0.003*jones + 0.003*fire + 0.002*vitro
topic 14: 0.006*nucleus + 0.006*gurdon + 0.004*egg + 0.004*nuclear + 0.003*king
topic 15: 0.004*loeb + 0.004*gaur + 0.004*egg + 0.003*stem + 0.002*researcher
topic 16: 0.004*court + 0.003*case + 0.002*law + 0.002*ferguson + 0.002*stone
topic 17: 0.003*stopes + 0.003*sex + 0.002*woman + 0.002*test + 0.002*rock
topic 18: 0.004*regeneration + 0.002*theory + 0.002*regenerative + 0.002*smith + 0.002*organism
topic 19: 0.004*stem + 0.003*bush + 0.003*brook + 0.003*grobstein + 0.002*president
topic 20: 0.005*rice + 0.003*sachs + 0.003*golden + 0.003*plant + 0.003*ziegler
topic 21: 0.004*growth + 0.004*thompson + 0.003*form + 0.003*ngf + 0.002*nucleus
topic 22: 0.005*carrel + 0.005*tissue + 0.004*culture + 0.003*loeb + 0.002*park
topic 23: 0.004*fetal + 0.003*surgery + 0.003*stem + 0.003*fetus + 0.002*woman
topic 24: 0.004*haeckel + 0.004*egg + 0.003*sand + 0.003*dollar + 0.002*sea
topic 25: 0.003*west + 0.002*leonardo + 0.002*fetus + 0.002*implantation + 0.001*stem
topic 26: 0.004*gene + 0.003*cftr + 0.003*cystic + 0.003*fibrosis + 0.002*loeb
topic 27: 0.003*sex + 0.003*temperature + 0.002*lewis + 0.002*reproductive + 0.002*gift
topic 28: 0.004*waddington + 0.002*node + 0.002*sperm + 0.002*hensen + 0.001*catenin
topic 29: 0.002*fabrici + 0.002*bonnet + 0.002*organizer + 0.002*preformationism + 0.001*embryology
topic 30: 0.002*egg + 0.002*morphogenesis + 0.002*hertwig + 0.001*loeb + 0.001*process
topic 31: 0.004*jones + 0.003*york + 0.002*court + 0.002*case + 0.002*institute
topic 32: 0.003*baby + 0.002*hospital + 0.002*case + 0.002*doe + 0.001*parent
topic 33: 0.002*kurt + 0.002*placenta + 0.001*lab + 0.001*biological + 0.001*san
topic 34: 0.003*leonardo + 0.002*swammerdam + 0.002*egg + 0.001*homunculus + 0.001*preformationism
topic 35: 0.002*abortion + 0.002*pope + 0.002*briggs + 0.002*paul + 0.001*john
topic 36: 0.004*hartman + 0.002*streeter + 0.002*sheep + 0.002*goat + 0.002*carl
topic 37: 0.003*library + 0.002*clomiphene + 0.001*mbl + 0.001*law + 0.001*woman
topic 38: 0.003*department + 0.002*embryology + 0.002*carnegie + 0.002*elli + 0.001*sexuality
topic 39: 0.003*ebert + 0.002*abortion + 0.002*quickening + 0.001*james + 0.001*embryology
topic 40: 0.002*egg + 0.002*image + 0.001*vitro + 0.001*fertilization + 0.001*ivf
topic 41: 0.005*nerve + 0.004*growth + 0.003*sarcoma + 0.003*ganglion + 0.003*tumor
topic 42: 0.003*carrel + 0.002*culture + 0.002*tissue + 0.002*graaf + 0.001*marriage
topic 43: 0.003*mouse + 0.002*hopkins + 0.002*mintz + 0.002*john + 0.002*fertility
topic 44: 0.002*trinkaus + 0.002*john + 0.002*fertilization + 0.001*chang + 0.001*egg
topic 45: 0.003*corner + 0.002*waddington + 0.001*medical + 0.001*chick + 0.001*keenan
topic 46: 0.002*sperm + 0.001*discoideum + 0.001*model + 0.001*organism + 0.001*mold
topic 47: 0.003*pope + 0.002*lecture + 0.001*abortion + 0.001*gregory + 0.001*sixtus
topic 48: 0.003*regeneration + 0.002*morgan + 0.002*plan + 0.001*part + 0.001*process
topic 49: 0.002*trembley + 0.001*polyp + 0.001*mother + 0.001*teresa + 0.001*baird
topic 50: 0.003*sperm + 0.001*egg + 0.001*body + 0.001*nova + 0.001*system
topic 51: 0.004*court + 0.002*case + 0.002*agreement + 0.002*preembryos + 0.002*wife
topic 52: 0.003*willier + 0.001*chick + 0.001*feather + 0.001*hormone + 0.001*sex
topic 53: 0.002*fertilization + 0.002*order + 0.002*stem + 0.002*baby + 0.001*tube
topic 54: 0.002*holtfreter + 0.002*theory + 0.002*organizer + 0.002*nerve + 0.001*tissue
topic 55: 0.002*waddington + 0.002*gene + 0.001*organizer + 0.001*court + 0.001*agreement
topic 56: 0.003*clinic + 0.002*stopes + 0.001*court + 0.001*death + 0.001*birth
topic 57: 0.002*artificial + 0.002*nerve + 0.002*harrison + 0.002*genetic + 0.002*fiber
topic 58: 0.002*steptoe + 0.002*ger + 0.002*pfl + 0.001*egg + 0.001*edward
topic 59: 0.003*von + 0.003*neumann + 0.001*endoscopy + 0.001*endoscope + 0.001*surgery
topic 60: 0.004*conklin + 0.001*wade + 0.001*abortion + 0.001*well + 0.001*roe
topic 61: 0.003*limb + 0.002*nerve + 0.002*harrison + 0.002*nerveless + 0.002*ipscs
topic 62: 0.003*stem + 0.002*treatment + 0.002*xii + 0.001*pope + 0.001*pius
topic 63: 0.002*church + 0.001*quickening + 0.001*mangold + 0.001*catholic + 0.001*abortion
topic 64: 0.003*fabrici + 0.002*fetus + 0.001*heart + 0.001*book + 0.001*uterus
topic 65: 0.003*pill + 0.001*birth + 0.001*woman + 0.001*control + 0.001*hormone
topic 66: 0.002*roux + 0.002*amniocentesis + 0.001*fetus + 0.001*procedure + 0.001*blastomere
topic 67: 0.003*russell + 0.002*theory + 0.002*interpretation + 0.002*heredity + 0.001*organism
topic 68: 0.002*shettles + 0.002*driesch + 0.001*photograph + 0.001*ovum + 0.001*egg
topic 69: 0.004*blood + 0.003*cord + 0.002*stem + 0.001*umbilical + 0.001*banking
topic 70: 0.003*bonner + 0.001*slime + 0.001*mold + 0.001*amoeba + 0.001*bailli
topic 71: 0.003*fate + 0.002*map + 0.001*organism + 0.001*developmental + 0.001*lineage
topic 72: 0.002*diversity + 0.002*russell + 0.001*evolutionary + 0.001*animal + 0.001*theory
topic 73: 0.002*pre + 0.001*litowitz + 0.001*court + 0.001*contract + 0.001*disposition
topic 74: 0.001*agreement + 0.001*contract + 0.001*jones + 0.001*york + 0.001*case
topic 75: 0.002*act + 0.001*stem + 0.001*cloning + 0.001*technology + 0.001*eukaryote
topic 76: 0.003*chimera + 0.002*musculus + 0.001*rossant + 0.001*mu + 0.001*tissue
topic 77: 0.005*stem + 0.001*tissue + 0.001*adult + 0.001*nih + 0.001*type
topic 78: 0.002*game + 0.002*counter + 0.001*pattern + 0.001*cellular + 0.001*conway
topic 79: 0.003*loeb + 0.002*mechanistic + 0.001*metaphysical + 0.001*romance + 0.001*jacques
topic 80: 0.003*wilmut + 0.001*dolly + 0.001*ian + 0.001*sheep + 0.001*animal
topic 81: 0.002*ramsey + 0.001*elizabeth + 0.001*carnegie + 0.001*yale + 0.001*harmonious
topic 82: 0.001*child + 0.001*viable + 0.001*injury + 0.001*fetus + 0.001*case
topic 83: 0.003*harvey + 0.001*william + 0.001*physician + 0.001*generation + 0.001*chapter
topic 84: 0.002*pope + 0.002*pius + 0.001*church + 0.001*abortion + 0.001*catholic
topic 85: 0.001*page + 0.001*scrapbook + 0.001*harvard + 0.001*woman + 0.001*housekeeper
topic 86: 0.002*streak + 0.002*primitive + 0.002*waddington + 0.002*experiment + 0.001*vitro
topic 87: 0.004*growth + 0.003*factor + 0.003*epidermal + 0.001*cohen + 0.001*tissue
topic 88: 0.001*child + 0.001*democrat + 0.001*born + 0.001*medical + 0.001*ivf
topic 89: 0.002*augustine + 0.001*church + 0.001*disposed + 0.001*writing + 0.001*displayed
topic 90: 0.002*laboratory + 0.001*jackson + 0.001*mouse + 0.001*little + 0.001*endometriosis
topic 91: 0.001*developmental + 0.001*crest + 0.001*hall + 0.001*evolutionary + 0.001*neural
topic 92: 0.002*hyman + 0.001*henrietta + 0.001*dolores + 0.001*libbie + 0.001*conscious
topic 93: 0.002*balmaceda + 0.001*clinic + 0.001*fertility + 0.001*chile + 0.001*california
topic 94: 0.001*aquinas + 0.001*summa + 0.001*theologica + 0.001*soul + 0.001*embryology
topic 95: 0.002*leonardo + 0.002*drawing + 0.002*fetus + 0.001*anatomy + 0.001*womb
topic 96: 0.001*statute + 0.001*case + 0.001*court + 0.001*ruling + 0.001*law
topic 97: 0.002*induction + 0.002*chemical + 0.001*experiment + 0.001*compound + 0.001*organizer
topic 98: 0.002*organism + 0.002*child + 0.001*gradient + 0.001*theory + 0.001*process
topic 99: 0.001*automobile + 0.001*quicker + 0.001*der + 0.000*needed + 0.000*zhu
topic 100: 0.001*fourteen + 0.001*atlantic + 0.001*dieser + 0.001*worthwhile + 0.001*janine
topic 101: 0.001*bonner + 0.001*mold + 0.001*slime + 0.001*film + 0.001*fly
topic 102: 0.001*souris + 0.001*ying + 0.001*baylis + 0.001*lectured + 0.001*dohrn
topic 103: 0.002*washington + 0.001*department + 0.001*hamburger + 0.001*louis + 0.001*hviden
topic 104: 0.001*articulated + 0.001*interplay + 0.001*mclean + 0.001*conklin + 0.001*rao
topic 105: 0.001*mccarthy + 0.001*duo + 0.001*lafranchi + 0.001*sandra + 0.001*spirit
topic 106: 0.001*cv + 0.001*villus + 0.001*procedure + 0.001*woman + 0.001*pregnancy
topic 107: 0.001*atra + 0.001*elliot + 0.001*leif + 0.001*framework + 0.001*micrognathia
topic 108: 0.001*cryopreserved + 0.001*holmiae + 0.001*vanderbilt + 0.001*hatch + 0.001*anglican
topic 109: 0.001*covering + 0.001*winther + 0.001*monophyletic + 0.001*thoracic + 0.001*denied
topic 110: 0.001*canterbury + 0.001*reinforces + 0.001*aff + 0.001*latourelle + 0.001*bird
topic 111: 0.001*cruelty + 0.001*manual + 0.001*timing + 0.001*unstudied + 0.000*programme
topic 112: 0.001*window + 0.001*reconsidered + 0.001*legislate + 0.001*definition + 0.001*bruce
topic 113: 0.001*radio + 0.001*verlaufe + 0.001*glossary + 0.001*probability + 0.001*mitochondriaf
topic 114: 0.001*abortion + 0.001*pope + 0.001*effraenatam + 0.001*sixtus + 0.001*document
topic 115: 0.001*overcoming + 0.001*exciting + 0.001*attitude + 0.001*cartesian + 0.001*concedes
topic 116: 0.001*persists + 0.001*quirinal + 0.001*minna + 0.001*crime + 0.001*presence
topic 117: 0.001*cum + 0.001*witnessed + 0.001*solidify + 0.001*producer + 0.001*bhl
topic 118: 0.001*scope + 0.001*convenient + 0.001*zellenstudien + 0.001*specie + 0.000*brenda
topic 119: 0.001*gosling + 0.001*russo + 0.001*ettore + 0.001*presumption + 0.001*mystery
topic 120: 0.001*placing + 0.001*donor + 0.001*theorie + 0.001*amphibien + 0.001*murdoch
topic 121: 0.001*translation + 0.001*tempo + 0.001*de + 0.001*phylogeny + 0.001*landacre
topic 122: 0.001*stria + 0.001*mayer + 0.001*land + 0.001*hyuk + 0.001*contact
topic 123: 0.001*update + 0.001*microphthalmia + 0.001*particle + 0.001*neuroscience + 0.001*heidenhain
topic 124: 0.001*duncan + 0.001*frederik + 0.001*list + 0.000*sloan + 0.000*daughter
topic 125: 0.001*popularly + 0.001*embalming + 0.001*avid + 0.001*prepared + 0.001*meckel
topic 126: 0.001*validity + 0.001*phocomelia + 0.001*duplicate + 0.001*resident + 0.001*ussr
topic 127: 0.001*deem + 0.001*automated + 0.001*unpublished + 0.001*testified + 0.001*poorly
topic 128: 0.001*neurotoxicology + 0.001*betterment + 0.001*estrogen + 0.001*fpetaqaamaaj + 0.001*takuya
topic 129: 0.001*rufus + 0.001*object + 0.001*kritik + 0.001*conception + 0.001*anhui
topic 130: 0.001*nasa + 0.001*scholar + 0.001*concretely + 0.001*putnam + 0.001*unconstitutional
topic 131: 0.001*smoke + 0.001*wooden + 0.001*saturday + 0.001*emerge + 0.001*revelation
topic 132: 0.001*mikhailov + 0.001*glasgow + 0.001*protozoology + 0.001*com + 0.001*intercalation
topic 133: 0.001*microphotography + 0.001*zellsubstanzen + 0.001*whitehead + 0.001*disagreement + 0.001*anti
topic 134: 0.001*augustine + 0.001*duan + 0.001*juke + 0.001*incredible + 0.001*shipped
topic 135: 0.001*philosophie + 0.001*wendy + 0.001*dystrophy + 0.001*holding + 0.001*great
topic 136: 0.001*island + 0.001*suit + 0.001*untersuchung + 0.001*possessed + 0.000*georgetown
topic 137: 0.001*fabrici + 0.001*aware + 0.001*capital + 0.001*tim + 0.001*transfered
topic 138: 0.001*transient + 0.001*uwe + 0.001*told + 0.001*reptile + 0.001*feebleminded
topic 139: 0.001*serve + 0.001*modeler + 0.001*carnccs + 0.001*inference + 0.001*defeat
topic 140: 0.001*incapacitated + 0.001*firmly + 0.001*cook + 0.001*express + 0.001*unifying
topic 141: 0.001*pharmacologist + 0.001*ullman + 0.001*vermont + 0.001*finer + 0.001*entwickelungsmechanik
topic 142: 0.001*decomposing + 0.001*cerebral + 0.001*udvikling + 0.001*hypotonic + 0.001*centrifugation
topic 143: 0.001*remnant + 0.001*reward + 0.001*hox + 0.001*prohibition + 0.001*sinclair
topic 144: 0.001*deafness + 0.001*researching + 0.001*exert + 0.001*familial + 0.001*repository
topic 145: 0.001*balinsky + 0.001*reclam + 0.001*keun + 0.001*constricted + 0.001*receptive
topic 146: 0.001*princess + 0.001*mccaffery + 0.001*microscopical + 0.001*ethic + 0.001*intervene
topic 147: 0.001*singled + 0.001*incontinence + 0.001*insensitivity + 0.001*bumpy + 0.001*jesuit
topic 148: 0.001*campbell + 0.001*sadava + 0.001*homogenized + 0.001*beobachtungen + 0.001*waller
topic 149: 0.001*violated + 0.001*maureen + 0.001*melzi + 0.001*diaphragm + 0.000*tong

In [16]:
hdp_documents = hmodel[documents_bow]

In [20]:
hdp_documents[5]


Out[20]:
[(0, 0.47440333706986754), (44, 0.52362158604844466)]

Document graph


In [23]:
import networkx as nx
from collections import Counter
from itertools import combinations

In [25]:
edge_counts = Counter()
node_counts = Counter()
for document in hdp_documents:
    topics, _ = zip(*document)
    for u, v in combinations(topics, 2):
        edge_key = tuple(sorted([u, v]))
        edge_counts[edge_key] += 1.
    for u in topics:
        node_counts[u] += 1.

In [26]:
graph = nx.Graph()
for node, count in node_counts.iteritems():
    graph.add_node(node, weight=count)

for (source, target), count in edge_counts.iteritems():
    graph.add_edge(source, target, weight=count)

In [27]:
graph.order(), graph.size()


Out[27]:
(103, 333)

In [34]:
node_labels = {}
for i, topic in hmodel.show_topics(topics=200, topn=5, formatted=False):
    terms = zip(*topic)[0]
    node_labels[i] = ', '.join(terms)

In [35]:
graph = nx.relabel_nodes(graph, node_labels)

In [36]:
nx.write_graphml(graph, 'hdp_graph.graphml')

In [54]:
hmodel.m_Elogbeta.shape, hmodel.m_W


Out[54]:
((150, 15115), 15115)

In [81]:
hmodel.m_rhot


Out[81]:
0.014925373134328358

In [57]:
from collections import Counter, defaultdict
from itertools import combinations

In [59]:
topic_counts = defaultdict(Counter)
for year, document in zip(years, hdp_documents):
    for topic, representation in document:
        topic_counts[topic][year] += 1.

In [62]:
topics_over_time = pd.DataFrame(columns=['Topic', 'Year', 'Count'])

i = 0
for topic, yearcounts in topic_counts.iteritems():
    for year, count in yearcounts.iteritems():
        topics_over_time.loc[i] = [topic, year, count]
        i += 1

In [67]:
hmodel.show_topics(topn=5)


Out[67]:
[u'topic 0: 0.004*gene + 0.004*theory + 0.004*www + 0.004*wilson + 0.004*scientist',
 u'topic 1: 0.007*gene + 0.005*dna + 0.004*kammerer + 0.003*egg + 0.003*called',
 u'topic 2: 0.008*fistula + 0.007*stem + 0.007*autism + 0.007*hamlin + 0.004*campbell',
 u'topic 3: 0.006*child + 0.004*fetus + 0.003*kanner + 0.003*tiedemann + 0.003*baby',
 u'topic 4: 0.003*eugenics + 0.003*frog + 0.003*kammerer + 0.003*toad + 0.003*egg',
 u'topic 5: 0.009*egg + 0.004*dinosaur + 0.003*system + 0.003*cope + 0.003*death',
 u'topic 6: 0.004*abortion + 0.004*thalidomide + 0.003*franklin + 0.003*woman + 0.003*pregnancy',
 u'topic 7: 0.005*gene + 0.003*network + 0.003*abortion + 0.003*form + 0.003*protein',
 u'topic 8: 0.007*colborn + 0.006*endocrine + 0.005*chemical + 0.003*effect + 0.003*wildlife',
 u'topic 9: 0.006*cocaine + 0.004*drug + 0.004*thalidomide + 0.003*born + 0.003*pregnancy',
 u'topic 10: 0.016*court + 0.012*preembryos + 0.006*ivf + 0.006*case + 0.006*agreement',
 u'topic 11: 0.005*mendel + 0.004*body + 0.003*cuvier + 0.003*anatomy + 0.003*von',
 u'topic 12: 0.003*court + 0.003*preembryos + 0.003*organism + 0.002*haeckel + 0.002*russell',
 u'topic 13: 0.006*genome + 0.004*edward + 0.003*jones + 0.003*fire + 0.002*vitro',
 u'topic 14: 0.006*nucleus + 0.006*gurdon + 0.004*egg + 0.004*nuclear + 0.003*king',
 u'topic 15: 0.004*loeb + 0.004*gaur + 0.004*egg + 0.003*stem + 0.002*researcher',
 u'topic 16: 0.004*court + 0.003*case + 0.002*law + 0.002*ferguson + 0.002*stone',
 u'topic 17: 0.003*stopes + 0.003*sex + 0.002*woman + 0.002*test + 0.002*rock',
 u'topic 18: 0.004*regeneration + 0.002*theory + 0.002*regenerative + 0.002*smith + 0.002*organism',
 u'topic 19: 0.004*stem + 0.003*bush + 0.003*brook + 0.003*grobstein + 0.002*president']

In [79]:
topic_1_over_time = topics_over_time[topics_over_time.Topic == 1]

In [80]:
plt.bar(topic_1_over_time.Year, topic_1_over_time.Count)
plt.ylabel('Number of documents')
plt.show()



In [ ]: