In [4]:
%pylab inline


Populating the interactive namespace from numpy and matplotlib

In [5]:
from tethne.readers import wos
import pandas as pd
import networkx as nx
import numpy as np
import nltk
from helpers import filter_token, normalize_token
import gensim

In [6]:
metadata = wos.read('../data/Baldwin/PlantPhysiology', 
                    streaming=True, index_fields=['date', 'abstract'], index_features=['authors'])

In [7]:
from tethne.model.corpus import mallet

MALLET


In [8]:
metadata.index_feature('abstract', nltk.tokenize.word_tokenize)

In [9]:
model = mallet.LDAModel(metadata, featureset_name='abstract')
model.Z = 20
model.max_iter = 500

In [10]:
model.fit()




In [11]:
for k, words in model.list_topics():
    print 'topic %i' % k, words


topic 0 [(u'light', 0.027919939194324804), (u'chloroplast', 0.020293894096782367), (u'ii', 0.01801368127691918), (u'complex', 0.017000253356979984), (u'high', 0.014745376235115277), (u'chloroplasts', 0.012743856093235369), (u'photosynthetic', 0.011451735495312896), (u'plastid', 0.011325057005320497), (u'electron', 0.011299721307322017), (u'chlorophyll', 0.010970357233341778)]
topic 1 [(u'protein', 0.06608631682197155), (u'arabidopsis', 0.031040542749713913), (u'proteins', 0.028057054111492563), (u'domain', 0.021885728298185386), (u'binding', 0.02149746607814288), (u'kinase', 0.017839627268268758), (u'activity', 0.014283962726826875), (u'plant', 0.01399787477521661), (u'yeast', 0.012608304724538172), (u'function', 0.012506130456105935)]
topic 2 [(u'gene', 0.04243860900784792), (u'dna', 0.02803157579802536), (u'region', 0.018434558468159536), (u'rna', 0.016800533934777106), (u'sequence', 0.015442682561966353), (u'cdna', 0.011553243883915215), (u'soybean', 0.011507215023819935), (u'analysis', 0.011046926422867137), (u'silencing', 0.00895261328853191), (u'single', 0.008469310257531473)]
topic 3 [(u'aba', 0.03469288095288647), (u'auxin', 0.03276078048366207), (u'ethylene', 0.031699186819253065), (u'ga', 0.026603537230089812), (u'growth', 0.02656107348351345), (u'signaling', 0.022102380092995606), (u'acid', 0.021231873288180218), (u'response', 0.020000424637465763), (u'iaa', 0.012187095267415444), (u'germination', 0.010722096010531009)]
topic 4 [(u'acid', 0.03968271263265706), (u'biosynthesis', 0.021682756216086643), (u'acids', 0.01874087471943167), (u'amino', 0.016583494955218027), (u'pathway', 0.015973326941096995), (u'synthesis', 0.012922486870491839), (u'fatty', 0.012682778007801434), (u'enzyme', 0.011418858549979299), (u'enzymes', 0.010045980518206979), (u'cyp', 0.00954477107803613)]
topic 5 [(u'mutant', 0.05710967584197551), (u'mutants', 0.05181231026262709), (u'arabidopsis', 0.035268230068662004), (u'root', 0.030235732768281006), (u'development', 0.020924594038426274), (u'phenotype', 0.01752205537784479), (u'type', 0.015036368452150527), (u'thaliana', 0.014099142234265805), (u'mutation', 0.013671278091318432), (u'function', 0.01334528636335853)]
topic 6 [(u'light', 0.04044637321760694), (u'ca', 0.03199008059516429), (u'cells', 0.019094854308741475), (u'red', 0.014879107253564786), (u'induced', 0.014407935523868569), (u'dependent', 0.014407935523868569), (u'response', 0.012300061996280223), (u'stomatal', 0.012200867947923124), (u'membrane', 0.011729696218226907), (u'calcium', 0.009522628642281462)]
topic 7 [(u'activity', 0.03475043528728961), (u'beta', 0.027011994583091507), (u'enzyme', 0.024738827626233313), (u'starch', 0.019467014896498356), (u'synthase', 0.01883826658928226), (u'alpha', 0.018209518282066163), (u'endosperm', 0.012744244534726253), (u'suc', 0.01230895724511511), (u'synthesis', 0.012091313600309538), (u'enzymes', 0.011196556393886632)]
topic 8 [(u'resistance', 0.027060399313549036), (u'induced', 0.024674563643212924), (u'plant', 0.02103302498848939), (u'defense', 0.019400611108785736), (u'acid', 0.015675358921769703), (u'response', 0.013331380017579842), (u'responses', 0.012850027206897995), (u'pathogen', 0.011803608053241807), (u'death', 0.01165710937172994), (u'sa', 0.011594324222510569)]
topic 9 [(u'stress', 0.060223421614158404), (u'tolerance', 0.021639261955566712), (u'plants', 0.019580770679051087), (u'induced', 0.019455252918287938), (u'al', 0.017572486506840717), (u'temperature', 0.015915652064767166), (u'conditions', 0.01563951299108824), (u'salt', 0.014911509978661981), (u'drought', 0.014183506966235722), (u'heat', 0.013555918162419982)]
topic 10 [(u'root', 0.04190430736188999), (u'roots', 0.037537286719961815), (u'uptake', 0.020570337668535974), (u'transport', 0.019114664121226584), (u'high', 0.0159408185180766), (u'pi', 0.014580598973869466), (u'plant', 0.011931750387781888), (u'fe', 0.011239708865290538), (u'plants', 0.010571530843574753), (u'low', 0.010356759336594678)]
topic 11 [(u'genes', 0.04983467638149167), (u'arabidopsis', 0.029461262795256103), (u'gene', 0.02560041182410359), (u'family', 0.02017542122874057), (u'plant', 0.01732433128081256), (u'species', 0.015918585542598056), (u'genome', 0.015601797770606055), (u'identified', 0.014829627576375551), (u'sequence', 0.01431484744688855), (u'specific', 0.013701071138654049)]
topic 12 [(u'cell', 0.09979736575481256), (u'cells', 0.05083476498832651), (u'wall', 0.023985727500991146), (u'pollen', 0.019977093520109245), (u'growth', 0.013391480551517555), (u'division', 0.008347649883265055), (u'actin', 0.008237522576097969), (u'formation', 0.008105369807497468), (u'walls', 0.007841064270296462), (u'cellulose', 0.007422580503061539)]
topic 13 [(u'protein', 0.06182921659191824), (u'proteins', 0.049177767365062366), (u'membrane', 0.020907245018631324), (u'kd', 0.013209273266840708), (u'fluorescent', 0.012584510342057702), (u'localization', 0.012048999263672268), (u'localized', 0.010330901220518999), (u'cells', 0.008858245754959056), (u'mass', 0.008768993908561484), (u'green', 0.00874668094696209)]
topic 14 [(u'activity', 0.01981241437453894), (u'leaves', 0.01926441142375382), (u'metabolism', 0.01681947518178944), (u'carbon', 0.015807777426493835), (u'increased', 0.013004531562862262), (u'levels', 0.012856992306881653), (u'increase', 0.011571293076193487), (u'oxygen', 0.01112867530825166), (u'rate', 0.010348824955211297), (u'content', 0.01032774791864264)]
topic 15 [(u'rice', 0.029644425107779933), (u'maize', 0.02214927026064945), (u'data', 0.018510461574971323), (u'analysis', 0.016315310683067675), (u'plant', 0.01586045959735791), (u'model', 0.01210299410671202), (u'genetic', 0.01200411343590555), (u'study', 0.011272396471937665), (u'zea', 0.00967052960487284), (u'molecular', 0.008998141043388839)]
topic 16 [(u'expression', 0.07232357460892865), (u'genes', 0.057961306592064304), (u'arabidopsis', 0.03437881713227472), (u'gene', 0.028586626738642186), (u'transcription', 0.026380078017258363), (u'regulation', 0.02285354032861815), (u'regulated', 0.02247921509909768), (u'promoter', 0.020174159738366365), (u'factors', 0.015524646361164742), (u'flowering', 0.014401670672603334)]
topic 17 [(u'expression', 0.0628901683100611), (u'expressed', 0.024573710100809564), (u'gene', 0.022411929499481613), (u'tissues', 0.022014867756380563), (u'development', 0.021948690799197052), (u'genes', 0.020470738755432025), (u'fruit', 0.018176604239737056), (u'mrna', 0.017492775682174135), (u'transcripts', 0.017360421767807117), (u'transcript', 0.017007477996161736)]
topic 18 [(u'leaf', 0.05179455586738768), (u'water', 0.024551075820841033), (u'seed', 0.021288246971045244), (u'leaves', 0.01763752937686814), (u'xylem', 0.011385675496839848), (u'phloem', 0.011317224541949026), (u'seeds', 0.011043420722385743), (u'growth', 0.009788486549387364), (u'rate', 0.00908116001551555), (u'embryo', 0.008670454286170625)]
topic 19 [(u'plants', 0.13170211376450355), (u'type', 0.05568495983629754), (u'wild', 0.041709297516163445), (u'transgenic', 0.03824803535276575), (u'levels', 0.03365478807932604), (u'lines', 0.023075081089317977), (u'tobacco', 0.02277031586738359), (u'arabidopsis', 0.022334936978905894), (u'reduced', 0.019809739425735245), (u'showed', 0.016370246206761436)]

In [14]:
model.phi.features[0]


Out[14]:
2038

In [ ]: