In [1]:
from __future__ import unicode_literals
import json
import numpy as np
import pandas as pd
from pandas import DataFrame, Series
import text_analysis as txt

Import some example time series data


In [2]:
with open('biotech500.json', 'rb') as fp:
    data = json.load(fp)

In [3]:
articles_list = data['response']['docs']
articles = DataFrame(articles_list)
articles = articles[articles['abstract'].notnull()].ix[:,['abstract', 'publication_date']]
articles.abstract = articles.abstract.apply(txt.wordify, 3)
articles = articles[articles['abstract'].notnull()]
articles.publication_date = pd.to_datetime(articles.publication_date)
articles.head()


Out[3]:
abstract publication_date
7 [objective, paper, assess, attitude, malaysian... 2014-01-29
16 [atrazine, atz, metolachlor, met, two, herbici... 2012-05-15
17 [due, environmental, persistence, biotoxicity,... 2013-08-05
34 [intensive, use, chlorpyrifos, resulted, ubiqu... 2012-10-08
35 [background, complex, characteristics, unclear... 2012-08-09

5 rows × 2 columns


In [4]:
print articles.publication_date.min(), articles.publication_date.max()
print len(articles)


2008-04-30 00:00:00 2014-04-11 00:00:00
57

The time series spans ~9 years with 57 data points.

We need to resample!

Something like a monthly aggregate set of words.

There are probably many ways to do this...


In [5]:
articles_timed = articles.set_index('publication_date')
articles_timed


Out[5]:
abstract
publication_date
2014-01-29 [objective, paper, assess, attitude, malaysian...
2012-05-15 [atrazine, atz, metolachlor, met, two, herbici...
2013-08-05 [due, environmental, persistence, biotoxicity,...
2012-10-08 [intensive, use, chlorpyrifos, resulted, ubiqu...
2012-08-09 [background, complex, characteristics, unclear...
2011-04-15 [reusing, filtering, facepiece, respirators, f...
2013-05-15 [study, focus, impact, senior, executives, ind...
2013-05-17 [hacek, organisms, haemophilus, species, aggre...
2012-12-28 [objectives, atrioventricular, block, avb, inf...
2011-07-13 [objective, assess, efficacy, video, assisted,...
2013-11-22 [energy, mining, mineral, processing, industri...
2011-09-14 [background, mesenchymal, stem, cells, msc, re...
2012-08-17 [objective, estimate, effectiveness, anterior,...
2013-06-21 [journal, policy, research, data, code, availa...
2013-02-28 [polycyclic, aromatic, hydrocarbons, pahs, tox...
2012-04-23 [article, uses, data, thomson, reuters, web, s...
2013-10-23 [animal, models, become, popular, platform, in...
2010-11-17 [sensory, analysis, studies, critical, develop...
2012-07-11 [trinitrotoluene, tnt, released, nature, manuf...
2012-10-24 [objectives, catheter, related, staphylococcus...
2014-03-18 [nanopods, extracellular, structures, arising,...
2012-08-15 [since, launched, plos, one, published, fifty,...
2011-05-03 [mark, van, ommeren, colleagues, describe, cho...
2012-07-19 [background, medical, devices, increasingly, d...
2010-01-05 [month, debate, examines, whether, current, pa...
2013-03-22 [introduction, although, individuals, lower, l...
2013-06-24 [background, suboptimal, left, ventricular, lv...
2012-07-31 [sanket, dhruva, rita, redberg, comment, resea...
2008-04-30 [according, world, health, organization, repor...
2014-01-22 [background, confirmation, diabetic, sensorimo...
2012-12-14 [oxidative, damage, microbial, hosts, often, o...
2011-08-05 [miniaturization, active, implantable, medical...
2014-03-27 [laboratory, soil, degradation, study, conduct...
2013-10-15 [optimised, reduction, dissolved, nutrient, lo...
2011-08-05 [background, field, synthetic, biology, promis...
2012-05-31 [environmentally, degradable, parameter, edk, ...
2014-04-11 [several, applications, tissue, engineering, r...
2013-06-14 [laccases, versatile, biocatalysts, bioremedia...
2012-04-25 [background, cervical, disc, arthroplasty, use...
2013-02-20 [embryonic, stem, es, cell, based, gene, manip...
2011-06-21 [presence, uniformly, small, collagen, fibrils...
2012-05-09 [biosorption, heavy, metals, using, dried, alg...
2013-03-21 [characterizing, quasi, stiffness, work, lower...
2013-05-16 [common, wheat, hexaploid, species, genes, pre...
2013-09-23 [study, two, strains, aspergillus, sp, lysinib...
2012-04-27 [escherichia, coli, chrr, enzyme, obligatory, ...
2014-03-07 [novosphingobium, pentaromativorans, halophili...
2012-12-12 [depending, speciation, environmental, contami...
2010-08-09 [background, several, materials, used, tissue,...
2012-12-11 [efforts, increase, affinity, design, new, the...
2011-10-05 [concerns, regarding, commercial, release, gen...
2012-09-25 [cassava, brown, streak, disease, cbsd, cassav...
2013-04-03 [phytate, major, storage, form, organic, phosp...
2013-04-01 [aim, study, isolate, identify, marine, derive...
2013-11-13 [introduction, composite, biomaterials, design...
2011-09-29 [previous, works, demonstrated, ligninolytic, ...
2014-04-02 [background, aim, study, evaluate, impact, qua...

57 rows × 1 columns

Using pandas time series resampling functions

These are geared towards numeric data. This is still pretty close to what we want, though.


In [6]:
articles_monthly = articles_timed.resample('M', how='sum', fill_method='ffill', kind='period')
articles_monthly.abstract = articles_monthly.abstract.apply(lambda x: np.nan if x == 0 else x)
articles_monthly.fillna(method='ffill', inplace=True)
articles_monthly


Out[6]:
abstract
publication_date
2008-04 [according, world, health, organization, repor...
2008-05 [according, world, health, organization, repor...
2008-06 [according, world, health, organization, repor...
2008-07 [according, world, health, organization, repor...
2008-08 [according, world, health, organization, repor...
2008-09 [according, world, health, organization, repor...
2008-10 [according, world, health, organization, repor...
2008-11 [according, world, health, organization, repor...
2008-12 [according, world, health, organization, repor...
2009-01 [according, world, health, organization, repor...
2009-02 [according, world, health, organization, repor...
2009-03 [according, world, health, organization, repor...
2009-04 [according, world, health, organization, repor...
2009-05 [according, world, health, organization, repor...
2009-06 [according, world, health, organization, repor...
2009-07 [according, world, health, organization, repor...
2009-08 [according, world, health, organization, repor...
2009-09 [according, world, health, organization, repor...
2009-10 [according, world, health, organization, repor...
2009-11 [according, world, health, organization, repor...
2009-12 [according, world, health, organization, repor...
2010-01 [month, debate, examines, whether, current, pa...
2010-02 [month, debate, examines, whether, current, pa...
2010-03 [month, debate, examines, whether, current, pa...
2010-04 [month, debate, examines, whether, current, pa...
2010-05 [month, debate, examines, whether, current, pa...
2010-06 [month, debate, examines, whether, current, pa...
2010-07 [month, debate, examines, whether, current, pa...
2010-08 [background, several, materials, used, tissue,...
2010-09 [background, several, materials, used, tissue,...
2010-10 [background, several, materials, used, tissue,...
2010-11 [sensory, analysis, studies, critical, develop...
2010-12 [sensory, analysis, studies, critical, develop...
2011-01 [sensory, analysis, studies, critical, develop...
2011-02 [sensory, analysis, studies, critical, develop...
2011-03 [sensory, analysis, studies, critical, develop...
2011-04 [reusing, filtering, facepiece, respirators, f...
2011-05 [mark, van, ommeren, colleagues, describe, cho...
2011-06 [presence, uniformly, small, collagen, fibrils...
2011-07 [objective, assess, efficacy, video, assisted,...
2011-08 [background, field, synthetic, biology, promis...
2011-09 [background, mesenchymal, stem, cells, msc, re...
2011-10 [concerns, regarding, commercial, release, gen...
2011-11 [concerns, regarding, commercial, release, gen...
2011-12 [concerns, regarding, commercial, release, gen...
2012-01 [concerns, regarding, commercial, release, gen...
2012-02 [concerns, regarding, commercial, release, gen...
2012-03 [concerns, regarding, commercial, release, gen...
2012-04 [article, uses, data, thomson, reuters, web, s...
2012-05 [biosorption, heavy, metals, using, dried, alg...
2012-06 [biosorption, heavy, metals, using, dried, alg...
2012-07 [trinitrotoluene, tnt, released, nature, manuf...
2012-08 [background, complex, characteristics, unclear...
2012-09 [cassava, brown, streak, disease, cbsd, cassav...
2012-10 [intensive, use, chlorpyrifos, resulted, ubiqu...
2012-11 [intensive, use, chlorpyrifos, resulted, ubiqu...
2012-12 [efforts, increase, affinity, design, new, the...
2013-01 [efforts, increase, affinity, design, new, the...
2013-02 [embryonic, stem, es, cell, based, gene, manip...
2013-03 [characterizing, quasi, stiffness, work, lower...
...

73 rows × 1 columns

Using the sum aggregation method worked

because all the values were lists. The three abstracts published in 2013-05 were concatenated together (see below).


In [7]:
articles_timed['2013-05']


Out[7]:
abstract
publication_date
2013-05-15 [study, focus, impact, senior, executives, ind...
2013-05-17 [hacek, organisms, haemophilus, species, aggre...
2013-05-16 [common, wheat, hexaploid, species, genes, pre...

3 rows × 1 columns


In [8]:
articles_timed['2013-05'].applymap(len)


Out[8]:
abstract
publication_date
2013-05-15 148
2013-05-17 147
2013-05-16 108

3 rows × 1 columns


In [9]:
articles_timed['2013-05'].applymap(len).sum()


Out[9]:
abstract    403
dtype: int64

In [10]:
articles_monthly['2013-05'].applymap(len)


Out[10]:
abstract
publication_date
2013-05 403

1 rows × 1 columns

Check what it would take to make a slider along the PeriodIndex


In [11]:
len(articles_monthly)


Out[11]:
73

In [12]:
' '.join(articles_monthly.ix[0]['abstract'])


Out[12]:
u'according world health organization reports three quarters world population access medical imaging addition developing countries medical equipment available used sophisticated disrepair health personnel trained use goal study introduce demonstrate feasibility new concept medical imaging centered cellular phone technology may provide solution medical imaging underserved areas new system replaces conventional stand alone medical imaging device new medical imaging system made two independent components connected cellular phone technology independent units data acquisition device dad remote patient site simple limited controls image display capability advanced image reconstruction hardware control multiserver unit central site cellular phone technology transmits unprocessed raw data patient site dad receives displays processed image central site different conventional telemedicine image reconstruction control patient site telecommunication used transmit processed images patient site primary goal study demonstrate cellular phone technology function proposed mode feasibility concept demonstrated using new frequency division multiplexing electrical impedance tomography system developed dynamic medical imaging medical imaging modality system used image cellular phone simulation breast cancer tumors medical imaging diagnostic mode image minimally invasive tissue ablation irreversible electroporation medical imaging interventional mode'

Making the slider


In [13]:
from IPython.display import display, Image, HTML, clear_output
from IPython.html import widgets

In [14]:
from jinja2 import Template

def textbarf(t):
    
    html_template = """
    <style>

    #textbarf {
        display: block;
        width: 666px;
        padding: 23px;
        background-color: #ddeeff;
    }

    </style>

    <div id="textbarf"> {{blargh}} </div>"""
    
    blob = ' '.join(articles_monthly.ix[t]['abstract'])
    
    html_src = Template(html_template).render(blargh=blob)
    display(HTML(html_src))

In [15]:
widgets.interact(textbarf,
                 t=widgets.IntSliderWidget(min=0,max=72,step=1,value=0),
                )


according world health organization reports three quarters world population access medical imaging addition developing countries medical equipment available used sophisticated disrepair health personnel trained use goal study introduce demonstrate feasibility new concept medical imaging centered cellular phone technology may provide solution medical imaging underserved areas new system replaces conventional stand alone medical imaging device new medical imaging system made two independent components connected cellular phone technology independent units data acquisition device dad remote patient site simple limited controls image display capability advanced image reconstruction hardware control multiserver unit central site cellular phone technology transmits unprocessed raw data patient site dad receives displays processed image central site different conventional telemedicine image reconstruction control patient site telecommunication used transmit processed images patient site primary goal study demonstrate cellular phone technology function proposed mode feasibility concept demonstrated using new frequency division multiplexing electrical impedance tomography system developed dynamic medical imaging medical imaging modality system used image cellular phone simulation breast cancer tumors medical imaging diagnostic mode image minimally invasive tissue ablation irreversible electroporation medical imaging interventional mode