In [1]:
import sys
sys.path.append('/Users/erickpeirson/tethne')

In [2]:
import matplotlib.pyplot as plt

Social influence modeling with Topical Affinity Propagation

1. Create a corpus from a JSTOR DfR dataset

1.1. Load bibliographic data


In [3]:
from tethne.readers import dfr

In [4]:
datapath = ['/Users/erickpeirson/Desktop/cleanup/JStor DfR Datasets/2013.5.3.cHrmED8A',
            '/Users/erickpeirson/Desktop/cleanup/JStor DfR Datasets/2013.5.3.k2HUvXh9',
            '/Users/erickpeirson/Desktop/cleanup/JStor DfR Datasets/2013.5.3.k2HUvXh9']

In [5]:
outpath = '/Users/erickpeirson/tethne/doc/notebooks/sandbox/ldaout'
temppath = '/Users/erickpeirson/tethne/doc/notebooks/sandbox/ldatemp'

In [6]:
papers = [ p for path in datapath for p in dfr.read(path) ]

In [7]:
len(papers)


Out[7]:
880

1.2. Load wordcounts


In [8]:
wordcounts = {}
for path in datapath:
    w = dfr.ngrams(path, 'uni')
    wordcounts.update(w)

1.3. Load NLTK stoplist


In [9]:
from nltk.corpus import stopwords

In [10]:
stoplist = stopwords.words()

1.4. Create a DataCollection


In [11]:
from tethne import DataCollection

In [12]:
D = DataCollection(papers, features={'wordcounts': wordcounts}, index_by='doi', exclude=stoplist)

1.5. Filter words in wordcount featureset


In [13]:
def filt(s, C, DC):
    if C > 3 and DC > 1 and len(s) > 3:
        return True
    return False

In [14]:
D.filter_features('wordcounts', 'wordcounts_filtered', filt)

In [15]:
len(D.features['wordcounts']['index']), len(D.features['wordcounts_filtered']['index'])


Out[15]:
(122836, 27750)

1.6. Create a time-period index


In [16]:
D.slice('date', method='time_period', window_size=5, cumulative=True)

In [17]:
D.plot_distribution('date')


2. Create a collaborative network model

2.1. Build a coauthorship GraphCollection


In [18]:
from tethne import GraphCollection

In [19]:
G = GraphCollection()

In [20]:
G.build(D, 'date', 'authors', 'coauthors')


Out[20]:
<tethne.classes.graphcollection.GraphCollection at 0x10a036d90>

In [21]:
G.plot_node_distribution()


3. Build a LDA topic model


In [22]:
mallet_path = '/Applications/mallet-2.0.7'

3.1. Instantiate a MALLETModelManager


In [23]:
from tethne.model.managers import MALLETModelManager

In [24]:
M = MALLETModelManager(D, 'wordcounts_filtered', outpath, temppath, mallet_path)

3.2. Prepare the corpus for modeling


In [25]:
M.prep()

3.3. Build the topic model


In [26]:
M.build(Z=50, max_iter=300, prep=True)


Out[26]:
<tethne.model.corpus.ldamodel.LDAModel at 0x11111c490>

In [27]:
print M.print_topic(0)


opposed, terminates, trichinosis, cistus, acaule, staminate, carrion, endemism, thorn, bifurcation

4. Build the social influence model

4.1. Instantiate a TAPModelManager


In [28]:
from tethne.model.managers import TAPModelManager

In [29]:
T = TAPModelManager(D, G, M.model, outpath=outpath, temppath=temppath, mallet_path=mallet_path)

4.2. Build the TAPModel


In [30]:
T.build(axis='date')

4.3. Extract a GraphCollection containing the time-variant social influence graph


In [31]:
GC = T.graph_collection(0)

4.4. Visualize the social influence model in Cytoscape


In [41]:
from tethne.writers.graph import to_graphml

In [45]:
to_graphml(GC[1976], './sandbox/ldaout/topic0.graphml')


In [ ]: