In [1]:
import sys
sys.path.append('/Users/erickpeirson/tethne')
In [2]:
import matplotlib.pyplot as plt
In [3]:
from tethne.readers import dfr
In [4]:
datapath = ['/Users/erickpeirson/Desktop/cleanup/JStor DfR Datasets/2013.5.3.cHrmED8A',
'/Users/erickpeirson/Desktop/cleanup/JStor DfR Datasets/2013.5.3.k2HUvXh9',
'/Users/erickpeirson/Desktop/cleanup/JStor DfR Datasets/2013.5.3.k2HUvXh9']
In [5]:
outpath = '/Users/erickpeirson/tethne/doc/notebooks/sandbox/ldaout'
temppath = '/Users/erickpeirson/tethne/doc/notebooks/sandbox/ldatemp'
In [6]:
papers = [ p for path in datapath for p in dfr.read(path) ]
In [7]:
len(papers)
Out[7]:
In [8]:
wordcounts = {}
for path in datapath:
w = dfr.ngrams(path, 'uni')
wordcounts.update(w)
In [9]:
from nltk.corpus import stopwords
In [10]:
stoplist = stopwords.words()
In [11]:
from tethne import DataCollection
In [12]:
D = DataCollection(papers, features={'wordcounts': wordcounts}, index_by='doi', exclude=stoplist)
In [13]:
def filt(s, C, DC):
if C > 3 and DC > 1 and len(s) > 3:
return True
return False
In [14]:
D.filter_features('wordcounts', 'wordcounts_filtered', filt)
In [15]:
len(D.features['wordcounts']['index']), len(D.features['wordcounts_filtered']['index'])
Out[15]:
In [16]:
D.slice('date', method='time_period', window_size=5, cumulative=True)
In [17]:
D.plot_distribution('date')
In [18]:
from tethne import GraphCollection
In [19]:
G = GraphCollection()
In [20]:
G.build(D, 'date', 'authors', 'coauthors')
Out[20]:
In [21]:
G.plot_node_distribution()
In [22]:
mallet_path = '/Applications/mallet-2.0.7'
In [23]:
from tethne.model.managers import MALLETModelManager
In [24]:
M = MALLETModelManager(D, 'wordcounts_filtered', outpath, temppath, mallet_path)
In [25]:
M.prep()
In [26]:
M.build(Z=50, max_iter=300, prep=True)
Out[26]:
In [27]:
print M.print_topic(0)
In [28]:
from tethne.model.managers import TAPModelManager
In [29]:
T = TAPModelManager(D, G, M.model, outpath=outpath, temppath=temppath, mallet_path=mallet_path)
In [30]:
T.build(axis='date')
In [31]:
GC = T.graph_collection(0)
In [41]:
from tethne.writers.graph import to_graphml
In [45]:
to_graphml(GC[1976], './sandbox/ldaout/topic0.graphml')
In [ ]: