In [1]:
import IPython
import numpy as np
import pandas as pd
from tsa.science import numpy_ext as npx
# from itertools import groupby
from sqlalchemy import func

from tsa.lib import datetime_extra
from tsa.science.plot import plt, figure_path, distinct_styles, ticker
from tsa.science import features, models, timeseries
from tsa.science.corpora import MulticlassCorpus
from tsa.models import Source, Document, create_session

In [2]:
DBSession = create_session()
def source_documents(source_name):
    return DBSession.query(Document).\
        join(Source, Source.id == Document.source_id).\
        filter(Source.name == source_name).all()

In [4]:
# print out database overview 
for source in DBSession.query(Source):
    labels = DBSession.query(Document.label, func.count(Document.label)).\
        filter(Document.source == source).\
        group_by(Document.label).all()
    df = pd.DataFrame.from_records(labels, index=['label'], columns=['label', 'count'])
    total = DBSession.query(Document).filter(Document.source == source).count()
    print 'source.name = {:s}, N = {:d}'.format(source.name, total)
    IPython.display.display(df)


source.name = sb5b, N = 106702
count
label
NaN 0
Not Applicable 571
For 2785
Broken Link 36
Against 10842
Neutral 149

6 rows × 1 columns

source.name = rt-polarity, N = 10662
count
label
neg 5331
pos 5331

2 rows × 1 columns

source.name = convote, N = 8121
count
label
Against 3853
For 4268

2 rows × 1 columns

source.name = stanford-politeness-wikipedia, N = 4353
count
label
Impolite 2062
Polite 2291

2 rows × 1 columns

source.name = stanford-politeness-stackexchange, N = 6603
count
label
Impolite 2858
Polite 3745

2 rows × 1 columns

source.name = twitter-sample, N = 128408
count
label
NaN 0

1 rows × 1 columns

source.name = debate08, N = 3238
count
label
Negative 1622
Neutral 560
Positive 1056

3 rows × 1 columns


In [46]:
times = np.array([doc.published for doc in source_documents('sb5b')]).astype('datetime64[s]')

In [48]:
plt.hist(times.astype(float))
axes = plt.gca()
axes.xaxis.set_major_formatter(ticker.FuncFormatter(datetime_extra.datetime64_formatter))
plt.gcf().set_size_inches(12, 5)



In [42]:
times.astype(float)


Out[42]:
array([  1.39437959e+09,   1.39437959e+09,   1.39437959e+09, ...,
         1.39439522e+09,   1.39439522e+09,   1.39439522e+09])

In [46]:
array = np.random.negative_binomial(1, .5, 100).reshape(10, -1)
np.apply_along_axis(np.count_nonzero, 0, array)


Out[46]:
array([[0, 0, 0, 3, 0, 1, 5, 0, 0, 0],
       [0, 2, 0, 0, 0, 0, 0, 1, 3, 0],
       [5, 0, 0, 1, 1, 0, 4, 3, 0, 1],
       [0, 0, 1, 0, 3, 0, 0, 0, 1, 4],
       [0, 0, 0, 4, 1, 1, 0, 0, 0, 0],
       [0, 0, 1, 1, 0, 0, 4, 2, 1, 2],
       [2, 1, 4, 4, 0, 1, 0, 1, 1, 0],
       [0, 0, 1, 1, 1, 0, 4, 1, 1, 0],
       [6, 1, 0, 0, 0, 0, 0, 0, 1, 8],
       [2, 0, 0, 0, 1, 0, 0, 4, 0, 0]])