In [1]:
import IPython
import numpy as np
import pandas as pd
from tsa.science import numpy_ext as npx
# from itertools import groupby
from sqlalchemy import func

from tsa.lib import datetime_extra
from tsa.science.plot import plt, figure_path, distinct_styles, ticker
from tsa.science import features, models, timeseries
from tsa.science.corpora import MulticlassCorpus
from tsa.models import Source, Document, create_session

In [3]:
DBSession = create_session()

In [4]:
corpus = MulticlassCorpus(Source.from_name('twitter-sample'))
corpus.apply_labelfunc(lambda doc: doc.label)
corpus.extract_features(lambda doc: doc.document, features.ngrams,
    ngram_max=2, min_df=2, max_df=1.0)

array = corpus.X.tocsc()
print array.shape
nrows, ncols = array.shape

idf = float(nrows) / np.array([array[:, col].nnz for col in xrange(ncols)])
log_idf = np.log(idf)
_ = plt.hist(log_idf)


[ 1.02881294  1.51709712  1.58716583  1.68263215  1.75882094  1.81503301
  2.10480568  2.18729028  2.22346775  2.2776509   2.28366974  2.29342217
  2.32334295  2.35969587  2.63416309  2.63568318  2.65333231  2.77177913
  2.8443177   2.88154842  2.90020123  3.02856882  3.04757988  3.06662492
  3.08638073  3.10254061  3.10688278  3.13871571  3.15483779  3.20751607
  3.23068914  3.28784756  3.3034039   3.30446378  3.31705598  3.31985199
  3.34096497  3.36298298  3.41017784  3.41206552  3.41632588  3.43162255
  3.44349428  3.48301726  3.4837782   3.51287022  3.63970665  3.65826451
  3.68707434  3.70085039]
Out[4]:
array([u'rt', u'i', u'the', u'to', u'you', u'a', u'and', u'this', u'is',
       u'my', u'in', u'for', u'me', u'of', u'on', u'so', u'it', u'that',
       u'be', u'at', u'with', u'just', u"i'm", u'have', u'like', u'your',
       u'if', u'love', u'are', u'was', u'but', u'not', u'get', u'all',
       u'when', u'follow', u'up', u'now', u'can', u'do', u'we', u"don't",
       u'out', u'by', u'one', u'what', u'no', u'who', u'today', u'much'], 
      dtype='<U200')

In [ ]:
ordering = np.argsort(log_idf)
k = 50
print log_idf[ordering][0:k]
corpus.feature_names[ordering][0:k]

In [36]:
# okay, look at tokens, not just types
frequencies = np.ravel(array.sum(axis=0))
ordering = np.argsort(frequencies)
selection = ordering[npx.edge_and_median_indices(ordering, 10)]
corpus.feature_names[selection]


Out[36]:
array([u'littlepulga10', u'marsha', u'mars is', u'marry this',
       u'marry that', u'marry men', u'marry is', u'married with',
       u'married or', u'married life', u'world those', u'myjaps',
       u'but treats', u'great new', u'christmas is', u'our private',
       u'gets http://pbs.twimg.com/media/bip9usdcaaeia6h.jpg',
       u'stay alive', u'seriously visit', u'great place', u'chrome steel',
       u'my', u'is', u'this', u'and', u'a', u'to', u'you', u'the', u'i',
       u'rt'], 
      dtype='<U200')