In [1]:
import IPython
import numpy as np
import pandas as pd
from tsa.science import numpy_ext as npx
# from itertools import groupby
from sqlalchemy import func
from tsa.lib import datetime_extra
from tsa.science.plot import plt, figure_path, distinct_styles, ticker
from tsa.science import features, models, timeseries
from tsa.science.corpora import MulticlassCorpus
from tsa.models import Source, Document, create_session
In [3]:
DBSession = create_session()
In [4]:
corpus = MulticlassCorpus(Source.from_name('twitter-sample'))
corpus.apply_labelfunc(lambda doc: doc.label)
corpus.extract_features(lambda doc: doc.document, features.ngrams,
ngram_max=2, min_df=2, max_df=1.0)
array = corpus.X.tocsc()
print array.shape
nrows, ncols = array.shape
idf = float(nrows) / np.array([array[:, col].nnz for col in xrange(ncols)])
log_idf = np.log(idf)
_ = plt.hist(log_idf)
Out[4]:
In [ ]:
ordering = np.argsort(log_idf)
k = 50
print log_idf[ordering][0:k]
corpus.feature_names[ordering][0:k]
In [36]:
# okay, look at tokens, not just types
frequencies = np.ravel(array.sum(axis=0))
ordering = np.argsort(frequencies)
selection = ordering[npx.edge_and_median_indices(ordering, 10)]
corpus.feature_names[selection]
Out[36]: