In [42]:
import re

import numpy as np
from scipy import sparse
from scipy.sparse import dok_matrix

from functools import reduce

from sklearn.preprocessing import normalize
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from sklearn.feature_extraction.text import TfidfTransformer

from nltk.stem.porter import PorterStemmer
from collections import defaultdict

In [20]:
stemmer = PorterStemmer()

In [28]:
def save(name, data):
    """ save object to disk for caching """
    import pickle

    with open('../data/processed/'+name, 'wb') as f:
        pickle.dump(data, f, pickle.HIGHEST_PROTOCOL)
        f.close()

def load(name):
    import pickle
    
    with open('../data/processed/'+name, 'rb') as f:
        data = pickle.load(f)
    
    return data

In [29]:
def make_stopword_matrix():
    words = load('text_vectorizer').get_feature_names()
    
    matrix = sparse.eye(len(words), format='dok')
    
    for word_id, word in enumerate(words):
        if word in ENGLISH_STOP_WORDS:
            matrix[word_id, word_id] = 0
        
    return matrix.tocsr()

def make_stem_matrix():
    words = load('text_vectorizer').get_feature_names()
    
    # stem all words
    stems = defaultdict(list)
    
    for word_id, word in enumerate(words):
        stems[stemmer.stem(word)].append(word_id)
        
    # make matrix
    matrix = dok_matrix((len(words), len(stems)))
    
    for stem_id, s in enumerate(stems):
        for word_id in stems[s]:
            matrix[word_id, stem_id] = 1.
            
    return matrix.tocsr()

In [30]:
save('stopword_matrix', make_stopword_matrix())
save('stem_matrix', make_stem_matrix())

In [44]:
def get_featureset(name, stem=True, tf_idf=True, stopwords=False, norm='l2',
                   use_idf=1, smooth_idf=1, sublinear_tf=1, binary=False):

    data = parse(name)

    if stopwords:
        data = data * load('stopword_matrix')
    if stem:
        data = data * load('stem_matrix')
    if tf_idf:
        data = TfidfTransformer(use_idf=use_idf, smooth_idf=smooth_idf, sublinear_tf=sublinear_tf).fit_transform(data)
    if norm:
        normalize(data, norm, copy=False)
    if binary:
        data.data[:] = 1

    return data

In [36]:
def parse(s):
    # remove spaces if any
    
    if ' ' in s:
        s = s.replace(' ', '')
    return _parse(s).copy()

def _apply(fun, items):
    assert len(items) > 0
    return reduce(fun, items[1:], items[0])

def _parse(s):
    text = load('text_features')
    function = re.compile(r'^(\w*)\(([^)]*)\)$')
    
    plus = lambda x, y: x + y
    times = lambda x, y: x * y
    
    # replace some strings
    if s == 'body':
        s = 'h1+h2+h3+img+a+other'
    elif s == 'other':
        s = 'body'
    
    # apply functions
    if function.match(s):
        name, param = function.match(s).group(1, 2)
        
        if param == 'all':
            param = ','.join(text)
        
        items = list(map(_parse, param.split(',')))
        
        return _apply({'max': maximum, 'sum': plus}[name], items)
    
    # addition and multiplication
    if '+' in s:
        items = list(map(_parse, s.split('+')))
        return _apply(plus, items)
    
    if '*' in s:
        items = list(map(_parse, s.split('*')))
        return _apply(times, items)
    
    # try to parse any numbers
    try:
        return float(s)
    except ValueError:
        pass
    
    # return corresponding dataset
    return text[s]

In [38]:
def sparse_maximum(A, B):
    BisBigger = A-B
    BisBigger.data = np.where(BisBigger.data < 0, 1, 0)
    
    return A - A.multiply(BisBigger) + B.multiply(BisBigger)

def maximum(A, B):
    from scipy.sparse import issparse, csr_matrix
    
    if issparse(A) or issparse(B):
        return sparse_maximum(csr_matrix(A), csr_matrix(B))
    else:
        return np.maximum(A, B)

In [47]:
get_featureset('max(h1, title)', tf_idf=False)


Out[47]:
<10566x175221 sparse matrix of type '<class 'numpy.float64'>'
	with 111256 stored elements in Compressed Sparse Row format>

In [ ]: