In [42]:
import re
import numpy as np
from scipy import sparse
from scipy.sparse import dok_matrix
from functools import reduce
from sklearn.preprocessing import normalize
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from sklearn.feature_extraction.text import TfidfTransformer
from nltk.stem.porter import PorterStemmer
from collections import defaultdict
In [20]:
stemmer = PorterStemmer()
In [28]:
def save(name, data):
""" save object to disk for caching """
import pickle
with open('../data/processed/'+name, 'wb') as f:
pickle.dump(data, f, pickle.HIGHEST_PROTOCOL)
f.close()
def load(name):
import pickle
with open('../data/processed/'+name, 'rb') as f:
data = pickle.load(f)
return data
In [29]:
def make_stopword_matrix():
words = load('text_vectorizer').get_feature_names()
matrix = sparse.eye(len(words), format='dok')
for word_id, word in enumerate(words):
if word in ENGLISH_STOP_WORDS:
matrix[word_id, word_id] = 0
return matrix.tocsr()
def make_stem_matrix():
words = load('text_vectorizer').get_feature_names()
# stem all words
stems = defaultdict(list)
for word_id, word in enumerate(words):
stems[stemmer.stem(word)].append(word_id)
# make matrix
matrix = dok_matrix((len(words), len(stems)))
for stem_id, s in enumerate(stems):
for word_id in stems[s]:
matrix[word_id, stem_id] = 1.
return matrix.tocsr()
In [30]:
save('stopword_matrix', make_stopword_matrix())
save('stem_matrix', make_stem_matrix())
In [44]:
def get_featureset(name, stem=True, tf_idf=True, stopwords=False, norm='l2',
use_idf=1, smooth_idf=1, sublinear_tf=1, binary=False):
data = parse(name)
if stopwords:
data = data * load('stopword_matrix')
if stem:
data = data * load('stem_matrix')
if tf_idf:
data = TfidfTransformer(use_idf=use_idf, smooth_idf=smooth_idf, sublinear_tf=sublinear_tf).fit_transform(data)
if norm:
normalize(data, norm, copy=False)
if binary:
data.data[:] = 1
return data
In [36]:
def parse(s):
# remove spaces if any
if ' ' in s:
s = s.replace(' ', '')
return _parse(s).copy()
def _apply(fun, items):
assert len(items) > 0
return reduce(fun, items[1:], items[0])
def _parse(s):
text = load('text_features')
function = re.compile(r'^(\w*)\(([^)]*)\)$')
plus = lambda x, y: x + y
times = lambda x, y: x * y
# replace some strings
if s == 'body':
s = 'h1+h2+h3+img+a+other'
elif s == 'other':
s = 'body'
# apply functions
if function.match(s):
name, param = function.match(s).group(1, 2)
if param == 'all':
param = ','.join(text)
items = list(map(_parse, param.split(',')))
return _apply({'max': maximum, 'sum': plus}[name], items)
# addition and multiplication
if '+' in s:
items = list(map(_parse, s.split('+')))
return _apply(plus, items)
if '*' in s:
items = list(map(_parse, s.split('*')))
return _apply(times, items)
# try to parse any numbers
try:
return float(s)
except ValueError:
pass
# return corresponding dataset
return text[s]
In [38]:
def sparse_maximum(A, B):
BisBigger = A-B
BisBigger.data = np.where(BisBigger.data < 0, 1, 0)
return A - A.multiply(BisBigger) + B.multiply(BisBigger)
def maximum(A, B):
from scipy.sparse import issparse, csr_matrix
if issparse(A) or issparse(B):
return sparse_maximum(csr_matrix(A), csr_matrix(B))
else:
return np.maximum(A, B)
In [47]:
get_featureset('max(h1, title)', tf_idf=False)
Out[47]:
In [ ]: