In [1]:
%load_ext vimception
In [2]:
%load_ext autoreload
%autoreload 2
For certain tasks it might make more sense to tokenize input strings first and then extract features on these string lists rather than on the original character lists.
To demonstrate this I'll take some example strings from highered and learn models using these two feature extraction techniques.
In [5]:
X = [(u'caring hands a step ahead', u'el valor little tykes ii'),
(u'dulles', u"chicago public schools o'keeffe, isabell c."),
(u'erie neighborhood house fcch-carmen l. vega site',
u'erie neighborhood house fcch-servia galva site'),
(u'chicago public schools dvorak math & science tech academy, anton',
u'chicago public schools perez, manuel'),
(u'v & j day care center', u"henry booth house granny's day care center"),
(u'home of life community dev. corp. - home of life just for you',
u'urban family and community centers'),
(u'carole robertson center for learning fcch-ileana gonzalez',
u'carole robertson center for learning fcch-rhonda culverson'),
(u'bethel new life bethel child development',
u'mary crane league mary crane center (lake & pulaski)'),
(u'easter seals society of metropolitan chicago - stepping stones early/childhood lear',
u"marcy newberry association kenyatta's day care"),
(u'westside holistic family services westside holistic family services',
u'childserv lawndale'),
(u'higgins', u'higgins'),
(u'ymca south side', u'ymca of metropolitan chicago - south side ymca'),
(u'chicago commons association paulo freire',
u'chicago commons association paulo freire'),
(u'fresh start daycare, inc.',
u'easter seals society of metropolitan chicago fresh start day care center'),
(u'el valor teddy bear 3', u'teddy bear 3'),
(u'chicago child care society chicago child care society',
u'chicago child care society-child and family dev center'),
(u'hull house - uptown', u'uptown family care center')]
Y = [u'distinct',
u'distinct',
u'distinct',
u'distinct',
u'distinct',
u'distinct',
u'distinct',
u'distinct',
u'distinct',
u'distinct',
u'match',
u'match',
u'match',
u'match',
u'match',
u'match',
u'match']
In [6]:
from pyhacrf import StringPairFeatureExtractor, Hacrf
from scipy.optimize import fmin_l_bfgs_b
import numpy as np
In [7]:
# Extract features
feature_extractor = StringPairFeatureExtractor(match=True, numeric=True)
X_extracted = feature_extractor.fit_transform(X)
In [9]:
%%timeit -n1 -r1
# Train model
model = Hacrf(l2_regularization=1.0, optimizer=fmin_l_bfgs_b, optimizer_kwargs={'maxfun': 10})
model.fit(X_extracted, Y, verbosity=1)
In [10]:
%%timeit -n1 -r1
# Evaluate
from sklearn.metrics import confusion_matrix
predictions = model.predict(X_extracted)
print(confusion_matrix(Y, predictions))
print(model.predict_proba(X_extracted))
In [14]:
from pyhacrf import PairFeatureExtractor
In [15]:
tokX = [[sentence.split(' ') for sentence in pair] for pair in X]
In [16]:
real = [
lambda i, j, s1, s2: 1.0,
lambda i, j, s1, s2: 1.0 if s1[i] == s2[j] else 0.0,
lambda i, j, s1, s2: 1.0 if s1[i] == s2[j] and len(s1[i]) >= 6 else 0.0,
lambda i, j, s1, s2: 1.0 if s1[i].isdigit() and s2[j].isdigit() and s1[i] == s2[j] else 0.0,
lambda i, j, s1, s2: 1.0 if s1[i].isalpha() and s2[j].isalpha() and s1[i] == s2[j] else 0.0,
lambda i, j, s1, s2: 1.0 if not s1[i].isalpha() and not s2[j].isalpha() else 0.0
]
# Other ideas are:
# to look up whether words are dictionary words,
# longest common subsequence,
# standard edit distance
feature_extractor = PairFeatureExtractor(real=real)
X_extracted = feature_extractor.fit_transform(tokX)
In [17]:
#%%timeit -n1 -r1
# Train model
model = Hacrf(l2_regularization=1.0, optimizer=fmin_l_bfgs_b, optimizer_kwargs={'maxfun': 400})
model.fit(X_extracted, Y, verbosity=10)
Out[17]:
In [18]:
%%timeit -n1 -r1
# Evaluate
from sklearn.metrics import confusion_matrix
predictions = model.predict(X_extracted)
print(confusion_matrix(Y, predictions))
print(model.predict_proba(X_extracted))
In [19]:
import editdistance
In [20]:
editdistance.eval('cheese', 'kaas')
Out[20]:
In [ ]:
tokX = [[sentence.split(' ') for sentence in pair] for pair in X]
In [48]:
real = [
lambda i, j, s1, s2: 1.0,
lambda i, j, s1, s2: 1.0 if s1[i] == s2[j] else 0.0,
lambda i, j, s1, s2: 1.0 if s1[i].isdigit() and s2[j].isdigit() and s1[i] == s2[j] else 0.0,
lambda i, j, s1, s2: 1.0 if not s1[i].isalpha() and not s2[j].isalpha() else 0.0,
lambda i, j, s1, s2: editdistance.eval(s1[i], s2[j]),
lambda i, j, s1, s2: np.log(editdistance.eval(s1[i], s2[j]) + 1),
lambda i, j, s1, s2: (editdistance.eval(s1[i], s2[j])) / max(len(s1[i]), len(s2[j])),
lambda i, j, s1, s2: 1.0 - (editdistance.eval(s1[i], s2[j])) / max(len(s1[i]), len(s2[j]))
]
# Other ideas are:
# to look up whether words are dictionary words,
# longest common subsequence,
# standard edit distance
In [46]:
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.cross_validation import train_test_split
In [51]:
# Train model
errors_val = []
errors_train = []
for i, featureset in enumerate([[0, 1],
[0, 1, 2],
[0, 1, 2, 3],
[0, 4],
[0, 1, 4],
[0, 1, 2, 3, 4],
[0, 5],
[0, 1, 5],
[0, 1, 2, 3, 5],
[0, 6],
[0, 1, 6],
[0, 1, 2, 3, 6],
[0, 7],
[0, 1, 7],
[0, 1, 2, 3, 7]]):
print '{:4}{:18}'.format(i, featureset),
errs_val = []
errs_train = []
for repeat in xrange(15):
x_train, x_val, y_train, y_val = train_test_split(tokX, Y, test_size=0.2)
feature_extractor = PairFeatureExtractor(real=[real[f] for f in featureset])
X_extracted = feature_extractor.fit_transform(x_train)
model = Hacrf(l2_regularization=1.0, optimizer=fmin_l_bfgs_b, optimizer_kwargs={'maxfun': 400})
model.fit(X_extracted, y_train)
predictions = model.predict(X_extracted)
err_train = 1.0 - accuracy_score(y_train, predictions)
X_extracted = feature_extractor.transform(x_val)
predictions = model.predict(X_extracted)
err_val = 1.0 - accuracy_score(y_val, predictions)
if repeat % 10 == 0:
print '{:.2f}'.format(err_train),
print '{:.2f}'.format(err_val),
errs_val.append(err_val)
errs_train.append(err_train)
print ' => {:.2f} +- {:.2f} | {:.2f} +- {:.2f}'.format(np.average(errs_train),
np.std(errs_train),
np.average(errs_val),
np.std(errs_val))
errors_train.append(errs_train)
errors_val.append(errs_val)
In [11]:
from time import sleep
In [21]:
from IPython import parallel
c = parallel.Client()
view = c.load_balanced_view()
In [23]:
def k():
sleep(8)
print 'kaas'
In [37]:
%%px --noblock
from time import sleep
sleep(15)
print 'kaas'
a=4
Out[37]:
In [38]:
1+1
Out[38]:
In [39]:
%pxresult
In [ ]: