Model19: NE(Named Entity) with adjust result


In [29]:
from utils import load_buzz, select, write_result
from features import featurize, get_pos
from containers import Questions, Users, Categories
from nlp import extract_entities

GMM

Classifying questions

features: avg_pos, accuracy rate


In [30]:
%matplotlib inline
import numpy as np
from scipy import linalg
import matplotlib.pyplot as plt
import matplotlib as mpl

from sklearn import mixture


def plot_gmm(X, models, n_components, covariance_type='diag', n_iter=100,
            figsize=(10, 20), suptitle=None, xlabel=None, ylabel=None):
    color_iter = ['r', 'g', 'b', 'c', 'm', 'y', 'k', 'gray', 'pink', 'lime']
    plt.figure(figsize=figsize)
    plt.suptitle(suptitle, fontsize=20)

    for i, model in enumerate(models):
        mm = getattr(mixture, model)(n_components=n_components,
                                     covariance_type=covariance_type,
                                     n_iter=n_iter)
        mm.fit(X_pos_qid)
        Y = mm.predict(X_pos_qid)

        plt.subplot(len(models), 1, 1 + i)
        for i, color in enumerate(color_iter):
            plt.scatter(X_pos_qid[Y == i, 0], X_pos_qid[Y == i, 1], .7, color=color)
        plt.title(model, fontsize=15)
        plt.xlabel(xlabel, fontsize=12)
        plt.ylabel(ylabel, fontsize=12)
        plt.grid()

    plt.show()

In [31]:
users = Users(load_buzz())
questions = Questions(load_buzz())
X_pos_uid = users.select(['ave_pos_uid', 'acc_ratio_uid'])
X_pos_qid = questions.select(['ave_pos_qid', 'acc_ratio_qid'])

In [32]:
plot_gmm(X_pos_uid,
         models=['GMM', 'VBGMM', 'DPGMM'],
         n_components=8,
         covariance_type='diag',
         figsize=(10, 20),
         suptitle='Classifying users',
         xlabel='abs(position)',
         ylabel='accuracy ratio')



In [33]:
plot_gmm(X_pos_qid,
         models=['GMM', 'VBGMM', 'DPGMM'],
         n_components=8,
         covariance_type='diag',
         figsize=(10, 20),
         suptitle='Classifying questions',
         xlabel='abs(position)',
         ylabel='accuracy ratio')



In [34]:
# Question category
n_components = 8
gmm = mixture.DPGMM(n_components=n_components, covariance_type='diag', n_iter=10**10)
gmm.fit(X_pos_qid)
pred_cat_qid = gmm.predict(X_pos_qid)

plt.hist(pred_cat_qid, bins=50, facecolor='g', alpha=0.75)
plt.xlabel("Category number")
plt.ylabel("Count")
plt.title("Question Category: " + str(n_components) + " categories")
plt.grid(True)
plt.show()



In [35]:
# User category
n_components = 8
gmm = mixture.DPGMM(n_components=n_components, covariance_type='diag', n_iter=10**10)
gmm.fit(X_pos_uid)
pred_cat_uid = gmm.predict(X_pos_uid)

plt.hist(pred_cat_uid, bins=50, facecolor='g', alpha=0.75)
plt.xlabel("Category number")
plt.ylabel("Count")
plt.title("User Category: " + str(n_components) + " categories")
plt.grid(True)
plt.show()



In [36]:
from collections import Counter


users.sub_append('cat_uid', [str(x) for x in pred_cat_uid])
questions.sub_append('cat_qid', [str(x) for x in pred_cat_qid])

# to get most frequent cat for some test data which do not have ids in train set
most_pred_cat_uid = Counter(pred_cat_uid).most_common(1)[0][0]
most_pred_cat_qid = Counter(pred_cat_qid).most_common(1)[0][0]

print(most_pred_cat_uid)
print(most_pred_cat_qid)


2
0

In [37]:
print(users[1])
print(questions[1])


{'cat_uid': '2', 'ave_pos_uid': 96.724899598393577, 'acc_ratio_uid': 0.6465863453815262}
{'answer': 'thomas cole', 'question': "This painter's indulgence of visual fantasy, and appreciation of different historic architectural styles can be seen in his 1840 Architect's Dream. After a series of paintings on The Last of the Mohicans, he made a three year trip to Europe in 1829, but he is better known for a trip four years earlier in which he journeyed up the Hudson River to the Catskill Mountains. FTP, name this painter of The Oxbow and The Voyage of Life series.", 'ave_pos_qid': 70.5, 'pos_token': {0: '', 1: 'painters', 2: 'indulgence', 4: 'visual', 5: 'fantasy', 68: 'this_painter', 7: 'appreciation', 64: 'mountains', 9: 'different', 10: 'historic', 11: 'architectural', 12: 'styles', 66: 'name', 77: 'series', 15: 'seen', 18: '1840', 19: 'architects', 20: 'dream', 23: 'series', 25: 'paintings', 28: 'last', 31: 'mohicans', 33: 'made', 35: 'three', 36: 'year', 37: 'trip', 65: 'ftp', 39: 'europe', 41: '1829', 71: 'oxbow', 45: 'better', 46: 'known', 76: 'life', 49: 'trip', 50: 'four', 51: 'years', 52: 'earlier', 56: 'journeyed', 59: 'hudson', 60: 'river', 74: 'voyage', 63: 'catskill'}, 'group': 'test', 'cat_qid': '0', 'acc_ratio_qid': 0.875, 'category': 'Fine Arts'}

B. Modeling


In [38]:
regression_keys = ['category', 'q_length', 'qid', 'uid', 'answer', 'avg_pos_uid', 'avg_pos_qid']
X_train, y_train = featurize(load_buzz(), group='train', sign_val=None, extra=['sign_val', 'avg_pos'])
X_train = select(X_train, regression_keys)

In [39]:
categories = Categories(load_buzz())

In [40]:
for item in X_train:
    for key in categories[item['category']].keys():
        item[key] = categories[item['category']][key]

In [41]:
X_train[1]


Out[41]:
{'acc_ratio_cat': 0.7560975609756098,
 'answer': 'thomas cole',
 'ave_pos_cat': 43.640185830429736,
 'avg_pos_qid': 51.0,
 'avg_pos_uid': 30.973895582329316,
 'category': 'fine arts',
 'q_length': 78,
 'qid': 1,
 'uid': 1}

In [42]:
extract_entities(questions[1]['question'], all=False, verbose=False)


Out[42]:
[['CD', '1840', 20],
 ['PERSON', 'Architect', 21],
 ['CD', 'three', 40],
 ['GPE', 'Europe', 44],
 ['CD', '1829', 46],
 ['CD', 'four', 56],
 ['ORGANIZATION', 'Hudson', 65],
 ['ORGANIZATION', 'Catskill', 69],
 ['ORGANIZATION', 'FTP', 72],
 ['ORGANIZATION', 'Oxbow', 79],
 ['ORGANIZATION', 'Voyage', 82],
 ['GPE', 'Life', 84]]

In [43]:
import pickle


with open('ne_count02.pkl', 'rb') as f:
    nes = pickle.load(f)

In [44]:
nes[1]


Out[44]:
[['CD', '1840', 20],
 ['PERSON', 'Architect', 21],
 ['CD', 'three', 40],
 ['GPE', 'Europe', 44],
 ['CD', '1829', 46],
 ['CD', 'four', 56],
 ['ORGANIZATION', 'Hudson', 65],
 ['ORGANIZATION', 'Catskill', 69],
 ['ORGANIZATION', 'FTP', 72],
 ['ORGANIZATION', 'Oxbow', 79],
 ['ORGANIZATION', 'Voyage', 82],
 ['GPE', 'Life', 84]]

In [45]:
pos = list(zip(*nes[1]))[-1]
print(pos)


(20, 21, 40, 44, 46, 56, 65, 69, 72, 79, 82, 84)

In [46]:
import numpy as np


mean = sum(pos) / len(pos)
nor_mean = (sum(pos)/len(pos)) / len(questions[1]['question'].split())
median = np.median(pos)
mod = max(set(pos), key=pos.count)
print(mean, nor_mean, median, mod)


56.5 0.7243589743589743 60.5 65

In [47]:
import numpy as np

"""
This is sort of temporary function for adding some features.
"""
def transform(X):
    for index, item in enumerate(X):
        uid = int(item['uid'])
        qid = int(item['qid'])

        # uid
        if int(uid) in users:
            item['acc_ratio_uid'] = users[uid]['acc_ratio_uid']
            item['cat_uid'] = users[uid]['cat_uid']
        else:
            acc = users.select(['acc_ratio_uid'])
            item['acc_ratio_uid'] = sum(acc) / float(len(acc))
            item['cat_uid'] = most_pred_cat_uid

        # qid
        if int(qid) in questions:
            item['acc_ratio_qid'] = questions[qid]['acc_ratio_qid']
            item['cat_qid'] = questions[qid]['cat_qid']
            # For Named Entity
            try:
                ne = list(zip(*nes[qid]))[-1]
                item['ne_count'] = len(ne)
                mean = sum(ne) / float(len(ne))
                nor_mean = (sum(ne)/len(ne)) / len(questions[qid]['question'].split())
                median = np.median(ne)
                mod = max(set(ne), key=ne.count)
            except:
                # qid 8216 does could not parsed well by ne
                # so it does not have ne
                print("WARN: qid", qid, "doesn't have NE so it's mean, nor_mean, median, mod will be 0.")
                mean = 0
                nor_mean = 0
                median = 0
                mod = 0
            item['ne_mean'] = mean
            item['ne_nor_mean'] = nor_mean
            item['ne_median'] = median
            item['ne_mod'] = mod
        else:
            acc = questions.select(['acc_ratio_qid'])
            item['acc_ratio_qid'] = sum(acc) / float(len(acc))
            item['cat_qid'] = most_pred_cat_qid

        item['uid'] = str(uid)
        item['qid'] = str(qid)

In [48]:
transform(X_train)
X_train[1]


WARN: qid 8216 doesn't have NE so it's mean, nor_mean, median, mod will be 0.
WARN: qid 8216 doesn't have NE so it's mean, nor_mean, median, mod will be 0.
WARN: qid 8216 doesn't have NE so it's mean, nor_mean, median, mod will be 0.
WARN: qid 8216 doesn't have NE so it's mean, nor_mean, median, mod will be 0.
WARN: qid 8216 doesn't have NE so it's mean, nor_mean, median, mod will be 0.
WARN: qid 8216 doesn't have NE so it's mean, nor_mean, median, mod will be 0.
WARN: qid 8216 doesn't have NE so it's mean, nor_mean, median, mod will be 0.
WARN: qid 8216 doesn't have NE so it's mean, nor_mean, median, mod will be 0.
WARN: qid 8216 doesn't have NE so it's mean, nor_mean, median, mod will be 0.
WARN: qid 8216 doesn't have NE so it's mean, nor_mean, median, mod will be 0.
WARN: qid 8216 doesn't have NE so it's mean, nor_mean, median, mod will be 0.
WARN: qid 8216 doesn't have NE so it's mean, nor_mean, median, mod will be 0.
WARN: qid 8216 doesn't have NE so it's mean, nor_mean, median, mod will be 0.
WARN: qid 8216 doesn't have NE so it's mean, nor_mean, median, mod will be 0.
Out[48]:
{'acc_ratio_cat': 0.7560975609756098,
 'acc_ratio_qid': 0.875,
 'acc_ratio_uid': 0.6465863453815262,
 'answer': 'thomas cole',
 'ave_pos_cat': 43.640185830429736,
 'avg_pos_qid': 51.0,
 'avg_pos_uid': 30.973895582329316,
 'cat_qid': '0',
 'cat_uid': '2',
 'category': 'fine arts',
 'ne_count': 12,
 'ne_mean': 56.5,
 'ne_median': 60.5,
 'ne_mod': 65,
 'ne_nor_mean': 0.7243589743589743,
 'q_length': 78,
 'qid': '1',
 'uid': '1'}

In [49]:
from sklearn.feature_extraction import DictVectorizer


vec = DictVectorizer()
X_train_dict_vec = vec.fit_transform(X_train)

In [50]:
import multiprocessing
from sklearn import linear_model
from sklearn.cross_validation import train_test_split, cross_val_score
import math
from numpy import abs, sqrt


regressor_names = """
ElasticNetCV
"""
for l1 in [0.72]:
    print ("=== ElasticNetCV RMSE", "with", l1)
    for regressor in regressor_names.split():
        scores = cross_val_score(getattr(linear_model, regressor)(n_jobs=3, normalize=True, l1_ratio = l1),
                                 X_train_dict_vec, y_train,
                                 cv=2,
                                 scoring='mean_squared_error'
                                )
        print (regressor, sqrt(abs(scores)).mean())


=== ElasticNetCV RMSE with 0.72
ElasticNetCV 75.8577470756

Training and testing model


In [51]:
regression_keys = ['category', 'q_length', 'qid', 'uid', 'answer', 'avg_pos_uid', 'avg_pos_qid']
X_train, y_train = featurize(load_buzz(), group='train', sign_val=None, extra=['avg_pos'])
X_train = select(X_train, regression_keys)
X_test = featurize(load_buzz(), group='test', sign_val=None, extra=['avg_pos'])
X_test = select(X_test, regression_keys)

transform(X_train)
transform(X_test)

for item in X_train:
    for key in categories[item['category']].keys():
        item[key] = categories[item['category']][key]

for item in X_test:
    for key in categories[item['category']].keys():
        item[key] = categories[item['category']][key]


WARN: qid 8216 doesn't have NE so it's mean, nor_mean, median, mod will be 0.
WARN: qid 8216 doesn't have NE so it's mean, nor_mean, median, mod will be 0.
WARN: qid 8216 doesn't have NE so it's mean, nor_mean, median, mod will be 0.
WARN: qid 8216 doesn't have NE so it's mean, nor_mean, median, mod will be 0.
WARN: qid 8216 doesn't have NE so it's mean, nor_mean, median, mod will be 0.
WARN: qid 8216 doesn't have NE so it's mean, nor_mean, median, mod will be 0.
WARN: qid 8216 doesn't have NE so it's mean, nor_mean, median, mod will be 0.
WARN: qid 8216 doesn't have NE so it's mean, nor_mean, median, mod will be 0.
WARN: qid 8216 doesn't have NE so it's mean, nor_mean, median, mod will be 0.
WARN: qid 8216 doesn't have NE so it's mean, nor_mean, median, mod will be 0.
WARN: qid 8216 doesn't have NE so it's mean, nor_mean, median, mod will be 0.
WARN: qid 8216 doesn't have NE so it's mean, nor_mean, median, mod will be 0.
WARN: qid 8216 doesn't have NE so it's mean, nor_mean, median, mod will be 0.
WARN: qid 8216 doesn't have NE so it's mean, nor_mean, median, mod will be 0.
WARN: qid 8216 doesn't have NE so it's mean, nor_mean, median, mod will be 0.
WARN: qid 8216 doesn't have NE so it's mean, nor_mean, median, mod will be 0.

In [52]:
X_train[1]


Out[52]:
{'acc_ratio_cat': 0.7560975609756098,
 'acc_ratio_qid': 0.875,
 'acc_ratio_uid': 0.6465863453815262,
 'answer': 'thomas cole',
 'ave_pos_cat': 43.640185830429736,
 'avg_pos_qid': 51.0,
 'avg_pos_uid': 30.973895582329316,
 'cat_qid': '0',
 'cat_uid': '2',
 'category': 'fine arts',
 'ne_count': 12,
 'ne_mean': 56.5,
 'ne_median': 60.5,
 'ne_mod': 65,
 'ne_nor_mean': 0.7243589743589743,
 'q_length': 78,
 'qid': '1',
 'uid': '1'}

In [53]:
X_test[1]


Out[53]:
{'acc_ratio_cat': 0.65625,
 'acc_ratio_qid': 0.6428571428571429,
 'acc_ratio_uid': 0.6712328767123288,
 'answer': 'david hilbert',
 'ave_pos_cat': 18.712499999999999,
 'avg_pos_qid': 15.571428571428571,
 'avg_pos_uid': 36.31506849315068,
 'cat_qid': '0',
 'cat_uid': '2',
 'category': 'mathematics',
 'ne_count': 9,
 'ne_mean': 85.11111111111111,
 'ne_median': 97.0,
 'ne_mod': 97,
 'ne_nor_mean': 0.8105820105820106,
 'q_length': 105,
 'qid': '9967',
 'uid': '131'}

In [54]:
vec = DictVectorizer()
vec.fit(X_train + X_test)
X_train = vec.transform(X_train)
X_test = vec.transform(X_test)

In [57]:
for l1_ratio in [0.68, 0.7, 0.72, 0.74]:
    print('=== l1_ratio:', l1_ratio)
    regressor = linear_model.ElasticNetCV(n_jobs=3, normalize=True, l1_ratio=l1_ratio)
    regressor.fit(X_train, y_train)
    print(regressor.coef_)
    print(regressor.alpha_)
    predictions = regressor.predict(X_test)
    write_result(load_buzz()['test'], predictions, file_name=str(l1_ratio)+'guess_adj.csv', adj=True)


=== l1_ratio: 0.68
[ 15.14488498  23.83604421  21.41813993 ...,   7.09162876   1.21096304
   4.57498261]
0.000416103968317
** Adjust results ***
** tid qid uid: pred_pos, q_length, diff
32193 123840 15 : 34.77270571077522 , 33 , 1.7727057107752202
32186 123840 28 : 35.71590142993018 , 33 , 2.7159014299301774
32081 123757 1 : 33.033652892919015 , 29 , 4.033652892919015
32067 123750 43 : 26.727746086504027 , 19 , 7.727746086504027
32025 123718 39 : 21.291084978907215 , 13 , 8.291084978907215
32018 123718 13 : 22.616800970347164 , 13 , 9.616800970347164
31997 123709 52 : 30.11708100723822 , 17 , 13.11708100723822
13083 106213 9 : 35.003631438929546 , 21 , 14.003631438929546
12908 106060 124 : 38.48615256163723 , 37 , 1.4861525616372333
24990 106373 62 : 29.89832874387028 , 23 , 6.89832874387028

** diff_tot 69.6630858210581
=== l1_ratio: 0.7
[ 15.34685785  25.0317335   22.47447183 ...,   7.5497235    1.26061527
   4.79521339]
0.000404215283508
** Adjust results ***
** tid qid uid: pred_pos, q_length, diff
32193 123840 15 : 34.40738918392179 , 33 , 1.4073891839217865
32186 123840 28 : 35.40096551042483 , 33 , 2.4009655104248324
32081 123757 1 : 32.80159956126448 , 29 , 3.8015995612644815
32067 123750 43 : 26.303731492304358 , 19 , 7.303731492304358
32025 123718 39 : 20.395336542591508 , 13 , 7.395336542591508
32018 123718 13 : 21.75137992214553 , 13 , 8.751379922145532
31997 123709 52 : 29.70125509781417 , 17 , 12.701255097814169
13083 106213 9 : 34.70134651465847 , 21 , 13.701346514658468
12908 106060 124 : 38.68605702260954 , 37 , 1.686057022609539
24990 106373 62 : 29.555889202259372 , 23 , 6.555889202259372

** diff_tot 65.70495004999404
=== l1_ratio: 0.72
[ 15.51447944  26.28185669  23.57646017 ...,   8.04093899   1.31192072
   5.02255209]
0.000392987081188
** Adjust results ***
** tid qid uid: pred_pos, q_length, diff
32193 123840 15 : 34.01239991794262 , 33 , 1.0123999179426235
32186 123840 28 : 35.05958276602927 , 33 , 2.0595827660292727
32081 123757 1 : 32.553925793345165 , 29 , 3.553925793345165
32067 123750 43 : 25.870854561676204 , 19 , 6.870854561676204
32025 123718 39 : 19.45366362709145 , 13 , 6.453663627091451
32018 123718 13 : 20.83547900073465 , 13 , 7.835479000734651
31997 123709 52 : 29.255418863884458 , 17 , 12.255418863884458
13083 106213 9 : 34.37005541907167 , 21 , 13.370055419071669
12908 106060 124 : 38.91196710242918 , 37 , 1.9119671024291804
1232 674 161 : 48.252633181699515 , 48 , 0.2526331816995153
24990 106373 62 : 29.195659819451024 , 23 , 6.1956598194510235

** diff_tot 61.77164005335521
=== l1_ratio: 0.74
[ 15.6432852   27.59158088  24.72669354 ...,   8.56873904   1.36513986
   5.25718447]
0.000382365808723
** Adjust results ***
** tid qid uid: pred_pos, q_length, diff
32193 123840 15 : 33.58413447458954 , 33 , 0.5841344745895398
32186 123840 28 : 34.68864621363657 , 33 , 1.6886462136365665
32081 123757 1 : 32.287218272353776 , 29 , 3.287218272353776
32067 123750 43 : 25.427631555476943 , 19 , 6.4276315554769425
32025 123718 39 : 18.461097698805624 , 13 , 5.461097698805624
32018 123718 13 : 19.862779874023268 , 13 , 6.862779874023268
31997 123709 52 : 28.774453273283896 , 17 , 11.774453273283896
14665 8933 56 : 58.898557564906476 , 58 , 0.8985575649064756
13083 106213 9 : 34.00547810824876 , 21 , 13.00547810824876
12908 106060 124 : 39.16657163092544 , 37 , 2.1665716309254393
1232 674 161 : 48.49122019917019 , 48 , 0.49122019917019344
24990 106373 62 : 28.81423139677917 , 23 , 5.814231396779171

** diff_tot 58.46202026219965