Model12: GMM remove uid qid

A. Functions

There have four different functions.

  • Data reader: Read data from file.
  • Feature functions(private): Functions which extract features are placed in here. It means that if you make a specific feature function, you can add the one into here.
  • Feature function(public): We can use only this function for feature extraction.
  • Utility functions: All the funtions except functions which are mentioned in above should be placed in here.

Data reader


In [55]:
import gzip
import pickle
from os import path
from collections import defaultdict
from numpy import sign


"""
Load buzz data as a dictionary.
You can give parameter for data so that you will get what you need only.
"""
def load_buzz(root='../data', data=['train', 'test', 'questions'], format='pklz'):
    buzz_data = {}
    for ii in data:
        file_path = path.join(root, ii + "." + format)
        with gzip.open(file_path, "rb") as fp:
          buzz_data[ii] = pickle.load(fp)
        
    return buzz_data

Feature functions(private)


In [56]:
from numpy import sign, abs


def _feat_basic(bd, group):
    X = []
    for item in bd[group].items():
        qid = item[1]['qid']
        q = bd['questions'][qid]
        #item[1]['q_length'] = max(q['pos_token'].keys())
        item[1]['q_length'] = len(q['question'].split())
        item[1]['category'] = q['category'].lower()
        item[1]['answer'] = q['answer'].lower()
        X.append(item[1])
        
    return X
        
        
def _feat_sign_val(data):
    for item in data:
        item['sign_val'] = sign(item['position'])

def _get_pos(bd, sign_val=None):
    # bd is not bd, bd is bd['train']
    unwanted_index = []
    pos_uid = defaultdict(list)
    pos_qid = defaultdict(list)
    
    for index, key in enumerate(bd):
        if sign_val and sign(bd[key]['position']) != sign_val:
            unwanted_index.append(index)
        else:
            pos_uid[bd[key]['uid']].append(bd[key]['position'])
            pos_qid[bd[key]['qid']].append(bd[key]['position'])
    
    return pos_uid, pos_qid, unwanted_index


def _get_avg_pos(bd, sign_val=None):
    pos_uid, pos_qid, unwanted_index = _get_pos(bd, sign_val)

    avg_pos_uid = {}
    avg_pos_qid = {}
    
    if not sign_val:
        sign_val = 1

    for key in pos_uid:
        pos = pos_uid[key]
        avg_pos_uid[key] = sign_val * (sum(pos) / len(pos))

    for key in pos_qid:
        pos = pos_qid[key]
        avg_pos_qid[key] = sign_val * (sum(pos) / len(pos))
    
    return avg_pos_uid, avg_pos_qid, unwanted_index

        
def _feat_avg_pos(data, bd, group, sign_val):
    avg_pos_uid, avg_pos_qid, unwanted_index = _get_avg_pos(bd['train'], sign_val=sign_val)
    
    if group == 'train':
        for index in sorted(unwanted_index, reverse=True):
            del data[index]
    
    for item in data:
        if item['uid'] in avg_pos_uid:
            item['avg_pos_uid'] = avg_pos_uid[item['uid']]
        else:
            vals = avg_pos_uid.values()
            item['avg_pos_uid'] = sum(vals) / float(len(vals))
              
        if item['qid'] in avg_pos_qid:
            item['avg_pos_qid'] = avg_pos_qid[item['qid']]
        else:
            vals = avg_pos_qid.values()
            item['avg_pos_qid'] = sum(vals) / float(len(vals))
        
        # Response position can be longer than length of question
        if item['avg_pos_uid'] > item['q_length']:
            item['avg_pos_uid'] = item['q_length']
        
        if item['avg_pos_qid'] > item['q_length']:
            item['avg_pos_qid'] = item['q_length']

Feature function(public)


In [57]:
def featurize(bd, group, sign_val=None, extra=None):
    # Basic features
    # qid(string), uid(string), position(float)
    # answer'(string), 'potistion'(float), 'qid'(string), 'uid'(string)
    X = _feat_basic(bd, group=group)
    
    # Some extra features
    if extra:
        for func_name in extra:
            func_name = '_feat_' + func_name
            if func_name in ['_feat_avg_pos']:
                globals()[func_name](X, bd, group=group, sign_val=sign_val)
            else:
                globals()[func_name](X)
    
    if group == 'train':
        y = []
        for item in X:
            y.append(item['position'])
            del item['position']

        return X, y
    elif group == 'test':
        return X
    else:
        raise ValueError(group, 'is not the proper type')

Utility functions


In [58]:
import csv


def select(data, keys):
    unwanted = data[0].keys() - keys
    for item in data:
        for unwanted_key in unwanted:
            del item[unwanted_key]
    return data


def write_result(test_set, predictions, file_name='guess.csv'):
    predictions = sorted([[id, predictions[index]] for index, id in enumerate(test_set.keys())])
    predictions.insert(0,["id", "position"])
    with open(file_name, "w") as fp:
        writer = csv.writer(fp, delimiter=',')
        writer.writerows(predictions)

GMM

Classifying questions

features: avg_pos, accuracy rate


In [59]:
%matplotlib inline
import numpy as np
from scipy import linalg
import matplotlib.pyplot as plt
import matplotlib as mpl

from sklearn import mixture


def plot_gmm(X, models, n_components, covariance_type='diag',
            figsize=(10, 20), suptitle=None, xlabel=None, ylabel=None):
    color_iter = ['r', 'g', 'b', 'c', 'm', 'y', 'k', 'gray', 'pink', 'lime']
    plt.figure(figsize=figsize)
    plt.suptitle(suptitle, fontsize=20)

    for i, model in enumerate(models):
        mm = getattr(mixture, model)(n_components=n_components,
                                     covariance_type=covariance_type)
        mm.fit(X_pos_qid)
        Y = mm.predict(X_pos_qid)

        plt.subplot(len(models), 1, 1 + i)
        for i, color in enumerate(color_iter):
            plt.scatter(X_pos_qid[Y == i, 0], X_pos_qid[Y == i, 1], .7, color=color)
        plt.title(model, fontsize=15)
        plt.xlabel(xlabel, fontsize=12)
        plt.ylabel(ylabel, fontsize=12)
        plt.grid()

    plt.show()

In [60]:
from collections import UserDict
import numpy as np


class DictDict(UserDict):
    def __init__(self, bd):
        UserDict.__init__(self)
        self._set_bd(bd)
        
    def sub_keys(self):
        return self[list(self.keys())[0]].keys()
            
    def select(self, sub_keys):
        vals = []
        for key in self:
            vals.append([self[key][sub_key] for sub_key in sub_keys])
        return np.array(vals)
    
    def sub_append(self, sub_key, values):
        for index, key in enumerate(self):
            self[key][sub_key] = values[index]

    
class Users(DictDict):
    def _set_bd(self, bd):
        pos_uid, _, _ = _get_pos(bd['train'], sign_val=None)
        for key in pos_uid:
            u = np.array(pos_uid[key])
            ave_pos_uid = sum(abs(u)) / float(len(u))
            acc_ratio_uid = len(u[u > 0]) / float(len(u))
            self[key] = {'ave_pos_uid': ave_pos_uid,
                         'acc_ratio_uid': acc_ratio_uid}

            
class Questions(DictDict):
    def _set_bd(self, bd):
        _, pos_qid, _ = _get_pos(bd['train'], sign_val=None)
        
        for key in pos_qid:
            u = np.array(pos_qid[key])
            ave_pos_qid = sum(abs(u)) / float(len(u))
            acc_ratio_qid = len(u[u > 0]) / float(len(u))
            self[key] = bd['questions'][key]
            self[key]['ave_pos_qid'] = ave_pos_qid
            self[key]['acc_ratio_qid'] = acc_ratio_qid

In [61]:
users = Users(load_buzz())
questions = Questions(load_buzz())
X_pos_uid = users.select(['ave_pos_uid', 'acc_ratio_uid'])
X_pos_qid = questions.select(['ave_pos_qid', 'acc_ratio_qid'])

In [62]:
plot_gmm(X_pos_uid,
         models=['GMM', 'VBGMM', 'DPGMM'],
         n_components=10,
         covariance_type='diag',
         figsize=(10, 20),
         suptitle='Classifying users',
         xlabel='abs(position)',
         ylabel='accuracy ratio')



In [63]:
plot_gmm(X_pos_qid,
         models=['GMM', 'VBGMM', 'DPGMM'],
         n_components=10,
         covariance_type='diag',
         figsize=(10, 20),
         suptitle='Classifying questions',
         xlabel='abs(position)',
         ylabel='accuracy ratio')



In [64]:
# Question category
n_components = 8
gmm = mixture.GMM(n_components=n_components, covariance_type='diag')
gmm.fit(X_pos_qid)
pred_cat_qid = gmm.predict(X_pos_qid)

plt.hist(pred_cat_qid, bins=50, facecolor='g', alpha=0.75)
plt.xlabel("Category number")
plt.ylabel("Count")
plt.title("Question Category: " + str(n_components) + " categories")
plt.grid(True)
plt.show()



In [65]:
# User category
n_components = 8
gmm = mixture.GMM(n_components=n_components, covariance_type='diag')
gmm.fit(X_pos_uid)
pred_cat_uid = gmm.predict(X_pos_uid)

plt.hist(pred_cat_uid, bins=50, facecolor='g', alpha=0.75)
plt.xlabel("Category number")
plt.ylabel("Count")
plt.title("User Category: " + str(n_components) + " categories")
plt.grid(True)
plt.show()



In [66]:
from collections import Counter


users.sub_append('cat_uid', [str(x) for x in pred_cat_uid])
questions.sub_append('cat_qid', [str(x) for x in pred_cat_qid])

# to get most frequent cat for some test data which do not have ids in train set
most_pred_cat_uid = Counter(pred_cat_uid).most_common(1)[0][0]
most_pred_cat_qid = Counter(pred_cat_qid).most_common(1)[0][0]

print(most_pred_cat_uid)
print(most_pred_cat_qid)


4
0

In [67]:
print(users[1])
print(questions[1])


{'ave_pos_uid': 96.724899598393577, 'cat_uid': '4', 'acc_ratio_uid': 0.6465863453815262}
{'ave_pos_qid': 70.5, 'answer': 'thomas cole', 'cat_qid': '3', 'group': 'test', 'acc_ratio_qid': 0.875, 'question': "This painter's indulgence of visual fantasy, and appreciation of different historic architectural styles can be seen in his 1840 Architect's Dream. After a series of paintings on The Last of the Mohicans, he made a three year trip to Europe in 1829, but he is better known for a trip four years earlier in which he journeyed up the Hudson River to the Catskill Mountains. FTP, name this painter of The Oxbow and The Voyage of Life series.", 'category': 'Fine Arts', 'pos_token': {0: '', 1: 'painters', 2: 'indulgence', 4: 'visual', 5: 'fantasy', 68: 'this_painter', 7: 'appreciation', 64: 'mountains', 9: 'different', 10: 'historic', 11: 'architectural', 12: 'styles', 66: 'name', 77: 'series', 15: 'seen', 18: '1840', 19: 'architects', 20: 'dream', 23: 'series', 25: 'paintings', 28: 'last', 31: 'mohicans', 33: 'made', 35: 'three', 36: 'year', 37: 'trip', 65: 'ftp', 39: 'europe', 41: '1829', 71: 'oxbow', 45: 'better', 46: 'known', 76: 'life', 49: 'trip', 50: 'four', 51: 'years', 52: 'earlier', 56: 'journeyed', 59: 'hudson', 60: 'river', 74: 'voyage', 63: 'catskill'}}

B. Modeling

Select model


In [68]:
regression_keys_step1 = ['category', 'q_length', 'qid', 'uid', 'answer', 'avg_pos_uid', 'avg_pos_qid']
X_train, y_train = featurize(load_buzz(), group='train', sign_val=None, extra=['sign_val', 'avg_pos'])
X_train = select(X_train, regression_keys_step1)

In [69]:
def transform(X):
    for index, item in enumerate(X):
        uid = int(item['uid'])
        qid = int(item['qid'])
        
        # uid
        if int(uid) in users:
            item['acc_ratio_uid'] = users[uid]['acc_ratio_uid']
            item['cat_uid'] = users[uid]['cat_uid']
        else:
            print('Not found uid:', uid)
            acc = users.select(['acc_ratio_uid'])
            item['acc_ratio_uid'] = sum(acc) / float(len(acc))
            item['cat_uid'] = most_pred_cat_uid

        # qid
        if int(qid) in questions:
            item['acc_ratio_qid'] = questions[qid]['acc_ratio_qid']
            item['cat_qid'] = questions[qid]['cat_qid']
        else:
            print('Not found qid:', qid)
            acc = questions.select(['acc_ratio_qid'])
            item['acc_ratio_qid'] = sum(acc) / float(len(acc))
            item['cat_qid'] = most_pred_cat_qid
        
        item['uid'] = str(uid)
        item['qid'] = str(qid)

In [70]:
transform(X_train)
regression_keys_step2 = ['category', 'q_length', 'answer', 'avg_pos_uid', 'avg_pos_qid',
                   'acc_ratio_qid', 'acc_ratio_uid',
                   'cat_qid', 'cat_uid'
                  ]
X_train = select(X_train, regression_keys_step2)
X_train[1]


Out[70]:
{'acc_ratio_qid': 0.875,
 'acc_ratio_uid': 0.6465863453815262,
 'answer': 'thomas cole',
 'avg_pos_qid': 51.0,
 'avg_pos_uid': 30.973895582329316,
 'cat_qid': '3',
 'cat_uid': '4',
 'category': 'fine arts',
 'q_length': 78}

In [71]:
from sklearn.feature_extraction import DictVectorizer


vec = DictVectorizer()
X_train_dict_vec = vec.fit_transform(X_train)

In [72]:
import multiprocessing
from sklearn import linear_model
from sklearn.cross_validation import train_test_split, cross_val_score
import math
from numpy import abs, sqrt


regressor_names = """
LinearRegression
LassoCV
ElasticNetCV
"""
print ("=== Linear Cross validation RMSE scores:")
for regressor in regressor_names.split():
    scores = cross_val_score(getattr(linear_model, regressor)(normalize=True, n_jobs=multiprocessing.cpu_count()-1),
                             X_train_dict_vec, y_train,
                             cv=2,
                             scoring='mean_squared_error'
                            )
    print (regressor, sqrt(abs(scores)).mean())


=== Linear Cross validation RMSE scores:
LinearRegression 69.1603431086
LassoCV 69.0388663148
ElasticNetCV 77.185149749

Training and testing model


In [73]:
regression_keys_step1 = ['category', 'q_length', 'qid', 'uid', 'answer', 'avg_pos_uid', 'avg_pos_qid']
X_train, y_train = featurize(load_buzz(), group='train', sign_val=None, extra=['avg_pos'])
X_train = select(X_train, regression_keys_step1)
X_test = featurize(load_buzz(), group='test', sign_val=None, extra=['avg_pos'])
regression_keys_step2 = ['category', 'q_length', 'answer', 'avg_pos_uid', 'avg_pos_qid',
                   'acc_ratio_qid', 'acc_ratio_uid',
                   'cat_qid', 'cat_uid'
                  ]
X_test = select(X_test, regression_keys_step1)

transform(X_train)
transform(X_test)
X_train = select(X_train, regression_keys_step2)
X_test = select(X_test, regression_keys_step2)


Not found qid: 103709
Not found qid: 9987
Not found qid: 113762
Not found qid: 113768
Not found qid: 108381
Not found qid: 108438
Not found qid: 113864
Not found qid: 113871
Not found qid: 113895
Not found qid: 10225
Not found qid: 108579
Not found qid: 109267
Not found qid: 108620
Not found qid: 5644
Not found uid: 381
Not found qid: 108668
Not found qid: 359
Not found qid: 114135
Not found qid: 6416
Not found qid: 5753
Not found qid: 5779
Not found qid: 10420
Not found qid: 114834
Not found qid: 114952
Not found qid: 120233
Not found qid: 115234
Not found uid: 369
Not found uid: 373
Not found uid: 381
Not found qid: 106613
Not found qid: 104241
Not found uid: 415
Not found uid: 422
Not found qid: 5999
Not found qid: 115684
Not found qid: 10645
Not found qid: 108976
Not found qid: 115775
Not found qid: 10769
Not found qid: 116015
Not found qid: 10795
Not found qid: 109128
Not found qid: 116504
Not found qid: 116539
Not found qid: 109177
Not found qid: 116608
Not found qid: 116634
Not found qid: 10930
Not found qid: 117050
Not found qid: 117168
Not found qid: 109401
Not found qid: 109425
Not found qid: 109432
Not found qid: 104702
Not found qid: 6417
Not found qid: 109569
Not found qid: 109615
Not found qid: 6511
Not found qid: 1620
Not found qid: 104844
Not found qid: 109706
Not found qid: 2073
Not found qid: 117973
Not found qid: 118039
Not found qid: 109894
Not found qid: 100711
Not found qid: 7227
Not found qid: 118830
Not found qid: 105067
Not found uid: 267
Not found uid: 268
Not found uid: 269
Not found qid: 11774
Not found qid: 119000
Not found qid: 11839
Not found qid: 105175
Not found qid: 11917
Not found qid: 119415
Not found qid: 101077
Not found qid: 12078
Not found qid: 119928
Not found qid: 119937
Not found qid: 3123
Not found qid: 120002
Not found qid: 105544
Not found qid: 3313
Not found qid: 3318
Not found qid: 12197
Not found qid: 7302
Not found qid: 110448
Not found qid: 105678
Not found qid: 120194
Not found qid: 120216
Not found qid: 101808
Not found qid: 105948
Not found qid: 106003
Not found qid: 3681
Not found uid: 397
Not found qid: 7963
Not found uid: 452
Not found uid: 309
Not found uid: 318
Not found qid: 102056
Not found qid: 106360
Not found qid: 121391
Not found qid: 3852
Not found qid: 121418
Not found qid: 8105
Not found qid: 3858
Not found qid: 110893
Not found qid: 12596
Not found uid: 397
Not found qid: 4087
Not found qid: 121630
Not found uid: 324
Not found uid: 326
Not found qid: 106548
Not found qid: 121652
Not found qid: 121688
Not found qid: 4168
Not found uid: 343
Not found qid: 12720
Not found qid: 111055
Not found qid: 8420
Not found qid: 106806
Not found qid: 121923
Not found qid: 8548
Not found qid: 121996
Not found qid: 4383
Not found qid: 122101
Not found qid: 106982
Not found qid: 102857
Not found qid: 13056
Not found qid: 13072
Not found qid: 13083
Not found qid: 102941
Not found qid: 13110
Not found qid: 8961
Not found qid: 122867
Not found qid: 122925
Not found qid: 4731
Not found qid: 9069
Not found qid: 9084
Not found qid: 13258
Not found qid: 13291
Not found qid: 123265
Not found qid: 111651
Not found qid: 123315
Not found qid: 9638
Not found qid: 4932
Not found uid: 381
Not found qid: 4968
Not found qid: 103277
Not found qid: 4223
Not found qid: 112066
Not found qid: 107687
Not found qid: 112091
Not found qid: 123734
Not found qid: 103354
Not found qid: 103397
Not found qid: 9488
Not found qid: 112475
Not found qid: 9546
Not found qid: 112621
Not found qid: 108050
Not found qid: 111097
Not found qid: 7931
Not found qid: 113182
Not found qid: 113578
Not found qid: 108246

In [74]:
X_train[1]


Out[74]:
{'acc_ratio_qid': 0.875,
 'acc_ratio_uid': 0.6465863453815262,
 'answer': 'thomas cole',
 'avg_pos_qid': 51.0,
 'avg_pos_uid': 30.973895582329316,
 'cat_qid': '3',
 'cat_uid': '4',
 'category': 'fine arts',
 'q_length': 78}

In [75]:
X_test[1]


Out[75]:
{'acc_ratio_qid': 0.6428571428571429,
 'acc_ratio_uid': 0.6712328767123288,
 'answer': 'david hilbert',
 'avg_pos_qid': 15.571428571428571,
 'avg_pos_uid': 36.31506849315068,
 'cat_qid': '3',
 'cat_uid': '4',
 'category': 'mathematics',
 'q_length': 105}

In [76]:
vec = DictVectorizer()
vec.fit(X_train + X_test)
X_train = vec.transform(X_train)
X_test = vec.transform(X_test)

In [77]:
regressor = linear_model.ElasticNetCV(n_jobs=3, normalize=True)
regressor.fit(X_train, y_train)


Out[77]:
ElasticNetCV(alphas=None, copy_X=True, cv=None, eps=0.001, fit_intercept=True,
       l1_ratio=0.5, max_iter=1000, n_alphas=100, n_jobs=3, normalize=True,
       positive=False, precompute='auto', random_state=None,
       selection='cyclic', tol=0.0001, verbose=0)

In [78]:
print(regressor.coef_)
print(regressor.alpha_)


[ 14.87941141  13.65933995   7.15099351 ...,  -0.80502526  -0.53521217
   0.02611121]
0.000565901396911

In [79]:
predictions = regressor.predict(X_test)

Writing result


In [80]:
write_result(load_buzz()['test'], predictions)

This submissions scores

  • 81.94899: normalize=True