Model08: Refactoring functions

In this model, we do refactor all the functions we used. Especilly, functions related to feature extraction are refactored. It means it does not make any improvement in terms of prediction accuracy.

Just read what has been changed.

A. Functions

There have four different functions.

  • Data reader: Read data from file.
  • Feature functions(private): Functions which extract features are placed in here. It means that if you make a specific feature function, you can add the one into here.
  • Feature function(public): We can use only this function for feature extraction.
  • Utility functions: All the funtions except functions which are mentioned in above should be placed in here.

Data reader


In [218]:
import gzip
import pickle
from os import path
from collections import defaultdict
from numpy import sign


"""
Load buzz data as a dictionary.
You can give parameter for data so that you will get what you need only.
"""
def load_buzz(root='../data', data=['train', 'test', 'questions'], format='pklz'):
    buzz_data = {}
    for ii in data:
        file_path = path.join(root, ii + "." + format)
        with gzip.open(file_path, "rb") as fp:
          buzz_data[ii] = pickle.load(fp)
        
    return buzz_data

Feature functions(private)


In [219]:
def _feat_basic(bd, group):
    X = []
    for item in bd[group].items():
        qid = item[1]['qid']
        q = bd['questions'][qid]
        item[1]['q_length'] = max(q['pos_token'].keys())
        item[1]['category'] = q['category'].lower()
        item[1]['answer'] = q['answer'].lower()
        X.append(item[1])
        
    return X
        
        
def _feat_sign_val(data):
    for item in data:
        item['sign_val'] = sign(item['position'])

        
def _feat_avg_pos(data, sign_val):
    unwanted_index = []
    pos_uid = defaultdict(list)
    pos_qid = defaultdict(list)
    
    for index, item in enumerate(data):
        if sign_val and sign(item['position']) != sign_val:
            unwanted_index.append(index)
        else:
            pos_uid[item['uid']].append(item['position'])
            pos_qid[item['qid']].append(item['position'])

    avg_pos_uid = {}
    avg_pos_qid = {}

    for key in pos_uid:
        avg_pos_uid[key] = sum(pos_uid[key]) / len(pos_uid[key])

    for key in pos_qid:
        avg_pos_qid[key] = sum(pos_qid[key]) / len(pos_qid[key])
    
    for index in sorted(unwanted_index, reverse=True):
        del data[index]
    
    for item in data:
        item['avg_pos_uid'] = avg_pos_uid[item['uid']]
        item['avg_pos_qid'] = avg_pos_qid[item['qid']]

        
def _feat_train(bd, sign_val=None, extra=None):
    # Basic features
    # qid(string), uid(string), position(float)
    # answer'(string), 'potistion'(float), 'qid'(string), 'uid'(string)
    X = _feat_basic(bd, group='train')
    y = []
    
    # Some extra features
    if extra:
        for func_name in extra:
            func_name = '_feat_' + func_name
            if func_name in ['_feat_avg_pos']:
                globals()[func_name](X, sign_val=sign_val)
            else:
                globals()[func_name](X)
    
    for item in X:
        y.append(item['position'])
        del item['position']
        
    return X, y

Feature function(public)


In [220]:
def featurize(bd, group='train', sign_val=None, extra=None):
    if group == 'train':
        return _feat_train(bd, sign_val=sign_val, extra=extra)
    elif group == 'test':
        return _feat_basic(bd, group='test')
    else:
        raise ValueError(group, 'is not the proper type')

Utility functions


In [221]:
import csv


def select(data, keys):
    unwanted = data[0].keys() - keys
    for item in data:
        for unwanted_key in unwanted:
            del item[unwanted_key]
    return data


def write_result(test_set, predictions, file_name='guess.csv'):
    predictions = sorted([[id, predictions[index]] for index, id in enumerate(test_set.keys())])
    predictions.insert(0,["id", "position"])
    with open(file_name, "w") as fp:
        writer = csv.writer(fp, delimiter=',')
        writer.writerows(predictions)

B. Modeling

Select model


In [222]:
import multiprocessing
from sklearn import linear_model
from sklearn.cross_validation import train_test_split, cross_val_score
from sklearn.feature_extraction import DictVectorizer
import math
from numpy import abs, sqrt


bd = load_buzz()
X_train, y_train = featurize(bd, group='train', sign_val=None, extra=['sign_val', 'avg_pos'])
regression_keys = ['category', 'q_length', 'sign_val', 'qid', 'uid', 'answer']
X_train = select(X_train, regression_keys)
X_test = featurize(bd, group='test')

vec = DictVectorizer()
X_train = vec.fit_transform(X_train)

regressor_names = """
LinearRegression
Ridge
Lasso
ElasticNet
"""
print ("=== Linear Cross validation RMSE scores:")
for regressor in regressor_names.split():
    scores = cross_val_score(getattr(linear_model, regressor)(),
                             X_train, y_train,
                             cv=10,
                             scoring='mean_squared_error',
                             n_jobs=multiprocessing.cpu_count()-1
                            )
    print (regressor, sqrt(abs(scores)).mean())


=== Linear Cross validation RMSE scores:
LinearRegression 35.3976636587
Ridge 85.4215957267
Lasso 34.8350153225
ElasticNet 46.015917262

Training and testing model


In [223]:
bd = load_buzz()
X_train, y_train = featurize(bd, group='train', sign_val=None, extra=['sign_val'])
regression_keys = ['category', 'q_length', 'sign_val', 'qid', 'uid', 'answer']
X_train = select(X_train, regression_keys)
X_test = featurize(bd, group='test')
X_test = select(X_test, regression_keys)

vec = DictVectorizer()
vec.fit(X_train + X_test)
X_train = vec.transform(X_train)
X_test = vec.transform(X_test)
                                                                      
regressor = linear_model.LassoCV()
regressor.fit(X_train, y_train)


Out[223]:
LassoCV(alphas=None, copy_X=True, cv=None, eps=0.001, fit_intercept=True,
    max_iter=1000, n_alphas=100, n_jobs=1, normalize=False, positive=False,
    precompute='auto', random_state=None, selection='cyclic', tol=0.0001,
    verbose=False)

In [224]:
print(regressor.coef_)
print(regressor.alpha_)


[  0.00000000e+00   0.00000000e+00  -0.00000000e+00 ...,   5.00200833e-05
   0.00000000e+00  -1.92318230e-02]
146.866398809

In [225]:
predictions = regressor.predict(X_test)

Writing result


In [226]:
write_result(bd['test'], predictions)

This submissions scores 84.38453 in Kaggle.