Model24: adding more features of categories


In [128]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt

from utils import load_buzz, select, write_result
from features import featurize, get_pos
from containers import Questions, Users, Categories
from nlp import extract_entities

In [ ]:
import pickle


questions = pickle.load(open('questions01.pkl', 'rb'))
users = pickle.load(open('users01.pkl', 'rb'))
categories = pickle.load(open('categories01.pkl', 'rb'))

In [119]:
from collections import defaultdict
from numpy import sign

def get_cat_pos_per_uid(bd):
    cat_pos_uid = defaultdict(lambda: defaultdict(list))
    for key in bd:
        uid = bd[key]['uid']
        cat = questions[bd[key]['qid']]['category'].lower()
        pos = bd[key]['position']
        cat_pos_uid[uid][cat].append(pos)
    return cat_pos_uid


def cal_acc_cat_per_uid(bd):
    acc_cat_per_uid = defaultdict(lambda: defaultdict(float))
    cat_pos_uid = get_cat_pos_per_uid(bd)
    for uid in cat_pos_uid:
        for cat in cat_pos_uid[uid]:
            answers = sign(cat_pos_uid[uid][cat])
            acc = list(answers).count(1) / float(len(answers))
            acc_cat_per_uid[uid][cat] = acc

    return acc_cat_per_uid

def get_acc_cat_per_uid(bd):
    acc_cat_per_uid = acc_cat_per_uid_inter.copy()
    for uid in acc_cat_per_uid_inter:
        for cat in categories:
            if cat not in acc_cat_per_uid[uid]:
                acc_cat_per_uid[uid][cat] = categories[cat]['acc_ratio_cat']
    
    return acc_cat_per_uid

In [120]:
acc_cat_per_uid = get_acc_cat_per_uid(load_buzz()['train'])

In [124]:
for uid in users:
    for cat in categories.keys():
        users[uid]['acc_cat_' + cat] = acc_cat_per_uid[uid][cat]

In [126]:
users[100]


Out[126]:
{'acc_cat_astronomy': 0.6634615384615384,
 'acc_cat_biology': 0.3333333333333333,
 'acc_cat_chemistry': 0.5,
 'acc_cat_earth science': 0.7358490566037735,
 'acc_cat_fine arts': 0.7272727272727273,
 'acc_cat_history': 0.3333333333333333,
 'acc_cat_literature': 0.9411764705882353,
 'acc_cat_mathematics': 0.65625,
 'acc_cat_other': 1.0,
 'acc_cat_physics': 0.5714285714285714,
 'acc_cat_social studies': 0.6363636363636364,
 'acc_ratio_uid': 0.6833333333333333,
 'ave_pos_uid': 80.416666666666671,
 'cat_uid': '19'}

In [127]:
import pickle

for ii in ['users']:
    file_name = ii + '01.pkl'
    with open(file_name, 'wb') as f:
        nes = pickle.dump(globals()[ii], f, protocol=2)