Model26: adding average position for categories per uid


In [178]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt

from utils import load_buzz, select, write_result
from features import featurize, get_pos
from containers import Questions, Users, Categories
from nlp import extract_entities

In [179]:
import pickle


questions = pickle.load(open('questions01.pkl', 'rb'))
users = pickle.load(open('users01.pkl', 'rb'))
categories = pickle.load(open('categories01.pkl', 'rb'))

In [180]:
from collections import defaultdict
from numpy import sign

def get_cat_pos_per_uid(bd):
    cat_pos_uid = defaultdict(lambda: defaultdict(list))
    for key in bd:
        uid = bd[key]['uid']
        cat = questions[bd[key]['qid']]['category'].lower()
        pos = bd[key]['position']
        cat_pos_uid[uid][cat].append(pos)
    return cat_pos_uid


def cal_avg_pos_cat_per_uid(bd):
    avg_pos_cat_per_uid = defaultdict(lambda: defaultdict(float))
    cat_pos_uid = get_cat_pos_per_uid(bd)
    for uid in cat_pos_uid:
        for cat in cat_pos_uid[uid]:
            answers = cat_pos_uid[uid][cat]
            avg_pos = sum(answers) / float(len(answers))
            avg_pos_cat_per_uid[uid][cat] = avg_pos

    return avg_pos_cat_per_uid

def get_avg_pos_cat_per_uid(bd):
    avg_pos_cat_per_uid_inter = cal_avg_pos_cat_per_uid(bd)
    avg_pos_cat_per_uid = avg_pos_cat_per_uid_inter.copy()
    for uid in avg_pos_cat_per_uid_inter:
        for cat in categories:
            if cat not in avg_pos_cat_per_uid[uid]:
                avg_pos_cat_per_uid[uid][cat] = categories[cat]['ave_pos_cat']
    
    return avg_pos_cat_per_uid

In [181]:
avg_pos_cat_per_uid = get_avg_pos_cat_per_uid(load_buzz()['train'])

In [183]:
avg_pos_cat_per_uid[1]


Out[183]:
defaultdict(<class 'float'>, {'fine arts': 39.054794520547944, 'literature': 51.4812030075188, 'chemistry': 22.47826086956522, 'other': 59.333333333333336, 'earth science': 110.0, 'mathematics': 7.333333333333333, 'astronomy': -76.66666666666667, 'biology': 4.423076923076923, 'social studies': 14.974137931034482, 'physics': 40.25, 'history': 20.676056338028168})

In [184]:
for uid in users:
    for cat in categories.keys():
        users[uid]['avg_pos_cat_' + cat] = avg_pos_cat_per_uid[uid][cat]

In [185]:
users[125]


Out[185]:
{'acc_cat_astronomy': 0.6634615384615384,
 'acc_cat_biology': 0.0,
 'acc_cat_chemistry': 0.687099725526075,
 'acc_cat_earth science': 0.7358490566037735,
 'acc_cat_fine arts': 0.3333333333333333,
 'acc_cat_history': 0.5,
 'acc_cat_literature': 0.5555555555555556,
 'acc_cat_mathematics': 0.65625,
 'acc_cat_other': 0.7508305647840532,
 'acc_cat_physics': 0.0,
 'acc_cat_social studies': 0.45454545454545453,
 'acc_ratio_uid': 0.4523809523809524,
 'ave_pos_uid': 62.928571428571431,
 'avg_pos_cat_astronomy': 24.798076923076923,
 'avg_pos_cat_biology': -56.0,
 'avg_pos_cat_chemistry': 28.218664226898444,
 'avg_pos_cat_earth science': 36.509433962264154,
 'avg_pos_cat_fine arts': -3.0,
 'avg_pos_cat_history': 7.25,
 'avg_pos_cat_literature': 25.444444444444443,
 'avg_pos_cat_mathematics': 18.712499999999999,
 'avg_pos_cat_other': 42.993355481727576,
 'avg_pos_cat_physics': -33.5,
 'avg_pos_cat_social studies': -4.090909090909091,
 'cat_uid': '19'}

In [186]:
import pickle

for ii in ['users']:
    file_name = ii + '01.pkl'
    with open(file_name, 'wb') as f:
        nes = pickle.dump(globals()[ii], f, protocol=2)

In [192]:
categories['other']


Out[192]:
{'acc_ratio_cat': 0.7508305647840532, 'ave_pos_cat': 42.993355481727576}

In [196]:
questions_new = questions.copy()
for qid in questions:
    cat = questions_new[qid]['category'].lower()
    questions_new[qid]['q_acc_ratio_cat'] = categories[cat]['acc_ratio_cat']
    questions_new[qid]['q_ave_pos_cat'] = categories[cat]['ave_pos_cat']

In [197]:
questions_new[1]


Out[197]:
{'acc_ratio_qid': 0.875,
 'answer': 'thomas cole',
 'ave_pos_qid': 70.5,
 'cat_qid': '11',
 'category': 'Fine Arts',
 'group': 'test',
 'ne_count': 12,
 'ne_mean': 56.5,
 'ne_median': 60.5,
 'ne_mod': 65,
 'ne_nor_mean': 0.7243589743589743,
 'ne_tags': [['CD', '1840', 20],
  ['PERSON', 'Architect', 21],
  ['CD', 'three', 40],
  ['GPE', 'Europe', 44],
  ['CD', '1829', 46],
  ['CD', 'four', 56],
  ['ORGANIZATION', 'Hudson', 65],
  ['ORGANIZATION', 'Catskill', 69],
  ['ORGANIZATION', 'FTP', 72],
  ['ORGANIZATION', 'Oxbow', 79],
  ['ORGANIZATION', 'Voyage', 82],
  ['GPE', 'Life', 84]],
 'pos_token': {0: '',
  1: 'painters',
  2: 'indulgence',
  4: 'visual',
  5: 'fantasy',
  7: 'appreciation',
  9: 'different',
  10: 'historic',
  11: 'architectural',
  12: 'styles',
  15: 'seen',
  18: '1840',
  19: 'architects',
  20: 'dream',
  23: 'series',
  25: 'paintings',
  28: 'last',
  31: 'mohicans',
  33: 'made',
  35: 'three',
  36: 'year',
  37: 'trip',
  39: 'europe',
  41: '1829',
  45: 'better',
  46: 'known',
  49: 'trip',
  50: 'four',
  51: 'years',
  52: 'earlier',
  56: 'journeyed',
  59: 'hudson',
  60: 'river',
  63: 'catskill',
  64: 'mountains',
  65: 'ftp',
  66: 'name',
  68: 'this_painter',
  71: 'oxbow',
  74: 'voyage',
  76: 'life',
  77: 'series'},
 'q_acc_ratio_cat': 0.7560975609756098,
 'q_ave_pos_cat': 43.640185830429736,
 'question': "This painter's indulgence of visual fantasy, and appreciation of different historic architectural styles can be seen in his 1840 Architect's Dream. After a series of paintings on The Last of the Mohicans, he made a three year trip to Europe in 1829, but he is better known for a trip four years earlier in which he journeyed up the Hudson River to the Catskill Mountains. FTP, name this painter of The Oxbow and The Voyage of Life series."}

In [ ]: