This model is just for code clean up. So, it means there has no difference between model05 and model06 in terms of result. However, this version is might be better for your code base for next step than model 01 to model 05.
In [138]:
import gzip
import cPickle as pickle
with gzip.open("../data/train.pklz", "rb") as train_file:
train_set = pickle.load(train_file)
with gzip.open("../data/test.pklz", "rb") as test_file:
test_set = pickle.load(test_file)
with gzip.open("../data/questions.pklz", "rb") as questions_file:
questions = pickle.load(questions_file)
In [139]:
print "* train_set:", train_set[1]
print "* test_set:", test_set[7]
print "* question keys:", questions[1].keys()
"* question contents:", questions[1]
Out[139]:
In [140]:
from collections import defaultdict
"""
Calculate average position(response time) per user(uid) and question(qid).
"""
def get_avg_pos(data):
pos_uid = defaultdict(list)
pos_qid = defaultdict(list)
for key in data:
uid = data[key]['uid']
qid = data[key]['qid']
pos = data[key]['position']
pos_uid[uid].append(pos)
pos_qid[qid].append(pos)
avg_pos_uid = {}
avg_pos_qid = {}
for key in pos_uid:
avg_pos_uid[key] = sum(pos_uid[key]) / len(pos_uid[key])
for key in pos_qid:
avg_pos_qid[key] = sum(pos_qid[key]) / len(pos_qid[key])
return [avg_pos_uid, avg_pos_qid]
"""
Make feature vectors for given data set
"""
def featurize(data, avg_pos):
X = []
avg_pos_uid = avg_pos[0]
avg_pos_qid = avg_pos[1]
for key in data:
uid = data[key]['uid']
qid = data[key]['qid']
q_length = max(questions[qid]['pos_token'].keys())
category = questions[qid]['category'].lower()
answer = questions[qid]['answer'].lower()
if uid in avg_pos_uid:
pos_uid = avg_pos_uid[uid]
else:
pos_uid = sum(avg_pos_uid.values()) / float(len(avg_pos_uid.values()))
if qid in avg_pos_qid:
pos_qid = avg_pos_qid[qid]
else:
pos_qid = sum(avg_pos_qid.values()) / float(len(avg_pos_qid.values()))
feat = {"uid": str(uid),
"qid": str(qid),
"q_length": q_length,
"category": category,
"answer": answer,
"avg_pos_uid": pos_uid,
"avg_pos_qid": pos_qid
}
X.append(feat)
return X
"""
Get positions
"""
def get_positions(data):
Y = []
for key in data:
position = data[key]['position']
Y.append([position])
return Y
Look at the feature vector.
In [141]:
X_train = featurize(train_set, get_avg_pos(train_set))
Y_train = get_positions(train_set)
print len(X_train)
print len(Y_train)
print X_train[0], Y_train[0]
In [142]:
from sklearn.feature_extraction import DictVectorizer
vec = DictVectorizer()
X_train = vec.fit_transform(X_train)
print X_train[0]
In [143]:
from sklearn import linear_model
from sklearn.cross_validation import train_test_split, cross_val_score
import math
from numpy import abs, sqrt
regressor_names = """
LinearRegression
Ridge
Lasso
ElasticNet
"""
print "=== Linear Cross validation RMSE scores:"
for regressor in regressor_names.split():
scores = cross_val_score(getattr(linear_model, regressor)(), X_train, Y_train, cv=10,\
scoring='mean_squared_error')
print regressor, sqrt(abs(scores)).mean()
In [149]:
X_train = featurize(train_set, get_avg_pos(train_set))
X_test = featurize(test_set, get_avg_pos(train_set))
for x in X_test[:10]:
print x
X_train_length = len(X_train)
X = vec.fit_transform(X_train + X_test)
X_train = X[:X_train_length]
X_test = X[X_train_length:]
In [150]:
regressor = Lasso()
regressor.fit(X_train, Y_train)
Out[150]:
In [133]:
predictions = regressor.predict(X_test)
predictions = sorted([[id, predictions[index]] for index, id in enumerate(test_set.keys())])
print len(predictions)
predictions[:5]
Out[133]:
In [134]:
import csv
predictions.insert(0,["id", "position"])
with open('guess.csv', 'wb') as fp:
writer = csv.writer(fp, delimiter=',')
writer.writerows(predictions)
It scores 85.85977