In [13]:
import gzip
import cPickle as pickle
In [14]:
with gzip.open("../data/train.pklz", "rb") as train_file:
train_set = pickle.load(train_file)
with gzip.open("../data/test.pklz", "rb") as test_file:
test_set = pickle.load(test_file)
with gzip.open("../data/questions.pklz", "rb") as questions_file:
questions = pickle.load(questions_file)
In [15]:
print train_set[1]
print questions[1].keys()
In [16]:
X = []
Y = []
avg_time_per_user = {}
avg_time_per_que = {}
for key in train_set:
# We only care about positive case at this time
#if train_set[key]['position'] < 0:
# continue
uid = train_set[key]['uid']
qid = train_set[key]['qid']
pos = train_set[key]['position']
q_length = max(questions[qid]['pos_token'].keys())
category = questions[qid]['category'].lower()
answer = questions[qid]['answer'].lower()
# Calculate average response time per user
temp = 0; num = 0
if uid not in avg_time_per_user.keys():
for keysubset in train_set:
if train_set[keysubset]['uid'] == uid:
temp += train_set[keysubset]['position']
num += 1
avg_time_per_user[uid] = temp/num
temp=0; num = 0
# Calculate average response time per question
temp=0; num = 0
if qid not in avg_time_per_que.keys():
for keysubset in train_set:
if train_set[keysubset]['qid'] == qid:
temp += train_set[keysubset]['position']
num += 1
avg_time_per_que[qid] = temp/num
temp=0; num = 0
feat = {"uid": str(uid), "qid": str(qid), "q_length": q_length, "category": category, "answer": answer, "avg_per_uid": avg_time_per_user[uid], "avg_per_qid":avg_time_per_que[qid]}
X.append(feat)
Y.append([pos])
In [17]:
print len(X)
print len(Y)
print X[0], Y[0]
In [18]:
from sklearn.feature_extraction import DictVectorizer
vec = DictVectorizer()
X = vec.fit_transform(X)
print X[0]
In [19]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.cross_validation import train_test_split, cross_val_score
import math
X_train, X_test, Y_train, Y_test = train_test_split (X, Y)
regressor = LinearRegression()
scores = cross_val_score(regressor, X, Y, cv=10, scoring= 'mean_squared_error')
# Flip the sign of MSE and take sqrt of that values.
for ii in xrange(len(scores)):
scores[ii] = math.sqrt(-1*scores[ii])
print 'Linear Cross validation RMSE scores:', scores.mean()
print scores
regressor = Ridge()
scores = cross_val_score(regressor, X, Y, cv=10, scoring= 'mean_squared_error')
# Flip the sign of MSE and take sqrt of that values.
for ii in xrange(len(scores)):
scores[ii] = math.sqrt(-1*scores[ii])
print 'Ridge Cross validation RMSE scores:', scores.mean()
print scores
regressor = Lasso()
scores = cross_val_score(regressor, X, Y, cv=10, scoring= 'mean_squared_error')
# Flip the sign of MSE and take sqrt of that values.
for ii in xrange(len(scores)):
scores[ii] = math.sqrt(-1*scores[ii])
print 'Lasso Cross validation RMSE scores:', scores.mean()
print scores
regressor = ElasticNet()
scores = cross_val_score(regressor, X, Y, cv=10, scoring= 'mean_squared_error')
# Flip the sign of MSE and take sqrt of that values.
for ii in xrange(len(scores)):
scores[ii] = math.sqrt(-1*scores[ii])
print 'ElasticNet Cross validation RMSE scores:', scores.mean()
print scores
In [20]:
a = [{1: 2}, {2: 3}]
b = [{3: 2}, {4: 3}]
c = a + b
print c[:len(a)]
print c[len(a):]
In [21]:
X_train = []
Y_train = []
for key in train_set:
# We only care about positive case at this time
#if train_set[key]['position'] < 0:
# continue
uid = train_set[key]['uid']
qid = train_set[key]['qid']
pos = train_set[key]['position']
q_length = max(questions[qid]['pos_token'].keys())
category = questions[qid]['category'].lower()
answer = questions[qid]['answer'].lower()
feat = {"uid": str(uid), "qid": str(qid), "q_length": q_length, "category": category, "answer": answer}
X_train.append(feat)
Y_train.append(pos)
X_test = []
Y_test = []
for key in test_set:
uid = test_set[key]['uid']
qid = test_set[key]['qid']
q_length = max(questions[qid]['pos_token'].keys())
category = questions[qid]['category'].lower()
answer = questions[qid]['answer'].lower()
feat = {"uid": str(uid), "qid": str(qid), "q_length": q_length, "category": category, "answer": answer}
X_test.append(feat)
Y_test.append(key)
print "Before transform: ", len(X_test)
X_train_length = len(X_train)
X = vec.fit_transform(X_train + X_test)
X_train = X[:X_train_length]
X_test = X[X_train_length:]
In [28]:
regressor = Ridge()
regressor.fit(X_train, Y_train)
Out[28]:
In [29]:
predictions = regressor.predict(X_test)
predictions = sorted([[id, predictions[index]] for index, id in enumerate(Y_test)])
print len(predictions)
predictions[:5]
Out[29]:
In [30]:
import csv
predictions.insert(0,["id", "position"])
with open('guess.csv', 'wb') as fp:
writer = csv.writer(fp, delimiter=',')
writer.writerows(predictions)
All right. Let's submit!