In [1]:
import gzip
import cPickle as pickle
In [6]:
with gzip.open("../data/train.pklz", "rb") as train_file:
train_set = pickle.load(train_file)
with gzip.open("../data/test.pklz", "rb") as test_file:
test_set = pickle.load(test_file)
with gzip.open("../data/questions.pklz", "rb") as questions_file:
questions = pickle.load(questions_file)
Let's take a look the loaded data set.
In [8]:
print "train_set: ", len(train_set)
print "test_set: ", len(test_set)
print "questions: ", len(questions)
In [21]:
print sorted(train_set.keys())[:10]
print train_set[1]
print train_set[1].keys()
In [24]:
print sorted(test_set.keys())[:10]
print test_set[7]
print test_set[7].keys()
In [25]:
print sorted(questions.keys())[:10]
print questions[1]
print questions[1].keys()
In [43]:
X_train = []
Y_train = []
for key in train_set:
# We only care about positive case at this time
if train_set[key]['position'] < 0:
continue
uid = train_set[key]['uid']
qid = train_set[key]['qid']
pos = train_set[key]['position']
q_length = max(questions[qid]['pos_token'].keys())
feat = [uid, qid, q_length]
X_train.append(feat)
Y_train.append([pos])
In [44]:
print len(X_train)
print len(Y_train)
In [45]:
print X_train[0], Y_train[0]
In [77]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(X_train, Y_train)
Out[77]:
Let's make test set for testing.
In [78]:
X_test = []
test_id = []
for key in test_set:
test_id.append(key)
uid = test_set[key]['uid']
qid = test_set[key]['qid']
q_length = max(questions[qid]['pos_token'].keys())
feat = [uid, qid, q_length]
X_test.append(feat)
In [81]:
predictions = model.predict(X_test)
predictions = sorted([[id, predictions[index][0]] for index, id in enumerate(test_id)])
print len(predictions)
predictions[:5]
Out[81]:
In [82]:
import csv
predictions.insert(0,["id", "position"])
with open('guess.csv', 'wb') as fp:
writer = csv.writer(fp, delimiter=',')
writer.writerows(predictions)
All right. Let's submit!
And... we got... 5th ranked. It's worse than the first submission. Let's think about why.
5 new CU_K-ml_Stars 97.07613 1 Sun, 05 Apr 2015 23:15:52
In [ ]: