In [23]:
import gzip
import cPickle as pickle
In [24]:
with gzip.open("../data/train.pklz", "rb") as train_file:
train_set = pickle.load(train_file)
with gzip.open("../data/test.pklz", "rb") as test_file:
test_set = pickle.load(test_file)
with gzip.open("../data/questions.pklz", "rb") as questions_file:
questions = pickle.load(questions_file)
In [25]:
print train_set[1]
print questions[1].keys()
In [26]:
X = []
Y = []
for key in train_set:
# We only care about positive case at this time
if train_set[key]['position'] < 0:
continue
uid = train_set[key]['uid']
qid = train_set[key]['qid']
pos = train_set[key]['position']
q_length = max(questions[qid]['pos_token'].keys())
category = questions[qid]['category'].lower()
answer = questions[qid]['answer'].lower()
feat = {"uid": str(uid), "qid": str(qid), "q_length": q_length, "category": category, "answer": answer}
X.append(feat)
Y.append([pos])
In [27]:
print len(X)
print len(Y)
print X[0], Y[0]
In [28]:
from sklearn.feature_extraction import DictVectorizer
vec = DictVectorizer()
X = vec.fit_transform(X)
In [29]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.cross_validation import train_test_split, cross_val_score
X_train, X_test, Y_train, Y_test = train_test_split (X, Y)
regressor = LinearRegression()
scores = cross_val_score(regressor, X, Y, cv=10)
print 'Cross validation r-squared scores:', scores.mean()
print scores
regressor = Ridge()
scores = cross_val_score(regressor, X, Y, cv=10)
print 'Cross validation r-squared scores:', scores.mean()
print scores
regressor = Lasso()
scores = cross_val_score(regressor, X, Y, cv=10)
print 'Cross validation r-squared scores:', scores.mean()
print scores
regressor = ElasticNet()
scores = cross_val_score(regressor, X, Y, cv=10)
print 'Cross validation r-squared scores:', scores.mean()
print scores
In [30]:
a = [{1: 2}, {2: 3}]
b = [{3: 2}, {4: 3}]
c = a + b
print c[:len(a)]
print c[len(a):]
In [35]:
X_train = []
Y_train = []
for key in train_set:
# We only care about positive case at this time
if train_set[key]['position'] < 0:
continue
uid = train_set[key]['uid']
qid = train_set[key]['qid']
pos = train_set[key]['position']
q_length = max(questions[qid]['pos_token'].keys())
category = questions[qid]['category'].lower()
answer = questions[qid]['answer'].lower()
feat = {"uid": str(uid), "qid": str(qid), "q_length": q_length, "category": category, "answer": answer}
X_train.append(feat)
Y_train.append(pos)
X_test = []
Y_test = []
for key in test_set:
uid = test_set[key]['uid']
qid = test_set[key]['qid']
q_length = max(questions[qid]['pos_token'].keys())
category = questions[qid]['category'].lower()
answer = questions[qid]['answer'].lower()
feat = {"uid": str(uid), "qid": str(qid), "q_length": q_length, "category": category, "answer": answer}
X_test.append(feat)
Y_test.append(key)
print "Before transform: ", len(X_test)
X_train_length = len(X_train)
X = vec.fit_transform(X_train + X_test)
X_train = X[:X_train_length]
X_test = X[X_train_length:]
In [44]:
# regressor = LinearRegression()
regressor = Ridge()
regressor.fit(X_train, Y_train)
Out[44]:
In [45]:
predictions = regressor.predict(X_test)
predictions = sorted([[id, predictions[index]] for index, id in enumerate(Y_test)])
print len(predictions)
predictions[:5]
Out[45]:
In [46]:
import csv
predictions.insert(0,["id", "position"])
with open('guess.csv', 'wb') as fp:
writer = csv.writer(fp, delimiter=',')
writer.writerows(predictions)
All right. Let's submit!
In [ ]: