In [1]:
import gzip
import cPickle as pickle
In [2]:
with gzip.open("../data/train.pklz", "rb") as train_file:
train_set = pickle.load(train_file)
with gzip.open("../data/test.pklz", "rb") as test_file:
test_set = pickle.load(test_file)
with gzip.open("../data/questions.pklz", "rb") as questions_file:
questions = pickle.load(questions_file)
In [3]:
X = []
Y = []
for key in train_set:
# We only care about positive case at this time
if train_set[key]['position'] < 0:
continue
uid = train_set[key]['uid']
qid = train_set[key]['qid']
pos = train_set[key]['position']
q_length = max(questions[qid]['pos_token'].keys())
feat = [uid, qid, q_length]
X.append(feat)
Y.append([pos])
In [4]:
print len(X)
print len(Y)
print X[0], Y[0]
In [5]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.cross_validation import train_test_split, cross_val_score
X_train, X_test, Y_train, Y_test = train_test_split (X, Y)
regressor = LinearRegression()
scores = cross_val_score(regressor, X, Y, cv=10)
print 'Cross validation r-squared scores:', scores.mean()
print scores
regressor = Ridge()
scores = cross_val_score(regressor, X, Y, cv=10)
print 'Cross validation r-squared scores:', scores.mean()
print scores
regressor = Lasso()
scores = cross_val_score(regressor, X, Y, cv=10)
print 'Cross validation r-squared scores:', scores.mean()
print scores
regressor = ElasticNet()
scores = cross_val_score(regressor, X, Y, cv=10)
print 'Cross validation r-squared scores:', scores.mean()
print scores
In [182]:
from sklearn.linear_model import SGDRegressor
from sklearn.cross_validation import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.cross_validation import train_test_split
X_scaler = StandardScaler()
Y_scaler = StandardScaler()
X_train, X_test, Y_train, Y_test = train_test_split (X, Y)
X_train = X_scaler.fit_transform(X_train)
Y_train = Y_scaler.fit_transform(Y_train)
X_test = X_scaler.fit_transform(X_test)
Y_test = Y_scaler.fit_transform(Y_test)
http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.SGDRegressor.html
There has four loss-function. ‘squared_loss’, ‘huber’, ‘epsilon_insensitive’, or ‘squared_epsilon_insensitive’. Among those, squared_loss is the best in this case.
In [153]:
regressor = SGDRegressor(loss='squared_loss', penalty='l1')
scores = cross_val_score(regressor, X_train, Y_train, cv=10)
print 'Cross validation r-squared scores:', scores.mean()
print scores
In [154]:
X_test = []
test_id = []
for key in test_set:
test_id.append(key)
uid = test_set[key]['uid']
qid = test_set[key]['qid']
q_length = max(questions[qid]['pos_token'].keys())
feat = [uid, qid, q_length]
X_test.append(feat)
X_scaler = StandardScaler()
Y_scaler = StandardScaler()
X_train = X_scaler.fit_transform(X)
Y_train = Y_scaler.fit_transform(Y)
X_test = X_scaler.fit_transform(X_test)
In [155]:
regressor.fit(X_train, Y_train)
predictions = regressor.predict(X_test)
predictions = Y_scaler.inverse_transform(predictions)
predictions = sorted([[id, predictions[index]] for index, id in enumerate(test_id)])
print len(predictions)
predictions[:5]
Out[155]:
In [156]:
import csv
predictions.insert(0,["id", "position"])
with open('guess.csv', 'wb') as fp:
writer = csv.writer(fp, delimiter=',')
writer.writerows(predictions)
All right. Let's submit!
And... we got... 5th ranked. It's worse than the first submission. Let's think about why.
5 new CU_K-ml_Stars 96.50206 2 Mon, 06 Apr 2015 21:13:50
In [ ]: