In [40]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
from utils import load_buzz, select, write_result
from features import featurize, get_pos
from containers import Questions, Users, Categories
from nlp import extract_entities
In [41]:
import pickle
questions = pickle.load(open('questions01.pkl', 'rb'))
users = pickle.load(open('users01.pkl', 'rb'))
categories = pickle.load(open('categories01.pkl', 'rb'))
In [42]:
set(users[0].keys()) - set(['cat_uid'])
Out[42]:
In [43]:
from sklearn.preprocessing import normalize
wanted_user_items = list(set(users[0].keys()) - set(['cat_uid']))
X_pos_uid = users.select(wanted_user_items)
X_pos_qid = questions.select(['ave_pos_qid', 'acc_ratio_qid', 'ne_nor_mean', 'ne_mean', 'ne_median'])
X_pos_uid = normalize(X_pos_uid, norm='l1')
X_pos_qid = normalize(X_pos_qid, norm='l1')
In [44]:
print(X_pos_qid[0])
print(X_pos_uid[0])
In [45]:
from sklearn.cluster import KMeans
# Question category
n_components = 27
est = KMeans(n_clusters=n_components)
est.fit(X_pos_qid)
pred_cat_qid = est.predict(X_pos_qid)
plt.hist(pred_cat_qid, bins=50, facecolor='g', alpha=0.75)
plt.xlabel("Category number")
plt.ylabel("Count")
plt.title("Question Category: " + str(n_components) + " categories")
plt.grid(True)
plt.show()
In [46]:
# User category
n_components = 27
est = KMeans(n_clusters=n_components)
est.fit(X_pos_uid)
pred_cat_uid = est.predict(X_pos_uid)
plt.hist(pred_cat_uid, bins=50, facecolor='g', alpha=0.75)
plt.xlabel("Category number")
plt.ylabel("Count")
plt.title("User Category: " + str(n_components) + " categories")
plt.grid(True)
plt.show()
In [47]:
from collections import Counter
users.sub_append('cat_uid', {key: str(pred_cat_uid[i]) for i, key in enumerate(users.keys())})
questions.sub_append('cat_qid', {key: str(pred_cat_qid[i]) for i, key in enumerate(questions.keys())})
# to get most frequent cat for some test data which do not have ids in train set
most_pred_cat_uid = Counter(pred_cat_uid).most_common(1)[0][0]
most_pred_cat_qid = Counter(pred_cat_qid).most_common(1)[0][0]
print(most_pred_cat_uid)
print(most_pred_cat_qid)
In [48]:
def add_features(X):
for item in X:
# category
for key in categories[item['category']].keys():
item[key] = categories[item['category']][key]
uid = int(item['uid'])
qid = int(item['qid'])
# uid
if int(uid) in users:
item.update(users[uid])
else:
acc = users.select(['acc_ratio_uid'])
item['acc_ratio_uid'] = sum(acc) / float(len(acc))
item['cat_uid'] = most_pred_cat_uid
# qid
if int(qid) in questions:
item.update(questions[qid])
In [49]:
import pickle
questions = pickle.load(open('questions01.pkl', 'rb'))
users = pickle.load(open('users01.pkl', 'rb'))
categories = pickle.load(open('categories01.pkl', 'rb'))
In [50]:
from utils import load_buzz, select, write_result
from features import featurize, get_pos
from containers import Questions, Users, Categories
from nlp import extract_entities
import math
from collections import Counter
from numpy import abs, sqrt
from sklearn.linear_model import ElasticNetCV
from sklearn.cross_validation import ShuffleSplit, cross_val_score
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import normalize
from sklearn.svm import LinearSVC
from sklearn.cluster import KMeans
wanted_user_items = list(set(users[0].keys()) - set(['cat_uid']))
X_pos_uid = users.select(wanted_user_items)
X_pos_qid = questions.select(['ave_pos_qid', 'acc_ratio_qid', 'ne_nor_mean', 'ne_mean', 'ne_median'])
X_pos_uid = normalize(X_pos_uid, norm='l1')
X_pos_qid = normalize(X_pos_qid, norm='l1')
tu = ('l1', 'n_uid_clust', 'n_qid_clust', 'rmse')
print ('=== Bench with ElasticNetCV: {0}, {1}, {2}, {3}'.format(*tu))
for ii in [27]:
n_uid_clu = ii
n_qid_clu = ii
# clustering for uid
uid_est = KMeans(n_clusters=n_uid_clu)
uid_est.fit(X_pos_uid)
pred_cat_uid = uid_est.predict(X_pos_uid)
# clustering for qid
qid_est = KMeans(n_clusters=n_qid_clu)
qid_est.fit(X_pos_qid)
pred_cat_qid = qid_est.predict(X_pos_qid)
users.sub_append('cat_uid', {key: str(pred_cat_uid[i]) for i, key in enumerate(users.keys())})
questions.sub_append('cat_qid', {key: str(pred_cat_qid[i]) for i, key in enumerate(questions.keys())})
# to get most frequent cat for some test data which do not have ids in train set
most_pred_cat_uid = Counter(pred_cat_uid).most_common(1)[0][0]
most_pred_cat_qid = Counter(pred_cat_qid).most_common(1)[0][0]
X_train, y_train = featurize(load_buzz(), group='train',
sign_val=None, extra=['sign_val', 'avg_pos'])
add_features(X_train)
unwanted_features = ['ne_tags', 'pos_token', 'question', 'sign_val', 'group']
wanted_features = list(set(X_train[1].keys()) - set(unwanted_features))
X_train = select(X_train, wanted_features)
vec = DictVectorizer()
X_train_dict_vec = vec.fit_transform(X_train)
X_new = X_train_dict_vec
#X_new = LinearSVC(C=0.01, penalty="l1", dual=False, random_state=50).fit_transform(X_train_dict_vec, y_train)
n_samples = X_new.shape[0]
cv = ShuffleSplit(n_samples, n_iter=5, test_size=0.2, random_state=50)
print("L1-based feature selection:", X_train_dict_vec.shape, X_new.shape)
for l1 in [0.7]:
scores = cross_val_score(ElasticNetCV(n_jobs=3, normalize=True, l1_ratio = l1),
X_new, y_train,
cv=cv, scoring='mean_squared_error')
rmse = sqrt(abs(scores)).mean()
print ('{0}, {1}, {2}, {3}'.format(l1, n_uid_clu, n_qid_clu, rmse))
Original
=== Bench with ElasticNetCV: l1, n_uid_clust, n_qid_clust, rmse
L1-based feature selection: (28494, 1112) (28494, 1112)
0.7, 27, 27, 74.88480204218828
Without users features for regression
=== Bench with ElasticNetCV: l1, n_uid_clust, n_qid_clust, rmse
L1-based feature selection: (28494, 1112) (28494, 1112)
0.7, 27, 27, 74.94733641570902
In [51]:
X_test = featurize(load_buzz(), group='test', sign_val=None, extra=['avg_pos'])
add_features(X_test)
X_test = select(X_test, wanted_features)
In [52]:
unwanted_features = ['ne_tags', 'pos_token', 'question', 'sign_val', 'group']
wanted_features = list(set(X_train[1].keys()) - set(unwanted_features))
X_train = select(X_train, wanted_features)
X_train[0]
Out[52]:
In [53]:
users[131]
Out[53]:
In [54]:
categories['astronomy']
Out[54]:
In [55]:
X_test[1]
Out[55]:
In [56]:
vec = DictVectorizer()
vec.fit(X_train + X_test)
X_train = vec.transform(X_train)
X_test = vec.transform(X_test)
In [58]:
for l1_ratio in [0.7]:
print('=== l1_ratio:', l1_ratio)
regressor = ElasticNetCV(n_jobs=3, normalize=True, l1_ratio=l1_ratio, random_state=50)
regressor.fit(X_train, y_train)
print(regressor.coef_)
print(regressor.alpha_)
predictions = regressor.predict(X_test)
write_result(load_buzz()['test'], predictions, file_name=str(l1_ratio)+'guess_adj.csv', adj=True)
80.97372