This is a "magic" (leaky) feature that exploits the patterns in question co-occurrence graph (based on the kernel by @zfturbo).
This utility package imports numpy
, pandas
, matplotlib
and a helper kg
module into the root namespace.
In [1]:
from pygoose import *
In [2]:
import hashlib
Automatically discover the paths to various data folders and compose the project structure.
In [3]:
project = kg.Project.discover()
Identifier for storing these features on disk and referring to them later.
In [4]:
feature_list_id = 'magic_pagerank'
Original question datasets.
In [5]:
df_train = pd.read_csv(project.data_dir + 'train.csv').fillna('')
df_test = pd.read_csv(project.data_dir + 'test.csv').fillna('')
Generate a graph of questions and their neighbors.
In [6]:
def generate_qid_graph_table(row):
hash_key1 = hashlib.md5(row['question1'].encode('utf-8')).hexdigest()
hash_key2 = hashlib.md5(row['question2'].encode('utf-8')).hexdigest()
qid_graph.setdefault(hash_key1, []).append(hash_key2)
qid_graph.setdefault(hash_key2, []).append(hash_key1)
In [7]:
qid_graph = {}
_ = df_train.apply(generate_qid_graph_table, axis=1)
_ = df_test.apply(generate_qid_graph_table, axis=1)
Compute PageRank.
In [8]:
def pagerank():
MAX_ITER = 20
d = 0.85
# Initializing: every node gets a uniform value!
pagerank_dict = {i: 1 / len(qid_graph) for i in qid_graph}
num_nodes = len(pagerank_dict)
for iter in range(0, MAX_ITER):
for node in qid_graph:
local_pr = 0
for neighbor in qid_graph[node]:
local_pr += pagerank_dict[neighbor] / len(qid_graph[neighbor])
pagerank_dict[node] = (1 - d) / num_nodes + d * local_pr
return pagerank_dict
In [9]:
pagerank_dict = pagerank()
In [10]:
def get_pagerank_value(pair):
q1 = hashlib.md5(pair[0].encode('utf-8')).hexdigest()
q2 = hashlib.md5(pair[1].encode('utf-8')).hexdigest()
return [pagerank_dict[q1], pagerank_dict[q2]]
In [11]:
pagerank_train = kg.jobs.map_batch_parallel(
df_train[['question1', 'question2']].as_matrix(),
item_mapper = get_pagerank_value,
batch_size=1000,
)
In [12]:
pagerank_test = kg.jobs.map_batch_parallel(
df_test[['question1', 'question2']].as_matrix(),
item_mapper = get_pagerank_value,
batch_size=1000,
)
In [13]:
X_train = np.array(pagerank_train) * 1000
X_test = np.array(pagerank_test) * 1000
In [14]:
print('X train:', X_train.shape)
print('X test: ', X_test.shape)
In [15]:
feature_names = [
'pagerank_q1',
'pagerank_q2',
]
In [16]:
project.save_features(X_train, X_test, feature_names, feature_list_id)