In [ ]:
# hide
%load_ext autoreload
%autoreload 2
Connect to the application and define a query model.
In [ ]:
from vespa.application import Vespa
from vespa.query import Query, RankProfile, OR
app = Vespa(url = "https://api.cord19.vespa.ai")
query_model = Query(
match_phase = OR(),
rank_profile = RankProfile(name="bm25", list_features=True))
Define some labelled data.
In [ ]:
labelled_data = [
{
"query_id": 0,
"query": "Intrauterine virus infections and congenital heart disease",
"relevant_docs": [{"id": 0, "score": 1}, {"id": 3, "score": 1}]
},
{
"query_id": 1,
"query": "Clinical and immunologic studies in identical twins discordant for systemic lupus erythematosus",
"relevant_docs": [{"id": 1, "score": 1}, {"id": 5, "score": 1}]
}
]
In [ ]:
training_data_batch = app.collect_training_data(
labelled_data = labelled_data,
id_field = "id",
query_model = query_model,
number_additional_docs = 2
)
training_data_batch
Out[ ]:
In [ ]:
from pandas import concat, DataFrame
training_data = []
for query_data in labelled_data:
for doc_data in query_data["relevant_docs"]:
training_data_point = app.collect_training_data_point(
query = query_data["query"],
query_id = query_data["query_id"],
relevant_id = doc_data["id"],
id_field = "id",
query_model = query_model,
number_additional_docs = 2
)
training_data.extend(training_data_point)
training_data = DataFrame.from_records(training_data)
training_data
Out[ ]: