In [ ]:
# hide
%load_ext autoreload
%autoreload 2

Vespa - collect training data

Collect training data to analyse and/or improve ranking functions

Example setup

Connect to the application and define a query model.


In [ ]:
from vespa.application import Vespa
from vespa.query import Query, RankProfile, OR

app = Vespa(url = "https://api.cord19.vespa.ai")
query_model = Query(
    match_phase = OR(),
    rank_profile = RankProfile(name="bm25", list_features=True))

Define some labelled data.


In [ ]:
labelled_data = [
    {
        "query_id": 0, 
        "query": "Intrauterine virus infections and congenital heart disease",
        "relevant_docs": [{"id": 0, "score": 1}, {"id": 3, "score": 1}]
    },
    {
        "query_id": 1, 
        "query": "Clinical and immunologic studies in identical twins discordant for systemic lupus erythematosus",
        "relevant_docs": [{"id": 1, "score": 1}, {"id": 5, "score": 1}]
    }
]

Collect training data in batch


In [ ]:
training_data_batch = app.collect_training_data(
    labelled_data = labelled_data,
    id_field = "id",
    query_model = query_model,
    number_additional_docs = 2
)
training_data_batch


Out[ ]:
attributeMatch(authors.first) attributeMatch(authors.first).averageWeight attributeMatch(authors.first).completeness attributeMatch(authors.first).fieldCompleteness attributeMatch(authors.first).importance attributeMatch(authors.first).matches attributeMatch(authors.first).maxWeight attributeMatch(authors.first).normalizedWeight attributeMatch(authors.first).normalizedWeightedWeight attributeMatch(authors.first).queryCompleteness ... textSimilarity(results).queryCoverage textSimilarity(results).score textSimilarity(title).fieldCoverage textSimilarity(title).order textSimilarity(title).proximity textSimilarity(title).queryCoverage textSimilarity(title).score document_id query_id relevant
0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.000000 0.0 0.000000 0.000000 0.000000 0 0 1
1 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 1.000000 1.0 1.000000 1.000000 1.000000 56212 0 0
2 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.187500 0.5 0.617188 0.428571 0.457087 34026 0 0
3 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.000000 0.0 0.000000 0.000000 0.000000 3 0 1
4 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 1.000000 1.0 1.000000 1.000000 1.000000 56212 0 0
5 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.187500 0.5 0.617188 0.428571 0.457087 34026 0 0
6 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.071429 0.0 0.000000 0.083333 0.039286 1 1 1
7 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 1.000000 1.0 1.000000 1.000000 1.000000 29774 1 0
8 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.500000 1.0 1.000000 0.333333 0.700000 22787 1 0
9 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.058824 0.0 0.000000 0.083333 0.036765 5 1 1
10 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 1.000000 1.0 1.000000 1.000000 1.000000 29774 1 0
11 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.500000 1.0 1.000000 0.333333 0.700000 22787 1 0

12 rows × 984 columns

Collect training data point

You can have finer control with the collect_training_data_point method.


In [ ]:
from pandas import concat, DataFrame


training_data = []
for query_data in labelled_data:
    for doc_data in query_data["relevant_docs"]:
        training_data_point = app.collect_training_data_point(
            query = query_data["query"],
            query_id = query_data["query_id"],
            relevant_id = doc_data["id"],
            id_field = "id",
            query_model = query_model,
            number_additional_docs = 2
        )
        training_data.extend(training_data_point)
training_data = DataFrame.from_records(training_data)
training_data


Out[ ]:
attributeMatch(authors.first) attributeMatch(authors.first).averageWeight attributeMatch(authors.first).completeness attributeMatch(authors.first).fieldCompleteness attributeMatch(authors.first).importance attributeMatch(authors.first).matches attributeMatch(authors.first).maxWeight attributeMatch(authors.first).normalizedWeight attributeMatch(authors.first).normalizedWeightedWeight attributeMatch(authors.first).queryCompleteness ... textSimilarity(results).queryCoverage textSimilarity(results).score textSimilarity(title).fieldCoverage textSimilarity(title).order textSimilarity(title).proximity textSimilarity(title).queryCoverage textSimilarity(title).score document_id query_id relevant
0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.000000 0.0 0.000000 0.000000 0.000000 0 0 1
1 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 1.000000 1.0 1.000000 1.000000 1.000000 56212 0 0
2 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.187500 0.5 0.617188 0.428571 0.457087 34026 0 0
3 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.000000 0.0 0.000000 0.000000 0.000000 3 0 1
4 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 1.000000 1.0 1.000000 1.000000 1.000000 56212 0 0
5 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.187500 0.5 0.617188 0.428571 0.457087 34026 0 0
6 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.071429 0.0 0.000000 0.083333 0.039286 1 1 1
7 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 1.000000 1.0 1.000000 1.000000 1.000000 29774 1 0
8 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.500000 1.0 1.000000 0.333333 0.700000 22787 1 0
9 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.058824 0.0 0.000000 0.083333 0.036765 5 1 1
10 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 1.000000 1.0 1.000000 1.000000 1.000000 29774 1 0
11 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.500000 1.0 1.000000 0.333333 0.700000 22787 1 0

12 rows × 984 columns