In [ ]:

    
# hide
%load_ext autoreload
%autoreload 2

Vespa - collect training data

Collect training data to analyse and/or improve ranking functions

Example setup

Connect to the application and define a query model.



In [ ]:

    
from vespa.application import Vespa
from vespa.query import Query, RankProfile, OR

app = Vespa(url = "https://api.cord19.vespa.ai")
query_model = Query(
    match_phase = OR(),
    rank_profile = RankProfile(name="bm25", list_features=True))

Define some labelled data.



In [ ]:

    
labelled_data = [
    {
        "query_id": 0, 
        "query": "Intrauterine virus infections and congenital heart disease",
        "relevant_docs": [{"id": 0, "score": 1}, {"id": 3, "score": 1}]
    },
    {
        "query_id": 1, 
        "query": "Clinical and immunologic studies in identical twins discordant for systemic lupus erythematosus",
        "relevant_docs": [{"id": 1, "score": 1}, {"id": 5, "score": 1}]
    }
]

Collect training data in batch



In [ ]:

    
training_data_batch = app.collect_training_data(
    labelled_data = labelled_data,
    id_field = "id",
    query_model = query_model,
    number_additional_docs = 2
)
training_data_batch









    Out[ ]:







  
    
      
      attributeMatch(authors.first)
      attributeMatch(authors.first).averageWeight
      attributeMatch(authors.first).completeness
      attributeMatch(authors.first).fieldCompleteness
      attributeMatch(authors.first).importance
      attributeMatch(authors.first).matches
      attributeMatch(authors.first).maxWeight
      attributeMatch(authors.first).normalizedWeight
      attributeMatch(authors.first).normalizedWeightedWeight
      attributeMatch(authors.first).queryCompleteness
      ...
      textSimilarity(results).queryCoverage
      textSimilarity(results).score
      textSimilarity(title).fieldCoverage
      textSimilarity(title).order
      textSimilarity(title).proximity
      textSimilarity(title).queryCoverage
      textSimilarity(title).score
      document_id
      query_id
      relevant
    
  
  
    
      0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      0.0
      0.0
      0.000000
      0.0
      0.000000
      0.000000
      0.000000
      0
      0
      1
    
    
      1
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      0.0
      0.0
      1.000000
      1.0
      1.000000
      1.000000
      1.000000
      56212
      0
      0
    
    
      2
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      0.0
      0.0
      0.187500
      0.5
      0.617188
      0.428571
      0.457087
      34026
      0
      0
    
    
      3
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      0.0
      0.0
      0.000000
      0.0
      0.000000
      0.000000
      0.000000
      3
      0
      1
    
    
      4
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      0.0
      0.0
      1.000000
      1.0
      1.000000
      1.000000
      1.000000
      56212
      0
      0
    
    
      5
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      0.0
      0.0
      0.187500
      0.5
      0.617188
      0.428571
      0.457087
      34026
      0
      0
    
    
      6
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      0.0
      0.0
      0.071429
      0.0
      0.000000
      0.083333
      0.039286
      1
      1
      1
    
    
      7
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      0.0
      0.0
      1.000000
      1.0
      1.000000
      1.000000
      1.000000
      29774
      1
      0
    
    
      8
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      0.0
      0.0
      0.500000
      1.0
      1.000000
      0.333333
      0.700000
      22787
      1
      0
    
    
      9
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      0.0
      0.0
      0.058824
      0.0
      0.000000
      0.083333
      0.036765
      5
      1
      1
    
    
      10
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      0.0
      0.0
      1.000000
      1.0
      1.000000
      1.000000
      1.000000
      29774
      1
      0
    
    
      11
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      0.0
      0.0
      0.500000
      1.0
      1.000000
      0.333333
      0.700000
      22787
      1
      0
    
  

12 rows × 984 columns

Collect training data point

You can have finer control with the collect_training_data_point method.



In [ ]:

    
from pandas import concat, DataFrame


training_data = []
for query_data in labelled_data:
    for doc_data in query_data["relevant_docs"]:
        training_data_point = app.collect_training_data_point(
            query = query_data["query"],
            query_id = query_data["query_id"],
            relevant_id = doc_data["id"],
            id_field = "id",
            query_model = query_model,
            number_additional_docs = 2
        )
        training_data.extend(training_data_point)
training_data = DataFrame.from_records(training_data)
training_data









    Out[ ]:







  
    
      
      attributeMatch(authors.first)
      attributeMatch(authors.first).averageWeight
      attributeMatch(authors.first).completeness
      attributeMatch(authors.first).fieldCompleteness
      attributeMatch(authors.first).importance
      attributeMatch(authors.first).matches
      attributeMatch(authors.first).maxWeight
      attributeMatch(authors.first).normalizedWeight
      attributeMatch(authors.first).normalizedWeightedWeight
      attributeMatch(authors.first).queryCompleteness
      ...
      textSimilarity(results).queryCoverage
      textSimilarity(results).score
      textSimilarity(title).fieldCoverage
      textSimilarity(title).order
      textSimilarity(title).proximity
      textSimilarity(title).queryCoverage
      textSimilarity(title).score
      document_id
      query_id
      relevant
    
  
  
    
      0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      0.0
      0.0
      0.000000
      0.0
      0.000000
      0.000000
      0.000000
      0
      0
      1
    
    
      1
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      0.0
      0.0
      1.000000
      1.0
      1.000000
      1.000000
      1.000000
      56212
      0
      0
    
    
      2
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      0.0
      0.0
      0.187500
      0.5
      0.617188
      0.428571
      0.457087
      34026
      0
      0
    
    
      3
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      0.0
      0.0
      0.000000
      0.0
      0.000000
      0.000000
      0.000000
      3
      0
      1
    
    
      4
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      0.0
      0.0
      1.000000
      1.0
      1.000000
      1.000000
      1.000000
      56212
      0
      0
    
    
      5
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      0.0
      0.0
      0.187500
      0.5
      0.617188
      0.428571
      0.457087
      34026
      0
      0
    
    
      6
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      0.0
      0.0
      0.071429
      0.0
      0.000000
      0.083333
      0.039286
      1
      1
      1
    
    
      7
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      0.0
      0.0
      1.000000
      1.0
      1.000000
      1.000000
      1.000000
      29774
      1
      0
    
    
      8
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      0.0
      0.0
      0.500000
      1.0
      1.000000
      0.333333
      0.700000
      22787
      1
      0
    
    
      9
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      0.0
      0.0
      0.058824
      0.0
      0.000000
      0.083333
      0.036765
      5
      1
      1
    
    
      10
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      0.0
      0.0
      1.000000
      1.0
      1.000000
      1.000000
      1.000000
      29774
      1
      0
    
    
      11
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      0.0
      0.0
      0.500000
      1.0
      1.000000
      0.333333
      0.700000
      22787
      1
      0
    
  

12 rows × 984 columns

	...	textSimilarity(title).fieldCoverage	textSimilarity(title).order	textSimilarity(title).proximity	textSimilarity(title).queryCoverage	textSimilarity(title).score	document_id	query_id	relevant
0	...	0.000000	0.0	0.000000	0.000000	0.000000	0	0	1
1	...	1.000000	1.0	1.000000	1.000000	1.000000	56212	0	0
2	...	0.187500	0.5	0.617188	0.428571	0.457087	34026	0	0
3	...	0.000000	0.0	0.000000	0.000000	0.000000	3	0	1
4	...	1.000000	1.0	1.000000	1.000000	1.000000	56212	0	0
5	...	0.187500	0.5	0.617188	0.428571	0.457087	34026	0	0
6	...	0.071429	0.0	0.000000	0.083333	0.039286	1	1	1
7	...	1.000000	1.0	1.000000	1.000000	1.000000	29774	1	0
8	...	0.500000	1.0	1.000000	0.333333	0.700000	22787	1	0
9	...	0.058824	0.0	0.000000	0.083333	0.036765	5	1	1
10	...	1.000000	1.0	1.000000	1.000000	1.000000	29774	1	0
11	...	0.500000	1.0	1.000000	0.333333	0.700000	22787	1	0