In [ ]:
%matplotlib inline

Categorization Example [REST API]

An example to illustrate binary categorizaiton with FreeDiscovery


In [ ]:
from __future__ import print_function

from time import time, sleep
import os.path
from multiprocessing import Process
import requests
import pandas as pd

pd.options.display.float_format = '{:,.3f}'.format
pd.options.display.expand_frame_repr = False

dataset_name = "treclegal09_2k_subset"     # see list of available datasets

BASE_URL = "http://localhost:5001/api/v0"  # FreeDiscovery server URL

if __name__ == '__main__':

    print(" 0. Load the example dataset")
    url = BASE_URL + '/example-dataset/{}'.format(dataset_name)
    print(" GET", url)
    input_ds = requests.get(url, json={'train_set_fields': ['document_id']}).json()


    data_dir = input_ds['metadata']['data_dir']
    dataset_definition = [{'document_id': row['document_id'],
                           'file_path': os.path.join(data_dir, row['file_path'])} \
                                   for row in input_ds['dataset']]
    # create a custom dataset definition for ingestion

    # 1. Feature extraction

    print("\n1.a Load dataset and initalize feature extraction")
    url = BASE_URL + '/feature-extraction'
    print(" POST", url)
    res = requests.post(url, json={'dataset_definition': dataset_definition,
                                   'use_hashing': True}).json()

    dsid = res['id']
    print("   => received {}".format(list(res.keys())))
    print("   => dsid = {}".format(dsid))

    print("\n1.b Start feature extraction (in the background)")

    # Make this call in a background process (there should be a better way of doing it)
    url = BASE_URL+'/feature-extraction/{}'.format(dsid)
    print(" POST", url)
    p = Process(target=requests.post, args=(url,))
    p.start()
    sleep(5.0) # wait a bit for the processing to start

    print('\n1.c Monitor feature extraction progress')
    url = BASE_URL+'/feature-extraction/{}'.format(dsid)
    print(" GET", url)

    t0 = time()
    while True:
        res = requests.get(url)
        if res.status_code == 520:
            p.terminate()
            raise ValueError('Processing did not start')
        elif res.status_code == 200:
            break # processing finished
        data = res.json()
        print('     ... {}k/{}k files processed in {:.1f} min'.format(
                    data['n_samples_processed']//1000, data['n_samples']//1000, (time() - t0)/60.))
        sleep(15.0)

    p.terminate()  # just in case, should not be necessary


    print("\n1.d. check the parameters of the extracted features")
    url = BASE_URL + '/feature-extraction/{}'.format(dsid)
    print(' GET', url)
    res = requests.get(url).json()

    print('\n'.join(['     - {}: {}'.format(key, val) for key, val in res.items() \
                                                      if "filenames" not in key]))
    # this step is not necessary anymore
    #method = BASE_URL + "/feature-extraction/{}/id-mapping/flat".format(dsid)
    #res = requests.get(method, data={'document_id': seed_document_id})
    #seed_internal_id = res.json()['internal_id']


    # 3. Document categorization with LSI (used for Nearest Neighbors method)

    print("\n2. Calculate LSI")

    url = BASE_URL + '/lsi/'
    print("POST", url)

    n_components = 100
    res = requests.post(url,
                        json={'n_components': n_components,
                              'parent_id': dsid
                              }).json()

    lsi_id = res['id']
    print('  => LSI model id = {}'.format(lsi_id))
    print('  => SVD decomposition with {} dimensions explaining {:.2f} % variabilty of the data'.format(
                            n_components, res['explained_variance']*100))


    # 3. Document categorization

    print("\n3.a. Train the categorization model")
    print("   {} positive, {} negative files".format(pd.DataFrame(input_ds['training_set']).groupby('category'), 0))

    for method, use_lsi in [('LinearSVC', False),
                            ('NearestNeighbor', True)]:

        print('='*80, '\n', ' '*10,
              method, " + LSI" if use_lsi else ' ', '\n', '='*80)
        if use_lsi:
            # Categorization with the previously created LSI model
            parent_id = lsi_id
        else:
            # Categorization with original text features
            parent_id = dsid

        url = BASE_URL + '/categorization/'
        print(" POST", url)
        print(' Training...')

        res = requests.post(url,
                            json={'parent_id': parent_id,
                                  'data': input_ds['training_set'],
                                  'method': method,  # one of "LinearSVC", "LogisticRegression", 'xgboost'
                                  }).json()

        mid = res['id']
        print("     => model id = {}".format(mid))
        print('    => Training scores: MAP = {average_precision:.3f}, ROC-AUC = {roc_auc:.3f}, F1= {f1:.3f}'.format(**res))

        print("\n3.b. Check the parameters used in the categorization model")
        url = BASE_URL + '/categorization/{}'.format(mid)
        print(" GET", url)
        res = requests.get(url).json()

        print('\n'.join(['     - {}: {}'.format(key, val) for key, val in res.items() \
                                                          if key not in ['index', 'category']]))

        print("\n3.c Categorize the complete dataset with this model")
        url = BASE_URL + '/categorization/{}/predict'.format(mid)
        print(" GET", url)
        res = requests.get(url).json()

        if method == "NearestNeighbor":
            data = res['data']
        else:
            data = res['data']

        df = pd.DataFrame(data).set_index('internal_id')
        #if method == "NearestNeighbor":
        #    df = df[['document_id', 'nn_negative__distance', 'nn_negative__document_id',
        #          'nn_positive__distance', 'nn_positive__document_id', 'score']]

        print(df)

        #print("\n3.d Compute the categorization scores")
        #url = BASE_URL + '/metrics/categorization'
        #print(" GET", url)
        #res = requests.post(url, json={'y_true': ground_truth_y,
        #                              'y_pred': df.score.values.tolist(),
        #                             } ).json()


        #print('    => Test scores: MAP = {average_precision:.3f}, ROC-AUC = {roc_auc:.3f}'.format(**res))

    # 4. Cleaning
    print("\n5.a Delete the extracted features (and LSI decomposition)")
    url = BASE_URL + '/feature-extraction/{}'.format(dsid)
    print(" DELETE", url)
    requests.delete(url)