notebook.community

Edit and run



In [1]:

    
!conda info -e









    



# conda environments:
#
freediscovery-env     *  /Users/kcom/anaconda/envs/freediscovery-env
go_r                     /Users/kcom/anaconda/envs/go_r
topik                    /Users/kcom/anaconda/envs/topik
root                     /Users/kcom/anaconda



In [2]:

    
# -*- coding: utf-8 -*-
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import sys
from time import time, sleep
import os
from multiprocessing import Process

import requests



In [4]:

    
#import freediscovery.tests as ft
#import freediscovery as ft



In [5]:

    
def _parent_dir(path, n=0):
    path = os.path.abspath(path)
    if n==0:
        return path
    else:
        return os.path.dirname(_parent_dir(path, n=n-1))



In [6]:

    
def _print_url(op, url):
    print(' '*1, op, url)



In [11]:

    
# normal setup
data_dir_l = _parent_dir(__file__, n=3)
data_dir_l = os.path.join(data_dir_l, "freediscovery_shared", "tar_fd_benchmark")









    



---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-11-6e8f250fd97c> in <module>()
      1 # normal setup
----> 2 data_dir_l = _parent_dir(__file__, n=3)
      3 #data_dir_l = os.path.join(data_dir_l, "freediscovery_shared", "tar_fd_benchmark")

NameError: name '__file__' is not defined



In [10]:

    
# docker setip
data_dir_d = "/freediscovery_shared/tar_fd_benchmark"



In [ ]:

    
print(
"""## =========================================================== ##
##                                                             ##
##    FREEDiscovery categorization example (Python)            ##
##                                                             ##
## Note that this only illustrates part of the implemented API.##
## =========================================================== ##\n
"""
)

print(""" This example uses a 37k documents subset of the TREC 2009 CORPUS (201)""")

#BASE_URL = "http://localhost:5001"  # FREEDiscovery server URL
BASE_URL = "http://52.38.241.62:5001"  # FREEDiscovery server URL
BASE_URL += '/api/v0'
dsid = None                               # set the dataset id here,
                                          # otherwise new feature generation will be executed

#dsid = "bdfa3179d0d24f9788144241973b16d1"
if 'localhost' in BASE_URL:
    data_dir_d = data_dir_l
data_dir = data_dir_d


if dsid is None:
# 1. Feature extracition

    print("\n1.a Load dataset and initalize feature extraction")
    url = BASE_URL + '/feature-extraction'
    _print_url("POST", url)
    res = requests.post(url,
            json={'data_dir': os.path.join(data_dir, "data"),
                'n_features': 100000, 'analyzer': 'word',
                'ngram_range': (1, 1), 'stop_words': 'english',
                'chunk_size': 5000, 'n_jobs': 4, 'use_idf': 1,
                'sublinear_tf': 0, 'binary': 0})

    dsid = res.json()['id']
    print("   => received {}".format(list(res.json().keys())))
    print("   => dsid = {}".format(dsid))

    print("\n1.b Start feature extraction (non blocking)")

    # Make this call non blocking
    url = BASE_URL+'/feature-extraction/{}'.format(dsid)
    _print_url("POST", url)
    p = Process(target=requests.post, args=(url,))
    p.start()
    sleep(5.0) # wait a bit for the processing to start

    print('\n1.c Monitor feature extraction progress')
    url = BASE_URL+'/feature-extraction/{}'.format(dsid)
    _print_url("GET", url)

    t0 = time()
    while True:
        res = requests.get(url)
        if res.status_code == 520:
            p.terminate()
            raise ValueError('Processing did not start')
        elif res.status_code == 200:
            break # processing finished
        data = res.json()
        print('     ... {}k/{}k files processed in {:.1f} min'.format(
                    data['n_samples_processed']//1000, data['n_samples']//1000, (time() - t0)/60.))
        sleep(15.0)

    p.terminate() # just in case, should not be necessary


print("\n1.d. check the parameters of the extracted features")
url = BASE_URL + '/feature-extraction/{}'.format(dsid)
_print_url('GET', url)
res = requests.get(url)

data = res.json()
for key, val in data.items():
    if key!='filenames':
        print('     - {}: {}'.format(key, val))



print("\n2.a. Load relevant & non relevant seed file list") 
with open(os.path.join(data_dir_l,'seed_relevant.txt'), 'rt') as fh:
    relevant_files = [el.strip() for el in fh.readlines()]

with open(os.path.join(data_dir_l,'seed_non_relevant.txt'), 'rt') as fh:
    non_relevant_files = [el.strip() for el in fh.readlines()]


print("\n3.b. Train the categorization model")
print("       {} relevant, {} non-relevant files".format(
    len(relevant_files), len(non_relevant_files)))
url = BASE_URL + '/categorization-model/'
_print_url("POST", url)

res = requests.post(url,
        json={'relevant_filenames': relevant_files,
              'non_relevant_filenames': non_relevant_files,
              'dataset_id': dsid,
              'method': 'LogisticRegression', # one of "LinearSVC", "LogisticRegression", "LogisticRegressionCV", "SGDClassifier"
              'training_scores': 1
              }) 

data = res.json()
mid  = data['id']
print("     => model id = {}".format(mid))
print('     => Training scores: F1 score = {:.2f},   recall = {:.2f},   precision = {:.2f}'.format(
    data['F1_score'], data['recall_score'], data['precision_score'], ))

print("\n3.c. Check the parameters used in the categorization model")
url = BASE_URL + '/categorization-model/{}'.format(mid)
_print_url("GET", url)
res = requests.get(url)

data = res.json()
for key, val in data.items():
    if "filenames" not in key:
        print('     - {}: {}'.format(key, val))

threshold = 0.0

print("\n3.d Categorize the complete dataset with this model")
url = BASE_URL + '/categorization-model/{}/predict'.format(mid)
_print_url("GET", url)
res = requests.get(url)
prediction = res.json()['prediction']

print("    => Predicting {} relevant and {} non relevant documents".format(
    len(list(filter(lambda x: x>threshold, prediction))),
    len(list(filter(lambda x: x<threshold, prediction)))))

print("\n3.e Test categorization accuracy")
gtfile = os.path.join(data_dir, "ground_truth_file.txt")
print("         using {}".format(gtfile))  
url = BASE_URL + '/categorization-model/{}/test'.format(mid)
_print_url("POST", url)
res = requests.post(url,
        json={'ground_truth_filename': gtfile})
               
data = res.json()
print('     => Test scores: F1 score = {:.2f},   recall = {:.2f},   precision = {:.2f}'.format(
    data['F1_score'], data['recall_score'], data['precision_score'], ))

print("\n4.a. Calculate LSI")

url = BASE_URL + '/lsi/'
_print_url("POST", url)

n_components = 100
res = requests.post(url,
        json={ 'n_components': n_components,
              'dataset_id': dsid,
              }) 

data = res.json()
lid  = data['id']
print('  => LSI model id = {}'.format(lid))
print('  => SVD decomposition with {} dimensions explaining {:.2f} % variabilty of the data'.format(
                        n_components, data['explained_variance']*100))
print("\n4.b. Predict categorization with LSI")

url = BASE_URL + '/lsi/{}/predict'.format(lid)
_print_url("POST", url)
res = requests.post(url,
        json={'relevant_filenames': relevant_files,
              'non_relevant_filenames': non_relevant_files,
              'threshold': 'auto'
              }) 
data = res.json()
prediction = data['prediction']

print('     => Training scores: F1 score = {:.2f},   recall = {:.2f},   precision = {:.2f}'.format(
    data['F1_score'], data['recall_score'], data['precision_score'], ))
print("    => Predicting {} relevant and {} non relevant documents".format(
    len(list(filter(lambda x: x>threshold, prediction))),
    len(list(filter(lambda x: x<threshold, prediction)))))

print("\n4.c. Test categorization with LSI")
url = BASE_URL + '/lsi/{}/test'.format(lid)
_print_url("POST", url)

res = requests.post(url,
        json={'relevant_filenames': relevant_files,
              'non_relevant_filenames': non_relevant_files,
              'threshold': 'auto',
              'ground_truth_filename': gtfile
              }) 
data = res.json()
print('     => Test scores: F1 score = {:.2f},   recall = {:.2f},   precision = {:.2f}'.format(
    data['F1_score'], data['recall_score'], data['precision_score'], ))


print("\n5.a Delete the extracted features (not actually calling this)")
url = BASE_URL + '/feature-extraction/{}'.format(dsid)
_print_url("DELETE", url)