In [1]:
!conda info -e
In [2]:
# -*- coding: utf-8 -*-
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import sys
from time import time, sleep
import os
from multiprocessing import Process
import requests
In [4]:
#import freediscovery.tests as ft
#import freediscovery as ft
In [5]:
def _parent_dir(path, n=0):
path = os.path.abspath(path)
if n==0:
return path
else:
return os.path.dirname(_parent_dir(path, n=n-1))
In [6]:
def _print_url(op, url):
print(' '*1, op, url)
In [11]:
# normal setup
data_dir_l = _parent_dir(__file__, n=3)
data_dir_l = os.path.join(data_dir_l, "freediscovery_shared", "tar_fd_benchmark")
In [10]:
# docker setip
data_dir_d = "/freediscovery_shared/tar_fd_benchmark"
In [ ]:
print(
"""## =========================================================== ##
## ##
## FREEDiscovery categorization example (Python) ##
## ##
## Note that this only illustrates part of the implemented API.##
## =========================================================== ##\n
"""
)
print(""" This example uses a 37k documents subset of the TREC 2009 CORPUS (201)""")
#BASE_URL = "http://localhost:5001" # FREEDiscovery server URL
BASE_URL = "http://52.38.241.62:5001" # FREEDiscovery server URL
BASE_URL += '/api/v0'
dsid = None # set the dataset id here,
# otherwise new feature generation will be executed
#dsid = "bdfa3179d0d24f9788144241973b16d1"
if 'localhost' in BASE_URL:
data_dir_d = data_dir_l
data_dir = data_dir_d
if dsid is None:
# 1. Feature extracition
print("\n1.a Load dataset and initalize feature extraction")
url = BASE_URL + '/feature-extraction'
_print_url("POST", url)
res = requests.post(url,
json={'data_dir': os.path.join(data_dir, "data"),
'n_features': 100000, 'analyzer': 'word',
'ngram_range': (1, 1), 'stop_words': 'english',
'chunk_size': 5000, 'n_jobs': 4, 'use_idf': 1,
'sublinear_tf': 0, 'binary': 0})
dsid = res.json()['id']
print(" => received {}".format(list(res.json().keys())))
print(" => dsid = {}".format(dsid))
print("\n1.b Start feature extraction (non blocking)")
# Make this call non blocking
url = BASE_URL+'/feature-extraction/{}'.format(dsid)
_print_url("POST", url)
p = Process(target=requests.post, args=(url,))
p.start()
sleep(5.0) # wait a bit for the processing to start
print('\n1.c Monitor feature extraction progress')
url = BASE_URL+'/feature-extraction/{}'.format(dsid)
_print_url("GET", url)
t0 = time()
while True:
res = requests.get(url)
if res.status_code == 520:
p.terminate()
raise ValueError('Processing did not start')
elif res.status_code == 200:
break # processing finished
data = res.json()
print(' ... {}k/{}k files processed in {:.1f} min'.format(
data['n_samples_processed']//1000, data['n_samples']//1000, (time() - t0)/60.))
sleep(15.0)
p.terminate() # just in case, should not be necessary
print("\n1.d. check the parameters of the extracted features")
url = BASE_URL + '/feature-extraction/{}'.format(dsid)
_print_url('GET', url)
res = requests.get(url)
data = res.json()
for key, val in data.items():
if key!='filenames':
print(' - {}: {}'.format(key, val))
print("\n2.a. Load relevant & non relevant seed file list")
with open(os.path.join(data_dir_l,'seed_relevant.txt'), 'rt') as fh:
relevant_files = [el.strip() for el in fh.readlines()]
with open(os.path.join(data_dir_l,'seed_non_relevant.txt'), 'rt') as fh:
non_relevant_files = [el.strip() for el in fh.readlines()]
print("\n3.b. Train the categorization model")
print(" {} relevant, {} non-relevant files".format(
len(relevant_files), len(non_relevant_files)))
url = BASE_URL + '/categorization-model/'
_print_url("POST", url)
res = requests.post(url,
json={'relevant_filenames': relevant_files,
'non_relevant_filenames': non_relevant_files,
'dataset_id': dsid,
'method': 'LogisticRegression', # one of "LinearSVC", "LogisticRegression", "LogisticRegressionCV", "SGDClassifier"
'training_scores': 1
})
data = res.json()
mid = data['id']
print(" => model id = {}".format(mid))
print(' => Training scores: F1 score = {:.2f}, recall = {:.2f}, precision = {:.2f}'.format(
data['F1_score'], data['recall_score'], data['precision_score'], ))
print("\n3.c. Check the parameters used in the categorization model")
url = BASE_URL + '/categorization-model/{}'.format(mid)
_print_url("GET", url)
res = requests.get(url)
data = res.json()
for key, val in data.items():
if "filenames" not in key:
print(' - {}: {}'.format(key, val))
threshold = 0.0
print("\n3.d Categorize the complete dataset with this model")
url = BASE_URL + '/categorization-model/{}/predict'.format(mid)
_print_url("GET", url)
res = requests.get(url)
prediction = res.json()['prediction']
print(" => Predicting {} relevant and {} non relevant documents".format(
len(list(filter(lambda x: x>threshold, prediction))),
len(list(filter(lambda x: x<threshold, prediction)))))
print("\n3.e Test categorization accuracy")
gtfile = os.path.join(data_dir, "ground_truth_file.txt")
print(" using {}".format(gtfile))
url = BASE_URL + '/categorization-model/{}/test'.format(mid)
_print_url("POST", url)
res = requests.post(url,
json={'ground_truth_filename': gtfile})
data = res.json()
print(' => Test scores: F1 score = {:.2f}, recall = {:.2f}, precision = {:.2f}'.format(
data['F1_score'], data['recall_score'], data['precision_score'], ))
print("\n4.a. Calculate LSI")
url = BASE_URL + '/lsi/'
_print_url("POST", url)
n_components = 100
res = requests.post(url,
json={ 'n_components': n_components,
'dataset_id': dsid,
})
data = res.json()
lid = data['id']
print(' => LSI model id = {}'.format(lid))
print(' => SVD decomposition with {} dimensions explaining {:.2f} % variabilty of the data'.format(
n_components, data['explained_variance']*100))
print("\n4.b. Predict categorization with LSI")
url = BASE_URL + '/lsi/{}/predict'.format(lid)
_print_url("POST", url)
res = requests.post(url,
json={'relevant_filenames': relevant_files,
'non_relevant_filenames': non_relevant_files,
'threshold': 'auto'
})
data = res.json()
prediction = data['prediction']
print(' => Training scores: F1 score = {:.2f}, recall = {:.2f}, precision = {:.2f}'.format(
data['F1_score'], data['recall_score'], data['precision_score'], ))
print(" => Predicting {} relevant and {} non relevant documents".format(
len(list(filter(lambda x: x>threshold, prediction))),
len(list(filter(lambda x: x<threshold, prediction)))))
print("\n4.c. Test categorization with LSI")
url = BASE_URL + '/lsi/{}/test'.format(lid)
_print_url("POST", url)
res = requests.post(url,
json={'relevant_filenames': relevant_files,
'non_relevant_filenames': non_relevant_files,
'threshold': 'auto',
'ground_truth_filename': gtfile
})
data = res.json()
print(' => Test scores: F1 score = {:.2f}, recall = {:.2f}, precision = {:.2f}'.format(
data['F1_score'], data['recall_score'], data['precision_score'], ))
print("\n5.a Delete the extracted features (not actually calling this)")
url = BASE_URL + '/feature-extraction/{}'.format(dsid)
_print_url("DELETE", url)