In [1]:
%matplotlib inline

Clustering Example [REST API]

Cluster documents into clusters


In [2]:
import numpy as np
import pandas as pd
from time import time
import requests

pd.options.display.float_format = '{:,.3f}'.format


def repr_clustering(labels, terms):
    out = []
    for ridx, row in enumerate(terms):
        out.append({'cluster_names': row, 'N_documents': (labels == ridx).sum()})
    out = pd.DataFrame(out).sort_values('N_documents', ascending=False)
    return out

dataset_name = "treclegal09_2k_subset"     # see list of available datasets

BASE_URL = "http://localhost:5001/api/v0"  # FreeDiscovery server URL

print(" 0. Load the test dataset")
url = BASE_URL + '/datasets/{}'.format(dataset_name)
print(" POST", url)
res = requests.get(url).json()

# To use a custom dataset, simply specify the following variables
data_dir = res['data_dir']

# # 1. Feature extraction (non hashed)

print("\n1.a Load dataset and initalize feature extraction")
url = BASE_URL + '/feature-extraction'
print(" POST", url)
fe_opts = {'data_dir': data_dir,
           'stop_words': 'english', 'chunk_size': 2000, 'n_jobs': -1,
           'use_idf': 1, 'sublinear_tf': 1, 'binary': 0, 'n_features': 30001,
           'analyzer': 'word', 'ngram_range': (1, 1), "norm": "l2",
           'use_hashing': False,  # hashing should be disabled for clustering
           'min_df': 4, 'max_df': 0.75
           }
res = requests.post(url, json=fe_opts).json()

dsid = res['id']
print("   => received {}".format(list(res.keys())))
print("   => dsid = {}".format(dsid))


print("\n1.b Run feature extraction")
# progress status is available for the hashed version only
url = BASE_URL+'/feature-extraction/{}'.format(dsid)
print(" POST", url)
res = requests.post(url)

print("\n1.d. check the parameters of the extracted features")
url = BASE_URL + '/feature-extraction/{}'.format(dsid)
print(' GET', url)
res = requests.get(url).json()

print('\n'.join(['     - {}: {}'.format(key, val) for key, val in res.items() \
                                                  if "filenames" not in key]))


# # 2. Document Clustering (LSI + K-Means)

print("\n2.a. Document clustering (LSI + K-means)")

url = BASE_URL + '/clustering/k-mean/'
print(" POST", url)
t0 = time()
res = requests.post(url,
                    json={'dataset_id': dsid,
                          'n_clusters': 10,
                          'lsi_components': 50
                          }).json()

mid = res['id']
print("     => model id = {}".format(mid))

print("\n2.b. Computing cluster labels")
url = BASE_URL + '/clustering/k-mean/{}'.format(mid)
print(" POST", url)
res = requests.get(url,
                   json={'n_top_words': 6
                         }).json()
t1 = time()

print('    .. computed in {:.1f}s'.format(t1 - t0))
print(repr_clustering(np.array(res['labels']), res['cluster_terms']))


# # 3. Document Clustering (LSI + Ward Hierarchical Clustering)

print("\n2.a. Document clustering (LSI + Ward HC)")

url = BASE_URL + '/clustering/ward_hc/'
print(" POST", url)
t0 = time()
res = requests.post(url,
                    json={'dataset_id': dsid,
                          'n_clusters': 10,
                          'lsi_components': 50,
                          'n_neighbors': 5  # this is the connectivity constraint
                          }).json()

mid = res['id']
print("     => model id = {}".format(mid))

print("\n2.b. Computing cluster labels")
url = BASE_URL + '/clustering/ward_hc/{}'.format(mid)
print("POST", url)
res = requests.get(url,
                   json={'n_top_words': 6
                         }).json()
t1 = time()

print('    .. computed in {:.1f}s'.format(t1 - t0))
print(repr_clustering(np.array(res['labels']), res['cluster_terms']))


 0. Load the test dataset
 POST http://localhost:5001/api/v0/datasets/treclegal09_2k_subset

1.a Load dataset and initalize feature extraction
 POST http://localhost:5001/api/v0/feature-extraction
   => received ['filenames', 'id']
   => dsid = 906d6a5e4b634fb882bd64d0d975e66f

1.b Run feature extraction
 POST http://localhost:5001/api/v0/feature-extraction/906d6a5e4b634fb882bd64d0d975e66f

1.d. check the parameters of the extracted features
 GET http://localhost:5001/api/v0/feature-extraction/906d6a5e4b634fb882bd64d0d975e66f
     - n_features: 30001
     - max_df: 0.75
     - ngram_range: [1, 1]
     - chunk_size: 2000
     - sublinear_tf: True
     - n_samples_processed: 2465
     - analyzer: word
     - stop_words: english
     - binary: False
     - norm: l2
     - use_hashing: False
     - n_samples: 2465
     - use_idf: True
     - n_jobs: -1
     - data_dir: /shared/code/wking_code/freediscovery_shared/treclegal09_2k_subset/data
     - min_df: 4.0

2.a. Document clustering (LSI + K-means)
 POST http://localhost:5001/api/v0/clustering/k-mean/
     => model id = 7f19bf164a4a47408519e3bebcc3e964

2.b. Computing cluster labels
 POST http://localhost:5001/api/v0/clustering/k-mean/7f19bf164a4a47408519e3bebcc3e964
    .. computed in 2.1s
   N_documents                                      cluster_names
4          486  ['enron', 'energy', 'trading', 'services', 'co...
3          482  ['shackleton', 'test', 'recipients', 'group', ...
2          425      ['tenet', 'test', 'oct', 'nov', 'tue', 'wed']
5          311  ['ect', 'hou', 'nemec', 'shackleton', 'enron_d...
1          225  ['ect', 'recipients', 'group', 'haedicke', 'ad...
9          178  ['teneo', 'recipients', 'administrative', 'ric...
0          135  ['shall', 'party', 'agreement', 'transaction',...
7          102  ['sanders', 'nov', 'ect', 'test', 'meeting', '...
8           64  ['migration', 'outlook', 'team', 'mtg', 'oct',...
6           57  ['rewrite', 'server', 'address', 'smtp', 'mail...

2.a. Document clustering (LSI + Ward HC)
 POST http://localhost:5001/api/v0/clustering/ward_hc/
     => model id = 1cbfeea563c7431d8c17072f8e65b84a

2.b. Computing cluster labels
POST http://localhost:5001/api/v0/clustering/ward_hc/1cbfeea563c7431d8c17072f8e65b84a
    .. computed in 3.4s
   N_documents                                      cluster_names
5          443      ['tenet', 'test', 'oct', 'nov', 'tue', 'mon']
1          423  ['recipients', 'administrative', 'group', 'tes...
2          398  ['enron', 'energy', 'power', 'trade', 'market'...
0          393  ['ect', 'hou', 'tana', 'group', 'recipients', ...
6          342  ['shackleton', 'ect', 'test', 'group', 'recipi...
4          166  ['shall', 'party', 'agreement', 'transaction',...
8           95  ['sanders', 'nov', 'ect', 'test', 'lunch', 'me...
3           85  ['enron_development', 'ect', 'shackleton', 'ho...
9           64  ['migration', 'outlook', 'team', 'mtg', 'oct',...
7           56  ['rewrite', 'server', 'address', 'smtp', 'mail...

In [ ]: