In [ ]:
import h2o
h2o.init()
In [3]:
from collections import OrderedDict
documents = [
'H2O is an in-memory platform for distributed, scalable machine learning. H2O uses familiar interfaces like R, Python, Scala, Java, JSON and the Flow notebook/web interface, and works seamlessly with big data technologies like Hadoop and Spark.',
'Ice hockey is a contact team sport played on ice, usually in a rink, in which two teams of skaters use their sticks to shoot a vulcanized rubber puck into their opponent\'s net to score goals. The sport is known to be fast-paced and physical.',
'An antibody (Ab), also known as an immunoglobulin (Ig), is a large, Y-shaped protein produced mainly by plasma cells that is used by the immune system to neutralize pathogens such as pathogenic bacteria and viruses.'
]
doc_ids = list(range(len(documents)))
input_frame = h2o.H2OFrame(OrderedDict([('DocID', doc_ids), ('Document', documents)]),
column_types=['numeric', 'string'])
input_frame.head()
Out[3]:
In [4]:
from h2o.information_retrieval.tf_idf import tf_idf
tf_idf_out = tf_idf(input_frame, "DocID", "Document", False, False)
tf_idf_out.head()
Out[4]:
In [5]:
from IPython.display import DisplayObject, display
VALUES_CNT_TO_SHOW = 3
def tf_idf_output_summary(tf_idf_out):
for doc_id in doc_ids:
sorted_doc_tf_idfs = tf_idf_out[tf_idf_out['DocID'] == doc_id].sort(by='TF-IDF')
print('The highest TF-IDF values for document ' + str(doc_id) +':')
display(sorted_doc_tf_idfs.tail(VALUES_CNT_TO_SHOW))
print('The lowest TF-IDF values for document ' + str(doc_id) +':')
display(sorted_doc_tf_idfs.head(VALUES_CNT_TO_SHOW))
print('\n')
In [6]:
tf_idf_output_summary(tf_idf_out)
In [7]:
preprocessed_data = [(doc_id, word) for doc_id, document in enumerate(documents) for word in document.split()]
preprocessed_input_frame = h2o.H2OFrame(preprocessed_data,
column_names=['DocID', 'Document'],
column_types=['numeric', 'string'])
preprocessed_input_frame.head()
Out[7]:
In [8]:
tf_idf_out = tf_idf(preprocessed_input_frame, 'DocID', 'Document', preprocess=False)
tf_idf_out.head()
Out[8]:
In [9]:
tf_idf_output_summary(tf_idf_out)
In [10]:
input_frame = h2o.H2OFrame(OrderedDict([('DocID', doc_ids), ('Document', documents)]),
column_types=['numeric', 'string'])
In [11]:
tf_idf_out = tf_idf(input_frame, 'DocID', 'Document', case_sensitive=False)
tf_idf_out.head()
Out[11]:
In [12]:
tf_idf_output_summary(tf_idf_out)