In [1]:
import h2o

In [2]:
h2o.init(max_mem_size = 2)


Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
  Java Version: java version "1.7.0_80"; Java(TM) SE Runtime Environment (build 1.7.0_80-b15); Java HotSpot(TM) 64-Bit Server VM (build 24.80-b11, mixed mode)
  Starting server from /usr/local/lib/python2.7/dist-packages/h2o/backend/bin/h2o.jar
  Ice root: /tmp/tmp3rsYDP
  JVM stdout: /tmp/tmp3rsYDP/h2o_unknownUser_started_from_python.out
  JVM stderr: /tmp/tmp3rsYDP/h2o_unknownUser_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321... successful.
Warning: Your H2O cluster version is too old (7 months and 24 days)! Please download and install the latest version from http://h2o.ai/download/
H2O cluster uptime: 03 secs
H2O cluster version: 3.10.3.4
H2O cluster version age: 7 months and 24 days !!!
H2O cluster name: H2O_from_python_unknownUser_0s0jkt
H2O cluster total nodes: 1
H2O cluster free memory: 1.778 Gb
H2O cluster total cores: 8
H2O cluster allowed cores: 8
H2O cluster status: accepting new members, healthy
H2O connection url: http://127.0.0.1:54321
H2O connection proxy: None
Python version: 2.7.6 final

In [3]:
h2o.remove_all()

In [4]:
help(h2o)


Help on package h2o:

NAME
    h2o - :mod:`h2o` -- module for using H2O services.

FILE
    /usr/local/lib/python2.7/dist-packages/h2o/__init__.py

DESCRIPTION
    (please add description).

PACKAGE CONTENTS
    assembly
    astfun
    backend (package)
    cross_validation
    demos
    display
    estimators (package)
    exceptions
    expr
    frame
    grid (package)
    group_by
    h2o
    job
    model (package)
    schemas (package)
    transforms (package)
    two_dim_table
    utils (package)

SUBMODULES
    __init__

FUNCTIONS
    api(endpoint, data=None, json=None, filename=None, save_to=None)
        Perform a REST API request to a previously connected server.
        
        This function is mostly for internal purposes, but may occasionally be useful for direct access to
        the backend H2O server. It has same parameters as :meth:`H2OConnection.request <h2o.backend.H2OConnection.request>`.
    
    as_list(data, use_pandas=True, header=True)
        Convert an H2O data object into a python-specific object.
        
        WARNING! This will pull all data local!
        
        If Pandas is available (and use_pandas is True), then pandas will be used to parse the
        data frame. Otherwise, a list-of-lists populated by character data will be returned (so
        the types of data will all be str).
        
        :param data: an H2O data object.
        :param use_pandas: If True, try to use pandas for reading in the data.
        :param header: If True, return column names as first element in list
        
        :returns: List of lists (Rows x Columns).
    
    assign(data, xid)
        (internal) Assign new id to the frame.
        
        :param data: an H2OFrame whose id should be changed
        :param xid: new id for the frame.
        :returns: the passed frame.
    
    cluster()
        Return :class:`H2OCluster` object describing the backend H2O cloud.
    
    cluster_info(*args, **kwargs)
        Deprecated, use ``h2o.cluster().show_status()``.
    
    cluster_status(*args, **kwargs)
        Deprecated, use ``h2o.cluster().show_status(True)``.
    
    connect(server=None, url=None, ip=None, port=None, https=None, verify_ssl_certificates=None, auth=None, proxy=None, cluster_id=None, cookies=None, verbose=True, config=None)
        Connect to an existing H2O server, remote or local.
        
        There are two ways to connect to a server: either pass a `server` parameter containing an instance of
        an H2OLocalServer, or specify `ip` and `port` of the server that you want to connect to.
        
        :param server: An H2OLocalServer instance to connect to (optional).
        :param url: Full URL of the server to connect to (can be used instead of `ip` + `port` + `https`).
        :param ip: The ip address (or host name) of the server where H2O is running.
        :param port: Port number that H2O service is listening to.
        :param https: Set to True to connect via https:// instead of http://.
        :param verify_ssl_certificates: When using https, setting this to False will disable SSL certificates verification.
        :param auth: Either a (username, password) pair for basic authentication, or one of the requests.auth
                     authenticator objects.
        :param proxy: Proxy server address.
        :param cluster_id: Name of the H2O cluster to connect to. This option is used from Steam only.
        :param cookies: Cookie (or list of) to add to request
        :param verbose: Set to False to disable printing connection status messages.
        :param connection_conf: Connection configuration object encapsulating connection parameters.
        :returns: the new :class:`H2OConnection` object.
    
    connection()
        Return the current :class:`H2OConnection` handler.
    
    create_frame(frame_id=None, rows=10000, cols=10, randomize=True, real_fraction=None, categorical_fraction=None, integer_fraction=None, binary_fraction=None, time_fraction=None, string_fraction=None, value=0, real_range=100, factors=100, integer_range=100, binary_ones_fraction=0.02, missing_fraction=0.01, has_response=False, response_factors=2, positive_response=False, seed=None, seed_for_column_types=None)
        Create a new frame with random data.
        
        Creates a data frame in H2O with real-valued, categorical, integer, and binary columns specified by the user.
        
        :param frame_id: the destination key. If empty, this will be auto-generated.
        :param rows: the number of rows of data to generate.
        :param cols: the number of columns of data to generate. Excludes the response column if has_response is True.
        :param randomize: If True, data values will be randomly generated. This must be True if either
            categorical_fraction or integer_fraction is non-zero.
        :param value: if randomize is False, then all real-valued entries will be set to this value.
        :param real_range: the range of randomly generated real values.
        :param real_fraction: the fraction of columns that are real-valued.
        :param categorical_fraction: the fraction of total columns that are categorical.
        :param factors: the number of (unique) factor levels in each categorical column.
        :param integer_fraction: the fraction of total columns that are integer-valued.
        :param integer_range: the range of randomly generated integer values.
        :param binary_fraction: the fraction of total columns that are binary-valued.
        :param binary_ones_fraction: the fraction of values in a binary column that are set to 1.
        :param time_fraction: the fraction of randomly created date/time columns.
        :param string_fraction: the fraction of randomly created string columns.
        :param missing_fraction: the fraction of total entries in the data frame that are set to NA.
        :param has_response: A logical value indicating whether an additional response column should be prepended to the
            final H2O data frame. If set to True, the total number of columns will be ``cols + 1``.
        :param response_factors: if has_response is True, then this variable controls the type of the "response" column:
            setting response_factors to 1 will generate real-valued response, any value greater or equal than 2 will
            create categorical response with that many categories.
        :param positive_reponse: when response variable is present and of real type, this will control whether it
            contains positive values only, or both positive and negative.
        :param seed: a seed used to generate random values when ``randomize`` is True.
        :param seed_for_column_types: a seed used to generate random column types when ``randomize`` is True.
        
        :returns: an :class:`H2OFrame` object
    
    deep_copy(data, xid)
        Create a deep clone of the frame ``data``.
        
        :param data: an H2OFrame to be cloned
        :param xid: (internal) id to be assigned to the new frame.
        :returns: new :class:`H2OFrame` which is the clone of the passed frame.
    
    demo(funcname, interactive=True, echo=True, test=False)
        H2O built-in demo facility.
        
        :param funcname: A string that identifies the h2o python function to demonstrate.
        :param interactive: If True, the user will be prompted to continue the demonstration after every segment.
        :param echo: If True, the python commands that are executed will be displayed.
        :param test: If True, `h2o.init()` will not be called (used for pyunit testing).
        
        :example:
            >>> import h2o
            >>> h2o.demo("gbm")
    
    download_all_logs(dirname=u'.', filename=None)
        Download H2O log files to disk.
        
        :param dirname: a character string indicating the directory that the log file should be saved in.
        :param filename: a string indicating the name that the CSV file should be.
        
        :returns: path of logs written.
    
    download_csv(data, filename)
        Download an H2O data set to a CSV file on the local disk.
        
        Warning: Files located on the H2O server may be very large! Make sure you have enough
        hard drive space to accommodate the entire file.
        
        :param data: an H2OFrame object to be downloaded.
        :param filename: name for the CSV file where the data should be saved to.
    
    download_pojo(model, path=u'', get_jar=True)
        Download the POJO for this model to the directory specified by path; if path is "", then dump to screen.
        
        :param model: the model whose scoring POJO should be retrieved.
        :param path: an absolute path to the directory where POJO should be saved.
        :param get_jar: retrieve the h2o-genmodel.jar also (will be saved to the same folder ``path``).
        :returns: location of the downloaded POJO file.
    
    export_file(frame, path, force=False, parts=1)
        Export a given H2OFrame to a path on the machine this python session is currently connected to.
        
        :param frame: the Frame to save to disk.
        :param path: the path to the save point on disk.
        :param force: if True, overwrite any preexisting file with the same path
        :param parts: enables export to multiple 'part' files instead of just a single file.
            Convenient for large datasets that take too long to store in a single file.
            Use parts=-1 to instruct H2O to determine the optimal number of part files or
            specify your desired maximum number of part files. Path needs to be a directory
            when exporting to multiple files, also that directory must be empty.
            Default is ``parts = 1``, which is to export to a single file.
    
    frame(frame_id)
        Retrieve metadata for an id that points to a Frame.
        
        :param frame_id: the key of a Frame in H2O.
        
        :returns: dict containing the frame meta-information.
    
    frames()
        Retrieve all the Frames.
        
        :returns: Meta information on the frames
    
    get_frame(frame_id)
        Obtain a handle to the frame in H2O with the frame_id key.
        
        :param str frame_id: id of the frame to retrieve.
        :returns: an :class:`H2OFrame` object
    
    get_grid(grid_id)
        Return the specified grid.
        
        :param grid_id: The grid identification in h2o
        
        :returns: an :class:`H2OGridSearch` instance.
    
    get_model(model_id)
        Load a model from the server.
        
        :param model_id: The model identification in H2O
        
        :returns: Model object, a subclass of H2OEstimator
    
    get_timezone(*args, **kwargs)
        Deprecated, use ``h2o.cluster().timezone``.
    
    import_file(path=None, destination_frame=None, parse=True, header=0, sep=None, col_names=None, col_types=None, na_strings=None, pattern=None)
        Import a dataset that is already on the cluster.
        
        The path to the data must be a valid path for each node in the H2O cluster. If some node in the H2O cluster
        cannot see the file, then an exception will be thrown by the H2O cluster. Does a parallel/distributed
        multi-threaded pull of the data. The main difference between this method and :func:`upload_file` is that
        the latter works with local files, whereas this method imports remote files (i.e. files local to the server).
        If you running H2O server on your own maching, then both methods behave the same.
        
        :param path: path(s) specifying the location of the data to import or a path to a directory of files to import
        :param destination_frame: The unique hex key assigned to the imported file. If none is given, a key will be
            automatically generated.
        :param parse: If True, the file should be parsed after import.
        :param header: -1 means the first line is data, 0 means guess, 1 means first line is header.
        :param sep: The field separator character. Values on each line of the file are separated by
            this character. If not provided, the parser will automatically detect the separator.
        :param col_names: A list of column names for the file.
        :param col_types: A list of types or a dictionary of column names to types to specify whether columns
            should be forced to a certain type upon import parsing. If a list, the types for elements that are
            one will be guessed. The possible types a column may have are:
        
            - "unknown" - this will force the column to be parsed as all NA
            - "uuid"    - the values in the column must be true UUID or will be parsed as NA
            - "string"  - force the column to be parsed as a string
            - "numeric" - force the column to be parsed as numeric. H2O will handle the compression of the numeric
              data in the optimal manner.
            - "enum"    - force the column to be parsed as a categorical column.
            - "time"    - force the column to be parsed as a time column. H2O will attempt to parse the following
              list of date time formats: (date) "yyyy-MM-dd", "yyyy MM dd", "dd-MMM-yy", "dd MMM yy", (time)
              "HH:mm:ss", "HH:mm:ss:SSS", "HH:mm:ss:SSSnnnnnn", "HH.mm.ss" "HH.mm.ss.SSS", "HH.mm.ss.SSSnnnnnn".
              Times can also contain "AM" or "PM".
        :param na_strings: A list of strings, or a list of lists of strings (one list per column), or a dictionary
            of column names to strings which are to be interpreted as missing values.
        :param pattern: Character string containing a regular expression to match file(s) in the folder if `path` is a
            directory.
        
        :returns: a new :class:`H2OFrame` instance.
        
        :examples:
            >>> # Single file import
            >>> iris = import_file("h2o-3/smalldata/iris.csv")
            >>> # Return all files in the folder iris/ matching the regex r"iris_.*\.csv"
            >>> iris_pattern = h2o.import_file(path = "h2o-3/smalldata/iris",
            ...                                pattern = "iris_.*\.csv")
    
    import_sql_select(connection_url, select_query, username, password, optimize=True)
        Import the SQL table that is the result of the specified SQL query to H2OFrame in memory.
        
        Creates a temporary SQL table from the specified sql_query.
        Runs multiple SELECT SQL queries on the temporary table concurrently for parallel ingestion, then drops the table.
        Be sure to start the h2o.jar in the terminal with your downloaded JDBC driver in the classpath::
        
          java -cp <path_to_h2o_jar>:<path_to_jdbc_driver_jar> water.H2OApp
        
        Also see h2o.import_sql_table. Currently supported SQL databases are MySQL, PostgreSQL, and MariaDB. Support
        for Oracle 12g and Microsoft SQL Server is forthcoming.
        
        :param connection_url: URL of the SQL database connection as specified by the Java Database Connectivity (JDBC)
            Driver. For example, "jdbc:mysql://localhost:3306/menagerie?&useSSL=false"
        :param select_query: SQL query starting with `SELECT` that returns rows from one or more database tables.
        :param username: username for SQL server
        :param password: password for SQL server
        :param optimize: optimize import of SQL table for faster imports. Experimental.
        
        :returns: an :class:`H2OFrame` containing data of the specified SQL query.
        
        :examples:
            >>> conn_url = "jdbc:mysql://172.16.2.178:3306/ingestSQL?&useSSL=false"
            >>> select_query = "SELECT bikeid from citibike20k"
            >>> username = "root"
            >>> password = "abc123"
            >>> my_citibike_data = h2o.import_sql_select(conn_url, select_query,
            ...                                          username, password)
    
    import_sql_table(connection_url, table, username, password, columns=None, optimize=True)
        Import SQL table to H2OFrame in memory.
        
        Assumes that the SQL table is not being updated and is stable.
        Runs multiple SELECT SQL queries concurrently for parallel ingestion.
        Be sure to start the h2o.jar in the terminal with your downloaded JDBC driver in the classpath::
        
            java -cp <path_to_h2o_jar>:<path_to_jdbc_driver_jar> water.H2OApp
        
        Also see :func:`import_sql_select`.
        Currently supported SQL databases are MySQL, PostgreSQL, and MariaDB. Support for Oracle 12g and Microsoft SQL
        Server is forthcoming.
        
        :param connection_url: URL of the SQL database connection as specified by the Java Database Connectivity (JDBC)
            Driver. For example, "jdbc:mysql://localhost:3306/menagerie?&useSSL=false"
        :param table: name of SQL table
        :param columns: a list of column names to import from SQL table. Default is to import all columns.
        :param username: username for SQL server
        :param password: password for SQL server
        :param optimize: optimize import of SQL table for faster imports. Experimental.
        
        :returns: an :class:`H2OFrame` containing data of the specified SQL table.
        
        :examples:
            >>> conn_url = "jdbc:mysql://172.16.2.178:3306/ingestSQL?&useSSL=false"
            >>> table = "citibike20k"
            >>> username = "root"
            >>> password = "abc123"
            >>> my_citibike_data = h2o.import_sql_table(conn_url, table, username, password)
    
    init(url=None, ip=None, port=None, https=None, insecure=None, username=None, password=None, cluster_id=None, cookies=None, proxy=None, start_h2o=True, nthreads=-1, ice_root=None, enable_assertions=True, max_mem_size=None, min_mem_size=None, strict_version_check=None, **kwargs)
        Attempt to connect to a local server, or if not successful start a new server and connect to it.
        
        :param url: Full URL of the server to connect to (can be used instead of `ip` + `port` + `https`).
        :param ip: The ip address (or host name) of the server where H2O is running.
        :param port: Port number that H2O service is listening to.
        :param https: Set to True to connect via https:// instead of http://.
        :param insecure: When using https, setting this to True will disable SSL certificates verification.
        :param username: Username and
        :param password: Password for basic authentication.
        :param cluster_id: Name of the H2O cluster to connect to. This option is used from Steam only.
        :param cookies: Cookie (or list of) to add to each request.
        :param proxy: Proxy server address.
        :param start_h2o: If False, do not attempt to start an h2o server when connection to an existing one failed.
        :param nthreads: "Number of threads" option when launching a new h2o server.
        :param ice_root: Directory for temporary files for the new h2o server.
        :param enable_assertions: Enable assertions in Java for the new h2o server.
        :param max_mem_size: Maximum memory to use for the new h2o server.
        :param min_mem_size: Minimum memory to use for the new h2o server.
        :param strict_version_check: If True, an error will be raised if the client and server versions don't match.
        :param kwargs: (all other deprecated attributes)
    
    interaction(data, factors, pairwise, max_factors, min_occurrence, destination_frame=None)
        Categorical Interaction Feature Creation in H2O.
        
        Creates a frame in H2O with n-th order interaction features between categorical columns, as specified by
        the user.
        
        :param data: the H2OFrame that holds the target categorical columns.
        :param factors: factor columns (either indices or column names).
        :param pairwise: If True, create pairwise interactions between factors (otherwise create one
            higher-order interaction). Only applicable if there are 3 or more factors.
        :param max_factors: Max. number of factor levels in pair-wise interaction terms (if enforced, one extra
            catch-all factor will be made).
        :param min_occurrence: Min. occurrence threshold for factor levels in pair-wise interaction terms
        :param destination_frame: a string indicating the destination key. If empty, this will be auto-generated by H2O.
        
        :returns: :class:`H2OFrame`
    
    lazy_import(path, pattern=None)
        Import a single file or collection of files.
        
        :param path: A path to a data file (remote or local).
        :param pattern: Character string containing a regular expression to match file(s) in the folder.
        :returns: either a :class:`H2OFrame` with the content of the provided file, or a list of such frames if
            importing multiple files.
    
    list_timezones(*args, **kwargs)
        Deprecated, use ``h2o.cluster().list_timezones()``.
    
    load_dataset(relative_path)
        Imports a data file within the 'h2o_data' folder.
    
    load_model(path)
        Load a saved H2O model from disk.
        
        :param path: the full path of the H2O Model to be imported.
        
        :returns: an :class:`H2OEstimator` object
        
        :examples:
            >>> path = h2o.save_mode(my_model, dir=my_path)
            >>> h2o.load_model(path)
    
    log_and_echo(message=u'')
        Log a message on the server-side logs.
        
        This is helpful when running several pieces of work one after the other on a single H2O
        cluster and you want to make a notation in the H2O server side log where one piece of
        work ends and the next piece of work begins.
        
        Sends a message to H2O for logging. Generally used for debugging purposes.
        
        :param message: message to write to the log.
    
    ls()
        List keys on an H2O Cluster.
    
    make_metrics(predicted, actual, domain=None, distribution=None)
        Create Model Metrics from predicted and actual values in H2O.
        
        :param H2OFrame predicted: an H2OFrame containing predictions.
        :param H2OFrame actuals: an H2OFrame containing actual values.
        :param domain: list of response factors for classification.
        :param distribution: distribution for regression.
    
    network_test(*args, **kwargs)
        Deprecated, use ``h2o.cluster().network_test()``.
    
    no_progress()
        Disable the progress bar from flushing to stdout.
        
        The completed progress bar is printed when a job is complete so as to demarcate a log file.
    
    parse_raw(setup, id=None, first_line_is_header=0)
        Parse dataset using the parse setup structure.
        
        :param setup: Result of ``h2o.parse_setup()``
        :param id: an id for the frame.
        :param first_line_is_header: -1, 0, 1 if the first line is to be used as the header
        
        :returns: an :class:`H2OFrame` object.
    
    parse_setup(raw_frames, destination_frame=None, header=0, separator=None, column_names=None, column_types=None, na_strings=None)
        Retrieve H2O's best guess as to what the structure of the data file is.
        
        During parse setup, the H2O cluster will make several guesses about the attributes of
        the data. This method allows a user to perform corrective measures by updating the
        returning dictionary from this method. This dictionary is then fed into `parse_raw` to
        produce the H2OFrame instance.
        
        :param raw_frames: a collection of imported file frames
        :param destination_frame: The unique hex key assigned to the imported file. If none is given, a key will
            automatically be generated.
        :param header: -1 means the first line is data, 0 means guess, 1 means first line is header.
        :param separator: The field separator character. Values on each line of the file are separated by
            this character. If not provided, the parser will automatically detect the separator.
        :param column_names: A list of column names for the file.
        :param column_types: A list of types or a dictionary of column names to types to specify whether columns
            should be forced to a certain type upon import parsing. If a list, the types for elements that are
            one will be guessed. The possible types a column may have are:
        
            - "unknown" - this will force the column to be parsed as all NA
            - "uuid"    - the values in the column must be true UUID or will be parsed as NA
            - "string"  - force the column to be parsed as a string
            - "numeric" - force the column to be parsed as numeric. H2O will handle the compression of the numeric
              data in the optimal manner.
            - "enum"    - force the column to be parsed as a categorical column.
            - "time"    - force the column to be parsed as a time column. H2O will attempt to parse the following
              list of date time formats: (date) "yyyy-MM-dd", "yyyy MM dd", "dd-MMM-yy", "dd MMM yy", (time)
              "HH:mm:ss", "HH:mm:ss:SSS", "HH:mm:ss:SSSnnnnnn", "HH.mm.ss" "HH.mm.ss.SSS", "HH.mm.ss.SSSnnnnnn".
              Times can also contain "AM" or "PM".
        
        :param na_strings: A list of strings, or a list of lists of strings (one list per column), or a dictionary
            of column names to strings which are to be interpreted as missing values.
        
        :returns: a dictionary containing parse parameters guessed by the H2O backend.
    
    rapids(expr)
        Execute a Rapids expression.
        
        :param expr: The rapids expression (ascii string).
        
        :returns: The JSON response (as a python dictionary) of the Rapids execution.
    
    remove(x)
        Remove object(s) from H2O.
        
        :param x: H2OFrame, H2OEstimator, or string, or a list of those things: the object(s) or unique id(s)
            pointing to the object(s) to be removed.
    
    remove_all()
        Remove all objects from H2O.
    
    save_model(model, path=u'', force=False)
        Save an H2O Model object to disk.
        
        :param model: The model object to save.
        :param path: a path to save the model at (hdfs, s3, local)
        :param force: if True overwrite destination directory in case it exists, or throw exception if set to False.
        
        :returns: the path of the saved model
    
    set_timezone(*args, **kwargs)
        Deprecated, set ``h2o.cluster().timezone`` instead.
    
    show_progress()
        Enable the progress bar (it is enabled by default).
    
    shutdown(*args, **kwargs)
        Deprecated, use ``h2o.cluster().shutdown()``.
    
    upload_file(path, destination_frame=None, header=0, sep=None, col_names=None, col_types=None, na_strings=None)
        Upload a dataset from the provided local path to the H2O cluster.
        
        Does a single-threaded push to H2O. Also see :meth:`import_file`.
        
        :param path: A path specifying the location of the data to upload.
        :param destination_frame:  The unique hex key assigned to the imported file. If none is given, a key will
            be automatically generated.
        :param header: -1 means the first line is data, 0 means guess, 1 means first line is header.
        :param sep: The field separator character. Values on each line of the file are separated by
            this character. If not provided, the parser will automatically detect the separator.
        :param col_names: A list of column names for the file.
        :param col_types: A list of types or a dictionary of column names to types to specify whether columns
            should be forced to a certain type upon import parsing. If a list, the types for elements that are
            one will be guessed. The possible types a column may have are:
        
            - "unknown" - this will force the column to be parsed as all NA
            - "uuid"    - the values in the column must be true UUID or will be parsed as NA
            - "string"  - force the column to be parsed as a string
            - "numeric" - force the column to be parsed as numeric. H2O will handle the compression of the numeric
              data in the optimal manner.
            - "enum"    - force the column to be parsed as a categorical column.
            - "time"    - force the column to be parsed as a time column. H2O will attempt to parse the following
              list of date time formats: (date) "yyyy-MM-dd", "yyyy MM dd", "dd-MMM-yy", "dd MMM yy", (time)
              "HH:mm:ss", "HH:mm:ss:SSS", "HH:mm:ss:SSSnnnnnn", "HH.mm.ss" "HH.mm.ss.SSS", "HH.mm.ss.SSSnnnnnn".
              Times can also contain "AM" or "PM".
        :param na_strings: A list of strings, or a list of lists of strings (one list per column), or a dictionary
            of column names to strings which are to be interpreted as missing values.
        
        :returns: a new :class:`H2OFrame` instance.
        
        :examples:
            >>> frame = h2o.upload_file("/path/to/local/data")

DATA
    __all__ = (u'connect', u'init', u'api', u'connection', u'upload_file',...
    __version__ = u'3.10.3.4'

VERSION
    3.10.3.4



In [5]:
from h2o.estimators.deeplearning import H2OAutoEncoderEstimator, H2ODeepLearningEstimator
help(H2ODeepLearningEstimator)
help(h2o.import_file)


Help on class H2ODeepLearningEstimator in module h2o.estimators.deeplearning:

class H2ODeepLearningEstimator(h2o.estimators.estimator_base.H2OEstimator)
 |  Deep Learning
 |  
 |  Build a Deep Neural Network model using CPUs
 |  Builds a feed-forward multilayer artificial neural network on an H2OFrame
 |  
 |  Examples
 |  --------
 |    >>> import h2o
 |    >>> from h2o.estimators.deeplearning import H2ODeepLearningEstimator
 |    >>> h2o.connect()
 |    >>> rows = [[1,2,3,4,0], [2,1,2,4,1], [2,1,4,2,1], [0,1,2,34,1], [2,3,4,1,0]] * 50
 |    >>> fr = h2o.H2OFrame(rows)
 |    >>> fr[4] = fr[4].asfactor()
 |    >>> model = H2ODeepLearningEstimator()
 |    >>> model.train(x=range(4), y=4, training_frame=fr)
 |  
 |  Method resolution order:
 |      H2ODeepLearningEstimator
 |      h2o.estimators.estimator_base.H2OEstimator
 |      h2o.model.model_base.ModelBase
 |      h2o.utils.backward_compatibility.BackwardsCompatibleBase
 |      __builtin__.object
 |  
 |  Methods defined here:
 |  
 |  __init__(self, **kwargs)
 |  
 |  ----------------------------------------------------------------------
 |  Data descriptors defined here:
 |  
 |  activation
 |      Activation function.
 |      
 |      One of: ``"tanh"``, ``"tanh_with_dropout"``, ``"rectifier"``, ``"rectifier_with_dropout"``, ``"maxout"``,
 |      ``"maxout_with_dropout"``  (default: ``"rectifier"``).
 |  
 |  adaptive_rate
 |      Adaptive learning rate.
 |      
 |      Type: ``bool``  (default: ``True``).
 |  
 |  autoencoder
 |      Auto-Encoder.
 |      
 |      Type: ``bool``  (default: ``False``).
 |  
 |  average_activation
 |      Average activation for sparse auto-encoder. #Experimental
 |      
 |      Type: ``float``  (default: ``0``).
 |  
 |  balance_classes
 |      Balance training data class counts via over/under-sampling (for imbalanced data).
 |      
 |      Type: ``bool``  (default: ``False``).
 |  
 |  categorical_encoding
 |      Encoding scheme for categorical features
 |      
 |      One of: ``"auto"``, ``"enum"``, ``"one_hot_internal"``, ``"one_hot_explicit"``, ``"binary"``, ``"eigen"``
 |      (default: ``"auto"``).
 |  
 |  checkpoint
 |      Model checkpoint to resume training with.
 |      
 |      Type: ``str``.
 |  
 |  class_sampling_factors
 |      Desired over/under-sampling ratios per class (in lexicographic order). If not specified, sampling factors will
 |      be automatically computed to obtain class balance during training. Requires balance_classes.
 |      
 |      Type: ``List[float]``.
 |  
 |  classification_stop
 |      Stopping criterion for classification error fraction on training data (-1 to disable).
 |      
 |      Type: ``float``  (default: ``0``).
 |  
 |  col_major
 |      #DEPRECATED Use a column major weight matrix for input layer. Can speed up forward propagation, but might slow
 |      down backpropagation.
 |      
 |      Type: ``bool``  (default: ``False``).
 |  
 |  diagnostics
 |      Enable diagnostics for hidden layers.
 |      
 |      Type: ``bool``  (default: ``True``).
 |  
 |  distribution
 |      Distribution function
 |      
 |      One of: ``"auto"``, ``"bernoulli"``, ``"multinomial"``, ``"gaussian"``, ``"poisson"``, ``"gamma"``,
 |      ``"tweedie"``, ``"laplace"``, ``"quantile"``, ``"huber"``  (default: ``"auto"``).
 |  
 |  elastic_averaging
 |      Elastic averaging between compute nodes can improve distributed model convergence. #Experimental
 |      
 |      Type: ``bool``  (default: ``False``).
 |  
 |  elastic_averaging_moving_rate
 |      Elastic averaging moving rate (only if elastic averaging is enabled).
 |      
 |      Type: ``float``  (default: ``0.9``).
 |  
 |  elastic_averaging_regularization
 |      Elastic averaging regularization strength (only if elastic averaging is enabled).
 |      
 |      Type: ``float``  (default: ``0.001``).
 |  
 |  epochs
 |      How many times the dataset should be iterated (streamed), can be fractional.
 |      
 |      Type: ``float``  (default: ``10``).
 |  
 |  epsilon
 |      Adaptive learning rate smoothing factor (to avoid divisions by zero and allow progress).
 |      
 |      Type: ``float``  (default: ``1e-08``).
 |  
 |  export_weights_and_biases
 |      Whether to export Neural Network weights and biases to H2O Frames.
 |      
 |      Type: ``bool``  (default: ``False``).
 |  
 |  fast_mode
 |      Enable fast mode (minor approximation in back-propagation).
 |      
 |      Type: ``bool``  (default: ``True``).
 |  
 |  fold_assignment
 |      Cross-validation fold assignment scheme, if fold_column is not specified. The 'Stratified' option will stratify
 |      the folds based on the response variable, for classification problems.
 |      
 |      One of: ``"auto"``, ``"random"``, ``"modulo"``, ``"stratified"``  (default: ``"auto"``).
 |  
 |  fold_column
 |      Column with cross-validation fold index assignment per observation.
 |      
 |      Type: ``str``.
 |  
 |  force_load_balance
 |      Force extra load balancing to increase training speed for small datasets (to keep all cores busy).
 |      
 |      Type: ``bool``  (default: ``True``).
 |  
 |  hidden
 |      Hidden layer sizes (e.g. [100, 100]).
 |      
 |      Type: ``List[int]``  (default: ``[200, 200]``).
 |  
 |  hidden_dropout_ratios
 |      Hidden layer dropout ratios (can improve generalization), specify one value per hidden layer, defaults to 0.5.
 |      
 |      Type: ``List[float]``.
 |  
 |  huber_alpha
 |      Desired quantile for Huber/M-regression (threshold between quadratic and linear loss, must be between 0 and 1).
 |      
 |      Type: ``float``  (default: ``0.9``).
 |  
 |  ignore_const_cols
 |      Ignore constant columns.
 |      
 |      Type: ``bool``  (default: ``True``).
 |  
 |  ignored_columns
 |      Names of columns to ignore for training.
 |      
 |      Type: ``List[str]``.
 |  
 |  initial_biases
 |      A list of H2OFrame ids to initialize the bias vectors of this model with.
 |      
 |      Type: ``List[str]``.
 |  
 |  initial_weight_distribution
 |      Initial weight distribution.
 |      
 |      One of: ``"uniform_adaptive"``, ``"uniform"``, ``"normal"``  (default: ``"uniform_adaptive"``).
 |  
 |  initial_weight_scale
 |      Uniform: -value...value, Normal: stddev.
 |      
 |      Type: ``float``  (default: ``1``).
 |  
 |  initial_weights
 |      A list of H2OFrame ids to initialize the weight matrices of this model with.
 |      
 |      Type: ``List[str]``.
 |  
 |  input_dropout_ratio
 |      Input layer dropout ratio (can improve generalization, try 0.1 or 0.2).
 |      
 |      Type: ``float``  (default: ``0``).
 |  
 |  keep_cross_validation_fold_assignment
 |      Whether to keep the cross-validation fold assignment.
 |      
 |      Type: ``bool``  (default: ``False``).
 |  
 |  keep_cross_validation_predictions
 |      Whether to keep the predictions of the cross-validation models.
 |      
 |      Type: ``bool``  (default: ``False``).
 |  
 |  l1
 |      L1 regularization (can add stability and improve generalization, causes many weights to become 0).
 |      
 |      Type: ``float``  (default: ``0``).
 |  
 |  l2
 |      L2 regularization (can add stability and improve generalization, causes many weights to be small.
 |      
 |      Type: ``float``  (default: ``0``).
 |  
 |  loss
 |      Loss function.
 |      
 |      One of: ``"automatic"``, ``"cross_entropy"``, ``"quadratic"``, ``"huber"``, ``"absolute"``, ``"quantile"``
 |      (default: ``"automatic"``).
 |  
 |  max_after_balance_size
 |      Maximum relative size of the training data after balancing class counts (can be less than 1.0). Requires
 |      balance_classes.
 |      
 |      Type: ``float``  (default: ``5``).
 |  
 |  max_categorical_features
 |      Max. number of categorical features, enforced via hashing. #Experimental
 |      
 |      Type: ``int``  (default: ``2147483647``).
 |  
 |  max_confusion_matrix_size
 |      [Deprecated] Maximum size (# classes) for confusion matrices to be printed in the Logs.
 |      
 |      Type: ``int``  (default: ``20``).
 |  
 |  max_hit_ratio_k
 |      Max. number (top K) of predictions to use for hit ratio computation (for multi-class only, 0 to disable).
 |      
 |      Type: ``int``  (default: ``0``).
 |  
 |  max_runtime_secs
 |      Maximum allowed runtime in seconds for model training. Use 0 to disable.
 |      
 |      Type: ``float``  (default: ``0``).
 |  
 |  max_w2
 |      Constraint for squared sum of incoming weights per unit (e.g. for Rectifier).
 |      
 |      Type: ``float``  (default: ``3.4028235e+38``).
 |  
 |  mini_batch_size
 |      Mini-batch size (smaller leads to better fit, larger can speed up and generalize better).
 |      
 |      Type: ``int``  (default: ``1``).
 |  
 |  missing_values_handling
 |      Handling of missing values. Either MeanImputation or Skip.
 |      
 |      One of: ``"mean_imputation"``, ``"skip"``  (default: ``"mean_imputation"``).
 |  
 |  momentum_ramp
 |      Number of training samples for which momentum increases.
 |      
 |      Type: ``float``  (default: ``1000000``).
 |  
 |  momentum_stable
 |      Final momentum after the ramp is over (try 0.99).
 |      
 |      Type: ``float``  (default: ``0``).
 |  
 |  momentum_start
 |      Initial momentum at the beginning of training (try 0.5).
 |      
 |      Type: ``float``  (default: ``0``).
 |  
 |  nesterov_accelerated_gradient
 |      Use Nesterov accelerated gradient (recommended).
 |      
 |      Type: ``bool``  (default: ``True``).
 |  
 |  nfolds
 |      Number of folds for N-fold cross-validation (0 to disable or >= 2).
 |      
 |      Type: ``int``  (default: ``0``).
 |  
 |  offset_column
 |      Offset column. This will be added to the combination of columns before applying the link function.
 |      
 |      Type: ``str``.
 |  
 |  overwrite_with_best_model
 |      If enabled, override the final model with the best model found during training.
 |      
 |      Type: ``bool``  (default: ``True``).
 |  
 |  pretrained_autoencoder
 |      Pretrained autoencoder model to initialize this model with.
 |      
 |      Type: ``str``.
 |  
 |  quantile_alpha
 |      Desired quantile for Quantile regression, must be between 0 and 1.
 |      
 |      Type: ``float``  (default: ``0.5``).
 |  
 |  quiet_mode
 |      Enable quiet mode for less output to standard output.
 |      
 |      Type: ``bool``  (default: ``False``).
 |  
 |  rate
 |      Learning rate (higher => less stable, lower => slower convergence).
 |      
 |      Type: ``float``  (default: ``0.005``).
 |  
 |  rate_annealing
 |      Learning rate annealing: rate / (1 + rate_annealing * samples).
 |      
 |      Type: ``float``  (default: ``1e-06``).
 |  
 |  rate_decay
 |      Learning rate decay factor between layers (N-th layer: rate * rate_decay ^ (n - 1).
 |      
 |      Type: ``float``  (default: ``1``).
 |  
 |  regression_stop
 |      Stopping criterion for regression error (MSE) on training data (-1 to disable).
 |      
 |      Type: ``float``  (default: ``1e-06``).
 |  
 |  replicate_training_data
 |      Replicate the entire training dataset onto every node for faster training on small datasets.
 |      
 |      Type: ``bool``  (default: ``True``).
 |  
 |  reproducible
 |      Force reproducibility on small data (will be slow - only uses 1 thread).
 |      
 |      Type: ``bool``  (default: ``False``).
 |  
 |  response_column
 |      Response variable column.
 |      
 |      Type: ``str``.
 |  
 |  rho
 |      Adaptive learning rate time decay factor (similarity to prior updates).
 |      
 |      Type: ``float``  (default: ``0.99``).
 |  
 |  score_duty_cycle
 |      Maximum duty cycle fraction for scoring (lower: more training, higher: more scoring).
 |      
 |      Type: ``float``  (default: ``0.1``).
 |  
 |  score_each_iteration
 |      Whether to score during each iteration of model training.
 |      
 |      Type: ``bool``  (default: ``False``).
 |  
 |  score_interval
 |      Shortest time interval (in seconds) between model scoring.
 |      
 |      Type: ``float``  (default: ``5``).
 |  
 |  score_training_samples
 |      Number of training set samples for scoring (0 for all).
 |      
 |      Type: ``int``  (default: ``10000``).
 |  
 |  score_validation_samples
 |      Number of validation set samples for scoring (0 for all).
 |      
 |      Type: ``int``  (default: ``0``).
 |  
 |  score_validation_sampling
 |      Method used to sample validation dataset for scoring.
 |      
 |      One of: ``"uniform"``, ``"stratified"``  (default: ``"uniform"``).
 |  
 |  seed
 |      Seed for random numbers (affects sampling) - Note: only reproducible when running single threaded.
 |      
 |      Type: ``int``  (default: ``-1``).
 |  
 |  shuffle_training_data
 |      Enable shuffling of training data (recommended if training data is replicated and train_samples_per_iteration is
 |      close to #nodes x #rows, of if using balance_classes).
 |      
 |      Type: ``bool``  (default: ``False``).
 |  
 |  single_node_mode
 |      Run on a single node for fine-tuning of model parameters.
 |      
 |      Type: ``bool``  (default: ``False``).
 |  
 |  sparse
 |      Sparse data handling (more efficient for data with lots of 0 values).
 |      
 |      Type: ``bool``  (default: ``False``).
 |  
 |  sparsity_beta
 |      Sparsity regularization. #Experimental
 |      
 |      Type: ``float``  (default: ``0``).
 |  
 |  standardize
 |      If enabled, automatically standardize the data. If disabled, the user must provide properly scaled input data.
 |      
 |      Type: ``bool``  (default: ``True``).
 |  
 |  stopping_metric
 |      Metric to use for early stopping (AUTO: logloss for classification, deviance for regression)
 |      
 |      One of: ``"auto"``, ``"deviance"``, ``"logloss"``, ``"mse"``, ``"rmse"``, ``"mae"``, ``"rmsle"``, ``"auc"``,
 |      ``"lift_top_group"``, ``"misclassification"``, ``"mean_per_class_error"``  (default: ``"auto"``).
 |  
 |  stopping_rounds
 |      Early stopping based on convergence of stopping_metric. Stop if simple moving average of length k of the
 |      stopping_metric does not improve for k:=stopping_rounds scoring events (0 to disable)
 |      
 |      Type: ``int``  (default: ``5``).
 |  
 |  stopping_tolerance
 |      Relative tolerance for metric-based stopping criterion (stop if relative improvement is not at least this much)
 |      
 |      Type: ``float``  (default: ``0``).
 |  
 |  target_ratio_comm_to_comp
 |      Target ratio of communication overhead to computation. Only for multi-node operation and
 |      train_samples_per_iteration = -2 (auto-tuning).
 |      
 |      Type: ``float``  (default: ``0.05``).
 |  
 |  train_samples_per_iteration
 |      Number of training samples (globally) per MapReduce iteration. Special values are 0: one epoch, -1: all
 |      available data (e.g., replicated training data), -2: automatic.
 |      
 |      Type: ``int``  (default: ``-2``).
 |  
 |  training_frame
 |      Id of the training data frame (Not required, to allow initial validation of model parameters).
 |      
 |      Type: ``str``.
 |  
 |  tweedie_power
 |      Tweedie power for Tweedie regression, must be between 1 and 2.
 |      
 |      Type: ``float``  (default: ``1.5``).
 |  
 |  use_all_factor_levels
 |      Use all factor levels of categorical variables. Otherwise, the first factor level is omitted (without loss of
 |      accuracy). Useful for variable importances and auto-enabled for autoencoder.
 |      
 |      Type: ``bool``  (default: ``True``).
 |  
 |  validation_frame
 |      Id of the validation data frame.
 |      
 |      Type: ``str``.
 |  
 |  variable_importances
 |      Compute variable importances for input features (Gedeon method) - can be slow for large networks.
 |      
 |      Type: ``bool``  (default: ``False``).
 |  
 |  weights_column
 |      Column with observation weights. Giving some observation a weight of zero is equivalent to excluding it from the
 |      dataset; giving an observation a relative weight of 2 is equivalent to repeating that row twice. Negative
 |      weights are not allowed.
 |      
 |      Type: ``str``.
 |  
 |  ----------------------------------------------------------------------
 |  Data and other attributes defined here:
 |  
 |  algo = u'deeplearning'
 |  
 |  ----------------------------------------------------------------------
 |  Methods inherited from h2o.estimators.estimator_base.H2OEstimator:
 |  
 |  fit(self, x, y=None, **params)
 |      Fit an H2O model as part of a scikit-learn pipeline or grid search.
 |      
 |      A warning will be issued if a caller other than sklearn attempts to use this method.
 |      
 |      :param H2OFrame x: An H2OFrame consisting of the predictor variables.
 |      :param H2OFrame y: An H2OFrame consisting of the response variable.
 |      :param params: Extra arguments.
 |      :returns: The current instance of H2OEstimator for method chaining.
 |  
 |  get_params(self, deep=True)
 |      Obtain parameters for this estimator.
 |      
 |      Used primarily for sklearn Pipelines and sklearn grid search.
 |      
 |      :param deep: If True, return parameters of all sub-objects that are estimators.
 |      
 |      :returns: A dict of parameters
 |  
 |  join(self)
 |      Wait until job's completion.
 |  
 |  set_params(self, **parms)
 |      Used by sklearn for updating parameters during grid search.
 |      
 |      :param parms: A dictionary of parameters that will be set on this model.
 |      :returns: self, the current estimator object with the parameters all set as desired.
 |  
 |  start(self, x, y=None, training_frame=None, offset_column=None, fold_column=None, weights_column=None, validation_frame=None, **params)
 |      Train the model asynchronously (to block for results call :meth:`join`).
 |      
 |      :param x: A list of column names or indices indicating the predictor columns.
 |      :param y: An index or a column name indicating the response column.
 |      :param H2OFrame training_frame: The H2OFrame having the columns indicated by x and y (as well as any
 |          additional columns specified by fold, offset, and weights).
 |      :param offset_column: The name or index of the column in training_frame that holds the offsets.
 |      :param fold_column: The name or index of the column in training_frame that holds the per-row fold
 |          assignments.
 |      :param weights_column: The name or index of the column in training_frame that holds the per-row weights.
 |      :param validation_frame: H2OFrame with validation data to be scored on while training.
 |  
 |  train(self, x=None, y=None, training_frame=None, offset_column=None, fold_column=None, weights_column=None, validation_frame=None, max_runtime_secs=None, ignored_columns=None, model_id=None)
 |      Train the H2O model.
 |      
 |      :param x: A list of column names or indices indicating the predictor columns.
 |      :param y: An index or a column name indicating the response column.
 |      :param H2OFrame training_frame: The H2OFrame having the columns indicated by x and y (as well as any
 |          additional columns specified by fold, offset, and weights).
 |      :param offset_column: The name or index of the column in training_frame that holds the offsets.
 |      :param fold_column: The name or index of the column in training_frame that holds the per-row fold
 |          assignments.
 |      :param weights_column: The name or index of the column in training_frame that holds the per-row weights.
 |      :param validation_frame: H2OFrame with validation data to be scored on while training.
 |      :param float max_runtime_secs: Maximum allowed runtime in seconds for model training. Use 0 to disable.
 |  
 |  ----------------------------------------------------------------------
 |  Static methods inherited from h2o.estimators.estimator_base.H2OEstimator:
 |  
 |  mixin(obj, cls)
 |  
 |  ----------------------------------------------------------------------
 |  Methods inherited from h2o.model.model_base.ModelBase:
 |  
 |  __repr__(self)
 |  
 |  aic(self, train=False, valid=False, xval=False)
 |      Get the AIC (Akaike Information Criterium).
 |      
 |      If all are False (default), then return the training metric value.
 |      If more than one options is set to True, then return a dictionary of metrics where the keys are "train",
 |      "valid", and "xval".
 |      
 |      :param bool train: If train is True, then return the AIC value for the training data.
 |      :param bool valid: If valid is True, then return the AIC value for the validation data.
 |      :param bool xval:  If xval is True, then return the AIC value for the validation data.
 |      
 |      :returns: The AIC.
 |  
 |  auc(self, train=False, valid=False, xval=False)
 |      Get the AUC (Area Under Curve).
 |      
 |      If all are False (default), then return the training metric value.
 |      If more than one options is set to True, then return a dictionary of metrics where the keys are "train",
 |      "valid", and "xval".
 |      
 |      :param bool train: If train is True, then return the AUC value for the training data.
 |      :param bool valid: If valid is True, then return the AUC value for the validation data.
 |      :param bool xval:  If xval is True, then return the AUC value for the validation data.
 |      
 |      :returns: The AUC.
 |  
 |  biases(self, vector_id=0)
 |      Return the frame for the respective bias vector.
 |      
 |      :param: vector_id: an integer, ranging from 0 to number of layers, that specifies the bias vector to return.
 |      
 |      :returns: an H2OFrame which represents the bias vector identified by vector_id
 |  
 |  catoffsets(self)
 |      Categorical offsets for one-hot encoding.
 |  
 |  coef(self)
 |      Return the coefficients which can be applied to the non-standardized data.
 |      
 |      Note: standardize = True by default, if set to False then coef() return the coefficients which are fit directly.
 |  
 |  coef_norm(self)
 |      Return coefficients fitted on the standardized data (requires standardize = True, which is on by default).
 |      
 |      These coefficients can be used to evaluate variable importance.
 |  
 |  cross_validation_fold_assignment(self)
 |      Obtain the cross-validation fold assignment for all rows in the training data.
 |      
 |      :returns: H2OFrame
 |  
 |  cross_validation_holdout_predictions(self)
 |      Obtain the (out-of-sample) holdout predictions of all cross-validation models on the training data.
 |      
 |      This is equivalent to summing up all H2OFrames returned by cross_validation_predictions.
 |      
 |      :returns: H2OFrame
 |  
 |  cross_validation_metrics_summary(self)
 |      Retrieve Cross-Validation Metrics Summary.
 |      
 |      :returns: The cross-validation metrics summary as an H2OTwoDimTable
 |  
 |  cross_validation_models(self)
 |      Obtain a list of cross-validation models.
 |      
 |      :returns: list of H2OModel objects.
 |  
 |  cross_validation_predictions(self)
 |      Obtain the (out-of-sample) holdout predictions of all cross-validation models on their holdout data.
 |      
 |      Note that the predictions are expanded to the full number of rows of the training data, with 0 fill-in.
 |      
 |      :returns: list of H2OFrame objects.
 |  
 |  deepfeatures(self, test_data, layer)
 |      Return hidden layer details.
 |      
 |      :param test_data: Data to create a feature space on
 |      :param layer: 0 index hidden layer
 |  
 |  download_mojo(self, path=u'.', get_genmodel_jar=False)
 |      Download the model in MOJO format.
 |      
 |      :param path: the path where MOJO file should be saved.
 |      :param get_genmodel_jar: if True, then also download h2o-genmodel.jar and store it in folder ``path``.
 |      :returns: name of the MOJO file written.
 |  
 |  download_pojo(self, path=u'', get_genmodel_jar=False)
 |      Download the POJO for this model to the directory specified by path.
 |      
 |      If path is an empty string, then dump the output to screen.
 |      
 |      :param path:  An absolute path to the directory where POJO should be saved.
 |      :param get_genmodel_jar: if True, then also download h2o-genmodel.jar and store it in folder ``path``.
 |      :returns: name of the POJO file written.
 |  
 |  get_xval_models(self, key=None)
 |      Return a Model object.
 |      
 |      :param key: If None, return all cross-validated models; otherwise return the model that key points to.
 |      
 |      :returns: A model or list of models.
 |  
 |  gini(self, train=False, valid=False, xval=False)
 |      Get the Gini coefficient.
 |      
 |      If all are False (default), then return the training metric value.
 |      If more than one options is set to True, then return a dictionary of metrics where the keys are "train",
 |      "valid", and "xval"
 |      
 |      :param bool train: If train is True, then return the Gini Coefficient value for the training data.
 |      :param bool valid: If valid is True, then return the Gini Coefficient value for the validation data.
 |      :param bool xval:  If xval is True, then return the Gini Coefficient value for the cross validation data.
 |      
 |      :returns: The Gini Coefficient for this binomial model.
 |  
 |  is_cross_validated(self)
 |      Return True if the model was cross-validated.
 |  
 |  logloss(self, train=False, valid=False, xval=False)
 |      Get the Log Loss.
 |      
 |      If all are False (default), then return the training metric value.
 |      If more than one options is set to True, then return a dictionary of metrics where the keys are "train",
 |      "valid", and "xval".
 |      
 |      :param bool train: If train is True, then return the log loss value for the training data.
 |      :param bool valid: If valid is True, then return the log loss value for the validation data.
 |      :param bool xval:  If xval is True, then return the log loss value for the cross validation data.
 |      
 |      :returns: The log loss for this regression model.
 |  
 |  mae(self, train=False, valid=False, xval=False)
 |      Get the Mean Absolute Error.
 |      
 |      If all are False (default), then return the training metric value.
 |      If more than one options is set to True, then return a dictionary of metrics where the keys are "train",
 |      "valid", and "xval".
 |      
 |      :param bool train: If train is True, then return the MAE value for the training data.
 |      :param bool valid: If valid is True, then return the MAE value for the validation data.
 |      :param bool xval:  If xval is True, then return the MAE value for the cross validation data.
 |      
 |      :returns: The MAE for this regression model.
 |  
 |  mean_residual_deviance(self, train=False, valid=False, xval=False)
 |      Get the Mean Residual Deviances.
 |      
 |      If all are False (default), then return the training metric value.
 |      If more than one options is set to True, then return a dictionary of metrics where the keys are "train",
 |      "valid", and "xval".
 |      
 |      :param bool train: If train is True, then return the Mean Residual Deviance value for the training data.
 |      :param bool valid: If valid is True, then return the Mean Residual Deviance value for the validation data.
 |      :param bool xval:  If xval is True, then return the Mean Residual Deviance value for the cross validation data.
 |      
 |      :returns: The Mean Residual Deviance for this regression model.
 |  
 |  metalearner(self)
 |      Print the metalearner for the model, if any.  Currently only used by H2OStackedEnsembleEstimator.
 |  
 |  model_performance(self, test_data=None, train=False, valid=False, xval=False)
 |      Generate model metrics for this model on test_data.
 |      
 |      :param H2OFrame test_data: Data set for which model metrics shall be computed against. All three of train,
 |          valid and xval arguments are ignored if test_data is not None.
 |      :param bool train: Report the training metrics for the model.
 |      :param bool valid: Report the validation metrics for the model.
 |      :param bool xval: Report the cross-validation metrics for the model. If train and valid are True, then it
 |          defaults to True.
 |      
 |      :returns: An object of class H2OModelMetrics.
 |  
 |  mse(self, train=False, valid=False, xval=False)
 |      Get the Mean Square Error.
 |      
 |      If all are False (default), then return the training metric value.
 |      If more than one options is set to True, then return a dictionary of metrics where the keys are "train",
 |      "valid", and "xval".
 |      
 |      :param bool train: If train is True, then return the MSE value for the training data.
 |      :param bool valid: If valid is True, then return the MSE value for the validation data.
 |      :param bool xval:  If xval is True, then return the MSE value for the cross validation data.
 |      
 |      :returns: The MSE for this regression model.
 |  
 |  normmul(self)
 |      Normalization/Standardization multipliers for numeric predictors.
 |  
 |  normsub(self)
 |      Normalization/Standardization offsets for numeric predictors.
 |  
 |  null_degrees_of_freedom(self, train=False, valid=False, xval=False)
 |      Retreive the null degress of freedom if this model has the attribute, or None otherwise.
 |      
 |      :param bool train: Get the null dof for the training set. If both train and valid are False, then train is
 |          selected by default.
 |      :param bool valid: Get the null dof for the validation set. If both train and valid are True, then train is
 |          selected by default.
 |      
 |      :returns: Return the null dof, or None if it is not present.
 |  
 |  null_deviance(self, train=False, valid=False, xval=False)
 |      Retreive the null deviance if this model has the attribute, or None otherwise.
 |      
 |      :param bool train: Get the null deviance for the training set. If both train and valid are False, then train
 |          is selected by default.
 |      :param bool valid: Get the null deviance for the validation set. If both train and valid are True, then train
 |          is selected by default.
 |      
 |      :returns: Return the null deviance, or None if it is not present.
 |  
 |  partial_plot(self, data, cols, destination_key=None, nbins=20, plot=True, figsize=(7, 10), server=False)
 |      Create partial dependence plot which gives a graphical depiction of the marginal effect of a variable on the
 |      response. The effect of a variable is measured in change in the mean response.
 |      
 |      :param H2OFrame data: An H2OFrame object used for scoring and constructing the plot.
 |      :param cols: Feature(s) for which partial dependence will be calculated.
 |      :param destination_key: An key reference to the created partial dependence tables in H2O.
 |      :param nbins: Number of bins used. For categorical columns make sure the number of bins exceed the level count.
 |      :param plot: A boolean specifying whether to plot partial dependence table.
 |      :param figsize: Dimension/size of the returning plots, adjust to fit your output cells.
 |      :param server: ?
 |      :returns: Plot and list of calculated mean response tables for each feature requested.
 |  
 |  pprint_coef(self)
 |      Pretty print the coefficents table (includes normalized coefficients).
 |  
 |  predict(self, test_data)
 |      Predict on a dataset.
 |      
 |      :param H2OFrame test_data: Data on which to make predictions.
 |      
 |      :returns: A new H2OFrame of predictions.
 |  
 |  predict_leaf_node_assignment(self, test_data)
 |      Predict on a dataset and return the leaf node assignment (only for tree-based models).
 |      
 |      :param H2OFrame test_data: Data on which to make predictions.
 |      
 |      :returns: A new H2OFrame of predictions.
 |  
 |  r2(self, train=False, valid=False, xval=False)
 |      Return the R squared for this regression model.
 |      
 |      Will return R^2 for GLM Models and will return NaN otherwise.
 |      
 |      The R^2 value is defined to be 1 - MSE/var, where var is computed as sigma*sigma.
 |      
 |      If all are False (default), then return the training metric value.
 |      If more than one options is set to True, then return a dictionary of metrics where the keys are "train",
 |      "valid", and "xval".
 |      
 |      :param bool train: If train is True, then return the R^2 value for the training data.
 |      :param bool valid: If valid is True, then return the R^2 value for the validation data.
 |      :param bool xval:  If xval is True, then return the R^2 value for the cross validation data.
 |      
 |      :returns: The R squared for this regression model.
 |  
 |  residual_degrees_of_freedom(self, train=False, valid=False, xval=False)
 |      Retreive the residual degress of freedom if this model has the attribute, or None otherwise.
 |      
 |      :param bool train: Get the residual dof for the training set. If both train and valid are False, then train
 |          is selected by default.
 |      :param bool valid: Get the residual dof for the validation set. If both train and valid are True, then train
 |          is selected by default.
 |      
 |      :returns: Return the residual dof, or None if it is not present.
 |  
 |  residual_deviance(self, train=False, valid=False, xval=None)
 |      Retreive the residual deviance if this model has the attribute, or None otherwise.
 |      
 |      :param bool train: Get the residual deviance for the training set. If both train and valid are False, then
 |          train is selected by default.
 |      :param bool valid: Get the residual deviance for the validation set. If both train and valid are True, then
 |          train is selected by default.
 |      
 |      :returns: Return the residual deviance, or None if it is not present.
 |  
 |  respmul(self)
 |      Normalization/Standardization multipliers for numeric response.
 |  
 |  respsub(self)
 |      Normalization/Standardization offsets for numeric response.
 |  
 |  rmse(self, train=False, valid=False, xval=False)
 |      Get the Root Mean Square Error.
 |      
 |      If all are False (default), then return the training metric value.
 |      If more than one options is set to True, then return a dictionary of metrics where the keys are "train",
 |      "valid", and "xval".
 |      
 |      :param bool train: If train is True, then return the RMSE value for the training data.
 |      :param bool valid: If valid is True, then return the RMSE value for the validation data.
 |      :param bool xval:  If xval is True, then return the RMSE value for the cross validation data.
 |      
 |      :returns: The RMSE for this regression model.
 |  
 |  rmsle(self, train=False, valid=False, xval=False)
 |      Get the Root Mean Squared Logarithmic Error.
 |      
 |      If all are False (default), then return the training metric value.
 |      If more than one options is set to True, then return a dictionary of metrics where the keys are "train",
 |      "valid", and "xval".
 |      
 |      :param bool train: If train is True, then return the RMSLE value for the training data.
 |      :param bool valid: If valid is True, then return the RMSLE value for the validation data.
 |      :param bool xval:  If xval is True, then return the RMSLE value for the cross validation data.
 |      
 |      :returns: The RMSLE for this regression model.
 |  
 |  save_mojo(self, path=u'', force=False)
 |      Save an H2O Model as MOJO (Model Object, Optimized) to disk.
 |      
 |      :param model: The model object to save.
 |      :param path: a path to save the model at (hdfs, s3, local)
 |      :param force: if True overwrite destination directory in case it exists, or throw exception if set to False.
 |      
 |      :returns str: the path of the saved model
 |  
 |  score_history(self)
 |      DEPRECATED. Use :meth:`scoring_history` instead.
 |  
 |  scoring_history(self)
 |      Retrieve Model Score History.
 |      
 |      :returns: The score history as an H2OTwoDimTable or a Pandas DataFrame.
 |  
 |  show(self)
 |      Print innards of model, without regards to type.
 |  
 |  std_coef_plot(self, num_of_features=None, server=False)
 |      Plot a GLM model"s standardized coefficient magnitudes.
 |      
 |      :param num_of_features: the number of features shown in the plot.
 |      :param server: ?
 |      
 |      :returns: None.
 |  
 |  summary(self)
 |      Print a detailed summary of the model.
 |  
 |  varimp(self, use_pandas=False)
 |      Pretty print the variable importances, or return them in a list.
 |      
 |      :param use_pandas: If True, then the variable importances will be returned as a pandas data frame.
 |      
 |      :returns: A list or Pandas DataFrame.
 |  
 |  varimp_plot(self, num_of_features=None, server=False)
 |      Plot the variable importance for a trained model.
 |      
 |      :param num_of_features: the number of features shown in the plot.
 |      :param server: ?
 |      
 |      :returns: None.
 |  
 |  weights(self, matrix_id=0)
 |      Return the frame for the respective weight matrix.
 |      
 |      :param: matrix_id: an integer, ranging from 0 to number of layers, that specifies the weight matrix to return.
 |      
 |      :returns: an H2OFrame which represents the weight matrix identified by matrix_id
 |  
 |  xval_keys(self)
 |      Return model keys for the cross-validated model.
 |  
 |  ----------------------------------------------------------------------
 |  Data descriptors inherited from h2o.model.model_base.ModelBase:
 |  
 |  actual_params
 |      Dictionary of actual parameters of the model.
 |  
 |  default_params
 |      Dictionary of the default parameters of the model.
 |  
 |  full_parameters
 |      Dictionary of the full specification of all parameters.
 |  
 |  model_id
 |      Model identifier.
 |  
 |  params
 |      Get the parameters and the actual/default values only.
 |      
 |      :returns: A dictionary of parameters used to build this model.
 |  
 |  type
 |      The type of model built: ``"classifier"`` or ``"regressor"`` or ``"unsupervised"``
 |  
 |  xvals
 |      Return a list of the cross-validated models.
 |      
 |      :returns: A list of models.
 |  
 |  ----------------------------------------------------------------------
 |  Methods inherited from h2o.utils.backward_compatibility.BackwardsCompatibleBase:
 |  
 |  __getattr__(self, item)
 |  
 |  ----------------------------------------------------------------------
 |  Data descriptors inherited from h2o.utils.backward_compatibility.BackwardsCompatibleBase:
 |  
 |  __dict__
 |      dictionary for instance variables (if defined)
 |  
 |  __weakref__
 |      list of weak references to the object (if defined)

Help on function import_file in module h2o.h2o:

import_file(path=None, destination_frame=None, parse=True, header=0, sep=None, col_names=None, col_types=None, na_strings=None, pattern=None)
    Import a dataset that is already on the cluster.
    
    The path to the data must be a valid path for each node in the H2O cluster. If some node in the H2O cluster
    cannot see the file, then an exception will be thrown by the H2O cluster. Does a parallel/distributed
    multi-threaded pull of the data. The main difference between this method and :func:`upload_file` is that
    the latter works with local files, whereas this method imports remote files (i.e. files local to the server).
    If you running H2O server on your own maching, then both methods behave the same.
    
    :param path: path(s) specifying the location of the data to import or a path to a directory of files to import
    :param destination_frame: The unique hex key assigned to the imported file. If none is given, a key will be
        automatically generated.
    :param parse: If True, the file should be parsed after import.
    :param header: -1 means the first line is data, 0 means guess, 1 means first line is header.
    :param sep: The field separator character. Values on each line of the file are separated by
        this character. If not provided, the parser will automatically detect the separator.
    :param col_names: A list of column names for the file.
    :param col_types: A list of types or a dictionary of column names to types to specify whether columns
        should be forced to a certain type upon import parsing. If a list, the types for elements that are
        one will be guessed. The possible types a column may have are:
    
        - "unknown" - this will force the column to be parsed as all NA
        - "uuid"    - the values in the column must be true UUID or will be parsed as NA
        - "string"  - force the column to be parsed as a string
        - "numeric" - force the column to be parsed as numeric. H2O will handle the compression of the numeric
          data in the optimal manner.
        - "enum"    - force the column to be parsed as a categorical column.
        - "time"    - force the column to be parsed as a time column. H2O will attempt to parse the following
          list of date time formats: (date) "yyyy-MM-dd", "yyyy MM dd", "dd-MMM-yy", "dd MMM yy", (time)
          "HH:mm:ss", "HH:mm:ss:SSS", "HH:mm:ss:SSSnnnnnn", "HH.mm.ss" "HH.mm.ss.SSS", "HH.mm.ss.SSSnnnnnn".
          Times can also contain "AM" or "PM".
    :param na_strings: A list of strings, or a list of lists of strings (one list per column), or a dictionary
        of column names to strings which are to be interpreted as missing values.
    :param pattern: Character string containing a regular expression to match file(s) in the folder if `path` is a
        directory.
    
    :returns: a new :class:`H2OFrame` instance.
    
    :examples:
        >>> # Single file import
        >>> iris = import_file("h2o-3/smalldata/iris.csv")
        >>> # Return all files in the folder iris/ matching the regex r"iris_.*\.csv"
        >>> iris_pattern = h2o.import_file(path = "h2o-3/smalldata/iris",
        ...                                pattern = "iris_.*\.csv")


In [6]:
get_ipython().magic(u'matplotlib inline')

In [7]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from h2o.estimators.deeplearning import H2OAutoEncoderEstimator, H2ODeepLearningEstimator
from h2o.estimators.gbm import H2OGradientBoostingEstimator
from h2o.estimators.glm import H2OGeneralizedLinearEstimator
from h2o.estimators.random_forest import H2ORandomForestEstimator

In [9]:
import os
spiral = h2o.import_file(path = os.path.realpath("..//data/spiral.csv"))
grid  = h2o.import_file(path = os.path.realpath("../data/grid.csv"))


Parse progress: |█████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%

In [32]:
print(type(spiral))


<class 'h2o.frame.H2OFrame'>

In [14]:
spiral_df = spiral.as_data_frame(use_pandas=True)
grid_df = grid.as_data_frame(use_pandas=True)
print(spiral_df.head())
print(grid_df.head())
print(spiral_df.shape)
print(grid_df.shape)
grid_x, grid_y = grid_df.x.values.reshape(201,201), grid_df.y.values.reshape(201,201)
spiral_r = spiral_df[spiral_df.color == "Red"]
spiral_k = spiral_df[spiral_df.color == "Black"]

spiral_xr, spiral_yr = spiral_r[spiral_r.columns[0]], spiral_r[spiral_r.columns[1]]
spiral_xk, spiral_yk = spiral_k[spiral_k.columns[0]], spiral_k[spiral_k.columns[1]]


          x         y  color
0  0.371268 -0.012075  Black
1  0.281894  0.044798  Black
2  0.508074 -0.438310    Red
3  0.362393  0.863403    Red
4  0.381005  0.062693  Black
       x    y
0 -1.500 -1.5
1 -1.485 -1.5
2 -1.470 -1.5
3 -1.455 -1.5
4 -1.440 -1.5
(200, 3)
(40401, 2)

In [15]:
markersize_ = 7**2
plt.figure(figsize = (5,5))
plt.scatter(spiral_xr, spiral_yr, c = 'r', s=markersize_)
plt.scatter(spiral_xk, spiral_yk, c = 'k', s=markersize_)
plt.axis([-1.5, 1.5, -1.5, 1.5])
plt.title("Spiral");



In [30]:
X = spiral.col_names[0:2]
print(X)
print(type(X))


[u'x', u'y']
<type 'list'>

In [18]:
y = spiral.col_names[2]
dl_model = H2ODeepLearningEstimator(epochs=1000, )
dl_model.train(X, y, spiral)


deeplearning Model Build progress: |██████████████████████████████████████| 100%

In [19]:
gbm_model = H2OGradientBoostingEstimator()
gbm_model.train(X, y, spiral)


gbm Model Build progress: |███████████████████████████████████████████████| 100%

In [20]:
drf_model = H2ORandomForestEstimator()
drf_model.train(X, y, spiral)


drf Model Build progress: |███████████████████████████████████████████████| 100%

In [23]:
glm_model = H2OGeneralizedLinearEstimator(family="binomial")
glm_model.fit(spiral[X], spiral[y])


/usr/local/lib/python2.7/dist-packages/ipykernel_launcher.py:2: UserWarning: 

	`fit` is not recommended outside of the sklearn framework. Use `train` instead.
  
glm Model Build progress: |███████████████████████████████████████████████| 100%
Model Details
=============
H2OGeneralizedLinearEstimator :  Generalized Linear Modeling
Model Key:  GLM_model_python_1506545386805_212


ModelMetricsBinomialGLM: glm
** Reported on train data. **

MSE: 0.23097408708
RMSE: 0.480597635325
LogLoss: 0.653747738037
Null degrees of freedom: 199
Residual degrees of freedom: 197
Null deviance: 277.258872224
Residual deviance: 261.499095215
AIC: 267.499095215
AUC: 0.6552
Gini: 0.3104
Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.278896720355: 
Black Red Error Rate
Black 11.0 89.0 0.89 (89.0/100.0)
Red 0.0 100.0 0.0 (0.0/100.0)
Total 11.0 189.0 0.445 (89.0/200.0)
Maximum Metrics: Maximum metrics at their respective thresholds

metric threshold value idx
max f1 0.2788967 0.6920415 188.0
max f2 0.2788967 0.8488964 188.0
max f0point5 0.5101145 0.6659836 96.0
max accuracy 0.5101145 0.665 96.0
max precision 0.7588010 1.0 0.0
max recall 0.2788967 1.0 188.0
max specificity 0.7588010 1.0 0.0
max absolute_mcc 0.5101145 0.3301486 96.0
max min_per_class_accuracy 0.4995604 0.66 99.0
max mean_per_class_accuracy 0.5101145 0.665 96.0
Gains/Lift Table: Avg response rate: 50.00 %

group cumulative_data_fraction lower_threshold lift cumulative_lift response_rate cumulative_response_rate capture_rate cumulative_capture_rate gain cumulative_gain
1 0.01 0.7513066 2.0 2.0 1.0 1.0 0.02 0.02 100.0 100.0
2 0.02 0.7466065 2.0 2.0 1.0 1.0 0.02 0.04 100.0 100.0
3 0.03 0.7324228 2.0 2.0 1.0 1.0 0.02 0.06 100.0 100.0
4 0.04 0.7237541 2.0 2.0 1.0 1.0 0.02 0.08 100.0 100.0
5 0.05 0.7191567 2.0 2.0 1.0 1.0 0.02 0.1 100.0 100.0
6 0.1 0.6817086 1.4 1.7 0.7 0.85 0.07 0.17 40.0 70.0
7 0.15 0.6640298 0.4 1.2666667 0.2 0.6333333 0.02 0.19 -60.0 26.6666667
8 0.2 0.6449746 0.8 1.15 0.4 0.575 0.04 0.23 -20.0 15.0
9 0.3 0.5963479 1.4 1.2333333 0.7 0.6166667 0.14 0.37 40.0 23.3333333
10 0.4 0.5503725 1.6 1.325 0.8 0.6625 0.16 0.53 60.0 32.5
11 0.5 0.4989615 1.3 1.32 0.65 0.66 0.13 0.66 30.0 32.0
12 0.6 0.4481084 0.5 1.1833333 0.25 0.5916667 0.05 0.71 -50.0 18.3333333
13 0.7 0.4083163 0.7 1.1142857 0.35 0.5571429 0.07 0.78 -30.0 11.4285714
14 0.8 0.3672248 0.5 1.0375 0.25 0.51875 0.05 0.83 -50.0 3.75
15 0.9 0.3098007 1.2 1.0555556 0.6 0.5277778 0.12 0.95 20.0 5.5555556
16 1.0 0.2431265 0.5 1.0 0.25 0.5 0.05 1.0 -50.0 0.0
Scoring History: 
timestamp duration iteration negative_log_likelihood objective
2017-09-27 21:38:30 0.000 sec 0 138.6294361 0.6931472
2017-09-27 21:38:30 0.003 sec 1 130.7709163 0.6539560
2017-09-27 21:38:30 0.004 sec 2 130.7495618 0.6538560
2017-09-27 21:38:30 0.005 sec 3 130.7495476 0.6538560
Out[23]:


In [24]:
models = [dl_model, gbm_model, drf_model, glm_model]
m_names = ["Deep Learning", "Gradient Boosted Method", "Distributed Random Forest", "Generalized Linear Model"]

In [25]:
def plot_spirals(models, model_names):
    fig, ax = plt.subplots(2,2, figsize=(12,12))
    for k, subplot in enumerate(ax.flatten()):
        subplot.scatter(spiral_xr, spiral_yr, c = 'r', s=markersize_)
        subplot.scatter(spiral_xk, spiral_yk, c = 'k', s=markersize_)
        subplot.axis([-1.5, 1.5, -1.5, 1.5])
        subplot.set_title(model_names[k])
        subplot.set_xlabel('x')
        subplot.set_ylabel('y')
        pred_z = models[k].predict(grid).as_data_frame(True)
        subplot.contour(grid_x, grid_y, (pred_z['predict'] == 'Black').astype(np.int).reshape(201,201), colors='b')

In [26]:
plot_spirals(models, m_names)


deeplearning prediction progress: |███████████████████████████████████████| 100%
/usr/local/lib/python2.7/dist-packages/ipykernel_launcher.py:11: FutureWarning: reshape is deprecated and will raise in a subsequent release. Please use .values.reshape(...) instead
  # This is added back by InteractiveShellApp.init_path()
gbm prediction progress: |████████████████████████████████████████████████| 100%
drf prediction progress: |████████████████████████████████████████████████| 100%
glm prediction progress: |████████████████████████████████████████████████| 100%

1974 Motor Trend US magazine dataset


In [36]:
mtcars = h2o.import_file(path = os.path.realpath("../data/mtcars.csv"))


Parse progress: |█████████████████████████████████████████████████████████| 100%

In [37]:
mtcars.head()


model mpg cyl disp hp drat wt qsec vs am gear carb
Mazda RX4 21 6 160 110 3.9 2.62 16.46 0 1 4 4
Mazda RX4 Wag 21 6 160 110 3.9 2.875 17.02 0 1 4 4
Datsun 710 22.8 4 108 93 3.852.32 18.61 1 1 4 1
Hornet 4 Drive 21.4 6 258 110 3.083.215 19.44 1 0 3 1
Hornet Sportabout 18.7 8 360 175 3.153.44 17.02 0 0 3 2
Valiant 18.1 6 225 105 2.763.46 20.22 1 0 3 1
Duster 360 14.3 8 360 245 3.213.57 15.84 0 0 3 4
Merc 240D 24.4 4 146.7 62 3.693.19 20 1 0 4 2
Merc 230 22.8 4 140.8 95 3.923.15 22.9 1 0 4 2
Merc 280 19.2 6 167.6 123 3.923.44 18.3 1 0 4 4
Out[37]:


In [40]:
mtcars_filtered = mtcars[1:12]
mtcars_filtered.head()


mpg cyl disp hp drat wt qsec vs am gear carb
21 6 160 110 3.9 2.62 16.46 0 1 4 4
21 6 160 110 3.9 2.875 17.02 0 1 4 4
22.8 4 108 93 3.852.32 18.61 1 1 4 1
21.4 6 258 110 3.083.215 19.44 1 0 3 1
18.7 8 360 175 3.153.44 17.02 0 0 3 2
18.1 6 225 105 2.763.46 20.22 1 0 3 1
14.3 8 360 245 3.213.57 15.84 0 0 3 4
24.4 4 146.7 62 3.693.19 20 1 0 4 2
22.8 4 140.8 95 3.923.15 22.9 1 0 4 2
19.2 6 167.6 123 3.923.44 18.3 1 0 4 4
Out[40]:


In [41]:
model = H2OAutoEncoderEstimator(activation="Tanh", hidden=[50, 50, 50],
                                ignore_const_cols=False, epochs=100)

In [43]:
model.train(x=mtcars_filtered.col_names, training_frame=mtcars_filtered)


deeplearning Model Build progress: |██████████████████████████████████████| 100%

In [46]:
model._model_json['output']


Status of Neuron Layers: auto-encoder, gaussian distribution, Quadratic loss, 6,261 weights/biases, 79.8 KB, 3,200 training samples, mini-batch size 1

layer units type dropout l1 l2 mean_rate rate_rms momentum mean_weight weight_rms mean_bias bias_rms
1 11 Input 0.0
2 50 Tanh 0.0 0.0 0.0 0.0326085 0.0117436 0.0 0.0027799 0.1899648 0.0034350 0.0180388
3 50 Tanh 0.0 0.0 0.0 0.0696627 0.0308341 0.0 0.0042777 0.1403785 -0.0019408 0.0266095
4 50 Tanh 0.0 0.0 0.0 0.0601408 0.0310974 0.0 -0.0064000 0.1367821 -0.0016912 0.0249806
5 11 Tanh 0.0 0.0 0.0286151 0.0123612 0.0 0.0081929 0.1643521 0.0111823 0.0243638
Scoring History: 
timestamp duration training_speed epochs iterations samples training_rmse training_mse
2017-09-27 23:20:31 0.033 sec 0.00000 obs/sec 0.0 0 0.0 0.4082370 0.1666575
2017-09-27 23:20:31 0.198 sec 19393 obs/sec 100.0 10 3200.0 0.0348218 0.0012126
ModelMetricsAutoEncoder: deeplearning
** Reported on train data. **

MSE: 0.00121255960698
RMSE: 0.0348218265888
Out[46]:
{u'__meta': {u'schema_name': u'DeepLearningModelOutputV3',
  u'schema_type': u'DeepLearningModelOutput',
  u'schema_version': 3},
 u'biases': None,
 u'catoffsets': None,
 u'cross_validation_fold_assignment_frame_id': None,
 u'cross_validation_holdout_predictions_frame_id': None,
 u'cross_validation_metrics': None,
 u'cross_validation_metrics_summary': None,
 u'cross_validation_models': None,
 u'cross_validation_predictions': None,
 u'domains': [None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None],
 u'end_time': 1506554431784,
 u'help': {u'__meta': u'Metadata on this schema instance, to make it self-describing.',
  u'biases': u'Frame keys for bias vectors',
  u'catoffsets': u'Categorical offsets for one-hot encoding',
  u'cross_validation_fold_assignment_frame_id': u'Cross-validation fold assignment (each row is assigned to one holdout fold)',
  u'cross_validation_holdout_predictions_frame_id': u'Cross-validation holdout predictions (full out-of-sample predictions on training data)',
  u'cross_validation_metrics': u'Cross-validation model metrics',
  u'cross_validation_metrics_summary': u'Cross-validation model metrics summary',
  u'cross_validation_models': u'Cross-validation models (model ids)',
  u'cross_validation_predictions': u'Cross-validation predictions, one per cv model (deprecated, use cross_validation_holdout_predictions_frame_id instead)',
  u'domains': u'Domains for categorical columns',
  u'end_time': u'End time in milliseconds',
  u'help': u'Help information for output fields',
  u'model_category': u'Category of the model (e.g., Binomial)',
  u'model_summary': u'Model summary',
  u'names': u'Column names',
  u'normmul': u'Normalization/Standardization multipliers for numeric predictors',
  u'normrespmul': u'Normalization/Standardization multipliers for numeric response',
  u'normrespsub': u'Normalization/Standardization offsets for numeric response',
  u'normsub': u'Normalization/Standardization offsets for numeric predictors',
  u'run_time': u'Runtime in milliseconds',
  u'scoring_history': u'Scoring history',
  u'start_time': u'Start time in milliseconds',
  u'status': u'Job status',
  u'training_metrics': u'Training data model metrics',
  u'validation_metrics': u'Validation data model metrics',
  u'variable_importances': u'Variable Importances',
  u'weights': u'Frame keys for weight matrices'},
 u'model_category': u'AutoEncoder',
 u'model_summary': ,
 u'names': [u'mpg',
  u'cyl',
  u'disp',
  u'hp',
  u'drat',
  u'wt',
  u'qsec',
  u'vs',
  u'am',
  u'gear',
  u'carb'],
 u'normmul': None,
 u'normrespmul': None,
 u'normrespsub': None,
 u'normsub': None,
 u'run_time': 195,
 u'scoring_history': ,
 u'start_time': 1506554431589,
 u'status': None,
 u'training_metrics': ,
 u'validation_metrics': None,
 u'variable_importances': None,
 u'weights': None}

In [54]:
errors = model.anomaly(mtcars_filtered)
errors.describe()


Rows:32
Cols:1


Reconstruction.MSE
type real
mins 3.58212755649e-05
mean 0.00121255960698
maxs 0.0045209675693
sigma 0.00104339526792
zeros 0
missing0
0 5.44338776455e-05
1 9.93733842341e-05
2 0.000581063430137
3 0.000773853445528
4 0.00114676987647
5 0.000973639788484
6 0.00115921074657
7 0.00130753831699
8 0.00019531585826
9 0.00109688791205

In [65]:
# plot
error_as_pandas_df = errors.as_data_frame(use_pandas = True)
print(error_as_pandas_df.head())
%matplotlib inline
import matplotlib.pyplot as plt
plt.plot(error_as_pandas_df, 'ro')


   Reconstruction.MSE
0            0.000054
1            0.000099
2            0.000581
3            0.000774
4            0.001147
Out[65]:
[<matplotlib.lines.Line2D at 0x7f508499afd0>]

In [77]:
error_filtered = error_as_pandas_df[error_as_pandas_df['Reconstruction.MSE'] > 0.0025]

In [106]:
df.iloc[list(error_filtered.index.values)]


Out[106]:
model mpg cyl disp hp drat wt qsec vs am gear carb
26 Porsche 914-2 26.0 4 120.3 91 4.43 2.14 16.7 0 1 5 2
30 Maserati Bora 15.0 8 301.0 335 3.54 3.57 14.6 0 1 5 8

In [ ]: