In [1]:
import h2o

In [2]:
h2o.init()


H2O cluster uptime: 7 minutes 34 seconds 146 milliseconds
H2O cluster version: 3.1.0.99999
H2O cluster name: spencer
H2O cluster total nodes: 1
H2O cluster total memory: 14.22 GB
H2O cluster total cores: 8
H2O cluster allowed cores: 8
H2O cluster healthy: True
H2O Connection ip: 127.0.0.1
H2O Connection port: 54321

In [3]:
air = h2o.upload_file(h2o.locate("smalldata/airlines/allyears2k_headers.zip"))


Parse Progress: [##################################################] 100%
Uploaded py2eedb6c5-0858-4b7f-833f-b24ab23140f8 into cluster with 43,978 rows and 31 cols

In [4]:
air.dim


Out[4]:
[43978, 31]

In [5]:
numNAs = air["DepTime"].isna().sum()
print numNAs


1086.0

In [6]:
DepTime_mean = air["DepTime"].mean(na_rm=True)
print DepTime_mean


1345.84666138

In [7]:
air.impute("DepTime", method = "median", combine_method="low")   
numNAs = air["DepTime"].isna().sum()
print numNAs


0.0

In [8]:
air = h2o.upload_file(h2o.locate("smalldata/airlines/allyears2k_headers.zip"))


Parse Progress: [##################################################] 100%
Uploaded py74d01579-0aba-47ba-91a9-c9df3c8b4649 into cluster with 43,978 rows and 31 cols

In [9]:
air.impute("DepTime", method = "mean", by = ["Origin", "Distance"]).show()


First 10 rows and first 31 columns: 
Year Month DayofMonth DayOfWeek DepTime CRSDepTime ArrTime CRSArrTime UniqueCarrier FlightNum TailNum ActualElapsedTime CRSElapsedTime AirTime ArrDelay DepDelay Origin Dest Distance TaxiIn TaxiOut Cancelled CancellationCode Diverted CarrierDelay WeatherDelay NASDelay SecurityDelay LateAircraftDelay IsArrDelayed IsDepDelayed
1987 10 14 3 741 730 912 849 PS 1451 NA 91 79 23 11 SAN SFO 447 0 NA 0 YES YES
1987 10 15 4 729 730 903 849 PS 1451 NA 94 79 14 -1 SAN SFO 447 0 NA 0 YES NO
1987 10 17 6 741 730 918 849 PS 1451 NA 97 79 29 11 SAN SFO 447 0 NA 0 YES YES
1987 10 18 7 729 730 847 849 PS 1451 NA 78 79 -2 -1 SAN SFO 447 0 NA 0 NO NO
1987 10 19 1 749 730 922 849 PS 1451 NA 93 79 33 19 SAN SFO 447 0 NA 0 YES YES
1987 10 21 3 728 730 848 849 PS 1451 NA 80 79 -1 -2 SAN SFO 447 0 NA 0 NO NO
1987 10 22 4 728 730 852 849 PS 1451 NA 84 79 3 -2 SAN SFO 447 0 NA 0 YES NO
1987 10 23 5 731 730 902 849 PS 1451 NA 91 79 13 1 SAN SFO 447 0 NA 0 YES YES
1987 10 24 6 744 730 908 849 PS 1451 NA 84 79 19 14 SAN SFO 447 0 NA 0 YES YES
1987 10 25 7 729 730 851 849 PS 1451 NA 82 79 2 -1 SAN SFO 447 0 NA 0 YES NO

In [10]:
air = h2o.upload_file(h2o.locate("smalldata/airlines/allyears2k_headers.zip"))


Parse Progress: [##################################################] 100%
Uploaded pyf6d4c8d3-78a3-4c26-9adb-a282583ae563 into cluster with 43,978 rows and 31 cols

In [11]:
air.impute("TailNum", method = "mode").show()


First 10 rows and first 31 columns: 
Year Month DayofMonth DayOfWeek DepTime CRSDepTime ArrTime CRSArrTime UniqueCarrier FlightNum TailNum ActualElapsedTime CRSElapsedTime AirTime ArrDelay DepDelay Origin Dest Distance TaxiIn TaxiOut Cancelled CancellationCode Diverted CarrierDelay WeatherDelay NASDelay SecurityDelay LateAircraftDelay IsArrDelayed IsDepDelayed
1987 10 14 3 741 730 912 849 PS 1451 NA 91 79 23 11 SAN SFO 447 0 NA 0 YES YES
1987 10 15 4 729 730 903 849 PS 1451 NA 94 79 14 -1 SAN SFO 447 0 NA 0 YES NO
1987 10 17 6 741 730 918 849 PS 1451 NA 97 79 29 11 SAN SFO 447 0 NA 0 YES YES
1987 10 18 7 729 730 847 849 PS 1451 NA 78 79 -2 -1 SAN SFO 447 0 NA 0 NO NO
1987 10 19 1 749 730 922 849 PS 1451 NA 93 79 33 19 SAN SFO 447 0 NA 0 YES YES
1987 10 21 3 728 730 848 849 PS 1451 NA 80 79 -1 -2 SAN SFO 447 0 NA 0 NO NO
1987 10 22 4 728 730 852 849 PS 1451 NA 84 79 3 -2 SAN SFO 447 0 NA 0 YES NO
1987 10 23 5 731 730 902 849 PS 1451 NA 91 79 13 1 SAN SFO 447 0 NA 0 YES YES
1987 10 24 6 744 730 908 849 PS 1451 NA 84 79 19 14 SAN SFO 447 0 NA 0 YES YES
1987 10 25 7 729 730 851 849 PS 1451 NA 82 79 2 -1 SAN SFO 447 0 NA 0 YES NO

In [12]:
air = h2o.upload_file(h2o.locate("smalldata/airlines/allyears2k_headers.zip"))


Parse Progress: [##################################################] 100%
Uploaded py77ce3a79-0fa0-4116-91d4-c928a7084e68 into cluster with 43,978 rows and 31 cols

In [13]:
air.impute("TailNum", method = "mode", by=["Month", "Year"]).show()


First 10 rows and first 31 columns: 
Year Month DayofMonth DayOfWeek DepTime CRSDepTime ArrTime CRSArrTime UniqueCarrier FlightNum TailNum ActualElapsedTime CRSElapsedTime AirTime ArrDelay DepDelay Origin Dest Distance TaxiIn TaxiOut Cancelled CancellationCode Diverted CarrierDelay WeatherDelay NASDelay SecurityDelay LateAircraftDelay IsArrDelayed IsDepDelayed
1987 10 14 3 741 730 912 849 PS 1451 NA 91 79 23 11 SAN SFO 447 0 NA 0 YES YES
1987 10 15 4 729 730 903 849 PS 1451 NA 94 79 14 -1 SAN SFO 447 0 NA 0 YES NO
1987 10 17 6 741 730 918 849 PS 1451 NA 97 79 29 11 SAN SFO 447 0 NA 0 YES YES
1987 10 18 7 729 730 847 849 PS 1451 NA 78 79 -2 -1 SAN SFO 447 0 NA 0 NO NO
1987 10 19 1 749 730 922 849 PS 1451 NA 93 79 33 19 SAN SFO 447 0 NA 0 YES YES
1987 10 21 3 728 730 848 849 PS 1451 NA 80 79 -1 -2 SAN SFO 447 0 NA 0 NO NO
1987 10 22 4 728 730 852 849 PS 1451 NA 84 79 3 -2 SAN SFO 447 0 NA 0 YES NO
1987 10 23 5 731 730 902 849 PS 1451 NA 91 79 13 1 SAN SFO 447 0 NA 0 YES YES
1987 10 24 6 744 730 908 849 PS 1451 NA 84 79 19 14 SAN SFO 447 0 NA 0 YES YES
1987 10 25 7 729 730 851 849 PS 1451 NA 82 79 2 -1 SAN SFO 447 0 NA 0 YES NO