Chapter 3 H2O

H2O is probably the easiest to learn and use.

## Load all packages first
library(h2o)
## Warning: package 'h2o' was built under R version 4.0.5
## 
## ----------------------------------------------------------------------
## 
## Your next step is to start H2O:
##     > h2o.init()
## 
## For H2O package documentation, ask for help:
##     > ??h2o
## 
## After starting H2O, you can use the Web UI at http://localhost:54321
## For more information visit https://docs.h2o.ai
## 
## ----------------------------------------------------------------------
## 
## Attaching package: 'h2o'
## The following objects are masked from 'package:stats':
## 
##     cor, sd, var
## The following objects are masked from 'package:base':
## 
##     %*%, %in%, &&, ||, apply, as.factor, as.numeric, colnames,
##     colnames<-, ifelse, is.character, is.factor, is.numeric, log,
##     log10, log1p, log2, round, signif, trunc
library(caret)
library(mlbench)
library(ggplot2)
library(reshape2)
library(DEEPR)
## Warning: package 'DEEPR' was built under R version 4.0.3
## Loading required package: dirmult
## Warning: package 'dirmult' was built under R version 4.0.3
# http://blog.revolutionanalytics.com/2014/04/a-dive-into-h2o.html
# https://discuss.analyticsvidhya.com/t/script-in-h2o-in-r-to-get-you-into-top-30-percentile-for-the-digit-recognizer-competition/6651/8

3.1 Initialise H2O Connection

## Start a local H2O cluster directly from R
localH2O = h2o.init(ip = "localhost", port = 54321, startH2O = TRUE,min_mem_size = "3g")
##  Connection successful!
## 
## R is connected to the H2O cluster: 
##     H2O cluster uptime:         55 minutes 35 seconds 
##     H2O cluster timezone:       Africa/Johannesburg 
##     H2O data parsing timezone:  UTC 
##     H2O cluster version:        3.32.0.1 
##     H2O cluster version age:    7 months and 11 days !!! 
##     H2O cluster name:           H2O_started_from_R_01438475_sup231 
##     H2O cluster total nodes:    1 
##     H2O cluster total memory:   3.90 GB 
##     H2O cluster total cores:    4 
##     H2O cluster allowed cores:  4 
##     H2O cluster healthy:        TRUE 
##     H2O Connection ip:          localhost 
##     H2O Connection port:        54321 
##     H2O Connection proxy:       NA 
##     H2O Internal Security:      FALSE 
##     H2O API Extensions:         Amazon S3, Algos, AutoML, Core V3, TargetEncoder, Core V4 
##     R Version:                  R version 4.0.1 (2020-06-06)
## Warning in h2o.clusterInfo(): 
## Your H2O cluster version is too old (7 months and 11 days)!
## Please download and install the latest version from http://h2o.ai/download/
#Get help
?h2o.deeplearning
## starting httpd help server ... done

3.2 Data in H2o format

# iris data ####
iris
##     Sepal.Length Sepal.Width Petal.Length Petal.Width    Species
## 1            5.1         3.5          1.4         0.2     setosa
## 2            4.9         3.0          1.4         0.2     setosa
## 3            4.7         3.2          1.3         0.2     setosa
## 4            4.6         3.1          1.5         0.2     setosa
## 5            5.0         3.6          1.4         0.2     setosa
## 6            5.4         3.9          1.7         0.4     setosa
## 7            4.6         3.4          1.4         0.3     setosa
## 8            5.0         3.4          1.5         0.2     setosa
## 9            4.4         2.9          1.4         0.2     setosa
## 10           4.9         3.1          1.5         0.1     setosa
## 11           5.4         3.7          1.5         0.2     setosa
## 12           4.8         3.4          1.6         0.2     setosa
## 13           4.8         3.0          1.4         0.1     setosa
## 14           4.3         3.0          1.1         0.1     setosa
## 15           5.8         4.0          1.2         0.2     setosa
## 16           5.7         4.4          1.5         0.4     setosa
## 17           5.4         3.9          1.3         0.4     setosa
## 18           5.1         3.5          1.4         0.3     setosa
## 19           5.7         3.8          1.7         0.3     setosa
## 20           5.1         3.8          1.5         0.3     setosa
## 21           5.4         3.4          1.7         0.2     setosa
## 22           5.1         3.7          1.5         0.4     setosa
## 23           4.6         3.6          1.0         0.2     setosa
## 24           5.1         3.3          1.7         0.5     setosa
## 25           4.8         3.4          1.9         0.2     setosa
## 26           5.0         3.0          1.6         0.2     setosa
## 27           5.0         3.4          1.6         0.4     setosa
## 28           5.2         3.5          1.5         0.2     setosa
## 29           5.2         3.4          1.4         0.2     setosa
## 30           4.7         3.2          1.6         0.2     setosa
## 31           4.8         3.1          1.6         0.2     setosa
## 32           5.4         3.4          1.5         0.4     setosa
## 33           5.2         4.1          1.5         0.1     setosa
## 34           5.5         4.2          1.4         0.2     setosa
## 35           4.9         3.1          1.5         0.2     setosa
## 36           5.0         3.2          1.2         0.2     setosa
## 37           5.5         3.5          1.3         0.2     setosa
## 38           4.9         3.6          1.4         0.1     setosa
## 39           4.4         3.0          1.3         0.2     setosa
## 40           5.1         3.4          1.5         0.2     setosa
## 41           5.0         3.5          1.3         0.3     setosa
## 42           4.5         2.3          1.3         0.3     setosa
## 43           4.4         3.2          1.3         0.2     setosa
## 44           5.0         3.5          1.6         0.6     setosa
## 45           5.1         3.8          1.9         0.4     setosa
## 46           4.8         3.0          1.4         0.3     setosa
## 47           5.1         3.8          1.6         0.2     setosa
## 48           4.6         3.2          1.4         0.2     setosa
## 49           5.3         3.7          1.5         0.2     setosa
## 50           5.0         3.3          1.4         0.2     setosa
## 51           7.0         3.2          4.7         1.4 versicolor
## 52           6.4         3.2          4.5         1.5 versicolor
## 53           6.9         3.1          4.9         1.5 versicolor
## 54           5.5         2.3          4.0         1.3 versicolor
## 55           6.5         2.8          4.6         1.5 versicolor
## 56           5.7         2.8          4.5         1.3 versicolor
## 57           6.3         3.3          4.7         1.6 versicolor
## 58           4.9         2.4          3.3         1.0 versicolor
## 59           6.6         2.9          4.6         1.3 versicolor
## 60           5.2         2.7          3.9         1.4 versicolor
## 61           5.0         2.0          3.5         1.0 versicolor
## 62           5.9         3.0          4.2         1.5 versicolor
## 63           6.0         2.2          4.0         1.0 versicolor
## 64           6.1         2.9          4.7         1.4 versicolor
## 65           5.6         2.9          3.6         1.3 versicolor
## 66           6.7         3.1          4.4         1.4 versicolor
## 67           5.6         3.0          4.5         1.5 versicolor
## 68           5.8         2.7          4.1         1.0 versicolor
## 69           6.2         2.2          4.5         1.5 versicolor
## 70           5.6         2.5          3.9         1.1 versicolor
## 71           5.9         3.2          4.8         1.8 versicolor
## 72           6.1         2.8          4.0         1.3 versicolor
## 73           6.3         2.5          4.9         1.5 versicolor
## 74           6.1         2.8          4.7         1.2 versicolor
## 75           6.4         2.9          4.3         1.3 versicolor
## 76           6.6         3.0          4.4         1.4 versicolor
## 77           6.8         2.8          4.8         1.4 versicolor
## 78           6.7         3.0          5.0         1.7 versicolor
## 79           6.0         2.9          4.5         1.5 versicolor
## 80           5.7         2.6          3.5         1.0 versicolor
## 81           5.5         2.4          3.8         1.1 versicolor
## 82           5.5         2.4          3.7         1.0 versicolor
## 83           5.8         2.7          3.9         1.2 versicolor
## 84           6.0         2.7          5.1         1.6 versicolor
## 85           5.4         3.0          4.5         1.5 versicolor
## 86           6.0         3.4          4.5         1.6 versicolor
## 87           6.7         3.1          4.7         1.5 versicolor
## 88           6.3         2.3          4.4         1.3 versicolor
## 89           5.6         3.0          4.1         1.3 versicolor
## 90           5.5         2.5          4.0         1.3 versicolor
## 91           5.5         2.6          4.4         1.2 versicolor
## 92           6.1         3.0          4.6         1.4 versicolor
## 93           5.8         2.6          4.0         1.2 versicolor
## 94           5.0         2.3          3.3         1.0 versicolor
## 95           5.6         2.7          4.2         1.3 versicolor
## 96           5.7         3.0          4.2         1.2 versicolor
## 97           5.7         2.9          4.2         1.3 versicolor
## 98           6.2         2.9          4.3         1.3 versicolor
## 99           5.1         2.5          3.0         1.1 versicolor
## 100          5.7         2.8          4.1         1.3 versicolor
## 101          6.3         3.3          6.0         2.5  virginica
## 102          5.8         2.7          5.1         1.9  virginica
## 103          7.1         3.0          5.9         2.1  virginica
## 104          6.3         2.9          5.6         1.8  virginica
## 105          6.5         3.0          5.8         2.2  virginica
## 106          7.6         3.0          6.6         2.1  virginica
## 107          4.9         2.5          4.5         1.7  virginica
## 108          7.3         2.9          6.3         1.8  virginica
## 109          6.7         2.5          5.8         1.8  virginica
## 110          7.2         3.6          6.1         2.5  virginica
## 111          6.5         3.2          5.1         2.0  virginica
## 112          6.4         2.7          5.3         1.9  virginica
## 113          6.8         3.0          5.5         2.1  virginica
## 114          5.7         2.5          5.0         2.0  virginica
## 115          5.8         2.8          5.1         2.4  virginica
## 116          6.4         3.2          5.3         2.3  virginica
## 117          6.5         3.0          5.5         1.8  virginica
## 118          7.7         3.8          6.7         2.2  virginica
## 119          7.7         2.6          6.9         2.3  virginica
## 120          6.0         2.2          5.0         1.5  virginica
## 121          6.9         3.2          5.7         2.3  virginica
## 122          5.6         2.8          4.9         2.0  virginica
## 123          7.7         2.8          6.7         2.0  virginica
## 124          6.3         2.7          4.9         1.8  virginica
## 125          6.7         3.3          5.7         2.1  virginica
## 126          7.2         3.2          6.0         1.8  virginica
## 127          6.2         2.8          4.8         1.8  virginica
## 128          6.1         3.0          4.9         1.8  virginica
## 129          6.4         2.8          5.6         2.1  virginica
## 130          7.2         3.0          5.8         1.6  virginica
## 131          7.4         2.8          6.1         1.9  virginica
## 132          7.9         3.8          6.4         2.0  virginica
## 133          6.4         2.8          5.6         2.2  virginica
## 134          6.3         2.8          5.1         1.5  virginica
## 135          6.1         2.6          5.6         1.4  virginica
## 136          7.7         3.0          6.1         2.3  virginica
## 137          6.3         3.4          5.6         2.4  virginica
## 138          6.4         3.1          5.5         1.8  virginica
## 139          6.0         3.0          4.8         1.8  virginica
## 140          6.9         3.1          5.4         2.1  virginica
## 141          6.7         3.1          5.6         2.4  virginica
## 142          6.9         3.1          5.1         2.3  virginica
## 143          5.8         2.7          5.1         1.9  virginica
## 144          6.8         3.2          5.9         2.3  virginica
## 145          6.7         3.3          5.7         2.5  virginica
## 146          6.7         3.0          5.2         2.3  virginica
## 147          6.3         2.5          5.0         1.9  virginica
## 148          6.5         3.0          5.2         2.0  virginica
## 149          6.2         3.4          5.4         2.3  virginica
## 150          5.9         3.0          5.1         1.8  virginica
index <- c(sample(1:50,25), sample(51:100,25), sample(101:150,25))
irisTrain = iris[index,]
irisTest = iris[-index,]

iris.h2oTrain <- as.h2o(irisTrain)
## Warning in use.package("data.table"): data.table cannot be used without R
## package bit64 version 0.9.7 or higher. Please upgrade to take advangage of
## data.table speedups.
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%
iris.h2oTest <- as.h2o(irisTest)
## Warning in use.package("data.table"): data.table cannot be used without R
## package bit64 version 0.9.7 or higher. Please upgrade to take advangage of
## data.table speedups.
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%
iris.nn <- h2o.deeplearning(x = 1:4 ,
                            y = 5, 
                            training_frame = iris.h2oTrain, # data in H2O format
                            validation_frame = iris.h2oTest,
                            activation = "Tanh",
                            hidden = c(5), # one layer of 5 nodes
                            l1 = 1e-5,
                            epochs = 100, variable_importances = TRUE) # max. no. of epochs
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%
iris.nn.cv <- h2o.deeplearning(x = 1:4 ,
                            y = 5, 
                            training_frame = iris.h2oTrain, # data in H2O format
                            validation_frame = iris.h2oTest,
                            activation = "Tanh",
                            hidden = c(5), # one layer of 5 nodes
                            l1 = 1e-5,
                            nfolds = 5,
                            epochs = 100) # max. no. of epochs
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |==============================================================        |  88%
  |                                                                            
  |======================================================================| 100%
iris.nn@parameters
## $model_id
## [1] "DeepLearning_model_R_1621504187964_6"
## 
## $training_frame
## [1] "irisTrain_sid_a943_1"
## 
## $validation_frame
## [1] "irisTest_sid_a943_3"
## 
## $activation
## [1] "Tanh"
## 
## $hidden
## [1] 5
## 
## $epochs
## [1] 100
## 
## $seed
## [1] "140125080671732877"
## 
## $l1
## [1] 1e-05
## 
## $distribution
## [1] "multinomial"
## 
## $stopping_metric
## [1] "logloss"
## 
## $categorical_encoding
## [1] "OneHotInternal"
## 
## $x
## [1] "Sepal.Length" "Sepal.Width"  "Petal.Length" "Petal.Width" 
## 
## $y
## [1] "Species"
h2o.performance(iris.nn, train = TRUE)
## H2OMultinomialMetrics: deeplearning
## ** Reported on training data. **
## ** Metrics reported on full training frame **
## 
## Training Set Metrics: 
## =====================
## 
## Extract training frame with `h2o.getFrame("irisTrain_sid_a943_1")`
## MSE: (Extract with `h2o.mse`) 0.04022331
## RMSE: (Extract with `h2o.rmse`) 0.2005575
## Logloss: (Extract with `h2o.logloss`) 0.1443866
## Mean Per-Class Error: 0.04
## Confusion Matrix: Extract with `h2o.confusionMatrix(<model>,train = TRUE)`)
## =========================================================================
## Confusion Matrix: Row labels: Actual class; Column labels: Predicted class
##            setosa versicolor virginica  Error     Rate
## setosa         25          0         0 0.0000 = 0 / 25
## versicolor      0         25         0 0.0000 = 0 / 25
## virginica       0          3        22 0.1200 = 3 / 25
## Totals         25         28        22 0.0400 = 3 / 75
## 
## Hit Ratio Table: Extract with `h2o.hit_ratio_table(<model>,train = TRUE)`
## =======================================================================
## Top-3 Hit Ratios: 
##   k hit_ratio
## 1 1  0.960000
## 2 2  1.000000
## 3 3  1.000000
h2o.mse(iris.nn, train = TRUE)
## [1] 0.04022331
h2o.mse(iris.nn, valid = TRUE)
## [1] 0.03606636
h2o.mse(iris.nn.cv, xval = TRUE)
## [1] 0.08550383
h2o.varimp(iris.nn)
## Variable Importances: 
##       variable relative_importance scaled_importance percentage
## 1 Petal.Length            1.000000          1.000000   0.404590
## 2  Petal.Width            0.579513          0.579513   0.234465
## 3 Sepal.Length            0.559525          0.559525   0.226378
## 4  Sepal.Width            0.332603          0.332603   0.134568
# now make a prediction
predictionsTrain <- h2o.predict(iris.nn, iris.h2oTrain)
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%
predictionsTrain
##   predict    setosa  versicolor    virginica
## 1  setosa 0.9605705 0.039408090 2.136172e-05
## 2  setosa 0.9941566 0.005842733 6.915862e-07
## 3  setosa 0.9932530 0.006746304 7.382293e-07
## 4  setosa 0.9938170 0.006182201 7.907678e-07
## 5  setosa 0.9950551 0.004944093 8.316701e-07
## 6  setosa 0.9937289 0.006270050 1.058508e-06
## 
## [75 rows x 4 columns]
predictionsTest <- h2o.predict(iris.nn, iris.h2oTest)
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%
predictionsTest
##   predict    setosa  versicolor    virginica
## 1  setosa 0.9947001 0.005298876 9.823197e-07
## 2  setosa 0.9930255 0.006973262 1.263730e-06
## 3  setosa 0.9945754 0.005423825 7.557000e-07
## 4  setosa 0.9932355 0.006763449 1.040563e-06
## 5  setosa 0.9925999 0.007398490 1.571062e-06
## 6  setosa 0.9941181 0.005880774 1.130556e-06
## 
## [75 rows x 4 columns]
yhatTrain = as.factor(as.matrix(predictionsTrain$predict))
confusionMatrix(yhatTrain, irisTrain$Species)
## Confusion Matrix and Statistics
## 
##             Reference
## Prediction   setosa versicolor virginica
##   setosa         25          0         0
##   versicolor      0         25         3
##   virginica       0          0        22
## 
## Overall Statistics
##                                           
##                Accuracy : 0.96            
##                  95% CI : (0.8875, 0.9917)
##     No Information Rate : 0.3333          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.94            
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: setosa Class: versicolor Class: virginica
## Sensitivity                 1.0000            1.0000           0.8800
## Specificity                 1.0000            0.9400           1.0000
## Pos Pred Value              1.0000            0.8929           1.0000
## Neg Pred Value              1.0000            1.0000           0.9434
## Prevalence                  0.3333            0.3333           0.3333
## Detection Rate              0.3333            0.3333           0.2933
## Detection Prevalence        0.3333            0.3733           0.2933
## Balanced Accuracy           1.0000            0.9700           0.9400
yhatTest = as.factor(as.matrix(predictionsTest$predict))
confusionMatrix(yhatTest, irisTest$Species)
## Confusion Matrix and Statistics
## 
##             Reference
## Prediction   setosa versicolor virginica
##   setosa         25          0         0
##   versicolor      0         25         3
##   virginica       0          0        22
## 
## Overall Statistics
##                                           
##                Accuracy : 0.96            
##                  95% CI : (0.8875, 0.9917)
##     No Information Rate : 0.3333          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.94            
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: setosa Class: versicolor Class: virginica
## Sensitivity                 1.0000            1.0000           0.8800
## Specificity                 1.0000            0.9400           1.0000
## Pos Pred Value              1.0000            0.8929           1.0000
## Neg Pred Value              1.0000            1.0000           0.9434
## Prevalence                  0.3333            0.3333           0.3333
## Detection Rate              0.3333            0.3333           0.2933
## Detection Prevalence        0.3333            0.3733           0.2933
## Balanced Accuracy           1.0000            0.9700           0.9400

3.2.1 Grid Search for Complexity

https://h2o-release.s3.amazonaws.com/h2o/master/3190/docs-website/h2o-docs/booklets/DeepLearning_Vignette.pdf

hidden_opt <- list(c(1), c(2), c(3), 4,5,6,7,8,9,10, c(3,4),c(4,4), c(5,4), c(6,4))
hyper_params <- list(hidden = hidden_opt)
model_grid <- h2o.grid("deeplearning",
  hyper_params = hyper_params,
  x = 1:4 ,
  y = 5, 
  training_frame = iris.h2oTrain, # data in H2O format
  validation_frame = iris.h2oTest,
  activation = "Tanh",
  seed = 1, reproducible = TRUE, nfolds = 5
  )
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%
model_grid
## H2O Grid Details
## ================
## 
## Grid ID: Grid_DeepLearning_irisTrain_sid_a943_1_model_R_1621504187964_8 
## Used hyper parameters: 
##   -  hidden 
## Number of models: 14 
## Number of failed models: 0 
## 
## Hyper-Parameter Search Summary: ordered by increasing logloss
##    hidden
## 1  [6, 4]
## 2    [10]
## 3     [9]
## 4  [5, 4]
## 5     [8]
## 6  [4, 4]
## 7     [4]
## 8     [5]
## 9     [7]
## 10 [3, 4]
## 11    [1]
## 12    [6]
## 13    [2]
## 14    [3]
##                                                                  model_ids
## 1  Grid_DeepLearning_irisTrain_sid_a943_1_model_R_1621504187964_8_model_14
## 2  Grid_DeepLearning_irisTrain_sid_a943_1_model_R_1621504187964_8_model_10
## 3   Grid_DeepLearning_irisTrain_sid_a943_1_model_R_1621504187964_8_model_9
## 4  Grid_DeepLearning_irisTrain_sid_a943_1_model_R_1621504187964_8_model_13
## 5   Grid_DeepLearning_irisTrain_sid_a943_1_model_R_1621504187964_8_model_8
## 6  Grid_DeepLearning_irisTrain_sid_a943_1_model_R_1621504187964_8_model_12
## 7   Grid_DeepLearning_irisTrain_sid_a943_1_model_R_1621504187964_8_model_4
## 8   Grid_DeepLearning_irisTrain_sid_a943_1_model_R_1621504187964_8_model_5
## 9   Grid_DeepLearning_irisTrain_sid_a943_1_model_R_1621504187964_8_model_7
## 10 Grid_DeepLearning_irisTrain_sid_a943_1_model_R_1621504187964_8_model_11
## 11  Grid_DeepLearning_irisTrain_sid_a943_1_model_R_1621504187964_8_model_1
## 12  Grid_DeepLearning_irisTrain_sid_a943_1_model_R_1621504187964_8_model_6
## 13  Grid_DeepLearning_irisTrain_sid_a943_1_model_R_1621504187964_8_model_2
## 14  Grid_DeepLearning_irisTrain_sid_a943_1_model_R_1621504187964_8_model_3
##                logloss
## 1  0.30340859353378963
## 2   0.3241505862508419
## 3   0.3762406263763066
## 4   0.3859649879467963
## 5  0.41505146300210605
## 6  0.42160893762650176
## 7  0.44643795766409294
## 8   0.5033872528865022
## 9   0.5453880844019323
## 10  0.5799886955035108
## 11  0.5823825581709984
## 12  0.6108454650879463
## 13   0.702354413999542
## 14  0.7400050923793164
model1 = h2o.getModel(model_grid@model_ids[[1]])
h2o.mse(model1, xval = TRUE)
## [1] 0.09175803