Chapter 4 Keras

Developers: Daniel Falbel (Contributor, copyright holder, maintainer), JJ Allaire (Author, copyright holder), Francois Chollet (Author, copyright holder) etc.

https://keras.rstudio.com/ https://keras.rstudio.com/articles/sequential_model.html

The keras package uses the pipe operator to connect functions or operations together and the datasets need to be in matrix format.

4.1 Installing and Calling the Keras Package

#devtools::install_github("rstudio/keras")
library(keras)
library(tensorflow)

## 
## Attaching package: 'tensorflow'

## The following object is masked from 'package:caret':
## 
##     train

#install.packages("devtools")
#require(devtools)
#install_github("rstudio/reticulate")
#install_github("rstudio/tensorflow")
#install_github("rstudio/keras", force = TRUE)

# Install TensorFlow
#install_tensorflow()
library(keras)
#install_keras()
library(tensorflow)
#install_tensorflow()

library(reticulate)

#reticulate::py_available(initialize = TRUE)
#reticulate::py_versions_windows()
#reticulate::py_config()
#reticulate::py_discover_config("keras")

4.2 Keras for Regression Problems

4.2.1 Load the dataset, standardize

library(MASS)
rm(list=ls())
data(Boston)
head(Boston)

##      crim zn indus chas   nox    rm  age    dis rad tax ptratio  black lstat
## 1 0.00632 18  2.31    0 0.538 6.575 65.2 4.0900   1 296    15.3 396.90  4.98
## 2 0.02731  0  7.07    0 0.469 6.421 78.9 4.9671   2 242    17.8 396.90  9.14
## 3 0.02729  0  7.07    0 0.469 7.185 61.1 4.9671   2 242    17.8 392.83  4.03
## 4 0.03237  0  2.18    0 0.458 6.998 45.8 6.0622   3 222    18.7 394.63  2.94
## 5 0.06905  0  2.18    0 0.458 7.147 54.2 6.0622   3 222    18.7 396.90  5.33
## 6 0.02985  0  2.18    0 0.458 6.430 58.7 6.0622   3 222    18.7 394.12  5.21
##   medv
## 1 24.0
## 2 21.6
## 3 34.7
## 4 33.4
## 5 36.2
## 6 28.7

# Standardize Your dataset
Boston.st = scale(Boston)

set.seed(1)
index <- sample(1:nrow(Boston.st),round(0.8*nrow(Boston.st)))
trainBoston.st <- Boston.st[index,]
testBoston.st <- Boston.st[-index,]
head(testBoston.st)

##          crim          zn      indus       chas        nox         rm
## 6  -0.4166314 -0.48724019 -1.3055857 -0.2723291 -0.8344581  0.2068916
## 7  -0.4098372  0.04872402 -0.4761823 -0.2723291 -0.2648919 -0.3880270
## 9  -0.3955433  0.04872402 -0.4761823 -0.2723291 -0.2648919 -0.9302853
## 10 -0.4003331  0.04872402 -0.4761823 -0.2723291 -0.2648919 -0.3994130
## 11 -0.3939564  0.04872402 -0.4761823 -0.2723291 -0.2648919  0.1314594
## 12 -0.4064448  0.04872402 -0.4761823 -0.2723291 -0.2648919 -0.3922967
##            age       dis        rad       tax    ptratio     black       lstat
## 6  -0.35080997 1.0766711 -0.7521778 -1.105022  0.1129203 0.4101651 -1.04229087
## 7  -0.07015919 0.8384142 -0.5224844 -0.576948 -1.5037485 0.4263763 -0.03123671
## 9   1.11638970 1.0861216 -0.5224844 -0.576948 -1.5037485 0.3281233  2.41937935
## 10  0.61548134 1.3283202 -0.5224844 -0.576948 -1.5037485 0.3289995  0.62272769
## 11  0.91389483 1.2117800 -0.5224844 -0.576948 -1.5037485 0.3926395  1.09184562
## 12  0.50890509 1.1547920 -0.5224844 -0.576948 -1.5037485 0.4406159  0.08639286
##           medv
## 6   0.67055821
## 7   0.03992492
## 9  -0.65594629
## 10 -0.39499459
## 11 -0.81904111
## 12 -0.39499459

# The data needs to be in a matrix format and X variables and Y variables should be provided in two different matrices. Y variable "medv" is given in the 14th column of our dataset

x_train.st <- as.matrix(trainBoston.st[,-14])
y_train.st <- as.matrix(trainBoston.st[,14])

x_test.st <- as.matrix(testBoston.st[,-14])
y_test.st <- as.matrix(testBoston.st[,14])

dim(x_train.st)

## [1] 405  13

dim(y_train.st)

## [1] 405   1

dim(x_test.st)

## [1] 101  13

dim(y_test.st)

## [1] 101   1

In fact all this is available in Keras from the dataset_boston_housing() function so you can also use the following, remember you still might need to standardize your dataset:

#boston_housing <- dataset_boston_housing()

#x_train <- boston_housing$train$x
#y_train <- boston_housing$train$y
#x_test <- boston_housing$test$x
#y_test <- boston_housing$test$y

4.2.2 Initialize a sequential feed forward neural network

# Initialize an empty sequential model:
model.regression <- keras_model_sequential()

4.2.3 Define the structure of your model: layers and the activation functions

Add layers to the model, 1 Input, 1 Hidden and 1 Output Layer

Layers are defined within the layer_dense() functions

We don’t need to define the Input Layer separately since it gets associated with the 1st Hidden layer and no activation occurs in the Input Layer.

Though we do need to specify the Output Layer because we need to define the activation function and the dimension.

model.regression %>% 
  layer_dense(units = 3, activation = 'relu', input_shape = c(13)) %>%
  # 13 X variables + 1 constant = input layer has 14 neurons
  # The 14 neurons are fully connected to 3 neurons in the hidden layer.
  # 14*2 = 42 weights are optimized
  # The activation of the hidden layer neurons

  layer_dense(units = 1, activation = 'relu')
  # 3 neurons in the hidden layer + 1 constant = 4 neurons in total
  # The 4 neurons are fully connected to 1 neuron in the output layer
  # 4*1 = 4 weights to optimize
  # the activation of the output layer, since a regression problem, best is either relu, linear

# Print a summary of a model
summary(model.regression) # in total 46 parameters will be optimized.

## Model: "sequential"
## ________________________________________________________________________________
## Layer (type)                        Output Shape                    Param #     
## ================================================================================
## dense_1 (Dense)                     (None, 3)                       42          
## ________________________________________________________________________________
## dense (Dense)                       (None, 1)                       4           
## ================================================================================
## Total params: 46
## Trainable params: 46
## Non-trainable params: 0
## ________________________________________________________________________________

4.2.4 Define how these weights will be optimized

This is where the weights will be optimized by minimizing the “Mean Square Error” which is calculated by \(\frac{\sum_{i=1}^{n}(y-\hat y)^2}{n}\).

model.regression %>% compile(
  optimizer = optimizer_rmsprop(lr = 0.002),
  loss = 'mse'
)

4.2.5 Run the model without validation

model.regression %>% fit(x_train.st, y_train.st, epochs = 20, batch_size = 32)

All this could have been done at once since we are using a pipeline operator.

We can then evaluate the performance of the model using the test set which means we need to predict the Y values of the test set using this model:

predictions.y.train.st = model.regression %>% predict(x_train.st)
mse.train.st = sum((y_train.st - predictions.y.train.st)^2)/dim(x_train.st)[1]
mse.train.st

## [1] 0.5032314

predictions.y.test.st = model.regression %>% predict(x_test.st)
mse.test.st = sum((y_test.st - predictions.y.test.st)^2)/dim(x_test.st)[1]
mse.test.st

## [1] 0.4617062

# same as below:
model.regression %>% evaluate(x_train.st, y_train.st, batch_size=32, verbose = 1)

##      loss 
## 0.5032314

model.regression %>% evaluate(x_test.st, y_test.st, batch_size=32, verbose = 1)

##      loss 
## 0.4617062

4.2.6 Run the model with validation

# Fit the model 
model.regression %>% fit(
     x_train.st, 
     y_train.st, 
     epochs = 20, 
     batch_size = 32, 
     validation_split = 0.2
 )

4.3 Keras for Classification Problems

For classification problems, the Y variable needs to be converted into dummy variables (one-hot-encoding), and the output layer activation function needs to be logistic function:

rm(list=ls())
iris$numericclasses = unclass(iris$Species)
set.seed(1)
index <- sample(1:nrow(iris),round(0.8*nrow(iris)))
trainiris <- iris[index,]
testiris <- iris[-index,]

# no need to standardize this dataset but we still need to convert to matrices
x_iristrain <- as.matrix(trainiris[,c(1:4)])
y_iristrain <- as.matrix(trainiris[,6])

# Convert labels to categorical one-hot encoding
y_iristrain.one_hot_labels <- to_categorical(y_iristrain, num_classes = 4)

x_iristest <- as.matrix(testiris[,c(1:4)])
y_iristest <- as.matrix(testiris[,6])

# Convert labels to categorical one-hot encoding
y_iristest.one_hot_labels <- to_categorical(y_iristest, num_classes = 4)

4.3.1 Initialize and define a sequential feed forward neural network

The only difference here is we defined all the components together, and remember the output will have 3 neurons with a softmax or logistic activation function defined.

model.classification <- keras_model_sequential() %>% 
  layer_dense(units = 3, activation = 'relu', input_shape = c(4)) %>%
  layer_dense(units = 3, activation = 'softmax') %>% 
  compile(loss = 'categorical_crossentropy',
          optimizer = 'rmsprop',
          metrics = c('accuracy'))

summary(model.classification)

## Model: "sequential_1"
## ________________________________________________________________________________
## Layer (type)                        Output Shape                    Param #     
## ================================================================================
## dense_3 (Dense)                     (None, 3)                       15          
## ________________________________________________________________________________
## dense_2 (Dense)                     (None, 3)                       12          
## ================================================================================
## Total params: 27
## Trainable params: 27
## Non-trainable params: 0
## ________________________________________________________________________________

In total 27 parameters will be optimized. (4variables+1 neurons in the input layer = 5 neurons, connected to a hidden layer that has 3 neurons, in total 15 weights. 3 neurons + 1 bias = 4 neurons in the hidden layer are fully connected to the output layer which has 3 neurons, 4*3 = 12 weights to optimize)

4.3.2 Run the model without validation

model.classification %>% fit(x_iristrain, y_iristrain.one_hot_labels[,-1], epochs = 200, batch_size = 32)

We can then evaluate the performance of the model using the test set which means we need to predict the Y values of the test set using this model:

model.classification %>% evaluate(x_iristrain, y_iristrain.one_hot_labels[,-1], batch_size=32, verbose = 1)

##      loss  accuracy 
## 0.4156227 0.8416666

model.classification %>% evaluate(x_iristest, y_iristest.one_hot_labels[,-1], batch_size=32, verbose = 1)

##      loss  accuracy 
## 0.4307769 0.8666667

4.3.3 Run the model with validation

# Fit the model 
model.classification %>% fit(x_iristrain, y_iristrain.one_hot_labels[,-1],
     epochs = 20, 
     batch_size = 32, 
     validation_split = 0.2
 )

4.3.4 Evaluate the model

model.classification %>% evaluate(x_iristrain, y_iristrain.one_hot_labels[,-1], batch_size=32, verbose = 1)

##      loss  accuracy 
## 0.3950052 0.8583333

model.classification %>% evaluate(x_iristest, y_iristest.one_hot_labels[,-1], batch_size=32, verbose = 1)

##      loss  accuracy 
## 0.4106196 0.8666667

4.3.5 Obtain the predictions:

iris.train.predicted <- model.classification %>% predict(x_iristrain)  

head(iris.train.predicted)

##             [,1]       [,2]        [,3]
## [1,] 0.069433182 0.51280409 0.417762667
## [2,] 0.001320021 0.28976113 0.708918810
## [3,] 0.948503733 0.04492656 0.006569748
## [4,] 0.944541514 0.04795999 0.007498457
## [5,] 0.035015721 0.53584474 0.429139525
## [6,] 0.022172937 0.45372736 0.524099648

sum(iris.train.predicted[1,])

## [1] 0.9999999

maxidx <- function(arr) {
  return(which(arr == max(arr)))
}

whichmax.train.predicted <- apply(iris.train.predicted, c(1), maxidx)

head(whichmax.train.predicted)

## [1] 2 3 1 1 2 3

prediction.train <- c('setosa', 'versicolor', 'virginica')[whichmax.train.predicted]
actual.train <- c('setosa', 'versicolor', 'virginica')[trainiris$Species]

table(prediction.train, actual.train)

##                 actual.train
## prediction.train setosa versicolor virginica
##       setosa         39          0         0
##       versicolor      0         21         0
##       virginica       0         17        43

accuracy.train = mean(prediction.train==actual.train)
accuracy.train

## [1] 0.8583333

4.4 Overfitting

https://tensorflow.rstudio.com/tutorials/beginners/basic-ml/tutorial_overfit_underfit/