Cognition:
Programming Edition!

LatinR 2023

Hannah Frick

Reading and understanding code

Research indicates that almost 60% of programmers’ time is spent understanding rather than writing code.

Felienne Hermans - The Programmer’s Brain

Let’s read some code

my_fun <- function(a,  
                   b,  
                   c,  
                   d = 2,  
                   e = 3,  
                   g = 22,  
                   j = 324) {  
  i <- a + c  
  w <- (d + e) / ((a + c) + (d + e))  
  four <- e - a  
  f <- four - a  
  ff <- i + w  
  list(f, ff, four)  
}

Our short-term memory (STM) only holds two to six items.

This is not even R code!

SEXP Cdqrls(SEXP x, SEXP y, SEXP tol, SEXP chk)
{
    SEXP ans;
    SEXP qr, coefficients, residuals, effects, pivot, qraux;
    int n, ny = 0, p, rank, nprotect = 4, pivoted = 0;
    double rtol = asReal(tol), *work;
    Rboolean check = asLogical(chk);

    ans = getAttrib(x, R_DimSymbol);
    if(check && length(ans) != 2) error(_("'x' is not a matrix"));
    int *dims = INTEGER(ans);
    n = dims[0]; p = dims[1];
    if(n) ny = (int)(XLENGTH(y)/n); /* y :  n x ny, or an n - vector */
    if(check && n * ny != XLENGTH(y))
    error(_("dimensions of 'x' (%d,%d) and 'y' (%d) do not match"),
          n,p, XLENGTH(y));

    /* These lose attributes, so do after we have extracted dims */
    if (TYPEOF(x) != REALSXP) {
    PROTECT(x = coerceVector(x, REALSXP));
    nprotect++;
    }
    if (TYPEOF(y) != REALSXP) {
    PROTECT(y = coerceVector(y, REALSXP));
    nprotect++;
    }
/* < more code > */
}

This is not even R code!

SEXP Cdqrls(SEXP x, SEXP y, SEXP tol, SEXP chk)
{
    /* define some variables */

    /* check inputs x and y */

    /* need to protect x and y */

/* < more code > */
}

Our long-term memory (LTM) helps us aggregate items in our STM into chunks.

Modifying the right-hand side of a formula

drop_strata <- function(expr, in_plus = TRUE) {
  if (rlang::is_call(expr, "+", n = 2) && in_plus) {
    lhs <- drop_strata(expr[[2]], in_plus = in_plus)
    rhs <- drop_strata(expr[[3]], in_plus = in_plus)
    if (rlang::is_call(lhs, "strata")) {
      rhs
    } else if (rlang::is_call(rhs, "strata")) {
      lhs
    } else {
      rlang::call2("+", lhs, rhs)
    }
  } else if (rlang::is_call(expr)) {
    expr[-1] <- purrr::map(as.list(expr[-1]), drop_strata, in_plus = FALSE)
    expr
  } else {
    expr
  }
}

Tests show (important) use cases

test_that("`drop_strata()` removes strata term in a series of `+` calls", {
  expect_equal(
    drop_strata(rlang::expr(a + strata(x))),
    rlang::expr(a)
  )
  
  expect_equal(
    drop_strata(rlang::expr(a + strata(x) + b)),
    rlang::expr(a + b)
  )
})

test_that("`drop_strata()` does not remove strata in other cases", {
  expect_equal(
    drop_strata(rlang::expr(a * (b + strata(x)))),
    rlang::expr(a * (b + strata(x)))
  )
})

Abstract syntax tree

library(lobstr)

ast(strata(x))
#> █─strata 
#> └─x

ast(a + b)
#> █─`+` 
#> ├─a 
#> └─b

Abstract syntax tree

library(lobstr)

ast(a + strata(x))
#> █─`+` 
#> ├─a 
#> └─█─strata 
#>   └─x

ast(a + strata(x) + b)
#> █─`+` 
#> ├─█─`+` 
#> │ ├─a 
#> │ └─█─strata 
#> │   └─x 
#> └─b

drop_strata(a + b)

drop_strata <- function(expr, in_plus = TRUE) {
  if (rlang::is_call(expr, "+", n = 2) && in_plus) {
    lhs <- drop_strata(expr[[2]], in_plus = in_plus)
    rhs <- drop_strata(expr[[3]], in_plus = in_plus)
    if (rlang::is_call(lhs, "strata")) {
      rhs
    } else if (rlang::is_call(rhs, "strata")) {
      lhs
    } else {
      rlang::call2("+", lhs, rhs)
    }
  } else if (rlang::is_call(expr)) {
    expr[-1] <- purrr::map(as.list(expr[-1]), 
                           drop_strata, 
                           in_plus = FALSE)
    expr
  } else {
    expr
  }
}
level line what

drop_strata(a + b)

drop_strata <- function(expr, in_plus = TRUE) {
  if (rlang::is_call(expr, "+", n = 2) && in_plus) {
    lhs <- drop_strata(expr[[2]], in_plus = in_plus)
    rhs <- drop_strata(expr[[3]], in_plus = in_plus)
    if (rlang::is_call(lhs, "strata")) {
      rhs
    } else if (rlang::is_call(rhs, "strata")) {
      lhs
    } else {
      rlang::call2("+", lhs, rhs)
    }
  } else if (rlang::is_call(expr)) {
    expr[-1] <- purrr::map(as.list(expr[-1]), 
                           drop_strata, 
                           in_plus = FALSE)
    expr
  } else {
    expr
  }
}
level line what
0

drop_strata(a + b)

drop_strata <- function(expr, in_plus = TRUE) {
  if (rlang::is_call(expr, "+", n = 2) && in_plus) {
    lhs <- drop_strata(expr[[2]], in_plus = in_plus)
    rhs <- drop_strata(expr[[3]], in_plus = in_plus)
    if (rlang::is_call(lhs, "strata")) {
      rhs
    } else if (rlang::is_call(rhs, "strata")) {
      lhs
    } else {
      rlang::call2("+", lhs, rhs)
    }
  } else if (rlang::is_call(expr)) {
    expr[-1] <- purrr::map(as.list(expr[-1]), 
                           drop_strata, 
                           in_plus = FALSE)
    expr
  } else {
    expr
  }
}
level line what
0 3

drop_strata(a + b)

drop_strata <- function(expr, in_plus = TRUE) {
  if (rlang::is_call(expr, "+", n = 2) && in_plus) {
    lhs <- drop_strata(expr[[2]], in_plus = in_plus)
    rhs <- drop_strata(expr[[3]], in_plus = in_plus)
    if (rlang::is_call(lhs, "strata")) {
      rhs
    } else if (rlang::is_call(rhs, "strata")) {
      lhs
    } else {
      rlang::call2("+", lhs, rhs)
    }
  } else if (rlang::is_call(expr)) {
    expr[-1] <- purrr::map(as.list(expr[-1]), 
                           drop_strata, 
                           in_plus = FALSE)
    expr
  } else {
    expr
  }
}
level line what
0 3
-1 in: a

drop_strata(a + b)

drop_strata <- function(expr, in_plus = TRUE) {
  if (rlang::is_call(expr, "+", n = 2) && in_plus) {
    lhs <- drop_strata(expr[[2]], in_plus = in_plus)
    rhs <- drop_strata(expr[[3]], in_plus = in_plus)
    if (rlang::is_call(lhs, "strata")) {
      rhs
    } else if (rlang::is_call(rhs, "strata")) {
      lhs
    } else {
      rlang::call2("+", lhs, rhs)
    }
  } else if (rlang::is_call(expr)) {
    expr[-1] <- purrr::map(as.list(expr[-1]), 
                           drop_strata, 
                           in_plus = FALSE)
    expr
  } else {
    expr
  }
}
level line what
0 3
-1 in: a
-1 out: a

drop_strata(a + b)

drop_strata <- function(expr, in_plus = TRUE) {
  if (rlang::is_call(expr, "+", n = 2) && in_plus) {
    lhs <- drop_strata(expr[[2]], in_plus = in_plus)
    rhs <- drop_strata(expr[[3]], in_plus = in_plus)
    if (rlang::is_call(lhs, "strata")) {
      rhs
    } else if (rlang::is_call(rhs, "strata")) {
      lhs
    } else {
      rlang::call2("+", lhs, rhs)
    }
  } else if (rlang::is_call(expr)) {
    expr[-1] <- purrr::map(as.list(expr[-1]), 
                           drop_strata, 
                           in_plus = FALSE)
    expr
  } else {
    expr
  }
}
level line what
0 3 lhs is a

drop_strata(a + b)

drop_strata <- function(expr, in_plus = TRUE) {
  if (rlang::is_call(expr, "+", n = 2) && in_plus) {
    lhs <- drop_strata(expr[[2]], in_plus = in_plus)
    rhs <- drop_strata(expr[[3]], in_plus = in_plus)
    if (rlang::is_call(lhs, "strata")) {
      rhs
    } else if (rlang::is_call(rhs, "strata")) {
      lhs
    } else {
      rlang::call2("+", lhs, rhs)
    }
  } else if (rlang::is_call(expr)) {
    expr[-1] <- purrr::map(as.list(expr[-1]), 
                           drop_strata, 
                           in_plus = FALSE)
    expr
  } else {
    expr
  }
}
level line what
0 3 lhs is a
0 4

drop_strata(a + b)

drop_strata <- function(expr, in_plus = TRUE) {
  if (rlang::is_call(expr, "+", n = 2) && in_plus) {
    lhs <- drop_strata(expr[[2]], in_plus = in_plus)
    rhs <- drop_strata(expr[[3]], in_plus = in_plus)
    if (rlang::is_call(lhs, "strata")) {
      rhs
    } else if (rlang::is_call(rhs, "strata")) {
      lhs
    } else {
      rlang::call2("+", lhs, rhs)
    }
  } else if (rlang::is_call(expr)) {
    expr[-1] <- purrr::map(as.list(expr[-1]), 
                           drop_strata, 
                           in_plus = FALSE)
    expr
  } else {
    expr
  }
}
level line what
0 3 lhs is a
0 4 rhs is b

drop_strata(a + b)

drop_strata <- function(expr, in_plus = TRUE) {
  if (rlang::is_call(expr, "+", n = 2) && in_plus) {
    lhs <- drop_strata(expr[[2]], in_plus = in_plus)
    rhs <- drop_strata(expr[[3]], in_plus = in_plus)
    if (rlang::is_call(lhs, "strata")) {
      rhs
    } else if (rlang::is_call(rhs, "strata")) {
      lhs
    } else {
      rlang::call2("+", lhs, rhs)
    }
  } else if (rlang::is_call(expr)) {
    expr[-1] <- purrr::map(as.list(expr[-1]), 
                           drop_strata, 
                           in_plus = FALSE)
    expr
  } else {
    expr
  }
}
level line what
0 3 lhs is a
0 4 rhs is b
0 10 return a + b

Our working memory is our STM applied to a problem.

Our working memory only holds two to six items.

Challenges

  • Lack of information
  • Lack of knowledge
  • Lack of processing power

Challenges, for reasons

  • Lack of information
  • Lack of knowledge
  • Lack of processing power
  • Limited capacity of STM
  • Activation of LTM
  • Limited capacity of working memory

Help your brain out

  • Look for beacons: names, comments, paragraphs
  • Summarize code into chunks via comments or refactoring
  • Learn more: programming concepts, domain knowledge
  • Offload information to notes

Writing Code

Writing is for re-reading

Better understanding - fewer mistakes - better science

Names

Good names help activate knowledge from your LTM.

Coming soon:
Survival analysis in tidymodels

predict(survival_model, 
        type = "survival",  
        time = 2)

time: the time points at which the survival probability is estimated

Which time are we talking about?

  • Observed time

Which time are we talking about?

  • Observed time
  • Event time
  • Censoring time

Which time are we talking about?

  • Observed time
  • Event time
  • Censoring time
  • time

Which time are we talking about?

  • Observed time
  • Event time
  • Censoring time
  • time

Which time are we talking about?

  • Observed time
  • Event time
  • Censoring time
  • time

Bad names can hinder you by activating the wrong knowledge.

time -> eval_time

.time -> eval_time

stat_times -> eval_time

Make (re)thinking names a separate step

to avoid overloading your working memory.

Code Smells

Bad names are linguistic anti-patterns, code smells are structural anti-patterns.

Code smells

  • Origin: Martin Fowler (1999) Refactoring: Improving the Design of Existing Code

  • R edition: Jenny Bryan (useR 2018 keynote) Code Smells and Feels


Code smells

Long method
Long parameter list
Switch statements

Alternative classes with different interfaces
Primitive obsession
Incomplete library class
Large class
Lazy class
Data class
Temporary field
Data clumps

Divergent change
Feature envy
Inappropriate intimacy
Duplicated code
Comments
Message chains
Middle man
Parallel inheritance
Refused bequest
Shotgun surgery
Speculative generality

Code smells

Long method
Long parameter list
Switch statements

Alternative classes with different interfaces
Primitive obsession
Incomplete library class
Large class
Lazy class
Data class
Temporary field
Data clumps

Divergent change
Feature envy
Inappropriate intimacy
Duplicated code
Comments
Message chains
Middle man
Parallel inheritance
Refused bequest
Shotgun surgery
Speculative generality

Duplicated code

predict._elnet <- function(object, new_data, type = NULL, penalty = NULL, ...) {
  # See discussion in https://github.com/tidymodels/parsnip/issues/195
  if (is.null(penalty)) penalty <- object$spec$args$penalty
  
  object$spec$args$penalty <- .check_glmnet_penalty_predict(penalty, object)
  
  predict.model_fit(object, new_data = new_data, type = type, ...)
}


predict._lognet <- function(object, new_data, type = NULL, penalty = NULL, ...) {
  # See discussion in https://github.com/tidymodels/parsnip/issues/195
  if (is.null(penalty)) penalty <- object$spec$args$penalty
  
  object$spec$args$penalty <- .check_glmnet_penalty_predict(penalty, object)
  
  predict.model_fit(object, new_data = new_data, type = type, ...)
}

Duplicated code

predict._elnet <- function(object, new_data, type = NULL, penalty = NULL, ...) {
  # See discussion in https://github.com/tidymodels/parsnip/issues/195
  if (is.null(penalty)) penalty <- object$spec$args$penalty
  
  object$spec$args$penalty <- .check_glmnet_penalty_predict(penalty, object)
  
  predict.model_fit(object, new_data = new_data, type = type, ...)
}


predict._lognet <- function(object, new_data, type = NULL, penalty = NULL, ...) {
  # See discussion in https://github.com/tidymodels/parsnip/issues/195
  if (is.null(penalty)) penalty <- object$spec$args$penalty
  
  object$spec$args$penalty <- .check_glmnet_penalty_predict(penalty, object)
  
  predict.model_fit(object, new_data = new_data, type = type, ...)
}

Duplicated code

predict_glmnet <- function(object, new_data, type = NULL, penalty = NULL, ...) {
  # See discussion in https://github.com/tidymodels/parsnip/issues/195
  if (is.null(penalty)) penalty <- object$spec$args$penalty
  
  object$spec$args$penalty <- .check_glmnet_penalty_predict(penalty, object)
  
  predict.model_fit(object, new_data = new_data, type = type, ...)
}


predict._elnet <- predict_glmnet
predict._lognet <- predict_glmnet

Duplicated code

predict._elnet <- function(object, new_data, type = NULL, penalty = NULL, ...) {
  # See discussion in https://github.com/tidymodels/parsnip/issues/195
  if (is.null(penalty)) penalty <- object$spec$args$penalty
  
  object$spec$args$penalty <- .check_glmnet_penalty_predict(penalty, object)
  
  predict.model_fit(object, new_data = new_data, type = type, ...)
}


predict._lognet <- function(object, new_data, type = NULL, penalty = NULL, ...) {
  # See discussion in https://github.com/tidymodels/parsnip/issues/195
  if (is.null(penalty)) penalty <- object$spec$args$penalty
  
  object$spec$args$penalty <- .check_glmnet_penalty_predict(penalty, object)
  
  predict.model_fit(object, new_data = new_data, type = type, ...)
}

Duplicated code

predict._elnet <- function(object, new_data, type = NULL, penalty = NULL, ...) {
  # See discussion in https://github.com/tidymodels/parsnip/issues/195
  if (is.null(penalty)) penalty <- object$spec$args$penalty
  
  object$spec$args$penalty <- .check_glmnet_penalty_predict(penalty, object)
  
  predict.model_fit(object, new_data = new_data, type = type, ...)
}


predict._lognet <- function(object, new_data, type = NULL, penalty = NULL, ...) {
  # See discussion in https://github.com/tidymodels/parsnip/issues/195
  if (is.null(penalty)) penalty <- object$spec$args$penalty
  
  object$spec$args$penalty <- .check_glmnet_penalty_predict(penalty, object)
  
  object$spec <- eval_args(object$spec)
  predict.model_fit(object, new_data = new_data, type = type, ...)
}

Through the cognitive lens

  • Exact copy: Lost opportunity of chunking things together!

  • (Very) similar code: Potential to chunk the wrong things together!

Duplicated code ~ Chunking gone wrong

Design patterns

Design patterns are reusable solutions to common problems.

Many arguments, revisited

my_fun <- function(x, 
                   y,
                   opt1 = 1,
                   opt2 = 2, 
                   opt3 = 3, 
                   opt4 = 4){
  ...  
}

Many arguments, revisited

my_fun <- function(x, y, options = my_fun_opts()) {
  ...
}

my_fun_opts <- function(opt1 = 1, opt2 = 2, opt3 = 3, opt4 = 4) {
  list(
    opt1 = opt1,
    opt2 = opt2,
    opt3 = opt3, 
    opt4 = opt4
  ) 
}

Principle/Pattern: Reduce argument clutter with an options object

Design patterns can help lower the cognitive load.

Work in progress:
https://design.tidyverse.org/

Tests

If you can’t make changes because you’re afraid of breaking something, it’s already broken.

Kara Woo

Tests are a diagnostic tool, make them obvious.

Michael Lynch - Why good developers write bad unit tests

── Failure (test-my_fun.R:4:3): my_fun() works ─────────────────────────────────
`one_thing` (`actual`) not equal to `another_thing` (`expected`).

`actual`:   FALSE
`expected`: TRUE 
[ FAIL 1 | WARN 0 | SKIP 0 | PASS 3 ]

Test complete

What "my_fun() works" looks like

test_that("my_fun() works", {
   # setup for thing 1
  expect_equal(my_fun(x_1), ...)
  
  # setup for thing 2
  expect_equal(my_fun(x_2), ...)
  
  # setup for thing 3
  expect_equal(my_fun(x_3), ...)  
  
  ...
})

Be specific in test names

test_that("my_fun() can do thing 1", {
   # setup for thing 1
  expect_equal(my_fun, ...)
})

test_that("my_fun() can do thing 2", {
  # setup for thing 2
  expect_equal(my_fun(x_2), ...)
})

test_that("my_fun() can do thing 3", {
  # setup for thing 3
  expect_equal(my_fun(x_3), ...)  
})

...

What if the second test breaks?

library(dplyr)

dat <- data.frame(a = 1:3, b = c("a", "b", "c"))

skip_if_not_installed("a_package")

test_that("my_fun() does this", {
  expect_equal(my_fun(dat), ...)
})

dat2 <- data.frame(x = 1:5, y = 6:10)

skip_on_os("windows")

test_that("my_fun_2() does that", {
  dat2 <- mutate(dat2, z = x + y)
  expect_equal(my_fun_2(dat, dat2), ...)
})

Example adapted from Hadley Wickham’s Package Development Masterclass at posit::conf(2023)

Make tests self-contained

library(dplyr)

skip_if_not_installed("a_package")

test_that("my_fun() does this", {
  dat <- data.frame(a = 1:3, b = c("a", "b", "c"))
  expect_equal(my_fun(dat), ...)
})

skip_on_os("windows")

test_that("my_fun_2() does that", {
  dat <- data.frame(a = 1:3, b = c("a", "b", "c"))
  dat2 <- data.frame(x = 1:5, y = 6:10)
  dat2 <- mutate(dat2, z = x + y)
  expect_equal(my_fun_2(dat, dat2), ...)
})

Make tests self-contained

skip_if_not_installed("a_package")

test_that("my_fun() does this", {
  dat <- data.frame(a = 1:3, b = c("a", "b", "c"))
  expect_equal(my_fun(dat), ...)
})

skip_on_os("windows")

test_that("my_fun_2() does that", {
  dat <- data.frame(a = 1:3, b = c("a", "b", "c"))
  dat2 <- data.frame(x = 1:5, y = 6:10)
  dat2 <- dplyr::mutate(dat2, z = x + y)
  expect_equal(my_fun_2(dat, dat2), ...)
})

Make tests self-contained

test_that("my_fun() does this", {
  skip_if_not_installed("a_package")
  
  dat <- data.frame(a = 1:3, b = c("a", "b", "c"))
  expect_equal(my_fun(dat), ...)
})

test_that("my_fun_2() does that", {
  skip_if_not_installed("a_package")
  skip_on_os("windows")

  dat <- data.frame(a = 1:3, b = c("a", "b", "c"))
  dat2 <- data.frame(x = 1:5, y = 6:10)
  dat2 <- dplyr::mutate(dat2, z = x + y)
  expect_equal(my_fun_2(dat, dat2), ...)
})

Obvious =
Fits into working memory

Well-chunked code supports testing.

In tests, we get to strip away complexity instead of accommodating it.

Having a mental model of how your brain works helps you to work with it, not against it.

If you want your code to grow in complexity,
you need to keep (re-)chunking.

If you want your understanding to grow,
you need to keep (re-)chunking.

Thank you!

The village: Maëlle, Tracy, Emil, Max, Simon, tidy team, Mine, Julie