Cognition:
Programming Edition!

LatinR 2023

Hannah Frick

Reading and understanding code

Research indicates that almost 60% of programmers’ time is spent understanding rather than writing code.

Felienne Hermans - The Programmer’s Brain

Let’s read some code

my_fun <- function(a,  
                   b,  
                   c,  
                   d = 2,  
                   e = 3,  
                   g = 22,  
                   j = 324) {  
  i <- a + c  
  w <- (d + e) / ((a + c) + (d + e))  
  four <- e - a  
  f <- four - a  
  ff <- i + w  
  list(f, ff, four)  
}

Our short-term memory (STM) only holds two to six items.

This is not even R code!

SEXP Cdqrls(SEXP x, SEXP y, SEXP tol, SEXP chk)
{
    SEXP ans;
    SEXP qr, coefficients, residuals, effects, pivot, qraux;
    int n, ny = 0, p, rank, nprotect = 4, pivoted = 0;
    double rtol = asReal(tol), *work;
    Rboolean check = asLogical(chk);

    ans = getAttrib(x, R_DimSymbol);
    if(check && length(ans) != 2) error(_("'x' is not a matrix"));
    int *dims = INTEGER(ans);
    n = dims[0]; p = dims[1];
    if(n) ny = (int)(XLENGTH(y)/n); /* y :  n x ny, or an n - vector */
    if(check && n * ny != XLENGTH(y))
    error(_("dimensions of 'x' (%d,%d) and 'y' (%d) do not match"),
          n,p, XLENGTH(y));

    /* These lose attributes, so do after we have extracted dims */
    if (TYPEOF(x) != REALSXP) {
    PROTECT(x = coerceVector(x, REALSXP));
    nprotect++;
    }
    if (TYPEOF(y) != REALSXP) {
    PROTECT(y = coerceVector(y, REALSXP));
    nprotect++;
    }
/* < more code > */
}

This is not even R code!

SEXP Cdqrls(SEXP x, SEXP y, SEXP tol, SEXP chk)
{
    /* define some variables */

    /* check inputs x and y */

    /* need to protect x and y */

/* < more code > */
}

Our long-term memory (LTM) helps us aggregate items in our STM into chunks.

Modifying the right-hand side of a formula

drop_strata <- function(expr, in_plus = TRUE) {
  if (rlang::is_call(expr, "+", n = 2) && in_plus) {
    lhs <- drop_strata(expr[[2]], in_plus = in_plus)
    rhs <- drop_strata(expr[[3]], in_plus = in_plus)
    if (rlang::is_call(lhs, "strata")) {
      rhs
    } else if (rlang::is_call(rhs, "strata")) {
      lhs
    } else {
      rlang::call2("+", lhs, rhs)
    }
  } else if (rlang::is_call(expr)) {
    expr[-1] <- purrr::map(as.list(expr[-1]), drop_strata, in_plus = FALSE)
    expr
  } else {
    expr
  }
}

Tests show (important) use cases

test_that("`drop_strata()` removes strata term in a series of `+` calls", {
  expect_equal(
    drop_strata(rlang::expr(a + strata(x))),
    rlang::expr(a)
  )
  
  expect_equal(
    drop_strata(rlang::expr(a + strata(x) + b)),
    rlang::expr(a + b)
  )
})

test_that("`drop_strata()` does not remove strata in other cases", {
  expect_equal(
    drop_strata(rlang::expr(a * (b + strata(x)))),
    rlang::expr(a * (b + strata(x)))
  )
})

Abstract syntax tree

library(lobstr)

ast(strata(x))
#> █─strata 
#> └─x

ast(a + b)
#> █─`+` 
#> ├─a 
#> └─b

Abstract syntax tree

library(lobstr)

ast(a + strata(x))
#> █─`+` 
#> ├─a 
#> └─█─strata 
#>   └─x

ast(a + strata(x) + b)
#> █─`+` 
#> ├─█─`+` 
#> │ ├─a 
#> │ └─█─strata 
#> │   └─x 
#> └─b

`drop_strata(a + b)`

drop_strata <- function(expr, in_plus = TRUE) {
  if (rlang::is_call(expr, "+", n = 2) && in_plus) {
    lhs <- drop_strata(expr[[2]], in_plus = in_plus)
    rhs <- drop_strata(expr[[3]], in_plus = in_plus)
    if (rlang::is_call(lhs, "strata")) {
      rhs
    } else if (rlang::is_call(rhs, "strata")) {
      lhs
    } else {
      rlang::call2("+", lhs, rhs)
    }
  } else if (rlang::is_call(expr)) {
    expr[-1] <- purrr::map(as.list(expr[-1]), 
                           drop_strata, 
                           in_plus = FALSE)
    expr
  } else {
    expr
  }
}

level	line	what

`drop_strata(a + b)`

drop_strata <- function(expr, in_plus = TRUE) {
  if (rlang::is_call(expr, "+", n = 2) && in_plus) {
    lhs <- drop_strata(expr[[2]], in_plus = in_plus)
    rhs <- drop_strata(expr[[3]], in_plus = in_plus)
    if (rlang::is_call(lhs, "strata")) {
      rhs
    } else if (rlang::is_call(rhs, "strata")) {
      lhs
    } else {
      rlang::call2("+", lhs, rhs)
    }
  } else if (rlang::is_call(expr)) {
    expr[-1] <- purrr::map(as.list(expr[-1]), 
                           drop_strata, 
                           in_plus = FALSE)
    expr
  } else {
    expr
  }
}

level	line	what
0

`drop_strata(a + b)`

drop_strata <- function(expr, in_plus = TRUE) {
  if (rlang::is_call(expr, "+", n = 2) && in_plus) {
    lhs <- drop_strata(expr[[2]], in_plus = in_plus)
    rhs <- drop_strata(expr[[3]], in_plus = in_plus)
    if (rlang::is_call(lhs, "strata")) {
      rhs
    } else if (rlang::is_call(rhs, "strata")) {
      lhs
    } else {
      rlang::call2("+", lhs, rhs)
    }
  } else if (rlang::is_call(expr)) {
    expr[-1] <- purrr::map(as.list(expr[-1]), 
                           drop_strata, 
                           in_plus = FALSE)
    expr
  } else {
    expr
  }
}

level	line	what
0	3

`drop_strata(a + b)`

drop_strata <- function(expr, in_plus = TRUE) {
  if (rlang::is_call(expr, "+", n = 2) && in_plus) {
    lhs <- drop_strata(expr[[2]], in_plus = in_plus)
    rhs <- drop_strata(expr[[3]], in_plus = in_plus)
    if (rlang::is_call(lhs, "strata")) {
      rhs
    } else if (rlang::is_call(rhs, "strata")) {
      lhs
    } else {
      rlang::call2("+", lhs, rhs)
    }
  } else if (rlang::is_call(expr)) {
    expr[-1] <- purrr::map(as.list(expr[-1]), 
                           drop_strata, 
                           in_plus = FALSE)
    expr
  } else {
    expr
  }
}

level	line	what
0	3
-1		in: `a`

`drop_strata(a + b)`

drop_strata <- function(expr, in_plus = TRUE) {
  if (rlang::is_call(expr, "+", n = 2) && in_plus) {
    lhs <- drop_strata(expr[[2]], in_plus = in_plus)
    rhs <- drop_strata(expr[[3]], in_plus = in_plus)
    if (rlang::is_call(lhs, "strata")) {
      rhs
    } else if (rlang::is_call(rhs, "strata")) {
      lhs
    } else {
      rlang::call2("+", lhs, rhs)
    }
  } else if (rlang::is_call(expr)) {
    expr[-1] <- purrr::map(as.list(expr[-1]), 
                           drop_strata, 
                           in_plus = FALSE)
    expr
  } else {
    expr
  }
}

level	line	what
0	3
-1		in: `a`
-1		out: `a`

`drop_strata(a + b)`

drop_strata <- function(expr, in_plus = TRUE) {
  if (rlang::is_call(expr, "+", n = 2) && in_plus) {
    lhs <- drop_strata(expr[[2]], in_plus = in_plus)
    rhs <- drop_strata(expr[[3]], in_plus = in_plus)
    if (rlang::is_call(lhs, "strata")) {
      rhs
    } else if (rlang::is_call(rhs, "strata")) {
      lhs
    } else {
      rlang::call2("+", lhs, rhs)
    }
  } else if (rlang::is_call(expr)) {
    expr[-1] <- purrr::map(as.list(expr[-1]), 
                           drop_strata, 
                           in_plus = FALSE)
    expr
  } else {
    expr
  }
}

level	line	what
0	3	`lhs` is `a`

`drop_strata(a + b)`

drop_strata <- function(expr, in_plus = TRUE) {
  if (rlang::is_call(expr, "+", n = 2) && in_plus) {
    lhs <- drop_strata(expr[[2]], in_plus = in_plus)
    rhs <- drop_strata(expr[[3]], in_plus = in_plus)
    if (rlang::is_call(lhs, "strata")) {
      rhs
    } else if (rlang::is_call(rhs, "strata")) {
      lhs
    } else {
      rlang::call2("+", lhs, rhs)
    }
  } else if (rlang::is_call(expr)) {
    expr[-1] <- purrr::map(as.list(expr[-1]), 
                           drop_strata, 
                           in_plus = FALSE)
    expr
  } else {
    expr
  }
}

level	line	what
0	3	`lhs` is `a`
0	4

`drop_strata(a + b)`

drop_strata <- function(expr, in_plus = TRUE) {
  if (rlang::is_call(expr, "+", n = 2) && in_plus) {
    lhs <- drop_strata(expr[[2]], in_plus = in_plus)
    rhs <- drop_strata(expr[[3]], in_plus = in_plus)
    if (rlang::is_call(lhs, "strata")) {
      rhs
    } else if (rlang::is_call(rhs, "strata")) {
      lhs
    } else {
      rlang::call2("+", lhs, rhs)
    }
  } else if (rlang::is_call(expr)) {
    expr[-1] <- purrr::map(as.list(expr[-1]), 
                           drop_strata, 
                           in_plus = FALSE)
    expr
  } else {
    expr
  }
}

level	line	what
0	3	`lhs` is `a`
0	4	`rhs` is `b`

`drop_strata(a + b)`

drop_strata <- function(expr, in_plus = TRUE) {
  if (rlang::is_call(expr, "+", n = 2) && in_plus) {
    lhs <- drop_strata(expr[[2]], in_plus = in_plus)
    rhs <- drop_strata(expr[[3]], in_plus = in_plus)
    if (rlang::is_call(lhs, "strata")) {
      rhs
    } else if (rlang::is_call(rhs, "strata")) {
      lhs
    } else {
      rlang::call2("+", lhs, rhs)
    }
  } else if (rlang::is_call(expr)) {
    expr[-1] <- purrr::map(as.list(expr[-1]), 
                           drop_strata, 
                           in_plus = FALSE)
    expr
  } else {
    expr
  }
}

level	line	what
0	3	`lhs` is `a`
0	4	`rhs` is `b`
0	10	return `a + b`

Our working memory is our STM applied to a problem.

Our working memory only holds two to six items.

Challenges

Lack of information
Lack of knowledge
Lack of processing power

Challenges, for reasons

Lack of information
Lack of knowledge
Lack of processing power

Limited capacity of STM
Activation of LTM
Limited capacity of working memory

Help your brain out

Look for beacons: names, comments, paragraphs
Summarize code into chunks via comments or refactoring

Learn more: programming concepts, domain knowledge

Offload information to notes

Writing Code

Writing is for re-reading

Better understanding - fewer mistakes - better science

Names

Good names help activate knowledge from your LTM.

Coming soon:
Survival analysis in tidymodels

predict(survival_model, 
        type = "survival",  
        time = 2)

time: the time points at which the survival probability is estimated

Which time are we talking about?

Observed time

Which time are we talking about?

Observed time
Event time
Censoring time

Which time are we talking about?

Observed time
Event time
Censoring time
time

Which time are we talking about?

Observed time
Event time
Censoring time
time

Which time are we talking about?

Observed time
Event time
Censoring time
time

Bad names can hinder you by activating the wrong knowledge.

`time` -> `eval_time`

`.time` -> `eval_time`

`stat_times` -> `eval_time`

Make (re)thinking names a separate step

to avoid overloading your working memory.

Code Smells

Bad names are linguistic anti-patterns, code smells are structural anti-patterns.

Code smells

Origin: Martin Fowler (1999) Refactoring: Improving the Design of Existing Code
R edition: Jenny Bryan (useR 2018 keynote) Code Smells and Feels

Code smells

Long method
Long parameter list
Switch statements

Alternative classes with different interfaces
Primitive obsession
Incomplete library class
Large class
Lazy class
Data class
Temporary field
Data clumps

Divergent change
Feature envy
Inappropriate intimacy
Duplicated code
Comments
Message chains
Middle man
Parallel inheritance
Refused bequest
Shotgun surgery
Speculative generality

Code smells

Long method
Long parameter list
Switch statements

Alternative classes with different interfaces
Primitive obsession
Incomplete library class
Large class
Lazy class
Data class
Temporary field
Data clumps

Divergent change
Feature envy
Inappropriate intimacy
Duplicated code
Comments
Message chains
Middle man
Parallel inheritance
Refused bequest
Shotgun surgery
Speculative generality

Duplicated code

predict._elnet <- function(object, new_data, type = NULL, penalty = NULL, ...) {
  # See discussion in https://github.com/tidymodels/parsnip/issues/195
  if (is.null(penalty)) penalty <- object$spec$args$penalty
  
  object$spec$args$penalty <- .check_glmnet_penalty_predict(penalty, object)
  
  predict.model_fit(object, new_data = new_data, type = type, ...)
}

predict._lognet <- function(object, new_data, type = NULL, penalty = NULL, ...) {
  # See discussion in https://github.com/tidymodels/parsnip/issues/195
  if (is.null(penalty)) penalty <- object$spec$args$penalty
  
  object$spec$args$penalty <- .check_glmnet_penalty_predict(penalty, object)
  
  predict.model_fit(object, new_data = new_data, type = type, ...)
}

Duplicated code

predict._elnet <- function(object, new_data, type = NULL, penalty = NULL, ...) {
  # See discussion in https://github.com/tidymodels/parsnip/issues/195
  if (is.null(penalty)) penalty <- object$spec$args$penalty
  
  object$spec$args$penalty <- .check_glmnet_penalty_predict(penalty, object)
  
  predict.model_fit(object, new_data = new_data, type = type, ...)
}

predict._lognet <- function(object, new_data, type = NULL, penalty = NULL, ...) {
  # See discussion in https://github.com/tidymodels/parsnip/issues/195
  if (is.null(penalty)) penalty <- object$spec$args$penalty
  
  object$spec$args$penalty <- .check_glmnet_penalty_predict(penalty, object)
  
  predict.model_fit(object, new_data = new_data, type = type, ...)
}

Duplicated code

predict_glmnet <- function(object, new_data, type = NULL, penalty = NULL, ...) {
  # See discussion in https://github.com/tidymodels/parsnip/issues/195
  if (is.null(penalty)) penalty <- object$spec$args$penalty
  
  object$spec$args$penalty <- .check_glmnet_penalty_predict(penalty, object)
  
  predict.model_fit(object, new_data = new_data, type = type, ...)
}

predict._elnet <- predict_glmnet
predict._lognet <- predict_glmnet

Duplicated code

predict._elnet <- function(object, new_data, type = NULL, penalty = NULL, ...) {
  # See discussion in https://github.com/tidymodels/parsnip/issues/195
  if (is.null(penalty)) penalty <- object$spec$args$penalty
  
  object$spec$args$penalty <- .check_glmnet_penalty_predict(penalty, object)
  
  predict.model_fit(object, new_data = new_data, type = type, ...)
}

predict._lognet <- function(object, new_data, type = NULL, penalty = NULL, ...) {
  # See discussion in https://github.com/tidymodels/parsnip/issues/195
  if (is.null(penalty)) penalty <- object$spec$args$penalty
  
  object$spec$args$penalty <- .check_glmnet_penalty_predict(penalty, object)
  
  predict.model_fit(object, new_data = new_data, type = type, ...)
}

Duplicated code

predict._elnet <- function(object, new_data, type = NULL, penalty = NULL, ...) {
  # See discussion in https://github.com/tidymodels/parsnip/issues/195
  if (is.null(penalty)) penalty <- object$spec$args$penalty
  
  object$spec$args$penalty <- .check_glmnet_penalty_predict(penalty, object)
  
  predict.model_fit(object, new_data = new_data, type = type, ...)
}

predict._lognet <- function(object, new_data, type = NULL, penalty = NULL, ...) {
  # See discussion in https://github.com/tidymodels/parsnip/issues/195
  if (is.null(penalty)) penalty <- object$spec$args$penalty
  
  object$spec$args$penalty <- .check_glmnet_penalty_predict(penalty, object)
  
  object$spec <- eval_args(object$spec)
  predict.model_fit(object, new_data = new_data, type = type, ...)
}

Through the cognitive lens

Exact copy: Lost opportunity of chunking things together!
(Very) similar code: Potential to chunk the wrong things together!

Duplicated code ~ Chunking gone wrong

Design patterns

Design patterns are reusable solutions to common problems.

Many arguments, revisited

my_fun <- function(x, 
                   y,
                   opt1 = 1,
                   opt2 = 2, 
                   opt3 = 3, 
                   opt4 = 4){
  ...  
}

Many arguments, revisited

my_fun <- function(x, y, options = my_fun_opts()) {
  ...
}

my_fun_opts <- function(opt1 = 1, opt2 = 2, opt3 = 3, opt4 = 4) {
  list(
    opt1 = opt1,
    opt2 = opt2,
    opt3 = opt3, 
    opt4 = opt4
  ) 
}

Principle/Pattern: Reduce argument clutter with an options object

Design patterns can help lower the cognitive load.

Work in progress:
https://design.tidyverse.org/

Tests

If you can’t make changes because you’re afraid of breaking something, it’s already broken.

Kara Woo

Tests are a diagnostic tool, make them obvious.

Michael Lynch - Why good developers write bad unit tests

── Failure (test-my_fun.R:4:3): my_fun() works ─────────────────────────────────
`one_thing` (`actual`) not equal to `another_thing` (`expected`).

`actual`:   FALSE
`expected`: TRUE 
[ FAIL 1 | WARN 0 | SKIP 0 | PASS 3 ]

Test complete

What `"my_fun() works"` looks like

test_that("my_fun() works", {
   # setup for thing 1
  expect_equal(my_fun(x_1), ...)
  
  # setup for thing 2
  expect_equal(my_fun(x_2), ...)
  
  # setup for thing 3
  expect_equal(my_fun(x_3), ...)  
  
  ...
})

Be specific in test names

test_that("my_fun() can do thing 1", {
   # setup for thing 1
  expect_equal(my_fun, ...)
})

test_that("my_fun() can do thing 2", {
  # setup for thing 2
  expect_equal(my_fun(x_2), ...)
})

test_that("my_fun() can do thing 3", {
  # setup for thing 3
  expect_equal(my_fun(x_3), ...)  
})

...

What if the second test breaks?

library(dplyr)

dat <- data.frame(a = 1:3, b = c("a", "b", "c"))

skip_if_not_installed("a_package")

test_that("my_fun() does this", {
  expect_equal(my_fun(dat), ...)
})

dat2 <- data.frame(x = 1:5, y = 6:10)

skip_on_os("windows")

test_that("my_fun_2() does that", {
  dat2 <- mutate(dat2, z = x + y)
  expect_equal(my_fun_2(dat, dat2), ...)
})

Example adapted from Hadley Wickham’s Package Development Masterclass at posit::conf(2023)

Make tests self-contained

library(dplyr)

skip_if_not_installed("a_package")

test_that("my_fun() does this", {
  dat <- data.frame(a = 1:3, b = c("a", "b", "c"))
  expect_equal(my_fun(dat), ...)
})

skip_on_os("windows")

test_that("my_fun_2() does that", {
  dat <- data.frame(a = 1:3, b = c("a", "b", "c"))
  dat2 <- data.frame(x = 1:5, y = 6:10)
  dat2 <- mutate(dat2, z = x + y)
  expect_equal(my_fun_2(dat, dat2), ...)
})

Make tests self-contained

skip_if_not_installed("a_package")

test_that("my_fun() does this", {
  dat <- data.frame(a = 1:3, b = c("a", "b", "c"))
  expect_equal(my_fun(dat), ...)
})

skip_on_os("windows")

test_that("my_fun_2() does that", {
  dat <- data.frame(a = 1:3, b = c("a", "b", "c"))
  dat2 <- data.frame(x = 1:5, y = 6:10)
  dat2 <- dplyr::mutate(dat2, z = x + y)
  expect_equal(my_fun_2(dat, dat2), ...)
})

Make tests self-contained

test_that("my_fun() does this", {
  skip_if_not_installed("a_package")
  
  dat <- data.frame(a = 1:3, b = c("a", "b", "c"))
  expect_equal(my_fun(dat), ...)
})

test_that("my_fun_2() does that", {
  skip_if_not_installed("a_package")
  skip_on_os("windows")

  dat <- data.frame(a = 1:3, b = c("a", "b", "c"))
  dat2 <- data.frame(x = 1:5, y = 6:10)
  dat2 <- dplyr::mutate(dat2, z = x + y)
  expect_equal(my_fun_2(dat, dat2), ...)
})

Obvious =
Fits into working memory

Well-chunked code supports testing.

In tests, we get to strip away complexity instead of accommodating it.

Having a mental model of how your brain works helps you to work with it, not against it.

If you want your code to grow in complexity,
you need to keep (re-)chunking.

If you want your understanding to grow,
you need to keep (re-)chunking.

Thank you!

The village: Maëlle, Tracy, Emil, Max, Simon, tidy team, Mine, Julie

Cognition: Programming Edition!

Reading and understanding code

Research indicates that almost 60% of programmers’ time is spent understanding rather than writing code.

Let’s read some code

Our short-term memory (STM) only holds two to six items.

This is not even R code!

This is not even R code!

Our long-term memory (LTM) helps us aggregate items in our STM into chunks.

Modifying the right-hand side of a formula

Tests show (important) use cases

Abstract syntax tree

Abstract syntax tree

drop_strata(a + b)

drop_strata(a + b)

drop_strata(a + b)

drop_strata(a + b)

drop_strata(a + b)

drop_strata(a + b)

drop_strata(a + b)

drop_strata(a + b)

drop_strata(a + b)

Our working memory is our STM applied to a problem.

Our working memory only holds two to six items.

Challenges

Challenges, for reasons

Help your brain out

Writing Code

Writing is for re-reading

Better understanding - fewer mistakes - better science

Names

Good names help activate knowledge from your LTM.

Which time are we talking about?

Which time are we talking about?

Which time are we talking about?

Which time are we talking about?

Which time are we talking about?

Bad names can hinder you by activating the wrong knowledge.

time -> eval_time

.time -> eval_time

stat_times -> eval_time

Make (re)thinking names a separate step

Code Smells

Bad names are linguistic anti-patterns, code smells are structural anti-patterns.

Code smells

Code smells

Code smells

Duplicated code

Duplicated code

Duplicated code

Duplicated code

Duplicated code

Through the cognitive lens

Design patterns

Design patterns are reusable solutions to common problems.

Many arguments, revisited

Many arguments, revisited

Design patterns can help lower the cognitive load.

Work in progress: https://design.tidyverse.org/

Tests

If you can’t make changes because you’re afraid of breaking something, it’s already broken.

Tests are a diagnostic tool, make them obvious.

What "my_fun() works" looks like

Be specific in test names

What if the second test breaks?

Make tests self-contained

Make tests self-contained

Make tests self-contained

Obvious = Fits into working memory

Well-chunked code supports testing.

In tests, we get to strip away complexity instead of accommodating it.

Having a mental model of how your brain works helps you to work with it, not against it.

If you want your code to grow in complexity, you need to keep (re-)chunking.

If you want your understanding to grow, you need to keep (re-)chunking.

Thank you!

Cognition:
Programming Edition!

`drop_strata(a + b)`

`drop_strata(a + b)`

`drop_strata(a + b)`

`drop_strata(a + b)`

`drop_strata(a + b)`

`drop_strata(a + b)`

`drop_strata(a + b)`

`drop_strata(a + b)`

`drop_strata(a + b)`

`time` -> `eval_time`

`.time` -> `eval_time`

`stat_times` -> `eval_time`

Work in progress:
https://design.tidyverse.org/

What `"my_fun() works"` looks like

Obvious =
Fits into working memory

If you want your code to grow in complexity,
you need to keep (re-)chunking.

If you want your understanding to grow,
you need to keep (re-)chunking.