CRTsampleSearch package implements the simulation-based sample size estimation approach proposed in the paper (link). The main function is CRTsearch which takes in 4 major sets of parameters.

The parameters (nrep, \(n_t\), \(n_c\)) set the number of simulation replications, the initial number of clusters in the intervention arm and the control arm respectively.
The parameter FUN_clustersize is an user-defined input functionfor the distribution of the size of clusters in the trial. It has to include at least one input parameter of the total number of clusters included in both arms of the trial, \(N\). The output should be a vector of intergers of lenght \(N\).
The parameter FUN_Ys is an user-defined input function for the distribution of the potential outcomes. It has to include at least one input parameter of the size of a cluster, \(m\). The oputput should be a \((m \times 2)\) matrix with column names set as (Y0, Y1).
The parameter FUN_TestStat is an user-defined input function for the test statistics proposed for the trial. It takes in the sampled trial data, (Y0, Y1, ClusterID). The output is the calculated value of the statistic value.

In addition, two major simulation settings are implemented in the CRTsearch function. First, the input data distributions have known analytic forms, e.g. Normal, Bernoulli, Poisson distributions, and the examples in Section 1 of this document. Second, the input data distributions are empirical distributions composited by, for example, data collected in other studies. In this document, we will demonstrate usage of CRTsearch under both settings especially on how to specify parameters 2-4.

Installation

devtools::install_github("RuoshuiAqua/CRTsampleSearch")
library(CRTsampleSearch)

Section 1: Using distributions of known forms

The goal of this section is to show you how to use CRTsampleSearch to estimate the optimal number of clusters for a cluster randomization trial (CRT) when the distributions of cluster sizes and potential outcomes have known analytic forms. Specifically, all the simulation examples in Section 6 of the paper are included in this document.

1.1 uneven cluster sizes

## simulate size of a cluster: log-normal distribution
sim_cluster_size = function(N, ...){
  # input: 
  # N: an integer of the total numebr of clusters
  # output: a vector of N integers indicating the size of each cluster in the trial
  size = round(100*rlnorm(N, 0, 1), 0)
  size[size<=0] = 1
  return(size)
}

## continuous outcome: Normal and Normal Two-level Model + fixed_continuous_change
sim_potential_outcomes = function(m,...){
  # input: 
  # m: an integer of the number of individuals in a cluster (i.e. size of the cluster)
  # output: a (m x 2) matrix of the two potential outcomes for each and every individual in the cluster
  muibar = rnorm(1, 0, 1)
  Y0 = rnorm(m, muibar, 10) 
  Y1 = Y0 + 0.8
  re = cbind(Y0, Y1)
  colnames(re) = list("Y0", "Y1")
  return(re)
}

## test statistics: pooled difference btw the two arm means
calc_teststat = function(W, data, ...){
  # input: 
  # W: a vector of lenght N (the total number of clusters in the trial composed of 1s and 0s indicating the randomization assignment to the intervention arm and the control arm respectatively.
  # data: a (Nm x 2) matrix of the two potential outcomes for all individuals participating in the trail ordered by their cluster ID.
  # output: a numeric object for the value of the calculated test statistic. Here, it is the difference of the sample means between the two study arms.
  Y0 = data[,"Y0"]
  Y1 = data[,"Y1"]
  re = sum(Y1[W==1])/sum(W==1) - sum(Y0[W==0])/sum(W==0)
  return(re)
}

## CRTsampleSearch 
n=CRTsearch(nrep=1e4, nt=100, nc=100, FUN_clustersize=sim_cluster_size, FUN_Ys=sim_potential_outcomes, FUN_TestStat=calc_teststat)


## using formula (Rutterford et cl. 2015)
sigma2n = 1
m = 100 * exp(0 + sigma2n/2) ## average cluster size 
m_sd = sqrt(100^2*(exp(sigma2n)-1)*exp(2*0+sigma2n)) ## variance of the cluster size
sigma2 = 1 + 10 ## overall variance
ICC = 1 / 11 ## inter-cluster correlation
alpha = 0.05
minpower=0.8
delta = 1
N = sigma2*(qnorm(alpha/2)+qnorm(1-minpower))^2/delta^2/m
DE1 = 1+(m-1)*ICC ## Design Effect
N_1 = 2*ceiling(2*N*DE1)
N_1

## formula with correlation
DE2 = 1+(((m_sd/m)^2+1)*m-1)*ICC ## Design Effect
N_2 = 2*ceiling(2*N*DE2)
N_2

DE3 = 1/(1-(m_sd/m)^2*d3*(1-d3)) ## Design Effect
N_3 = 2*ceiling(2*N*DE1*DE3)
N_3

1.2 0-log-Normal continuous outcomes

## simulate size of a cluster: equal cluster size (m=100)
sim_cluster_size = function(N, ...){
  # input: 
  # N: an integer of the total numebr of clusters
  # output: a vector of N integers indicating the size of each cluster in the trial
  size = rep(100,N)
  return(size)
}

## continuous outcome: Normal and 0-log-Normal Two-level Model + fixed_continuous_change
sim_potential_outcomes = function(m,...){
  # input:
  # m: an integer of the number of individuals in a cluster (i.e. size of the cluster)
  # output: a (m x 2) matrix of the two potential outcomes for each and every individual in the cluster
  meanlog2 = log(1 / sqrt( 5/1^2 +1) )
  sd2log2 = log( 5/ 1^2 +1 )
  muibar = rlnorm(1, mean=meanlog2, sd=sqrt(sd2log2))
  v0 = rbinom(m,1,prob=1-p0)
  p0 = 0.2
  varlogN = 5
  meanlog = log(muibar / sqrt( varlogN/muibar^2 +1) )
  sd2log = log( varlogN/ muibar^2 +1 )
  Y0 = v0 * rlnorm(m, meanlog, sqrt(sd2log))
  Y1 = Y0 + 1
  re = cbind(Y0, Y1)
  colnames(re) = list("Y0", "Y1")
  return(re)
}

## test statistics: pooled difference btw the two arm means
calc_teststat = function(W, data, ...){
  # input: 
  # W: a vector of lenght N (the total number of clusters in the trial composed of 1s and 0s indicating the randomization assignment to the intervention arm and the control arm respectatively.
  # data: a (Nm x 2) matrix of the two potential outcomes for all individuals participating in the trail ordered by their cluster ID.
  # output: a numeric object for the value of the calculated test statistic. Here, it is the difference of the sample means between the two study arms.
  Y0 = data[,"Y0"]
  Y1 = data[,"Y1"]
  re = sum(Y1[W==1])/sum(W==1) - sum(Y0[W==0])/sum(W==0)
  return(re)
}

## CRTsampleSearch 
CRTsearch(nrep=1e4, nt=20, nc=20, FUN_clustersize=sim_cluster_size, FUN_Ys=sim_potential_outcomes, FUN_TestStat=calc_teststat)

## using formula
p0 = 0.2
mu0 = 10
sigma2mui = 10
varlogN = 10
delta = 1
m = 100
sigma2 = (1-p0)*varlogN + p0*(1-p0)*mu0^2 + (1-p0)*sigma2mui
sigma2B = sigma2mui
ICC = sigma2B / sigma2
alpha = 0.05; minpower=0.8
DE = 1+(m-1)*ICC ## Design Effect
N = 2*ceiling(2*sigma2*( qnorm(alpha/2)+qnorm(1-minpower))^2/delta^2/m*DE)
N

1.3 Binary outcomes

## simulate size of a cluster: equal cluster size (m=100)
sim_cluster_size = function(N, ...){
  # input: 
  # N: an integer of the total numebr of clusters
  # output: a vector of N integers indicating the size of each cluster in the trial
  size = rep(100,N)
  return(size)
}

## binary outcome: beta and bernoulli Two-level Model + dependent bernoulli increase
sim_potential_outcomes = function(m,...){
  # input: 
  # m: an integer of the number of individuals in a cluster (i.e. size of the cluster)
  # output: a (m x 2) matrix of the two potential outcomes for each and every individual in the cluster
  pbar=rbeta(1,shape1=1, shape2=4)
  Y0= rbinom(m, size=1, prob=pbar ) 
  Y1 = rbinom(m, size=1, prob=0.2)*(1-Y0) + Y0
  re = cbind(Y0, Y1)
  colnames(re) = list("Y0", "Y1")
  return(re)
}

## test statistics: pooled difference btw the two arm means
calc_teststat = function(W, data, ...){
  # input: 
  # W: a vector of lenght N (the total number of clusters in the trial composed of 1s and 0s indicating the randomization assignment to the intervention arm and the control arm respectatively.
  # data: a (Nm x 2) matrix of the two potential outcomes for all individuals participating in the trail ordered by their cluster ID.
  # output: a numeric object for the value of the calculated test statistic. Here, it is the difference of the sample means between the two study arms.
  Y0 = data[,"Y0"]
  Y1 = data[,"Y1"]
  re = sum(Y1[W==1])/sum(W==1) - sum(Y0[W==0])/sum(W==0)
  return(re)
}

## CRTsampleSearch 
CRTsearch(nrep=1e4, nt=20, nc=20, FUN_clustersize=sim_cluster_size, FUN_Ys=sim_potential_outcomes, FUN_TestStat=calc_teststat)

## using formula
alpha0 = 1
beta0 = 4
delta = 0.05
m = 100
p0 = alpha0/(alpha0+beta0)
p1 = p0 + delta*(1-p0)
sigma2B = (alpha0*beta0)/(alpha0+beta0+1)/(alpha0+beta0)^2
sigma2 = (alpha0*beta0)/(alpha0+beta0)^2
ICCH0 = sigma2B/sigma2
alpha = 0.05; minpower=0.8
DE = 1+(m-1)*ICCH0 ## Design Effect
N = 2*ceiling((p0*(1-p0)+p1*(1-p1))*(qnorm(alpha/2)+qnorm(1-minpower))^2/(p1-p0)^2*DE/m)
N

1.4 Count outcomes

1.4.1 Gamma-Poisson Model

## simulate size of a cluster: equal cluster size (m=100)
sim_cluster_size = function(N, ...){
  # input: 
  # N: an integer of the total numebr of clusters
  # output: a vector of N integers indicating the size of each cluster in the trial
  size = rep(100,N)
  return(size)
}

## count outcome: gamma and poisson Two-level Model + independent poisson increase
sim_potential_outcomes = function(m,...){
  # input: 
  # m: an integer of the number of individuals in a cluster (i.e. size of the cluster)
  # output: a (m x 2) matrix of the two potential outcomes for each and every individual in the cluster
  lambdai = rgamma(n=1, shape=10, rate=2)
  Y0 = rpois(m, lambda=lambdai)
  Y1 = rpois(m, lambda=0.1) + Y0
  re = cbind(Y0, Y1)
  colnames(re) = list("Y0", "Y1")
  return(re)    
}

## test statistics: pooled difference btw the two arm means
calc_teststat = function(W, data, ...){
  # input: 
  # W: a vector of lenght N (the total number of clusters in the trial composed of 1s and 0s indicating the randomization assignment to the intervention arm and the control arm respectatively.
  # data: a (Nm x 2) matrix of the two potential outcomes for all individuals participating in the trail ordered by their cluster ID.
  # output: a numeric object for the value of the calculated test statistic. Here, it is the difference of the sample means between the two study arms.
  Y0 = data[,"Y0"]
  Y1 = data[,"Y1"]
  re = sum(Y1[W==1])/sum(W==1) - sum(Y0[W==0])/sum(W==0)
  return(re)
}

## CRTsampleSearch 
CRTsearch(nrep=1e4, nt=20, nc=20, FUN_clustersize=sim_cluster_size, FUN_Ys=sim_potential_outcomes, FUN_TestStat=calc_teststat)


## using Hayes's formula
alpha0 = 10
beta0 = 2
lambda0 = alpha0 / beta0
delta = 5
lambda1 = lambda0+delta
m = 100
meanY0B = alpha0/beta0
varY0B =alpha0/beta0^2
CV = sqrt(varY0B)/meanY0B
alpha = 0.05; minpower=0.8
N = 2*ceiling(1+(qnorm(alpha/2)+qnorm(1-minpower))^2*((lambda0+lambda1)/m +CV^2*(lambda0^2+lambda1^2))/delta^2)
N

1.4.2 Gamma-ZIP Model

## simulate size of a cluster: equal cluster size (m=100)
sim_cluster_size = function(N, ...){
  # input: 
  # N: an integer of the total numebr of clusters
  # output: a vector of N integers indicating the size of each cluster in the trial
  size = rep(100,N)
  return(size)
}

## count outcome: gamma and zero-inflated-poisson (ZIP) Two-level Model + independent poisson increase
sim_potential_outcomes = function(m,...){
  # input: 
  # m: an integer of the number of individuals in a cluster (i.e. size of the cluster)
  # output: a (m x 2) matrix of the two potential outcomes for each and every individual in the cluster
  lambdai = rgamma(n=1, shape=10, rate=2)
  Y0c = rpois(m, lambda=lambdai)
  p0=0.2
  Y0b = rbinom(n=m, size=1, prob=1-p0)
  Y0 = Y0c * Y0b
  Y1 = rpois(m, lambda=0.1) + Y0
  re = cbind(Y0, Y1)
  colnames(re) = list("Y0", "Y1")
  return(re)    
}

## test statistics: pooled difference btw the two arm means
calc_teststat = function(W, data, ...){
  # input: 
  # W: a vector of lenght N (the total number of clusters in the trial composed of 1s and 0s indicating the randomization assignment to the intervention arm and the control arm respectatively.
  # data: a (Nm x 2) matrix of the two potential outcomes for all individuals participating in the trail ordered by their cluster ID.
  # output: a numeric object for the value of the calculated test statistic. Here, it is the difference of the sample means between the two study arms.
  Y0 = data[,"Y0"]
  Y1 = data[,"Y1"]
  re = sum(Y1[W==1])/sum(W==1) - sum(Y0[W==0])/sum(W==0)
  return(re)
}

## CRTsampleSearch 
CRTsearch(nrep=1e4, nt=20, nc=20, FUN_clustersize=sim_cluster_size, FUN_Ys=sim_potential_outcomes, FUN_TestStat=calc_teststat)


## using Hayes's formula
p0=0.2
alpha0 = 10
beta0 = 2
lambda0 = (1-p0)*alpha0/beta0
delta = 1
lambda1 = lambda0+delta
m = 100
meanY0B = (1-p0)*alpha0/beta0
varY0B =(1-p0)^2*alpha0/beta0^2
CV = sqrt(varY0B)/meanY0B
alpha = 0.05; minpower=0.8
N = 2*ceiling(1+(qnorm(alpha/2)+qnorm(1-minpower))^2*((lambda0+lambda1)/m+CV^2*(lambda0^2+lambda1^2))/delta^2)
N

Section 2: Using empirical data distributions

The goal of this section is to show you how to use CRTsampleSearch to estimate the optimal number of clusters for a cluster randomization trial (CRT) when the distributions of baseline outcomes, size of clusters, and study demographics, such as the cluster’s characteristic used in stratified randomization, are better approximated by empirical distributions collected from previous studies. Specifically, in this document, I will repeat the examples in Section 7 of our paper using a synthetic data set generated from the Flu Data in our paper using the R synthpop package. The data set is attached to the package. The goal of this section is to show you how to use CRTsampleSearch to estimate the optimal number of clusters for a cluster randomization trial (CRT) when the distributions of cluster sizes and potential outcomes have known analytic forms. Specifically, all the simulation examples in Section 6 of the paper are included in this document.

2.1 The synthetic Flu Data

synFluData_loc = system.file("extdata", "synFluData.csv", package = "CRTsampleSearch")
synFluData = read.csv(synFluData_loc)

The synthetic flu data, similar to the original one, includes data from 816 clusters with an average size of 64.9 (ranges from 5 to 498). All clusterID and patientID are synthetic IDs. count is the number of hospitalizations experienced by the patient while he/she was followed in the study. On average, patients were followed for NA days and 0% of all were followed for the whole flu season (3 months, 211 days). race_aa is a identifier for whether the patient is African American. race_aa_pct is the percentage of African American within each cluster. race_aa_strata is a three-level strata indicator defined from race_aa_pct using its 33% and 67% percentiles.The goal of this section is to show you how to use CRTsampleSearch to estimate the optimal number of clusters for a cluster randomization trial (CRT) when the distributions of baseline outcomes, size of clusters, and study demographics, such as the cluster’s characteristic used in stratified randomization, are better approximated by empirical distributions collected from previous studies. Specifically, in this document, I will repeat the examples in Section 7 of our paper using a synthetic data set generated from the Flu Data in our paper using the R synthpop package. The data set is attached to the package. The goal of this section is to show you how to use CRTsampleSearch to estimate the optimal number of clusters for a cluster randomization trial (CRT) when the distributions of cluster sizes and potential outcomes have known analytic forms. Specifically, all the simulation examples in Section 6 of the paper are included in this document.

head(synFluData)
#>   clusterID patientID rate count time_days time_years race_aa race_aa_pct
#> 1         1         1    0     0       211  0.5776865       0    0.164557
#> 2         1         2    0     0       211  0.5776865       0    0.164557
#> 3         1         3    0     0       211  0.5776865       0    0.164557
#> 4         1         4    0     0       211  0.5776865       0    0.164557
#> 5         1         5    0     0       211  0.5776865       0    0.164557
#> 6         1         6    0     0       211  0.5776865       0    0.164557
#>   race_aa_strata
#> 1              3
#> 2              3
#> 3              3
#> 4              3
#> 5              3
#> 6              3

2.2 Constant change in count or rate

Suppose the intervention in the new study is expected to reduce the number of hospitalization at most one time (some patients never experienced hospitalization in the flu study), the potential outcome under intervention can be simulated from the following \(f_1\): \[f_1: Y_{ij}(1) = max(Y_{ij}(0)-1, 0)\]

sim_potential_outcomes = function(data, ...){
  # input: 
  # data: a dataframe with at least two columns (clusterID, Y0) representing the cluster sizes and the empirical distribution of the potential outcome Y0.
  # output: a (Nm x 2) matrix of the two potential outcomes for each and every individual in the cluster
  data = data[order(data$clusterID),]
  Y0 = data[,"Y0"]
  Y1 = Y0 - 1 ## effect size = -1
  Y1[Y1<0] = 0
  re = cbind(Y0, Y1)
  colnames(re) = list("Y0", "Y1")
  return(re)
}
calc_teststat = function(W, data, ...){
  # input: 
  # W: a vector of lenght N (the total number of clusters in the trial composed of 1s and 0s indicating the randomization assignment to the intervention arm and the control arm respectatively.
  # data: a (Nm x 2) matrix of the two potential outcomes for all individuals participating in the trail ordered by their cluster ID.
  # output: a numeric object for the value of the calculated test statistic. Here, it is the difference of the pooled rates between the two study arms.
  Y0 = data[,"Y0"]
  Y1 = data[,"Y1"]
  time = data[,"time_years"]
  ## pooled rates
  re = sum(Y1[W==1])/sum(time[W==1]) - sum(Y0[W==0])/sum(time[W==0])
  ## person-level rates
  ##re = mean(Y1[W==1]/time[W==1]) - mean(Y0[W==0]/time[W==0])
  return(re)
}
CRTsearch(nrep=1e4, nt=10, nc=10, dataset=synFluData, outcome="count", clusterID="clusterID", replacement=TRUE, FUN_Ys=sim_potential_outcomes, FUN_TestStat=calc_teststat)

If we are interested in the effect of the intervention in terms of changes in hospitalization rate, for example, suppose the intervention in the new study is expected to reduce the patient-level hospitalization rate by at most 0.5, the potential outcome under intervention can be simulated from the following \(f_1\): \[f_1: Y_{ij}(1) = max(Y_{ij}(0)-0.5, 0)\]

sim_potential_outcomes = function(data, ...){
  # input: 
  # data: a dataframe with at least two columns (clusterID, Y0) representing the cluster sizes and the empirical distribution of the potential outcome Y0.
  # output: a (Nm x 2) matrix of the two potential outcomes for each and every individual in the cluster
  data = data[order(data$clusterID),]
  lambda0 = data[,"rate"]
  lambda1 = lambda0 - 0.5 ## effect size = -0.5
  lambda1[lambda1<0] = 0
  Y1 = unlist(lapply(lambda1*data[,"time_years"], rpois, n=1))
  Y0 = data[,"Y0"]
  re = cbind(Y0, Y1)
  colnames(re) = list("Y0", "Y1")
  return(re)
}
calc_teststat = function(W, data, ...){
  # input: 
  # W: a vector of lenght N (the total number of clusters in the trial composed of 1s and 0s indicating the randomization assignment to the intervention arm and the control arm respectatively.
  # data: a (Nm x 2) matrix of the two potential outcomes for all individuals participating in the trail ordered by their cluster ID.
  # output: a numeric object for the value of the calculated test statistic. Here, it is the difference of the pooled rates between the two study arms.
  Y0 = data[,"Y0"]
  Y1 = data[,"Y1"]
  time = data[,"time_years"]
  ## pooled rates
  re = sum(Y1[W==1])/sum(time[W==1]) - sum(Y0[W==0])/sum(time[W==0])
  ## person-level rates
  ##re = mean(Y1[W==1]/time[W==1]) - mean(Y0[W==0]/time[W==0])
  return(re)
}
CRTsearch(nrep=1e4, nt=10, nc=10, dataset=synFluData, outcome="count", clusterID="clusterID", replacement=TRUE, FUN_Ys=sim_potential_outcomes, FUN_TestStat=calc_teststat)

2.2.1 Generalized Linear Mixed Effect Model

sim_potential_outcomes = function(data, ...){
  # input: 
  # data: a dataframe with at least two columns (clusterID, Y0) representing the cluster sizes and the empirical distribution of the potential outcome Y0.
  # output: a (Nm x 2) matrix of the two potential outcomes for each and every individual in the cluster
  data = data[order(data$clusterID),]
  Y0 = data[,"Y0"]
  lambda0 = data[,"rate"]
  lambda1 = lambda0 - 0.4 ## effect size = -0.4
  lambda1[lambda1<0] = 0
  Y1 = unlist(lapply(lambda1*data[,"time_years"], rpois, n=1))
  re = cbind(Y0, Y1)
  colnames(re) = list("Y0", "Y1")
  return(re)
}
calc_teststat = function(W, data, ...){
  # input: 
  # W: a vector of lenght N (the total number of clusters in the trial composed of 1s and 0s indicating the randomization assignment to the intervention arm and the control arm respectatively.
  # data: a (Nm x 2) matrix of the two potential outcomes for all individuals participating in the trail ordered by their cluster ID.
  # output: a numeric object for the value of the calculated test statistic. Here, it is the estimated intervention effect from a Generalized Linear Mixed Effect Model
  outcome = data[,"Y1"]*W + data[,"Y0"]*(1-W)
  AnaData = data.frame(cbind(outcome, W, data$cID, data$time_years))
  preg = lme4::glmer(outcome ~ W + (1|V3), family=poisson, offset=V4, nAGQ=1, data=AnaData)
  re = coef(preg)$V3$W[1]
  return(re)
}
CRTsearch(nrep=1e3, nt=15, nc=15, dataset=synFluData, outcome="count", clusterID="clusterID", replacement=TRUE, FUN_Ys=sim_potential_outcomes, FUN_TestStat=calc_teststat)

2.2.2 GEE

When the test statistic is the effect of the intervention estimated from a generalized estimating equation (GEE), i.e. \(\beta\) from the following model:

\[Y_{ij}^{obs} = Y_{ij}(0)(1-W_{ij}) + Y_{ij}(1) W_{i}\]

\[E(Y_{ij}^{obs}/t_{ij}) = \mu_{ij}; \quad g(\mu_{ij}) = log(\mu_{ij}) = \alpha + W_{i}\beta\] \[\sum_{i=1}^{n}\Big( \frac{\partial \mu_{i}}{\partial \beta} \Big)^T V_i^{-1}\big(Y_i^{obs} - \mu_i\big) = 0\] \[\sum_{i=1}^{n}\Big( \frac{\partial \mu_{i}}{\partial \alpha} \Big)^T V_i^{-1}\big(Y_i^{obs} - \mu_i\big) = 0 \] \(W_{i}\) is the indicator for intervention assignment at the cluster level, \(g(\cdot)\) is the log link function, \(V_i\) is the working covariance matrix of \(Y^{obs}_{i}\), \(\alpha\) is a constant unknown auxiliary parameter and \(\beta\) is the parameter of interest.

sim_potential_outcomes = function(data, ...){
  # input: 
  # data: a dataframe with at least two columns (clusterID, Y0) representing the cluster sizes and the empirical distribution of the potential outcome Y0.
  # output: a (Nm x 2) matrix of the two potential outcomes for each and every individual in the cluster
  data = data[order(data$clusterID),]
  Y0 = data[,"Y0"]
  lambda0 = data[,"rate"]
  lambda1 = lambda0 - 0.4 ## effect size = -0.4
  lambda1[lambda1<0] = 0
  Y1 = unlist(lapply(lambda1*data[,"time_years"], rpois, n=1))
  re = cbind(Y0, Y1)
  colnames(re) = list("Y0", "Y1")
  return(re)
}
calc_teststat = function(W, data, ...){
  # input: 
  # W: a vector of lenght N (the total number of clusters in the trial composed of 1s and 0s indicating the randomization assignment to the intervention arm and the control arm respectatively.
  # data: a (Nm x 2) matrix of the two potential outcomes for all individuals participating in the trail ordered by their cluster ID.
  # output: a numeric object for the value of the calculated test statistic. Here, it is the estimated intervention effect from a GEE model.
  outcome = data[,"Y1"]*W + data[,"Y0"]*(1-W)
  preg = gee::gee(outcome ~ W + offset(data$time_years), id=data$cID, family = "poisson", corstr="exchangeable", silent=TRUE)
  re = preg$coef[2]
  return(re)
}
CRTsearch(nrep=1e3, nt=15, nc=15, dataset=synFluData, outcome="count", clusterID="clusterID", replacement=TRUE, FUN_Ys=sim_potential_outcomes, FUN_TestStat=calc_teststat)

2.3 Stratified randomization

As the flu study data suggests, the percentage of African Americans varies among nursing homes (clusters).

library(ggplot2)
#> Registered S3 methods overwritten by 'ggplot2':
#>   method         from 
#>   [.quosures     rlang
#>   c.quosures     rlang
#>   print.quosures rlang
racedist = synFluData[!duplicated(synFluData$clusterID),c("clusterID","race_aa_pct")]
ggplot(data=racedist, aes(racedist$race_aa_pct))+
  geom_histogram(binwidth=0.01, alpha=0.7)+
  xlab("Percentage of African Americans per Nursing Home")+
  theme_bw()+ 
  geom_vline(xintercept=quantile(racedist$race_aa_pct,0.25))+
  geom_vline(xintercept=quantile(racedist$race_aa_pct,0.5))+
  geom_vline(xintercept=quantile(racedist$race_aa_pct,0.75))

Suppose the new study adopts stratified randomization, i.e. randomizing sampled clusters within each stratum defined by the percentage of African Americans in the nursing home, CRTsampleSearch should be implemented in the following way using the argument stratifyBy:

sim_potential_outcomes = function(data, ...){
  # input: 
  # data: a dataframe with at least two columns (clusterID, Y0) representing the cluster sizes and the empirical distribution of the potential outcome Y0.
  # output: a (Nm x 2) matrix of the two potential outcomes for each and every individual in the cluster
  data = data[order(data$clusterID),]
  lambda0 = data[,"rate"]
  lambda1 = lambda0 - 0.5 ## effect size = -0.5
  lambda1[lambda1<0] = 0
  Y1 = unlist(lapply(lambda1*data[,"time_years"], rpois, n=1))
  Y0 = data[,"Y0"]
  re = cbind(Y0, Y1)
  colnames(re) = list("Y0", "Y1")
  return(re)
}
calc_teststat = function(W, data, ...){
  # input: 
  # W: a vector of lenght N (the total number of clusters in the trial composed of 1s and 0s indicating the randomization assignment to the intervention arm and the control arm respectatively.
  # data: a (Nm x 2) matrix of the two potential outcomes for all individuals participating in the trail ordered by their cluster ID.
  # output: a numeric object for the value of the calculated test statistic. Here, it is the difference of the pooled rates between the two study arms.
  Y0 = data[,"Y0"]
  Y1 = data[,"Y1"]
  time = data[,"time_years"]
  ## pooled rates
  re = sum(Y1[W==1])/sum(time[W==1]) - sum(Y0[W==0])/sum(time[W==0])
  ## person-level rates
  ##re = mean(Y1[W==1]/time[W==1]) - mean(Y0[W==0]/time[W==0])
  return(re)
}
CRTsearch(nrep=1e4, nt=15, nc=15, dataset=synFluData, outcome="count", clusterID="clusterID", stratifyBy="race_aa_strata", replacement=TRUE, FUN_Ys=sim_potential_outcomes, FUN_TestStat=calc_teststat)

2.4 Constant change in rate after fitting a zero-inflated poisson (ZIP) model

The flu study demonstrated that the number of hospitalizations includes more 0 counts than expected from a Poisson distribution. We can fit a ZIP model for each nursing home.

# estimating the amount of inflaction per cluster
clusterID = unique(synFluData$clusterID)
for(i in 1:816){
  dtmp = synFluData[synFluData$clusterID==clusterID[i],]
  clusterSize = dim(dtmp)[1]
  synFluData$clusterSize[synFluData$clusterID==clusterID[i]] = clusterSize
  
  if(sum(dtmp$count)==0){
    synFluData$clusterP0[synFluData$clusterID==clusterID[i]] = 0
    synFluData$clusterLambda0[synFluData$clusterID==clusterID[i]] = 0
  }else{
    fit = pscl::zeroinfl(count ~  offset(log(time_years)) , data=dtmp, dist="poisson", link="logit")
    clusterP0 = 1/(1+exp(-fit$coefficients$zero))
    clusterLambda0 = exp(fit$coefficients$count)
    synFluData$clusterP0[synFluData$clusterID==clusterID[i]] = clusterP0
    synFluData$clusterLambda0[synFluData$clusterID==clusterID[i]] = clusterLambda0
  }
}
head(synFluData)
#>   clusterID patientID rate count time_days time_years race_aa race_aa_pct
#> 1         1         1    0     0       211  0.5776865       0    0.164557
#> 2         1         2    0     0       211  0.5776865       0    0.164557
#> 3         1         3    0     0       211  0.5776865       0    0.164557
#> 4         1         4    0     0       211  0.5776865       0    0.164557
#> 5         1         5    0     0       211  0.5776865       0    0.164557
#> 6         1         6    0     0       211  0.5776865       0    0.164557
#>   race_aa_strata clusterSize clusterP0 clusterLambda0
#> 1              3          79  0.822865      0.3443982
#> 2              3          79  0.822865      0.3443982
#> 3              3          79  0.822865      0.3443982
#> 4              3          79  0.822865      0.3443982
#> 5              3          79  0.822865      0.3443982
#> 6              3          79  0.822865      0.3443982

Proportions of zero-inflation vary across nurisng homes.

synfluZIP = synFluData[!duplicated(synFluData$clusterID),c("clusterID","clusterP0","clusterLambda0")]
ggplot(data=racedist, aes(synfluZIP$clusterP0))+
  geom_histogram(binwidth=0.005, alpha=0.7)+
  xlab("Estimated proportion of zero-inflation from a ZIP model among Nursing Homes")+
  theme_bw()

Hospitalization rates after adjusting for the zero-inflation also vary across nursing homes.

ggplot(data=racedist, aes(synfluZIP$clusterLambda0))+
  geom_histogram(binwidth=0.01, alpha=0.7)+
  xlab("Estimated hospitalization rate from a ZIP model among Nursing Homes")+
  theme_bw()

Suppose the intervention is expected to reduce the number of hospitalization for the “sub-population” of patients who are ever expected to experience hospitalization, i.e. the Poisson part of the ZIP model, the potential outcome under intervention can be simulated from the following \(f_1\): \[f_1: Y_{ij}(1) \sim ZIP(P_i, max(\lambda_i-0.1, 0)\}\] \(P_i\) and \(\lambda_i\) are the proportion of zero-inflation and event rate estimated using a ZIP for nursing home \(i\).

sim_potential_outcomes = function(data, ...){ 
  # input: 
  # data: a dataframe with at least two columns (clusterID, Y0) representing the cluster sizes and the empirical distribution of the potential outcome Y0.
  # output: a (Nm x 2) matrix of the two potential outcomes for each and every individual in the cluster
  data = data[order(data$clusterID),]
  Y0 = data[,"Y0"]
  lambda1 = data[,"clusterLambda0"] - 0.1 ## effect size = -0.1
  lambda1 = ifelse(lambda1<=0,0,lambda1)
  clusterP1 = data[,"clusterP0"]
  Y1 = sapply(lambda1*data[,"time_years"], rpois, n=1)
  zerocount = sapply(1-clusterP1,rbinom,n=1,size=1)
  Y1 = Y1*zerocount
  re = cbind(Y0, Y1)
  colnames(re) = list("Y0", "Y1")
  return(re)
}
calc_teststat = function(W, data, ...){
  # input: 
  # W: a vector of lenght N (the total number of clusters in the trial composed of 1s and 0s indicating the randomization assignment to the intervention arm and the control arm respectatively.
  # data: a (Nm x 2) matrix of the two potential outcomes for all individuals participating in the trail ordered by their cluster ID.
  # output: a numeric object for the value of the calculated test statistic. Here, it is the difference of the pooled rates between the two study arms.
  Y0 = data[,"Y0"]
  Y1 = data[,"Y1"]
  time = data[,"time_years"]
  ## pooled rates
  re = sum(Y1[W==1])/sum(time[W==1]) - sum(Y0[W==0])/sum(time[W==0])
  ## person-level rates
  ##re = mean(Y1[W==1]/time[W==1]) - mean(Y0[W==0]/time[W==0])
  return(re)
}
CRTsearch(nrep=1e4, nt=10, nc=10, dataset=synFluData, outcome="count", clusterID="clusterID", replacement=TRUE, FUN_Ys=sim_potential_outcomes, FUN_TestStat=calc_teststat)

2.4.1 GEE when intervention effect the possion process in a ZIP model

sim_potential_outcomes = function(data, ...){
  # input: 
  # data: a dataframe with at least two columns (clusterID, Y0) representing the cluster sizes and the empirical distribution of the potential outcome Y0.
  # output: a (Nm x 2) matrix of the two potential outcomes for each and every individual in the cluster
  data = data[order(data$clusterID),]
  Y0 = data[,"Y0"]
  lambda0 = data[,"clusterLambda0"]
  lambda1 = lambda0 - 0.1 ## effect size = -0.4
  lambda1[lambda1<0] = 0
  Y11 = unlist(lapply(lambda1*data[,"time_years"], rpois, n=1))
  Y10 = unlist(lapply(data[,"clusterP0"], rbinom, size=1, n=1))
  Y1 = Y11*Y10
  re = cbind(Y0, Y1)
  colnames(re) = list("Y0", "Y1")
  return(re)
}
calc_teststat = function(W, data, ...){
  # input: 
  # W: a vector of lenght N (the total number of clusters in the trial composed of 1s and 0s indicating the randomization assignment to the intervention arm and the control arm respectatively.
  # data: a (Nm x 2) matrix of the two potential outcomes for all individuals participating in the trail ordered by their cluster ID.
  # output: a numeric object for the value of the calculated test statistic. Here, it is the estimated intervention effect from a GEE model.
  outcome = data[,"Y1"]*W + data[,"Y0"]*(1-W)
  preg = gee::gee(outcome ~ W + offset(data$time_years), id=data$cID, family = "poisson", corstr="exchangeable", silent=TRUE)
  re = preg$coef[2]
  return(re)
}
CRTsearch(nrep=1e3, nt=15, nc=15, dataset=synFluData, outcome="count", clusterID="clusterID", replacement=TRUE, FUN_Ys=sim_potential_outcomes, FUN_TestStat=calc_teststat)

2.4.2 Generalized Linear Mixed Effect Model when intervention only impacts the possion process in a ZIP model

sim_potential_outcomes = function(data, ...){
  # input: 
  # data: a dataframe with at least two columns (clusterID, Y0) representing the cluster sizes and the empirical distribution of the potential outcome Y0.
  # output: a (Nm x 2) matrix of the two potential outcomes for each and every individual in the cluster
  data = data[order(data$clusterID),]
  Y0 = data[,"Y0"]
  lambda0 = data[,"clusterLambda0"]
  lambda1 = lambda0 - 0.1 ## effect size = -0.4
  lambda1[lambda1<0] = 0
  Y11 = unlist(lapply(lambda1*data[,"time_years"], rpois, n=1))
  Y10 = unlist(lapply(data[,"clusterP0"], rbinom, size=1, n=1))
  Y1 = Y11*Y10
  re = cbind(Y0, Y1)
  colnames(re) = list("Y0", "Y1")
  return(re)
}
calc_teststat = function(W, data, ...){
  # input: 
  # W: a vector of lenght N (the total number of clusters in the trial composed of 1s and 0s indicating the randomization assignment to the intervention arm and the control arm respectatively.
  # data: a (Nm x 2) matrix of the two potential outcomes for all individuals participating in the trail ordered by their cluster ID.
  # output: a numeric object for the value of the calculated test statistic. Here, it is the estimated intervention effect from a Generalized Linear Mixed Effect model.
  outcome = data[,"Y1"]*W + data[,"Y0"]*(1-W)
  AnaData = data.frame(cbind(outcome, W, data$cID, data$time_years))
  preg = lme4::glmer(outcome ~ W + (1|V3), family=poisson, offset=V4, nAGQ=1, data=AnaData)
  re = coef(preg)$V3$W[1]
  return(re)
}
CRTsearch(nrep=1e3, nt=15, nc=15, dataset=synFluData, outcome="count", clusterID="clusterID", replacement=TRUE, FUN_Ys=sim_potential_outcomes, FUN_TestStat=calc_teststat)

2.5 Infinite study population vs finite population

When the available data from previous studies only include a small sample of clusters (in terms of the number of clusters not the total number of individual units), sampling with replacement assumes that the study population of the new study is infinite, whereas sampling without replacement assumes that the study population of the new study is finite. To demonstrate the difference, we take a smaller data set of 16 nursing homes from the original 817.

set.seed(2019)
subset16 = sample(unique(synFluData$clusterID),16)
synFluData16 = synFluData[synFluData$clusterID %in% subset16,]

The power for sample size from 10 to 20 is calculated in the following way. Note that when sampling without replacement, the maximum number of clusters is 16.

sim_potential_outcomes = function(data, ...){
  # input: 
  # data: a dataframe with at least two columns (clusterID, Y0) representing the cluster sizes and the empirical distribution of the potential outcome Y0.
  # output: a (Nm x 2) matrix of the two potential outcomes for each and every individual in the cluster
  data = data[order(data$clusterID),]
  lambda0 = data[,"rate"]
  lambda1 = lambda0 - 0.5 ## effect size = -0.5
  lambda1[lambda1<0] = 0
  Y1 = unlist(lapply(lambda1*data[,"time_years"], rpois, n=1))
  Y0 = data[,"Y0"]
  re = cbind(Y0, Y1)
  colnames(re) = list("Y0", "Y1")
  return(re)
}
calc_teststat = function(W, data, ...){
  # input: 
  # W: a vector of lenght N (the total number of clusters in the trial composed of 1s and 0s indicating the randomization assignment to the intervention arm and the control arm respectatively.
  # data: a (Nm x 2) matrix of the two potential outcomes for all individuals participating in the trail ordered by their cluster ID.
  # output: a numeric object for the value of the calculated test statistic. Here, it is the difference of the pooled rates between the two study arms.
  Y0 = data[,"Y0"]
  Y1 = data[,"Y1"]
  time = data[,"time_years"]
  ## pooled rates
  re = sum(Y1[W==1])/sum(time[W==1]) - sum(Y0[W==0])/sum(time[W==0])
  ## person-level rates
  ##re = mean(Y1[W==1]/time[W==1]) - mean(Y0[W==0]/time[W==0])
  return(re)
}

power16 = NULL
for(n in 5:10){
  if(n*2>16){
    power16_finite = NA
  }else{
    power16_finite = simPower(nrep=1e4, nt=n, nc=n, dataset=synFluData16, outcome="count", clusterID="clusterID", replacement=FALSE, FUN_Ys=sim_potential_outcomes, FUN_TestStat=calc_teststat)
    power16_finite_PT = simPowerPT(nrep=1e4, nt=n, nc=n, dataset=synFluData16, outcome="count", clusterID="clusterID", replacement=FALSE, FUN_Ys=sim_potential_outcomes, FUN_TestStat=calc_teststat)
  }
  power16_infinite = simPower(nrep=1e4, nt=n, nc=n, dataset=synFluData16, outcome="count", clusterID="clusterID", replacement=TRUE, FUN_Ys=sim_potential_outcomes, FUN_TestStat=calc_teststat)
  
  power16_i = cbind(n*2, power16_finite, power16_finite_PT, power16_infinite)
  colnames(power16_i) = list("ntotal", "power16_finite", "power16_finite_PT", "power16_infinite")
  power16 = rbind(power16, power16_i)
}
power16

2.6 Extend followup time

When the study population is small, it might be interesting be investigate whether extending the follow-up time will increase the study power, especially when individual units are followed for different among of time due to loss of follow-up. Suppose the intervention in the new study is expected to reduce the rate of hospitalization by -0.2 at most, the potential outcomes with extended follow-up time \(\Delta t\) can be simulated as follows:

\[f_0: Y_{ij}(0) \sim \big( \lambda_0 * t + Poisson(\lambda_0*\Delta t) \big) / \big( t + \Delta t\big) \]

\[f_1: Y_{ij}(1) \sim \big( max(\lambda_0-0.2,0) * t + Poisson(max(\lambda_0-0.2,0)*\Delta t) \big) / \big( t + \Delta t\big)\]

If the previous study data suggest that a patient was not followed till the end of the study, it does not make sense to extend the follow-up time for them. So \(\Delta t =0\) for patients who were not followed for a full 211 days in the flu study. Users should specify a lost-follow-up argument in CRTsearch as follows:

sim_potential_outcomes = function(data, ...){
  # input: 
  # data: a dataframe with at least two columns (clusterID, Y0) representing the cluster sizes and the empirical distribution of the potential outcome Y0.
  # output: a (Nm x 2) matrix of the two potential outcomes for each and every individual in the cluster
  data = data[order(data$clusterID),]
  Y0count = data[,"Y0"]
  lambda0 = data[,"rate"]
  lambda1 = lambda0 - 0.4 ## effect size = -0.4
  lambda1[ lambda1 < 0 ] = 0
  extendtime = 0
  Y0count_extend = unlist(lapply(lambda0*extendtime, rpois, n=1))
  Y1count = unlist(lapply(lambda1*data[,"time_years"], rpois, n=1))
  Y1count_extend = unlist(lapply(lambda1*extendtime, rpois, n=1))
  Y1 = (Y1count + Y1count_extend*(1-data[,"lostfollowup"]))
  Y0 = (Y0count + Y0count_extend*(1-data[,"lostfollowup"]))
  re = cbind(Y0, Y1)
  colnames(re) = list("Y0", "Y1")
  return(re)
}
calc_teststat = function(W, data, ...){
  # input: 
  # W: a vector of lenght N (the total number of clusters in the trial composed of 1s and 0s indicating the randomization assignment to the intervention arm and the control arm respectatively.
  # data: a (Nm x 2) matrix of the two potential outcomes for all individuals participating in the trail ordered by their cluster ID.
  # output: a numeric object for the value of the calculated test statistic. Here, it is the difference of the pooled rates between the two study arms.
  Y0 = data[,"Y0"]
  Y1 = data[,"Y1"]
  extendtime = 0.25
  time = data[,"time_years"] + extendtime*(1-data[,"lostfollowup"])
  ## pooled rates
  re = sum(Y1[W==1])/sum(time[W==1]) - sum(Y0[W==0])/sum(time[W==0])
  ## person-level rates
  ##re = mean(Y1[W==1]/time[W==1]) - mean(Y0[W==0]/time[W==0])
  return(re)
}

synFluData16$lostfollowup = ifelse(synFluData16$time_days < 211, 1 , 0)

power_extendtime=NULL
addtime = 0
for(addtime in seq(0, 1, 0.25)){
  power_i = simPower(nrep=1e2, nt=8, nc=8, dataset=synFluData16, outcome="count", clusterID="clusterID", replacement=FALSE, FUN_Ys=sim_potential_outcomes, FUN_TestStat=calc_teststat, extendtime=addtime)
  power_extendtime = rbind(power_extendtime, cbind(addtime, power_i))
}
power_extendtime

CRTsampleSearch Examples

Ruoshui Zhai

06 May, 2021