############################################################### # # R Routines for Simulating and Calibrating Inference on # PPV or NPV for Diagnostic Methods # # Following the paper by Steinberg, Fine and Chappell # ############################################################### ################################################### # # All routines are written in terms of PPV. # Reversing the roles of the case and control # data and parameters lets you use them to # study how the methods work for NPV. # ################################################### ############################################# # # Simulate data and check inference # ================================= # # INPUT parameters are: # nca = number of cases in study # nco = number of healthy subjects in study # se = actual sensitivity # sp = actual specificity # nsim = the number of simulation runs # # OUTPUT focuses on the ability to estimate # phi = log(1-se) - log(sp) # # The value of phi is printed on the screen. # # Note that problems arise for estimating phi from a sample in # which se-hat = 1 (all cases test positive) or # sp-hat = 0 (all controls test positive) # # The output summaries below (like average, SD, etc) are # computed ONLY from simulated samples in which the above # problem does NOT occur. # # The program prints the value of phi and returns # a vector with the following arguments: # -- Average of phi-hat # -- SD of phi-hat over the simulation samples # -- Coverage fraction for the "standard" 95% lower confidence # bound (phi-hat - 1.645*estimated SE of phi-hat) # -- Multiplier needed to achieve 95% coverage (i.e. the smallest # number k such that phi-hat - k*estimated SE of phi-hat # achieves 95% coverage of the true value # -- Fraction of simulation samples with either se-hat = 1 or # sp-hat = 0. # ############################################################### sim.phi.hat <- function(nca,nco,se,sp,nsim){ phi <- log(1-sp)-log(se) print("phi is") print(phi) se.hat <- rbinom(nsim,nca,se)/nca sp.hat <- rbinom(nsim,nco,sp)/nco sel <- ((se.hat==0)|(sp.hat==1)) phi.hat <- log(1-sp.hat[sel==F])-log(se.hat[sel==F]) se.phi.hat <- sqrt(((1-se.hat[sel==F])/(se.hat[sel==F]*nca))+ (sp.hat[sel==F]/((1-sp.hat[sel==F])*nco))) cb95 <- phi.hat-qnorm(0.95,0,1)*se.phi.hat coverage95 <- sum(cb95<=phi)/length(cb95) # Determine the multiplier. if (coverage95 >= 0.95){ mult <- 1.65 check <- 1 while (check == 1){ mult <- mult-0.01 cb <- phi.hat-mult*se.phi.hat cov <- sum(cb<=phi)/length(cb95) check <- (cov>=0.95) } mult <- mult+0.01 } if (coverage95 <= 0.95){ mult <- 1.65 check <- 0 while (check == 0){ mult <- mult+0.01 cb <- phi.hat-mult*se.phi.hat cov <- sum(cb<=phi)/length(cb95) check <- (cov>=0.95) } } phi.sum <- c(mean(phi.hat),sqrt(var(phi.hat)),coverage95,mult,sum(sel)/nsim) return(phi.sum) } ######################################################### # # Compute optimal allocation and sample # size for studying PPV only # ===================================== # # The goal here is to find a sample allocation and size such # that a lower (1-alpha) confidence bound for PPV is greater # than or equal to (1-gamma) with a given power. # # INPUT parameters are: # z1 = multiplier for the lower confidence bound # pow = desired power # se = assumed sensitivity # sp = assumed specificity # p = prevalence of condition in test population # gamma = 1 - lower bound desired for PPV # # As a default, z1 can be taken as the normal quantile for the # desired confidence level. Alternatively, the function sim.phi.hat # can be used to produce a multiplier with the desired (simulated) # coverage. # # OUTPUT vector gives # -- Fraction Pca of subjects that should be cases # -- Total sample size # # These values are also printed to the screen. # ############################################################### sam.ppv <- function(z1,pow,se,sp,p,gamma){ r <- (1-se)*(1-sp)/(se*sp) Pca <- sqrt(r)/(1+sqrt(r)) sigsq <- (1-se)/(Pca*se)+sp/((1-sp)*(1-Pca)) phi0 <- log(1-sp)-log(se) check1 <- (gamma < (1-p)) if (check1==0) {print("gamma is too big")} check2 <- (gamma*(p+(1-p)*exp(phi0)) > ((1-p)*exp(phi0))) if (check2==0) {print("gamma is too small")} if((check1==1)&(check2==1)){ n <- (z1+qnorm(pow,0,1))^2*sigsq/(phi0-log(p*gamma/((1-p)*(1-gamma))))^2 n <- ceiling(n) print("The optimal fraction of diseased subjects is") print(Pca) print("The total sample size is") print(n) return(c(Pca,n)) } } ######################################################### # # Compute optimal allocation and sample # size for studying PPV and NPV # ===================================== # # The goal here is to find a sample allocation and size such # that a lower (1-alpha) confidence bound for PPV is greater # than or equal to (1-gamma) with a given power for EACH OF # PPV and NPV. # # INPUT parameters are: # z1p = multiplier for the lower confidence bound for PPV # powp = desired power for PPV # z1n = multiplier for the lower confidence bound for NPV # pown = desired power for NPV # se = assumed sensitivity # sp = assumed specificity # p = prevalence of condition in test population # gammap = 1 - lower bound desired for PPV # gamman = 1 - lower bound desired for NPV # # As a default, z1 can be taken as the normal quantile for the # desired confidence level. Alternatively, the function sim.phi.hat # can be used to produce a multiplier with the desired (simulated) # coverage. # # OUTPUT vector gives # -- Fraction Pca of subjects that should be cases # -- Total sample size # # These values are also printed to the screen. # ############################################################### sam.ppv.npv <- function(z1p,powp,z1n,pown,se,sp,p,gammap,gamman){ phi1.0 <- log(1-sp)-log(se) phi2.0 <- log(1-se)-log(sp) check1 <- (gammap < (1-p)) if (check1==0) {print("gamma for PPV is too big")} check2 <- (gammap*(p+(1-p)*exp(phi1.0)) > ((1-p)*exp(phi1.0))) if (check2==0) {print("gamma for PPV is too small")} check3 <- (gamman < p) if (check3==0) {print("gamma for NPV is too big")} check4 <- (exp(phi2.0)*p*(1-gamman) < (1-p)*gamman) if (check4==0) {print("gamma for NPV is too small")} if((check1==1)&(check2==1)&(check3==1)&(check4==1)){ r <- (1-se)*(1-sp)/(se*sp) PcaP <- sqrt(r)/(1+sqrt(r)) PcaN <- 1-PcaP # The optimal P is sought by comparing the separate sample # size functions (for PPV and NPV) on a grid of possible values. P <- seq(PcaP,PcaN,(PcaN-PcaP)/200) sigsq1 <- (1-se)/(P*se)+sp/((1-sp)*(1-P)) sigsq2 <- se/((1-se)*P)+(1-sp)/(sp*(1-P)) np <- (z1p+qnorm(powp,0,1))^2*sigsq1/(phi1.0-log(p*gammap/((1-p)*(1-gammap))))^2 nn <- (z1n+qnorm(pown,0,1))^2*sigsq2/(phi2.0-log((1-p)*gamman/(p*(1-gamman))))^2 check5 <- (np[1] > nn[1]) check6 <- (np[201] > nn[201]) if ((check5==1)&(check6==1)){ Popt <- P[1] n <- ceiling(np[1]) } if ((check5==0)&(check6==0)){ Popt <- P[201] n <- ceiling(nn[201]) } if ((check5==0)&(check6==1)){ ind <- grep("TRUE",rank(abs(nn-np))==1) Popt <- P[ind] n <- ceiling(max(np[ind],nn[ind])) print(cbind(np,nn)) } print("The optimal fraction of diseased subjects is") print(Popt) print("The total sample size is") print(n) return(c(Popt,n)) } } ##################################################### # # Simulate data and compare performance for PPV # for a collection of possible allocations # ============================================= # # INPUT parameters are: # P = vector with possible allocations (proportion of cases) # n = total sample size # se = actual sensitivity # sp = actual specificity # nsim = the number of simulation runs # # OUTPUT focuses on the ability to estimate # phi = log(1-se) - log(sp) # # The value of phi is printed on the screen. # # Note that problems arise for estimating phi from a sample in # which se-hat = 1 (all cases test positive) or # sp-hat = 0 (all controls test positive) # # The output summaries below (like average, SD, etc) are # computed ONLY from simulated samples in which the above # problem does NOT occur. # # The program prints the value of phi and returns # a matrix with the following arguments for each allocation. # Each row of the matrix corresponds to one of the allocations. # -- The allocation # -- Average of phi-hat # -- SD of phi-hat over the simulation samples # -- Coverage fraction for the "standard" 95% lower confidence # bound (phi-hat - 1.645*estimated SE of phi-hat) # -- Multiplier needed to achieve 95% coverage (i.e. the smallest # number k such that phi-hat - k*estimated SE of phi-hat # achieves 95% coverage of the true value # -- Fraction of simulation samples with either se-hat = 1 or # sp-hat = 0. # # Note: It is assumed that the actual sample sizes are the # nearest integers to those computed directly from the allocation. ############################################################### sim.ppv <- function(P,se,sp,n,nsim){ nca <- round(n*P) nco <- n-nca phi <- log(1-sp)-log(se) out <- matrix(rep(0,6*length(P)),ncol=6) for (i in 1:length(nca)){ se.hat <- rbinom(nsim,nca[i],se)/nca[i] sp.hat <- rbinom(nsim,nco[i],sp)/nco[i] sel <- ((se.hat==0)|(sp.hat==1)) phi.hat <- log(1-sp.hat[sel==F])-log(se.hat[sel==F]) se.phi.hat <- sqrt(((1-se.hat[sel==F])/(se.hat[sel==F]*nca[i]))+ (sp.hat[sel==F]/((1-sp.hat[sel==F])*nco[i]))) cb95 <- phi.hat-qnorm(0.95,0,1)*se.phi.hat coverage95 <- sum(cb95<=phi)/length(cb95) # Determine the multiplier. if (coverage95 >= 0.95){ mult <- 1.65 check <- 1 while (check == 1){ mult <- mult-0.01 cb <- phi.hat-mult*se.phi.hat cov <- sum(cb<=phi)/length(cb95) check <- (cov>=0.95) } mult <- mult+0.01 } if (coverage95 <= 0.95){ mult <- 1.65 check <- 0 while (check == 0){ mult <- mult+0.01 cb <- phi.hat-mult*se.phi.hat cov <- sum(cb<=phi)/length(cb95) check <- (cov>=0.95) } } phi.sum <- c(P[i],mean(phi.hat),sqrt(var(phi.hat)),coverage95,mult,sum(sel)/nsim) out[i,] <- phi.sum } print("phi is") print(phi) return(out) }