# 1. Specify simulation parameters # --------------------------------- # 1.a "Study" parameters (with default values) # p.A <- 0.30 # Probability of genetic marker A # p.cure.X.B <- 0.40 # Treatment Y cure probability for patients with genetic marker A # p.cure.Y.B <- 0.40 # Treatment Y cure probability for patients with genetic marker B # p.cure.X.A <- 0.25 # Treatment X cure probability for patients with genetic marker A # p.cure.Y.A <- 0.25 # Treatment X cure probability for patients with genetic marker B # 1.b Design parameters # n <- 100 # Total sample size # p.alloc.X.B <- 0.50 # Treatment X allocation probability for patients with genetic marker B # p.alloc.X.A <- 0.50 # Treatment X allocation probability for patients with genetic marker A # 2. Aux functions # ---------------- generate.data <- function(n = 1000, p.cure.X.B = 0.40, p.cure.Y.B = 0.40, p.cure.X.A = 0.25,p.cure.Y.A = 0.25, p.A = 0.30, p.alloc.X.B = 0.50,p.alloc.X.A = 0.50) { data.matrix <- array(dim=c(n,3)) dimnames(data.matrix) <- list(paste("obs - ",1:n,sep = ""),c("Marker","Treatment","Cure")) data.matrix[,1] <- sample(c("A","B"),n,replace = T, prob = c(p.A,1-p.A)) is.A <- data.matrix[,1] == "A" is.B <- !is.A data.matrix[is.A,2] <- sample(c("X","Y"),sum(is.A),replace = T, prob = c(p.alloc.X.A,1-p.alloc.X.A)) data.matrix[is.B,2] <- sample(c("X","Y"),sum(is.B),replace = T, prob = c(p.alloc.X.B,1-p.alloc.X.B)) is.X <- data.matrix[,2] == "X" p.cure <- rep(NA,n) p.cure[is.X & is.A] <- p.cure.X.A p.cure[is.X & is.B] <- p.cure.X.B p.cure[!is.X & is.A] <- p.cure.Y.A p.cure[!is.X & is.B] <- p.cure.Y.B data.matrix[,3] <- rbinom(n,1,p.cure) return(data.matrix) } z.rate.diff <- function(x.vec,y.vec) { n.x <- length(x.vec) n.y <- length(y.vec) p.hat.x <- mean(as.numeric(x.vec)) p.hat.y <- mean(as.numeric(y.vec)) var.hat.x <- p.hat.x*(1-p.hat.x)/n.x var.hat.y <- p.hat.y*(1-p.hat.y)/n.y z.diff <- (p.hat.x - p.hat.y) / sqrt(var.hat.x+var.hat.y) return(z.diff) } # 3. 1st simulation -- determine sample size needed to ensure 0.80 power for 1st research question # --------------------------------------------------------------------------------------------------- n.vec <- seq(100,600,by = 10) pow.vec <- rep(0,51) sig.vec <- rep(0,51) for(i in 1:51) { print(i) for(sim in 1:1000) { data.0 <- generate.data(n = n.vec[i], p.cure.X.B = 0.32, p.cure.Y.B = 0.32, p.cure.X.A = 0.32,p.cure.Y.A = 0.32) is.A <- data.0[,1] == "A" is.B <- data.0[,1] == "B" is.X <- data.0[,2] == "X" is.Y <- data.0[,2] == "Y" z.diff.B.A.X <- z.rate.diff(data.0[is.B & is.X,3],data.0[is.A & is.X,3]) z.diff.B.A.Y <- z.rate.diff(data.0[is.B & is.Y,3],data.0[is.A & is.Y,3]) z.diff.B.A <- (z.diff.B.A.X + z.diff.B.A.Y) / sqrt(2) if(qnorm(0.95) <= z.diff.B.A) sig.vec[i] <- sig.vec[i] + 1/1000 data.1 <- generate.data(n = n.vec[i], p.cure.X.B = 0.40, p.cure.Y.B = 0.40, p.cure.X.A = 0.25,p.cure.Y.A = 0.25) is.A <- data.1[,1] == "A" is.B <- data.1[,1] == "B" is.X <- data.1[,2] == "X" is.Y <- data.1[,2] == "Y" z.diff.B.A.X <- z.rate.diff(data.1[is.B & is.X,3],data.1[is.A & is.X,3]) z.diff.B.A.Y <- z.rate.diff(data.1[is.B & is.Y,3],data.1[is.A & is.Y,3]) z.diff.B.A <- (z.diff.B.A.X + z.diff.B.A.Y) / sqrt(2) if(qnorm(0.95) <= z.diff.B.A) pow.vec[i] <- pow.vec[i] + 1/1000 } } par(mfcol=c(1,1)) plot(n.vec,pow.vec,type = "b",col = 3,ylim = c(0,1)) lines(n.vec,sig.vec, type = "b",col=2) abline(h= 0.05,lty = 2) abline(h= 0.80,lty = 2) abline(h= 1,lty = 2) # 3.1 Find sample size needed for power = 0.80 lowess.pow <- lowess(n.vec,pow.vec,f=0.3) lines(lowess.pow,col = 3) (n.80 <- ceiling(approx(lowess.pow$y, lowess.pow$x,xout = 0.80)$y)) # 4. 2nd simulation -- compare 4 different statistics for a fixed sample size of 300 patients # --------------------------------------------------------------------------------------------- n0.A.vec <- rep(NA,10^4) n1.A.vec <- rep(NA,10^4) z0.diff.B.A.vec <- rep(NA,10^4) z1.diff.B.A.vec <- rep(NA,10^4) diff0.B.A.succ.prop <- rep(NA,10^4) diff1.B.A.succ.prop <- rep(NA,10^4) z0.diff.B.A.succ.prop <- rep(NA,10^4) z1.diff.B.A.succ.prop <- rep(NA,10^4) smp0.log.LRT <- rep(NA,10^4) smp1.log.LRT <- rep(NA,10^4) for(i in 1:(10^4)) { if( i%%1000 == 1) print(i) data.0 <- generate.data(n = 300, p.cure.X.B = 0.32, p.cure.Y.B = 0.32, p.cure.X.A = 0.32,p.cure.Y.A = 0.32) is.A <- data.0[,1] == "A" is.B <- data.0[,1] == "B" is.X <- data.0[,2] == "X" is.Y <- data.0[,2] == "Y" is.cure <- data.0[,3] == "1" n0.A.vec[i] <- sum(is.A) diff0.B.A.succ.prop[i] <- (sum(is.B & is.cure) / sum(is.B)) - ( sum(is.A & is.cure) / sum(is.A)) z0.diff.B.A.succ.prop[i]<- z.rate.diff(data.0[is.B,3],data.0[is.A,3]) log.lik0 <- (is.cure*log(0.32)) + ((1-is.cure)*log(0.68)) log.lik1 <- (is.cure*is.A *log(0.25)) + ((1-is.cure)*is.A *log(0.75)) + (is.cure*is.B *log(0.40)) + ((1-is.cure)*is.B *log(0.60)) smp0.log.LRT[i] <- sum(log.lik1 - log.lik0) z.diff.B.A.X <- z.rate.diff(data.0[is.B & is.X,3],data.0[is.A & is.X,3]) z.diff.B.A.Y <- z.rate.diff(data.0[is.B & is.Y,3],data.0[is.A & is.Y,3]) z0.diff.B.A.vec[i] <- (z.diff.B.A.X + z.diff.B.A.Y) / sqrt(2) data.1 <- generate.data(n = 300, p.cure.X.B = 0.40, p.cure.Y.B = 0.40, p.cure.X.A = 0.25,p.cure.Y.A = 0.25) is.A <- data.1[,1] == "A" is.B <- data.1[,1] == "B" is.X <- data.1[,2] == "X" is.Y <- data.1[,2] == "Y" is.cure <- data.1[,3] == "1" n1.A.vec[i] <- sum(is.A) diff1.B.A.succ.prop[i] <- (sum(is.B & is.cure) / sum(is.B)) - ( sum(is.A & is.cure) / sum(is.A)) z1.diff.B.A.succ.prop[i]<- z.rate.diff(data.1[is.B,3],data.1[is.A,3]) log.lik0 <- (is.cure*log(0.32)) + ((1-is.cure)*log(0.68)) log.lik1 <- (is.cure*is.A *log(0.25)) + ((1-is.cure)*is.A *log(0.75)) + (is.cure*is.B *log(0.40)) + ((1-is.cure)*is.B *log(0.60)) smp1.log.LRT[i] <- sum(log.lik1 - log.lik0) z.diff.B.A.X <- z.rate.diff(data.1[is.B & is.X,3],data.1[is.A & is.X,3]) z.diff.B.A.Y <- z.rate.diff(data.1[is.B & is.Y,3],data.1[is.A & is.Y,3]) z1.diff.B.A.vec[i] <- (z.diff.B.A.X + z.diff.B.A.Y) / sqrt(2) } # 4.1 Study first statistic par(mfcol = c(1,3)) qqnorm(z0.diff.B.A.vec) qqline(z0.diff.B.A.vec,col=2) abline(0,1,col=3) qqnorm(z1.diff.B.A.vec) qqline(z1.diff.B.A.vec,col=2) abline(0,1,col=3) boxplot(list(z0.diff.B.A.vec,z1.diff.B.A.vec)) mean(z1.diff.B.A.vec) sd(z1.diff.B.A.vec) mean(z0.diff.B.A.vec > qnorm(0.95)) mean(z1.diff.B.A.vec > qnorm(0.95)) # 4.2 Study second statistic par(mfcol = c(1,3)) qqnorm(diff0.B.A.succ.prop) qqline(diff0.B.A.succ.prop,col=2) qqnorm(diff1.B.A.succ.prop) qqline(diff1.B.A.succ.prop,col=2) boxplot(list(diff0.B.A.succ.prop,diff1.B.A.succ.prop)) mean(diff0.B.A.succ.prop > quantile(diff0.B.A.succ.prop,0.95)) mean(diff1.B.A.succ.prop > quantile(diff0.B.A.succ.prop,0.95)) # 4.3 Study third statistic par(mfcol = c(1,3)) qqnorm(z0.diff.B.A.succ.prop) qqline(z0.diff.B.A.succ.prop,col=2) abline(0,1,col=3) qqnorm(z1.diff.B.A.succ.prop) qqline(z1.diff.B.A.succ.prop,col=2) abline(0,1,col=3) boxplot(list(z0.diff.B.A.succ.prop,z1.diff.B.A.succ.prop)) mean(z0.diff.B.A.succ.prop > qnorm(0.95)) mean(z1.diff.B.A.succ.prop > qnorm(0.95)) # 4.4 Compare second and third statistics par(mfcol = c(1,2)) plot(diff0.B.A.succ.prop,z0.diff.B.A.succ.prop) plot(diff1.B.A.succ.prop,z1.diff.B.A.succ.prop) boxplot(split(z0.diff.B.A.vec,n0.A.vec),varwidth = T,main = "Null z-score distribution") boxplot(split(diff0.B.A.succ.prop,n0.A.vec),varwidth = T,main = "Null z-score distribution") # 4.5 Study fourth statistic par(mfcol = c(1,3)) qqnorm(smp0.log.LRT) qqline(smp0.log.LRT,col=2) abline(0,1,col=3) qqnorm(smp1.log.LRT) qqline(smp1.log.LRT,col=2) abline(0,1,col=3) boxplot(list(smp0.log.LRT,smp1.log.LRT)) mean(smp1.log.LRT > quantile(smp0.log.LRT,0.95)) # 4.6 Compare all four statistics # Scatter plot of all pairs par(mfcol = c(1,1) pairs(cbind(z0.diff.B.A.vec,diff0.B.A.succ.prop,z0.diff.B.A.succ.prop,smp0.log.LRT)) pairs(cbind(z1.diff.B.A.vec,diff1.B.A.succ.prop,z1.diff.B.A.succ.prop,smp1.log.LRT)) # ROC curves stat1.vals <- seq(-4,9,length = 5000) stat1.TP <- (10^4 - (rank(c(stat1.vals,z1.diff.B.A.vec))[1:5000] - (1:5000))) / 10^4 stat1.FP <- (10^4 - (rank(c(stat1.vals,z0.diff.B.A.vec))[1:5000] - (1:5000))) / 10^4 stat2.vals <- seq(-0.4,1,length = 5000) stat2.TP <- (10^4 - (rank(c(stat2.vals,diff1.B.A.succ.prop))[1:5000] - (1:5000))) / 10^4 stat2.FP <- (10^4 - (rank(c(stat2.vals,diff0.B.A.succ.prop))[1:5000] - (1:5000))) / 10^4 stat3.vals <- seq(-4,9,length = 5000) stat3.TP <- (10^4 - (rank(c(stat1.vals,z1.diff.B.A.succ.prop))[1:5000] - (1:5000))) / 10^4 stat3.FP <- (10^4 - (rank(c(stat1.vals,z0.diff.B.A.succ.prop))[1:5000] - (1:5000))) / 10^4 stat4.vals <- seq(-15,15,length = 5000) stat4.TP <- (10^4 - (rank(c(stat3.vals,smp1.log.LRT))[1:5000] - (1:5000))) / 10^4 stat4.FP <- (10^4 - (rank(c(stat3.vals,smp0.log.LRT))[1:5000] - (1:5000))) / 10^4 par(mfcol = c(1,1)) plot(stat1.FP,stat1.TP,col=2,type = "l", main = "ROC curve",xlab = "False positive",ylab = "True positive") lines(stat2.FP,stat2.TP,col=3) lines(stat3.FP,stat3.TP,col=4) lines(stat4.FP,stat4.TP,col=5) abline(v = 0.05,lty = 2) abline(h = 0.80,lty = 2) par(mfcol = c(1,1)) plot(stat1.FP,stat1.TP,col=2,type = "l", main = "ROC curve",xlab = "False positive",ylab = "True positive",ylim = c(0.75,0.90),xlim = c(0.03,0.06)) lines(stat2.FP,stat2.TP,col=3) lines(stat3.FP,stat3.TP,col=4) lines(stat4.FP,stat4.TP,col=5) abline(v = 0.05,lty = 2) abline(h = 0.80,lty = 2)