###############################################################
#
#   R Routines for Simulating and Calibrating Inference on
#           PPV or NPV for Diagnostic Methods
#
#   Following the paper by Steinberg, Fine and Chappell
#
###############################################################

###################################################
#
#   All routines are written in terms of PPV.
#   Reversing the roles of the case and control
#   data and parameters lets you use them to
#   study how the methods work for NPV.
#
###################################################


#############################################
#
#  Simulate data and check inference
#  =================================
#
#  INPUT parameters are:
#  nca = number of cases in study
#  nco = number of healthy subjects in study
#  se = actual sensitivity
#  sp = actual specificity
#  nsim = the number of simulation runs
#
#  OUTPUT focuses on the ability to estimate 
#     phi = log(1-se) - log(sp)
#
#  The value of phi is printed on the screen.
#
#  Note that problems arise for estimating phi from a sample in
#  which se-hat = 1 (all cases test positive) or 
#  sp-hat = 0 (all controls test positive)
#
#  The output summaries below (like average, SD, etc) are
#  computed ONLY from simulated samples in which the above
#  problem does NOT occur.
#
#  The program prints the value of phi and returns
#  a vector with the following arguments:
#  -- Average of phi-hat
#  -- SD of phi-hat over the simulation samples
#  -- Coverage fraction for the "standard" 95% lower confidence
#     bound (phi-hat - 1.645*estimated SE of phi-hat)
#  -- Multiplier needed to achieve 95% coverage (i.e. the smallest
#     number k such that phi-hat - k*estimated SE of phi-hat
#     achieves 95% coverage of the true value
#  -- Fraction of simulation samples with either se-hat = 1 or
#     sp-hat = 0.
#
###############################################################

sim.phi.hat <- function(nca,nco,se,sp,nsim){

	phi <- log(1-sp)-log(se)

	print("phi is")
	print(phi)

	se.hat <- rbinom(nsim,nca,se)/nca
	sp.hat <- rbinom(nsim,nco,sp)/nco

	sel <- ((se.hat==0)|(sp.hat==1))

	phi.hat <- log(1-sp.hat[sel==F])-log(se.hat[sel==F])
      se.phi.hat <- sqrt(((1-se.hat[sel==F])/(se.hat[sel==F]*nca))+
              (sp.hat[sel==F]/((1-sp.hat[sel==F])*nco)))
	cb95 <- phi.hat-qnorm(0.95,0,1)*se.phi.hat
      coverage95 <- sum(cb95<=phi)/length(cb95)

# Determine the multiplier.

      if (coverage95 >= 0.95){
         mult <- 1.65
         check <- 1
         while (check == 1){
           mult <- mult-0.01
           cb <- phi.hat-mult*se.phi.hat
           cov <- sum(cb<=phi)/length(cb95)
           check <- (cov>=0.95)
         }
         mult <- mult+0.01
      }
      if (coverage95 <= 0.95){
         mult <- 1.65
         check <- 0
         while (check == 0){
           mult <- mult+0.01
           cb <- phi.hat-mult*se.phi.hat
           cov <- sum(cb<=phi)/length(cb95)
           check <- (cov>=0.95)
         }
      }

	phi.sum <- c(mean(phi.hat),sqrt(var(phi.hat)),coverage95,mult,sum(sel)/nsim)

 	return(phi.sum)
}


#########################################################
#
#  Compute optimal allocation and sample
#  size for studying PPV only
#  =====================================
#
#  The goal here is to find a sample allocation and size such
#  that a lower (1-alpha) confidence bound for PPV is greater
#  than or equal to (1-gamma) with a given power. 
# 
#  INPUT parameters are:
#  z1 = multiplier for the lower confidence bound
#  pow = desired power
#  se = assumed sensitivity
#  sp = assumed specificity
#  p = prevalence of condition in test population
#  gamma = 1 - lower bound desired for PPV
#
#  As a default, z1 can be taken as the normal quantile for the
#  desired confidence level.  Alternatively, the function sim.phi.hat
#  can be used to produce a multiplier with the desired (simulated)
#  coverage.
#
#  OUTPUT vector gives 
#  -- Fraction Pca of subjects that should be cases
#  -- Total sample size
#  
#  These values are also printed to the screen.
#
###############################################################

sam.ppv <- function(z1,pow,se,sp,p,gamma){

	r <- (1-se)*(1-sp)/(se*sp)
	Pca <- sqrt(r)/(1+sqrt(r))

      sigsq <- (1-se)/(Pca*se)+sp/((1-sp)*(1-Pca))

      phi0 <- log(1-sp)-log(se)

      check1 <- (gamma < (1-p))
      if (check1==0) {print("gamma is too big")}
      check2 <- (gamma*(p+(1-p)*exp(phi0)) > ((1-p)*exp(phi0)))
      if (check2==0) {print("gamma is too small")}

      if((check1==1)&(check2==1)){
      
        n <- (z1+qnorm(pow,0,1))^2*sigsq/(phi0-log(p*gamma/((1-p)*(1-gamma))))^2
        n <- ceiling(n)

        print("The optimal fraction of diseased subjects is")
        print(Pca)
        print("The total sample size is")
        print(n)

        return(c(Pca,n))
      }
}
     

  
#########################################################
#
#  Compute optimal allocation and sample
#  size for studying PPV and NPV
#  =====================================
#
#  The goal here is to find a sample allocation and size such
#  that a lower (1-alpha) confidence bound for PPV is greater
#  than or equal to (1-gamma) with a given power for EACH OF
#  PPV and NPV. 
# 
#  INPUT parameters are:
#  z1p = multiplier for the lower confidence bound for PPV
#  powp = desired power for PPV
#  z1n = multiplier for the lower confidence bound for NPV
#  pown = desired power for NPV
#  se = assumed sensitivity
#  sp = assumed specificity
#  p = prevalence of condition in test population
#  gammap = 1 - lower bound desired for PPV
#  gamman = 1 - lower bound desired for NPV
#
#  As a default, z1 can be taken as the normal quantile for the
#  desired confidence level.  Alternatively, the function sim.phi.hat
#  can be used to produce a multiplier with the desired (simulated)
#  coverage.
#
#  OUTPUT vector gives 
#  -- Fraction Pca of subjects that should be cases
#  -- Total sample size
#  
#  These values are also printed to the screen.
#
###############################################################

sam.ppv.npv <- function(z1p,powp,z1n,pown,se,sp,p,gammap,gamman){

      phi1.0 <- log(1-sp)-log(se)
      phi2.0 <- log(1-se)-log(sp)

      check1 <- (gammap < (1-p))
      if (check1==0) {print("gamma for PPV is too big")}
      check2 <- (gammap*(p+(1-p)*exp(phi1.0)) > ((1-p)*exp(phi1.0)))
      if (check2==0) {print("gamma for PPV is too small")}
      check3 <- (gamman < p)
      if (check3==0) {print("gamma for NPV is too big")}
      check4 <- (exp(phi2.0)*p*(1-gamman) < (1-p)*gamman)
      if (check4==0) {print("gamma for NPV is too small")}


      if((check1==1)&(check2==1)&(check3==1)&(check4==1)){
 
	r <- (1-se)*(1-sp)/(se*sp)
	PcaP <- sqrt(r)/(1+sqrt(r))
	PcaN <- 1-PcaP

#  The optimal P is sought by comparing the separate sample
#  size functions (for PPV and NPV) on a grid of possible values.

      P <- seq(PcaP,PcaN,(PcaN-PcaP)/200)

      sigsq1 <- (1-se)/(P*se)+sp/((1-sp)*(1-P))
      sigsq2 <- se/((1-se)*P)+(1-sp)/(sp*(1-P))

      np <- (z1p+qnorm(powp,0,1))^2*sigsq1/(phi1.0-log(p*gammap/((1-p)*(1-gammap))))^2
      nn <- (z1n+qnorm(pown,0,1))^2*sigsq2/(phi2.0-log((1-p)*gamman/(p*(1-gamman))))^2

      check5 <- (np[1] > nn[1])
      check6 <- (np[201] > nn[201])
      if ((check5==1)&(check6==1)){
         Popt <- P[1]
         n <- ceiling(np[1])
      }
      if ((check5==0)&(check6==0)){
         Popt <- P[201]
         n <- ceiling(nn[201])
      }
      if ((check5==0)&(check6==1)){
         ind <- grep("TRUE",rank(abs(nn-np))==1)
         Popt <- P[ind]
         n <- ceiling(max(np[ind],nn[ind]))
         print(cbind(np,nn))
      }

        print("The optimal fraction of diseased subjects is")
        print(Popt)
        print("The total sample size is")
        print(n)

        return(c(Popt,n))
      }
}



#####################################################
#
#  Simulate data and compare performance for PPV
#  for a collection of possible allocations
#  =============================================
#
#  INPUT parameters are:
#  P = vector with possible allocations (proportion of cases)
#  n = total sample size
#  se = actual sensitivity
#  sp = actual specificity
#  nsim = the number of simulation runs
#
#  OUTPUT focuses on the ability to estimate 
#     phi = log(1-se) - log(sp)
#
#  The value of phi is printed on the screen.
#
#  Note that problems arise for estimating phi from a sample in
#  which se-hat = 1 (all cases test positive) or 
#  sp-hat = 0 (all controls test positive)
#
#  The output summaries below (like average, SD, etc) are
#  computed ONLY from simulated samples in which the above
#  problem does NOT occur.
#
#  The program prints the value of phi and returns
#  a matrix with the following arguments for each allocation.
#  Each row of the matrix corresponds to one of the allocations.
#  -- The allocation
#  -- Average of phi-hat
#  -- SD of phi-hat over the simulation samples
#  -- Coverage fraction for the "standard" 95% lower confidence
#     bound (phi-hat - 1.645*estimated SE of phi-hat)
#  -- Multiplier needed to achieve 95% coverage (i.e. the smallest
#     number k such that phi-hat - k*estimated SE of phi-hat
#     achieves 95% coverage of the true value
#  -- Fraction of simulation samples with either se-hat = 1 or
#     sp-hat = 0.
#
# Note:  It is assumed that the actual sample sizes are the
#  nearest integers to those computed directly from the allocation.
###############################################################

sim.ppv <- function(P,se,sp,n,nsim){

	nca <- round(n*P)
	nco <- n-nca

	phi <- log(1-sp)-log(se)

	out <- matrix(rep(0,6*length(P)),ncol=6)

	for (i in 1:length(nca)){

		se.hat <- rbinom(nsim,nca[i],se)/nca[i]
		sp.hat <- rbinom(nsim,nco[i],sp)/nco[i]

		sel <- ((se.hat==0)|(sp.hat==1))

		phi.hat <- log(1-sp.hat[sel==F])-log(se.hat[sel==F])
            se.phi.hat <- sqrt(((1-se.hat[sel==F])/(se.hat[sel==F]*nca[i]))+
                   (sp.hat[sel==F]/((1-sp.hat[sel==F])*nco[i])))
	      cb95 <- phi.hat-qnorm(0.95,0,1)*se.phi.hat
            coverage95 <- sum(cb95<=phi)/length(cb95)

# Determine the multiplier.

      if (coverage95 >= 0.95){
         mult <- 1.65
         check <- 1
         while (check == 1){
           mult <- mult-0.01
           cb <- phi.hat-mult*se.phi.hat
           cov <- sum(cb<=phi)/length(cb95)
           check <- (cov>=0.95)
         }
         mult <- mult+0.01
      }
      if (coverage95 <= 0.95){
         mult <- 1.65
         check <- 0
         while (check == 0){
           mult <- mult+0.01
           cb <- phi.hat-mult*se.phi.hat
           cov <- sum(cb<=phi)/length(cb95)
           check <- (cov>=0.95)
         }
      }

		phi.sum <- c(P[i],mean(phi.hat),sqrt(var(phi.hat)),coverage95,mult,sum(sel)/nsim)

		out[i,] <- phi.sum
 
		}
	print("phi is")
	print(phi)

	return(out)
}