#Suplementary material: 
#A Flexible Two Stage Procedure for Identifying Gene Sets that are Differentially  Expressed
#Ruth Heller, Elisabetta Manduchi, Gregory Grant, Warren Ewens. 


#---R code for the two stage procedure 3.1----------#

library(multtest)
library(MASS)

#--------------------THE FUNCTIONS------------------#


################################################################################
# Author of the following 3 functions:  Dan Nettleton                          #
# Date: August 7, 2007                                                         #
# Downloaded from: http://www.public.iastate.edu/~dnett/useR/GeneCatTesting.txt#
################################################################################


e.com=function (y) {#Nettleton
  K=ncol(y)
  scale=rep(0,K)
  for(i in 1:K){
    D=dist(y[,i])
    scale[i]=1/sum(D)
  }
  y=sweep(y,2,scale,FUN="*")
  return(y)
}

get.delta=function (x,D,n,N,w) {#Nettleton
  inter.sum=tapply(1:N,x,FUN="inter.dist",D)
  inter.mean=inter.sum*2/(n*(n-1))
  delta=sum(w*inter.mean)
  return(delta)
}

inter.dist=function (index,D) {#Nettleton
  return(sum(as.matrix(D)[index,index])/2)
}

mrpp=function (x,y,nperm) { #Nettleton's p-value for a gene set. 
  D=dist(y)
  x=as.factor(x)
  n=table(x)
  N=sum(n)
  w=n/N
  delta.obs=get.delta(x,D,n,N,w)
  delta=rep(0,nperm)
  for(i in 1:nperm){
    delta[i]=get.delta(sample(x),D,n,N,w)  
  }
  return(p=mean(delta.obs>=delta))
}

################################################################################
# End of Dan Nettleton's code                                                  #
################################################################################


fdr = function(pv,q){
if (sum(is.na(pv))>0 | sum(pv<0,na.rm=T)>0 | sum(pv>1,na.rm=T)>0) 
	warning("Some p-values are not-valid or missing! These p-values are ignored")
end
spv = sort(pv[!is.na(pv)])
G = length(spv)
temp = spv<=seq(1,G,1)*q/G
pID = ifelse(length(spv[temp==T])>0,max(spv[temp==T]),NA)
pID
} 


#-----------------------THE DATA----------------------#

#Input: 

#set2geneindex: a matrix of size Sxn, where S is the number of gene sets and n the size of the largest gene set. 
#               entry (i,j) is the gene id of the j-th gene in the i-th gene set. 

#ysub: a matrix of size Gxm, where G is the number of genes that are in at least one gene set and m is the number of replicates. 
       entry (i,j) is the expression level of gene i in replicate j. 

#mlabels: a vector of size m, with entries 0 or 1 corresponding to replicates from the first or the second group respectively. 

#THE CODE FOR THE DATA USED IN THE APPLICATION TO A MICROARRAY STUDY IN SECTION 4 IS AT THE END OF THIS SCRIPT. 


#----------------------THE ANALYSIS-----------------#

#----screening stage---#

#compute screening p-values
pmrpp  = rep(NA,S)
for (i in 1: S){#for every gene set
	print(paste("gene set ",i))
	pmrpp[i] = mrpp(mylabels,e.com(t(ysub[set2geneindex[i,1:n[i]],])),nperm=100)
#	pmrpp[i] = mrpp(type.b,e.com(t(y[set2geneindex[i,1:n[i]],])),nperm=100)
}#for i


#BH on screening p-values
q=0.05
pID = fdr(pmrpp,q)
index = seq(1,S,1)
index = index[pmrpp<=pID]

length(index)

#-----testing after screening stage---#

pWY = matrix(nrow=S,ncol=500) #the adjusted p-values by WY

for (k in 1: length(index)){
	i = index[k] #the index of the rejected gene set
	print(paste("selected gene set ",k))
	out = mt.minP(ysub[set2geneindex[i,1:n[i]],],mylabels,test = "wilcoxon",  B=100000) 
#	out = mt.minP(ysub[set2geneindex[i,1:n[i]],],mylabels,test = "wilcoxon",side="upper",  B=100000) 
	pWY[i,out$index] = out$adjp
}

save(mylabels, mysample,pmrpp,index, pWY, file ="out2sided")

}#for Iindex

#--------output----#
load("out2sided")
S = dim(pWY)[1]
q=0.05
alpha = length(index)/S*q
numGenesPerSet = apply(pWY<=alpha, 1,sum, na.rm=T)
cbind(n[numGenesPerSet>0],numGenesPerSet[numGenesPerSet>0], 100*(numGenesPerSet[numGenesPerSet>0]/n[numGenesPerSet>0]))

table(numGenesPerSet[numGenesPerSet>0])

#can display the number of genes detected per gene set in a table.  
temp = set2geneindex[pWY<=alpha]
temp= temp[!is.na(temp)]
length(unique(temp))


#--------compare to BH on all genes----#
G = length(unique(set2geneindex[!is.na(set2geneindex)]))

I = length(mysample)/2
mylabels = c(rep(0,I), rep(1,I))

pw = rep(NA,G)

ysub = y[unique(set2geneindex[!is.na(set2geneindex)]),]
ysub = ysub[,mysample]

for (j in 1:dim(ysub)[1]){
	mydata = ysub[j,]
	pw[j] = wilcox.test(mydata[mylabels==0], mydata[mylabels==1])$p.value
#	pw[j] = wilcox.test(mydata[mylabels==0], mydata[mylabels==1],alternative="less")$p.value
}#for j

q=0.05
pID = fdr(pw,q)
if (!is.na(pID)){
	sum(pw<=pID)
}

#---------output table of discoveries---#
mysetdiscoveries = (names.goBP2hgu[setID])
mygeneswithinsetdiscoveries = matrix(nrow = S,ncol= dim(set2geneindex)[2])
numgenediscoveries = rep(0,S)
for (i in 1:length(index)){ 
	pv = pWY[index[i],]
	pv = pv[!is.na(pv)]
	mysetIDs = set2geneID[index[i],1:length(pv)]
	if (sum(pv<=alpha)){
		numgenediscoveries[index[i]] = sum(pv<=alpha)
		mygeneswithinsetdiscoveries[index[i],1:sum(pv<=alpha)] = mysetIDs[pv<=alpha]
	}
}


rbind(numgenediscoveries[index],n[index])
temp = numgenediscoveries[index]/n[index]

sub = c(index[temp==1],index[temp==0.75],index[round(temp,3)==0.667])

numgenediscoveries[sub]

mysetdiscoveries[sub]
for (i in 1: length(sub)){
	print(mygeneswithinsetdiscoveries[sub[i],1:numgenediscoveries[sub[i]]])
}

#####################################################################
#--THE DATA FOR THE APPLICATION TO A MICROARRAY STUDY IN SECTION 4--#
#####################################################################
#---the gene sets---#
library(GO)
packageDescription('GO', field='Version')
library(hgu95av2)
packageDescription('hgu95av2', field='Version')
go<-as.list(GOTERM)
goBP<-character()
i<-1
for (j in 1:length(go)) {
    if (Ontology(go[[j]]) == "BP") {
      goBP[i]<-GOID(go[[j]])
      names(goBP)[i]<-Term(go[[j]])  
      i<-i+1
   }
}

length(goBP) 
hgu<-as.list(hgu95av2GO2ALLPROBES)
goINhgu<-character()
for (i in 1:length(hgu)) {
    goINhgu[i]<-names(hgu[i])
}

length(hgu)
length(goINhgu)
goBP2hgu<-list()
names.goBP2hgu<-character()
i<-1
for (j in 1:length(goBP)) {
    v<-which(goINhgu==goBP[j])
    if (length(v)==1) {
       goBP2hgu[i]<-hgu[v[1]]
       names.goBP2hgu[i]<-paste(goBP[j],names(goBP)[j],sep="\t")
       i<-i+1
    }
}

length(goBP2hgu)
length(names.goBP2hgu)
names.goBP2hgu[1]
goBP2hgu[[1]]
length(goBP2hgu[])#4364 gene sets
nS = rep(0,length(goBP2hgu[]))#the gene set sizes
for (i in 1:length(goBP2hgu[])){
	nS[i] = length(unique(goBP2hgu[[i]]))
}
S = length(nS[nS>1 & nS<=500])# 3367 gene sets of size 2 to 500 - consider these for analysis/
n = rep(0,S)
set2geneID = matrix(nrow = S, ncol=500)
setID = rep(0,S) #the index of the gene set in goBP2hgu
k=1
for (i in 1:length(goBP2hgu[])){
	if (nS[i]>1 & nS[i]<=500){
		set2geneID[k,1:nS[i]] = unique(goBP2hgu[[i]])
		setID[k] =i 
		n[k] = nS[i]
		k=k+1
	}
}

rm("nS")

#---the expression data---#
source("http://bioconductor.org/biocLite.R")
biocLite("ALL")
library(ALL)
data(ALL)
show(ALL)
print(summary(pData(ALL)))
dim(exprs(ALL))


set2geneindex = matrix(nrow = S, ncol=500) #for a  gene set the rows in expression data that correspond to it
for (i in 1:S){
	set2geneindex[i,1:n[i]] = pmatch(set2geneID[i,1:n[i]],featureNames(ALL))
}

set2geneindex[i,]
set2geneID[i,]

#The B type  and T type in ALLtype. 
phenoData(ALL)$BT
ALLtype=as.factor(substr(phenoData(ALL)$BT,1,1)) #Define a new factor that has B- or T-cell ALL type designation.
type.b=(ALLtype=="B"); # Indicator for B cell type

y=exprs(ALL)

#----------------------THE ANALYSIS-----------------#
Bindex = sample(seq(1,95,1),30)
Tindex = sample(seq(96,128,1),30) 


Ivec = c(5,10,15,20,25,30)

for (Iindex in 1:length(Ivec)){
I = Ivec[Iindex]

mysample = c(Bindex[1:I], Tindex[1:I])

ysub = y[,mysample]
dim(ysub)
mylabels = c(rep(0,I), rep(1,I))