#Suplementary material: #A Flexible Two Stage Procedure for Identifying Gene Sets that are Differentially Expressed #Ruth Heller, Elisabetta Manduchi, Gregory Grant, Warren Ewens. #---R code for the two stage procedure 3.1----------# library(multtest) library(MASS) #--------------------THE FUNCTIONS------------------# ################################################################################ # Author of the following 3 functions: Dan Nettleton # # Date: August 7, 2007 # # Downloaded from: http://www.public.iastate.edu/~dnett/useR/GeneCatTesting.txt# ################################################################################ e.com=function (y) {#Nettleton K=ncol(y) scale=rep(0,K) for(i in 1:K){ D=dist(y[,i]) scale[i]=1/sum(D) } y=sweep(y,2,scale,FUN="*") return(y) } get.delta=function (x,D,n,N,w) {#Nettleton inter.sum=tapply(1:N,x,FUN="inter.dist",D) inter.mean=inter.sum*2/(n*(n-1)) delta=sum(w*inter.mean) return(delta) } inter.dist=function (index,D) {#Nettleton return(sum(as.matrix(D)[index,index])/2) } mrpp=function (x,y,nperm) { #Nettleton's p-value for a gene set. D=dist(y) x=as.factor(x) n=table(x) N=sum(n) w=n/N delta.obs=get.delta(x,D,n,N,w) delta=rep(0,nperm) for(i in 1:nperm){ delta[i]=get.delta(sample(x),D,n,N,w) } return(p=mean(delta.obs>=delta)) } ################################################################################ # End of Dan Nettleton's code # ################################################################################ fdr = function(pv,q){ if (sum(is.na(pv))>0 | sum(pv<0,na.rm=T)>0 | sum(pv>1,na.rm=T)>0) warning("Some p-values are not-valid or missing! These p-values are ignored") end spv = sort(pv[!is.na(pv)]) G = length(spv) temp = spv<=seq(1,G,1)*q/G pID = ifelse(length(spv[temp==T])>0,max(spv[temp==T]),NA) pID } #-----------------------THE DATA----------------------# #Input: #set2geneindex: a matrix of size Sxn, where S is the number of gene sets and n the size of the largest gene set. # entry (i,j) is the gene id of the j-th gene in the i-th gene set. #ysub: a matrix of size Gxm, where G is the number of genes that are in at least one gene set and m is the number of replicates. entry (i,j) is the expression level of gene i in replicate j. #mlabels: a vector of size m, with entries 0 or 1 corresponding to replicates from the first or the second group respectively. #THE CODE FOR THE DATA USED IN THE APPLICATION TO A MICROARRAY STUDY IN SECTION 4 IS AT THE END OF THIS SCRIPT. #----------------------THE ANALYSIS-----------------# #----screening stage---# #compute screening p-values pmrpp = rep(NA,S) for (i in 1: S){#for every gene set print(paste("gene set ",i)) pmrpp[i] = mrpp(mylabels,e.com(t(ysub[set2geneindex[i,1:n[i]],])),nperm=100) # pmrpp[i] = mrpp(type.b,e.com(t(y[set2geneindex[i,1:n[i]],])),nperm=100) }#for i #BH on screening p-values q=0.05 pID = fdr(pmrpp,q) index = seq(1,S,1) index = index[pmrpp<=pID] length(index) #-----testing after screening stage---# pWY = matrix(nrow=S,ncol=500) #the adjusted p-values by WY for (k in 1: length(index)){ i = index[k] #the index of the rejected gene set print(paste("selected gene set ",k)) out = mt.minP(ysub[set2geneindex[i,1:n[i]],],mylabels,test = "wilcoxon", B=100000) # out = mt.minP(ysub[set2geneindex[i,1:n[i]],],mylabels,test = "wilcoxon",side="upper", B=100000) pWY[i,out$index] = out$adjp } save(mylabels, mysample,pmrpp,index, pWY, file ="out2sided") }#for Iindex #--------output----# load("out2sided") S = dim(pWY)[1] q=0.05 alpha = length(index)/S*q numGenesPerSet = apply(pWY<=alpha, 1,sum, na.rm=T) cbind(n[numGenesPerSet>0],numGenesPerSet[numGenesPerSet>0], 100*(numGenesPerSet[numGenesPerSet>0]/n[numGenesPerSet>0])) table(numGenesPerSet[numGenesPerSet>0]) #can display the number of genes detected per gene set in a table. temp = set2geneindex[pWY<=alpha] temp= temp[!is.na(temp)] length(unique(temp)) #--------compare to BH on all genes----# G = length(unique(set2geneindex[!is.na(set2geneindex)])) I = length(mysample)/2 mylabels = c(rep(0,I), rep(1,I)) pw = rep(NA,G) ysub = y[unique(set2geneindex[!is.na(set2geneindex)]),] ysub = ysub[,mysample] for (j in 1:dim(ysub)[1]){ mydata = ysub[j,] pw[j] = wilcox.test(mydata[mylabels==0], mydata[mylabels==1])$p.value # pw[j] = wilcox.test(mydata[mylabels==0], mydata[mylabels==1],alternative="less")$p.value }#for j q=0.05 pID = fdr(pw,q) if (!is.na(pID)){ sum(pw<=pID) } #---------output table of discoveries---# mysetdiscoveries = (names.goBP2hgu[setID]) mygeneswithinsetdiscoveries = matrix(nrow = S,ncol= dim(set2geneindex)[2]) numgenediscoveries = rep(0,S) for (i in 1:length(index)){ pv = pWY[index[i],] pv = pv[!is.na(pv)] mysetIDs = set2geneID[index[i],1:length(pv)] if (sum(pv<=alpha)){ numgenediscoveries[index[i]] = sum(pv<=alpha) mygeneswithinsetdiscoveries[index[i],1:sum(pv<=alpha)] = mysetIDs[pv<=alpha] } } rbind(numgenediscoveries[index],n[index]) temp = numgenediscoveries[index]/n[index] sub = c(index[temp==1],index[temp==0.75],index[round(temp,3)==0.667]) numgenediscoveries[sub] mysetdiscoveries[sub] for (i in 1: length(sub)){ print(mygeneswithinsetdiscoveries[sub[i],1:numgenediscoveries[sub[i]]]) } ##################################################################### #--THE DATA FOR THE APPLICATION TO A MICROARRAY STUDY IN SECTION 4--# ##################################################################### #---the gene sets---# library(GO) packageDescription('GO', field='Version') library(hgu95av2) packageDescription('hgu95av2', field='Version') go<-as.list(GOTERM) goBP<-character() i<-1 for (j in 1:length(go)) { if (Ontology(go[[j]]) == "BP") { goBP[i]<-GOID(go[[j]]) names(goBP)[i]<-Term(go[[j]]) i<-i+1 } } length(goBP) hgu<-as.list(hgu95av2GO2ALLPROBES) goINhgu<-character() for (i in 1:length(hgu)) { goINhgu[i]<-names(hgu[i]) } length(hgu) length(goINhgu) goBP2hgu<-list() names.goBP2hgu<-character() i<-1 for (j in 1:length(goBP)) { v<-which(goINhgu==goBP[j]) if (length(v)==1) { goBP2hgu[i]<-hgu[v[1]] names.goBP2hgu[i]<-paste(goBP[j],names(goBP)[j],sep="\t") i<-i+1 } } length(goBP2hgu) length(names.goBP2hgu) names.goBP2hgu[1] goBP2hgu[[1]] length(goBP2hgu[])#4364 gene sets nS = rep(0,length(goBP2hgu[]))#the gene set sizes for (i in 1:length(goBP2hgu[])){ nS[i] = length(unique(goBP2hgu[[i]])) } S = length(nS[nS>1 & nS<=500])# 3367 gene sets of size 2 to 500 - consider these for analysis/ n = rep(0,S) set2geneID = matrix(nrow = S, ncol=500) setID = rep(0,S) #the index of the gene set in goBP2hgu k=1 for (i in 1:length(goBP2hgu[])){ if (nS[i]>1 & nS[i]<=500){ set2geneID[k,1:nS[i]] = unique(goBP2hgu[[i]]) setID[k] =i n[k] = nS[i] k=k+1 } } rm("nS") #---the expression data---# source("http://bioconductor.org/biocLite.R") biocLite("ALL") library(ALL) data(ALL) show(ALL) print(summary(pData(ALL))) dim(exprs(ALL)) set2geneindex = matrix(nrow = S, ncol=500) #for a gene set the rows in expression data that correspond to it for (i in 1:S){ set2geneindex[i,1:n[i]] = pmatch(set2geneID[i,1:n[i]],featureNames(ALL)) } set2geneindex[i,] set2geneID[i,] #The B type and T type in ALLtype. phenoData(ALL)$BT ALLtype=as.factor(substr(phenoData(ALL)$BT,1,1)) #Define a new factor that has B- or T-cell ALL type designation. type.b=(ALLtype=="B"); # Indicator for B cell type y=exprs(ALL) #----------------------THE ANALYSIS-----------------# Bindex = sample(seq(1,95,1),30) Tindex = sample(seq(96,128,1),30) Ivec = c(5,10,15,20,25,30) for (Iindex in 1:length(Ivec)){ I = Ivec[Iindex] mysample = c(Bindex[1:I], Tindex[1:I]) ysub = y[,mysample] dim(ysub) mylabels = c(rep(0,I), rep(1,I))