ThresholdTest.r
##############################################################
# This file contains the R functions for the paper:
#
# "Two-Sample and ANOVA Tests for High Dimensional Means"
# by Song Xi Chen, Jun Li and Ping-Shou Zhong
#
# Questions and comments, please contact
# csx@gsm.pku.edu.cn, jli49@kent.edu
###########################################################
##########################################################
# function to obtain parametric bootstrap copy of
# Maximum thresholding test statistic
# input: p--dimension of random vector
# mat--covariance matrix of random vector
# gam--related with mat by relationship gam=mat^{1/2}
# size1(2)--sample size of random sample 1(2)
# output: bootstrap copy of Maximum thresholding statistic
# based on generated samples: sam1 and sam2
# inside the code:
# random sample 1 (2) with size=size1 (size2) and dimension=p
# is generated by the multivariate model:
# X_{ij}=Sigma_i^{1/2}Z_{ij}+mu_i
# where Z_{ij} are iid p dimensional random vectors such that
# E(Z_{ij})=0, Var(Z_{ij})=I_p (p by p identity matrix)
################################################################
resampling_method<-function(p,mat,gam,size1,size2){
mu1<-rep(0,p);mu2<-rep(0,p);eta<-0.05
Z_mat1<-matrix(rnorm(size1*p,0, 1),ncol=p)
Z_mat2<-matrix(rnorm(size2*p,0, 1),ncol=p)
sam1<-(Z_mat1)%*% t(gam)+matrix(mu1,ncol=p,nrow=size1,byrow=T)
sam2<-(Z_mat2)%*% t(gam)+matrix(mu2,ncol=p,nrow=size2,byrow=T)
test_stat<-max_threshold(sam1,sam2,mat,size1,size2)
return(test_stat)
}
########################################################
# function to obtain Maximum thresholding test statistic
# defined by eqn (3.2) without data tranformation or
# eqn (4.9) with data transformation in the paper
# input: sample1(2)--two random samples
# sig_mat--either covariance matrix or inverse
# covariance matrix depending on
# if test statistic is (3.2) or (4.9)
# n1(n2)--sample size
# output: Maximum thresholding statistic (3.2) or (4.9)
########################################################
max_threshold<-function(sample1,sample2,sig_mat,n1,n2){
a_f<-sqrt(2*log(log(p)))
b_f<-2*log(log(p))+0.5*log(log(log(p)))-0.5*log(4*3.1416/(1-0.05)^2)
eta<-0.05
T_orig<-(colMeans(sample1)-colMeans(sample2))^2/((1/n1+1/n2)*diag(sig_mat))
s_level<-T_orig[sign(T_orig)>=0.01 & T_orig<=2*(1-eta)*log(p)]
s_m<-matrix(s_level,length(s_level),p,byrow=F)
T_m<-matrix((T_orig-1),length(s_level),p,byrow=T)
T_m[sign(T_m+1-s_m)==-1]<-0
thr<-rowSums(T_m)
mean_thr<-2*sqrt(s_level)*dnorm(sqrt(s_level))*p
sd_thr<-sqrt(p*(2*((sqrt(s_level))^3+sqrt(s_level))*dnorm(sqrt(s_level))+4-4*pnorm(sqrt(s_level)))-mean_thr^2/p)
max_threshold<-max((thr-mean_thr)/sd_thr)*a_f-b_f
return(max_threshold)
}
##############################################################
# function to obtain the inverse of covariance matrix
# input: sample1(2)--two random samples
# band--banding width parameter chosen in the cholesky
# decomposition to estimate Omega defined by (4.6)
# in the paper
# output: cholesky decomposition estimate of Omega
##############################################################
chole<-function(sample1,sample2,band){
n1<-dim(sample1)[1];n2<-dim(sample2)[1]
sample_cen<-sample1-(n1/n2)^0.5*sample2[1:n1,]+(1/(n1*n2)^0.5)*colSums(sample2[1:n1,])-(1/n2)*colSums(sample2)
d<-NULL; d[1]<-1/(n1)*sum((sample_cen[,1])^2)
a<-matrix(0,ncol=p,nrow=p)
#estimate a_i and d_i^2 using least square regression
for(j in 2:p){
X_mat<-sample_cen[,(max(j-band,1)):(j-1)]
beta_hat<-solve(t(X_mat)%*%X_mat)%*%t(X_mat)%*%sample_cen[,j]
a[j,(max(j-band,1)):(j-1)]<-beta_hat
d[j]<-1/(n1)*sum((sample_cen[,j]-sample_cen%*%a[j,])^2)
}
ma<-t(diag(1,p)-a)%*%solve(diag(d,p))%*%(diag(1,p)-a)
inv_hat<-((n1+n2)/n2)*ma
return(inv_hat)
}
##############################################################
# function to obtain covariance matrix which is called by the
# function banding_choice to estimate the optimal banding
# parameter
# input: sa--random sample
# band--banding parameter
# output: cholesky decomposition estimating the covariance matrix
##############################################################
chole_sigma<-function(sa,band){
n<-dim(sa)[1];p<-dim(sa)[2]
d<-NULL; d[1]<-var(sa[,1])
a<-matrix(0,ncol=p,nrow=p)
#estimate a_i and d_i^2 using least square regression
for(j in 2:p){
if(band>=1){
X_mat<-sa[,(max(j-band,1)):(j-1)]
beta_hat<-qr.solve(X_mat, sa[,j], tol=1e-200)
a[j,(max(j-band,1)):(j-1)]<-beta_hat
d[j]<-var(sa[,j]-sa%*%a[j,])
}else{
d[j]<-var(sa[,j])
}
}
mat_hat<-solve(diag(1,p)-a)%*%(diag(d,p))%*%solve(t(diag(1,p)-a))
return(mat_hat)
}
###############################################################
# function to find optimal banding parameter by minimizing (5.1)
# in the paper
# input: x--random sample
# k.vec--initial choice of banding parameter
# output: optimal banding parameter defined by (5.1)
###############################################################
banding_choice<-function(x, k.vec=NULL){
n<-dim(x)[1]; p<-dim(x)[2]
n.tr<-round(n*2/3);n.va<-n-n.tr
if(is.null(k.vec))
{
k.vec<-0:min(c(n.tr-2, p-1))
}
n.split<-50
disc<-array(0, c(length(k.vec), n.split))
for(i in 1:n.split){
id<-sample(n); id1<-id[1:n.tr]; id2<-id[(n.tr+1):n]
x.tr<-scale( x[id1,,drop=FALSE], scale=FALSE)
s.va<-cov(x[id2,,drop=FALSE])*(n.va-1)/n.va
for(j in 1:length(k.vec))
{
sigma_hat<-chole_sigma(x.tr,k.vec[j])
disc[j,i]<-sum((sigma_hat-s.va)^2)
}
}
risk<-apply(disc, 1, sum)
optim.k<-k.vec[which.min(risk)]
return(optim.k)
}
################################################################
# one example to obtain empirical sizes or powers of maximum
# thresholding test
################################################################
library(mvtnorm)
p<-200 # the dimension of multivariate random vector
k<-1 # optimal banding parameter
n1<-30; n2<-40 # two sample sizes
#####################################################
# two covariance matrices following AR(1) model
rho1<-0.6; rho2<-0.6
times<-1:p; H<- abs(outer(times, times, "-"))
sigma1<-rho1^H; sigma2<-rho2^H
#######################################################
# inverse of linear combination of two covariance matrices
# used to transform data to enhance signal strength
sigma_inv<-solve((n2/(n1+n2))*sigma1+(n1/(n1+n2))*sigma2)
########################################################
s_beta<-0.4 # sparsity parameter used to control the number of
# signals
mm<-round(p^(1-s_beta)) # number of nonzero signals
mu1<-rep(0,p) # without loss of generality, assume mu1=0
post<-sort(sample(1:p,mm,replace=F))# randomly select the location
# of non-zero coordiantes of mu2
r<-0 # signal strength of mu2
# when r=0, the simulation study is for empirical size
# otherwise, it is for empirical power
mu2<-rep(0,p)
mu2[post]<-sqrt(2*r*log(p)*(1/n1+1/n2))
################################################################
test_thr1<-NULL
test_thr2<-NULL
p_thr1<-p_thr2<-NULL
gam<-eigen(sigma1,symmetric=T)$vectors%*%diag(sqrt(eigen(sigma1,symmetric=T)$values),p)
n<-100 # the number of the simulations
for (i in 1:n){
#######################################################
# generate samples by the factor model
#######################################################
Z_mat1<-matrix(rnorm(n1*p,0, 1),ncol=p)
Z_mat2<-matrix(rnorm(n2*p,0, 1),ncol=p)
sam1<-(Z_mat1)%*% t(gam)+matrix(mu1,ncol=p,nrow=n1,byrow=T)
sam2<-(Z_mat2)%*% t(gam)+matrix(mu2,ncol=p,nrow=n2,byrow=T)
#######################################################
sigma_inv_est<-chole(sam1,sam2,k)# estimate Omega by cholesky
gam_inv_est<-eigen(sigma_inv_est,symmetric=T)$vectors%*%diag(sqrt(eigen(sigma_inv_est,symmetric=T)$values),p)
sam1_tran_est<-sam1%*%sigma_inv_est # transform data by Omega
sam2_tran_est<-sam2%*%sigma_inv_est
sigma_e<-solve(sigma_inv_est)
gam_e<-eigen(sigma_e,symmetric=T)$vectors%*%diag(sqrt(eigen(sigma_e,symmetric=T)$values),p)
##############################################################
# bootstrap copies of maximum thresholding test statistics without
# and with data transformation
##############################################################
iter<-100
stat.thr1<-stat.thr2<-NULL
for(j in 1:iter){
stat.thr1[j]<-resampling_method(p,sigma_e,gam_e,100,100)
stat.thr2[j]<-resampling_method(p,sigma_inv_est,gam_inv_est,100,100)
}
###############################################################
# obtain p-values of maximum thresholding tests
# #############################################################
test_thr1<-max_threshold(sam1,sam2,sigma1,n1,n2)
test_thr2<-max_threshold(sam1_tran_est,sam2_tran_est,sigma_inv_est,n1,n2)
p_thr1[i]<-length(which(sort(stat.thr1)>=test_thr1))/iter
p_thr2[i]<-length(which(sort(stat.thr2)>=test_thr2))/iter
}
#############################################################
# power_thres1: empirical size or power of maximum thresholding
# test without data transformation
# power_thres2: empirical size or power of maximum thresholding
# test with data transformation
#############################################################
power_thres1<-length(which(p_thr1<=0.05))/n
power_thres2<-length(which(p_thr2<=0.05))/n
power<-c(power_thres1,power_thres2)
|