PartiallyFD/PartiallyFD_simulation.r at master · stefanrameseder/PartiallyFD · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
##########################################################################################
##########################################################################################
## Supplement for   "Partially Observed Functional Data: The Case of Systematically Missing"
## by Dominik Liebl & Stefan Rameseder
## Generate partially observed functional data from different DGPs and apply estimators

rm(list = ls())
library("devtools")
install_github("stefanrameseder/PartiallyFD")
library("PartiallyFD")

##########################################################################################
## Generate Functional Data:
p        		<- 501 # Number of evaluation points per function
comp_dom        	<- seq(from = 0, to = 1,len = p) # Seq. of equidistant evaluation points for complete domain
small_dom		<- comp_dom[ comp_dom <= 0.5 ] # Seq. of equidistant evaluation points for small domain
nbasis_dgp 		<- 5   # Number of basis functions
nbasis_est 		<- 5  # Number of basis functions for estimation process
seq.ev   		<- seq(from = 10, to = 2, len = nbasis_dgp) # Seq. of eigenvalues or variances
sd 			<- sqrt(seq.ev[1])  # standard deviation of normal marginal; in our case the same as the first score
xi_means 		<- c(5,2,0,0,0)
mean_dom		<- 1 # mean of right domain border
prob 			<- 0.9 # binomial probability for normal marginal
basis 			    <- c(BSpline = "bspline", Fourier = "fourier", Monomial = "monomial", Power = "power")
base            <- basis[2] # Just for Fourier
x_var			<- sum(seq.ev)
alpha			<- 0.05 # significance level for bootstraps
B				<- 1000 #number of bootstraps
printout		<- 1
der 			<- 1
base			<- basis[which(basis == "fourier")]
name			<- names(base)

f_stat 			<- PartiallyFD:::f_stat2 # Choose the test statistic function for Romano Wolf

b_choice		<- "Med" # or Max # The choice method for the overall basis selection
maxAllowedBasis	    <- 51 # The number of maximal allowed overall basis length
basis_seq 		<- seq(3,maxAllowedBasis,2) # possible basis lengths
derFds			<- FALSE # Should the estimator be used with the derivatives of the functional data sample or with the basis projection
percentageCompDom   <- 0.02 # how many functions should be forced to be observed over the whole domain
sd_part 		<- 1 # Which part of the d s should be trnaformed to 0.5 or 1 (here 100%)
xi_1_d_cor		<- 0.995 # the correlation in the truncated normal case
d_var 			<- 1 # The variance of d in the truncated normal cases
sigma_sc            <- 3 # the variance of the sigma in the transformation1 case
positions 		<- c(0.625, 0.75, 0.875, 1)
DGP_names		<- c("SD","SC", "ID", "IC")
ver             <- "FirstSimulation" # Name of dataset to save

# The Supplied Setting list for all DGPs
suppliedSettings <- list(sd_part = sd_part, xi_1_d_cor = xi_1_d_cor, d_var = d_var, sigma_sc = sigma_sc)

replications		<- 500


ns                  <- c(50, 150, 250, 500) # Different n's

require(parallel) 		# for parallel spline smoothing

for(n in ns){ # n <- 50

    # Version for Saving the .RData
    print(version 			<- paste0(ver,"_p", p, "_n", n, "_R", replications, "_SD", 100*suppliedSettings$sd_part, "_SD", 1000*suppliedSettings$xi_1_d_cor,"_BC_", b_choice,maxAllowedBasis ))

    # make the cluster
    cl <- makeCluster(8)

    # load required libraries
    clusterEvalQ(cl, {
        library("xtable")
        library("systemfit") 	# install.packages("systemfit") # for SUR estimation
        library("fda")    		# convenient basis functions
        library("scales") 		# colors
        library(copula)			# install.packages("copula")
        library(psych)
        library(vcdExtra)		# install.packages("vcdExtra")
        library(skewt) 			# install.packages("skewt") Skew T Distribution
        library(e1071) 			# for skewness function
        library(MASS) 			# mvrnorm
        library(tmvtnorm)		# install.packages("tmvtnorm")
        library(xtable) 		# latex tables
        library(MBESS)			# install.packages("MBESS")
        library(RColorBrewer) 	# for color definitions
        library(forecast) 		# for arima estimation
        require(MASS) 			# for df
        require(coneproj) 		# cone projections for smoothing parameter lambda
        require(orthogonalsplinebasis) # cone projections for smoothing parameter lambda
        library(PartiallyFD)		# nice corrplots
    })


    ## export required objects
    clusterExport(cl , varlist = c("p", "n", "comp_dom", "small_dom", "nbasis_dgp", "nbasis_est", "sigma_sc",
                                   "basis_seq", "seq.ev", "sd", "xi_means", "mean_dom", "prob", "b_choice",
                                   "cor", "basis", "x_var", "alpha", "B", "printout", "der", "f_stat", "derFds", "percentageCompDom",
                                   "DGP_names", "replications", "base", "suppliedSettings", "maxAllowedBasis") )

    # For each base in Basis
    # for each dgp ind DGPs
    # for all replications

    sys.time <- system.time(resDGP <- lapply(DGP_names, function(dgp){ # dgp <- "SC"
        #system.time(resDGP <- lapply(DGP_names, function(dgp){ # dgp <- "ID"

        # Define basis once
        basis_fd 		<- get(paste0("create.",base,".basis"))(rangeval = range(comp_dom), nbasis = 5)
        trueMean 		<- PartiallyFD:::calcTrueMean(basis_fd, xi_means, comp_dom)

        # Calculate true covariance via cov(s,t) = sum_i=1^b lambda_i * psi_i(t)*psi_i(s)
        trueCov 		<- PartiallyFD:::calcTrueCovariance(basis_fd, seq.ev, comp_dom, Lfdobj=0)

        # Calculate true derivative of covariance via cov''(s,t) = sum_i=1^b lambda_i * psi_i'(t)*psi'_i(s)
        trueCovDer 		<- PartiallyFD:::calcTrueCovariance(basis_fd, seq.ev, comp_dom, Lfdobj=1)

        # For all replications
        set.seed(10)
        resRep <- parLapply(cl, 1:replications, function(rep){ # rep = 1

            ### Simulate for n cruves xi_i1 and d_i data, where xi_1_mean is the first entry of xi_means
            if(dgp == "IC"){
                sim_data 					<- PartiallyFD:::simDataIC(n, xi_1_mean = xi_means[1], ev = seq.ev[1], suppliedSettings)
            } else if (dgp == "ID"){
                sim_data 					<- PartiallyFD:::simDataID(n, xi_1_mean = xi_means[1], ev = seq.ev[1], suppliedSettings)
            } else if (dgp == "SC"){
                sim_data 					<- PartiallyFD:::simDataSC(n, xi_1_mean = xi_means[1], ev = seq.ev[1], suppliedSettings)
            } else if (dgp == "SD"){
                sim_data 					<- PartiallyFD:::simDataSD(n, xi_1_mean = xi_means[1], ev = seq.ev[1], suppliedSettings)
            } else {
                print("Could not find simulation function")
            }

            ### If the maximum value of d != 1, define percentageCompDom% of the sample as observed on the full domain
            sim_data[rev(order(sim_data[ ,"d"]))[1:round( percentageCompDom * n ,0)] ,"d"] <- max(comp_dom)

            ### Construct corresponding functional data set of n curves observed at p points
            simFD 						<- PartiallyFD:::simMeanBias(	sim_data = sim_data, nbasis_dgp = nbasis_dgp, p =p, n = n, seq.ev = seq.ev,
                                        comp_dom = comp_dom, small_dom = small_dom, basis = basis_fd, xi_means = xi_means)
            ### Calculate the Krauss Mean
            krausMean      <- rowMeans(simFD$X_ad_m, na.rm = TRUE)

            ### Choose a number of basis elements b by Minimizing the BIC over all projections onto basis_seq
            bicMat 		<- PartiallyFD:::selBasisLength(X = simFD$X_ad_m, comp_dom, base = "fourier", basis_seq)

            ### Select an overall basis length b via Median or Max Basis Length
            selBasis  	<- PartiallyFD:::calcMedBasisDim(bicMat, basis_seq)

            ### Take evaluated psi_i and "der" derivative of psi_i of chosen fourier basis of length selBasis
            basis_fd_chosen <- get(paste0("create.",base,".basis"))(rangeval = range(comp_dom), nbasis = selBasis)
            Psi_i			<- eval.basis( evalarg = comp_dom, basisobj = basis_fd_chosen)
            Psi_i_der		<- eval.basis( evalarg = comp_dom, basisobj = basis_fd_chosen, Lfdobj = der)

            ### Estimate coefficients/scores in that chosen basis psi_i, i = 1, ..., selBasis
            coefs <- apply(simFD$X_ad_m, 2, function(X_i){ # X_i <- simFD$X_ad_m[, 1]
                dom_ind 		<- which(!is.na(X_i))
                y               <- as.numeric(na.omit(X_i)) # length(y)
                X_mat           <- Psi_i[dom_ind, ]
                return(coefs    <- lm(y ~ -1 + X_mat)$coefficients)
            })

            ### Estimate the FTC Estimator like defined in the paper
            ftcMean 		<- PartiallyFD:::calcFTCEstimator(fds = simFD$X_ad_m, evalBase = Psi_i, evalDer = Psi_i_der, coefs, small_dom, comp_dom, derFds = derFds)
            ftcMean_prime 	<- ftcMean$firstDer
            ftcMean			<- ftcMean$mean

            ### Estimate the FTC Covariance like defined in the paper small_dom = comp_dom
            ftcCov 		<-  PartiallyFD:::calcFTCCovariance(fds = simFD$X_ad_m, ftcMean = ftcMean, ftcMean_prime = ftcMean_prime, evalBase = Psi_i, evalDer = Psi_i_der, coefs, small_dom, comp_dom)
            krausCov    <-  PartiallyFD:::calcKrausCovariance(fds = simFD$X_ad_m, fmean = krausMean)

            if(any(dgp == c("SD", "SC"))){ # dgp = "IC"
                krausCov_ftc    <- PartiallyFD:::calcKrausCovariance(fds = simFD$X_ad_m, fmean = krausMean)
            } else {
                krausCov_ftc    <- krausCov
            }

            ### Apply RomanoWolf with F-Statistic
            romWolf <-  PartiallyFD:::RomWolf(X = t(coefs), alpha, B, printout = FALSE, d = sim_data[, "d"], f_stat = f_stat) # plot(X[,1], d)

            return(list(krausMean = krausMean,
                        ftcMean = ftcMean,
                        ftcCov = ftcCov,
                        krausCov = krausCov,
                        krausCov_ftc = krausCov_ftc,
                        selBasis  = selBasis,
                        romWolf = romWolf,
                        cor = cor(sim_data)))
        })
        return(resRep)
    }))

    names(resDGP) <- DGP_names
    stopCluster(cl)
    print(file <- paste0("PartiallyFD_sim_", version,".RData"))
    print(sys.time)
    save(resDGP, file = paste0("PartiallyFD_sim_", version,".RData"))
}