Simulate model — sim.lvm • lava

Simulate data from a general SEM model including non-linear effects and general link and distribution of variables.

# S3 method for class 'lvm'
sim(x, n = NULL, p = NULL, normal = FALSE, cond = FALSE,
sigma = 1, rho = 0.5, X = NULL, unlink=FALSE, latent=TRUE,
use.labels = TRUE, seed=NULL, ...)

Arguments

x: Model object
n: Number of simulated values/individuals
p: Parameter value (optional)
normal: Logical indicating whether to simulate data from a multivariate normal distribution conditional on exogenous variables hence ignoring functional/distribution definition
cond: for internal use
sigma: Default residual variance (1)
rho: Default covariance parameter (0.5)
X: Optional matrix of fixed values of variables (manipulation)
unlink: Return Inverse link transformed data
latent: Include latent variables (default TRUE)
use.labels: convert categorical variables to factors before applying transformation
seed: Random seed
...: Additional arguments to be passed to the low level functions

Author

Klaus K. Holst

Examples

##################################################
## Logistic regression
##################################################
m <- lvm(y~x+z)
regression(m) <- x~z
distribution(m,~y+z) <- binomial.lvm("logit")
d <- sim(m,1e3)
head(d)
#>   y          x z
#> 1 1 -0.3890521 0
#> 2 1  1.5760916 1
#> 3 1  1.5709196 1
#> 4 1  1.4971902 1
#> 5 1  1.3515534 1
#> 6 1  0.6453046 0

e <- estimate(m,d,estimator="glm")
e
#>              Estimate Std. Error  Z-value   P-value
#> Regressions:                                       
#>    y~x        1.06972    0.09349 11.44259    <1e-12
#>    y~z        1.01578    0.16776  6.05499 1.404e-09
#>     x~z       0.96477    0.06532 14.77041    <1e-12
#> Intercepts:                                        
#>    y         -0.13044    0.10078 -1.29431    0.1956
#>    x          0.02210    0.04701  0.47012    0.6383
#> Dispersion:                                        
#>    x          1.06874                              
## Simulate a few observation from estimated model
sim(e,n=5)
#>   y         x z
#> 1 1 0.7578086 1
#> 2 0 0.6592641 1
#> 3 1 1.2263657 1
#> 4 1 0.2300801 0
#> 5 1 0.9931559 1

##################################################
## Poisson
##################################################
distribution(m,~y) <- poisson.lvm()
d <- sim(m,1e4,p=c(y=-1,"y~x"=2,z=1))
head(d)
#>    y           x z
#> 1 14  1.01864520 1
#> 2  1  0.09862234 0
#> 3  0 -0.28820655 0
#> 4 19  1.53349369 1
#> 5  0  0.46584650 0
#> 6  6  1.12191083 1
estimate(m,d,estimator="glm")
#>                Estimate Std. Error    Z-value  P-value
#> Regressions:                                          
#>    y~x          1.99880    0.00192 1042.74787   <1e-12
#>    y~z          0.98743    0.01116   88.49805   <1e-12
#>     x~z         0.99262    0.02273   43.66784   <1e-12
#> Intercepts:                                           
#>    y           -0.98504    0.01167  -84.44281   <1e-12
#>    x            0.00358    0.01950    0.18345   0.8544
#> Dispersion:                                           
#>    x            1.00426                               
mean(d$z); lava:::expit(1)
#> [1] 0.7251
#> [1] 0.7310586

summary(lm(y~x,sim(lvm(y[1:2]~4*x),1e3)))
#> 
#> Call:
#> lm(formula = y ~ x, data = sim(lvm(y[1:2] ~ 4 * x), 1000))
#> 
#> Residuals:
#>     Min      1Q  Median      3Q     Max 
#> -4.2216 -1.0570  0.0261  0.9492  4.5983 
#> 
#> Coefficients:
#>             Estimate Std. Error t value Pr(>|t|)    
#> (Intercept)  0.99738    0.04548   21.93   <2e-16 ***
#> x            3.99724    0.04426   90.31   <2e-16 ***
#> ---
#> Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
#> 
#> Residual standard error: 1.438 on 998 degrees of freedom
#> Multiple R-squared:  0.891,	Adjusted R-squared:  0.8909 
#> F-statistic:  8156 on 1 and 998 DF,  p-value: < 2.2e-16
#> 

##################################################
### Gamma distribution
##################################################
m <- lvm(y~x)
distribution(m,~y+x) <- list(Gamma.lvm(shape=2),binomial.lvm())
intercept(m,~y) <- 0.5
d <- sim(m,1e4)
summary(g <- glm(y~x,family=Gamma(),data=d))
#> 
#> Call:
#> glm(formula = y ~ x, family = Gamma(), data = d)
#> 
#> Coefficients:
#>             Estimate Std. Error t value Pr(>|t|)    
#> (Intercept) 0.505874   0.005102   99.15   <2e-16 ***
#> x           1.008180   0.016302   61.84   <2e-16 ***
#> ---
#> Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
#> 
#> (Dispersion parameter for Gamma family taken to be 0.5156528)
#> 
#>     Null deviance: 8397.4  on 9999  degrees of freedom
#> Residual deviance: 5546.3  on 9998  degrees of freedom
#> AIC: 20678
#> 
#> Number of Fisher Scoring iterations: 6
#> 
if (FALSE) MASS::gamma.shape(g) # \dontrun{}

args(lava::Gamma.lvm)
#> function (link = "inverse", shape, rate, unit = FALSE, var = FALSE, 
#>     log = FALSE, ...) 
#> NULL
distribution(m,~y) <- Gamma.lvm(shape=2,log=TRUE)
sim(m,10,p=c(y=0.5))[,"y"]
#>  [1] -1.14993675  0.06110529 -0.78986992  1.68142084  0.13917097 -0.35847861
#>  [7]  0.73652104 -1.62401234 -0.39977265 -1.16007252

##################################################
### Beta
##################################################
m <- lvm()
distribution(m,~y) <- beta.lvm(alpha=2,beta=1)
var(sim(m,100,"y,y"=2))
#>         y
#> y 1.14993
distribution(m,~y) <- beta.lvm(alpha=2,beta=1,scale=FALSE)
var(sim(m,100))
#>           y
#> y 0.0521512

##################################################
### Transform
##################################################
m <- lvm()
transform(m,xz~x+z) <- function(x) x[1]*(x[2]>0)
regression(m) <- y~x+z+xz
d <- sim(m,1e3)
summary(lm(y~x+z + x*I(z>0),d))
#> 
#> Call:
#> lm(formula = y ~ x + z + x * I(z > 0), data = d)
#> 
#> Residuals:
#>     Min      1Q  Median      3Q     Max 
#> -3.1528 -0.6738 -0.0486  0.7210  3.6658 
#> 
#> Coefficients:
#>                 Estimate Std. Error t value Pr(>|t|)    
#> (Intercept)     0.021540   0.061655   0.349    0.727    
#> x               0.987755   0.047652  20.728   <2e-16 ***
#> z               1.016270   0.054549  18.630   <2e-16 ***
#> I(z > 0)TRUE   -0.005932   0.106185  -0.056    0.955    
#> x:I(z > 0)TRUE  1.009156   0.065653  15.371   <2e-16 ***
#> ---
#> Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
#> 
#> Residual standard error: 1.018 on 995 degrees of freedom
#> Multiple R-squared:  0.7704,	Adjusted R-squared:  0.7694 
#> F-statistic: 834.5 on 4 and 995 DF,  p-value: < 2.2e-16
#> 

##################################################
### Non-random variables
##################################################
m <- lvm()
distribution(m,~x+z+v+w) <- list(Sequence.lvm(0,5),## Seq. 0 to 5 by 1/n
                               Binary.lvm(),       ## Vector of ones
                               Binary.lvm(0.5),    ##  0.5n 0, 0.5n 1
                               Binary.lvm(interval=list(c(0.3,0.5),c(0.8,1))))
sim(m,10)
#>            x z v w
#> 1  0.0000000 1 0 0
#> 2  0.5555556 1 0 0
#> 3  1.1111111 1 0 1
#> 4  1.6666667 1 0 1
#> 5  2.2222222 1 0 1
#> 6  2.7777778 1 1 0
#> 7  3.3333333 1 1 0
#> 8  3.8888889 1 1 1
#> 9  4.4444444 1 1 1
#> 10 5.0000000 1 1 1

##################################################
### Cox model
### piecewise constant hazard
################################################
m <- lvm(t~x)
rates <- c(1,0.5); cuts <- c(0,5)
## Constant rate: 1 in [0,5), 0.5 in [5,Inf)
distribution(m,~t) <- coxExponential.lvm(rate=rates,timecut=cuts)

if (FALSE) { # \dontrun{
    d <- sim(m,2e4,p=c("t~x"=0.1)); d$status <- TRUE
    plot(timereg::aalen(survival::Surv(t,status)~x,data=d,
                        resample.iid=0,robust=0),spec=1)
    L <- approxfun(c(cuts,max(d$t)),f=1,
                   cumsum(c(0,rates*diff(c(cuts,max(d$t))))),
                   method="linear")
    curve(L,0,100,add=TRUE,col="blue")
} # }

##################################################
### Cox model
### piecewise constant hazard, gamma frailty
##################################################
m <- lvm(y~x+z)
rates <- c(0.3,0.5); cuts <- c(0,5)
distribution(m,~y+z) <- list(coxExponential.lvm(rate=rates,timecut=cuts),
                             loggamma.lvm(rate=1,shape=1))
if (FALSE) { # \dontrun{
    d <- sim(m,2e4,p=c("y~x"=0,"y~z"=0)); d$status <- TRUE
    plot(timereg::aalen(survival::Surv(y,status)~x,data=d,
                        resample.iid=0,robust=0),spec=1)
    L <- approxfun(c(cuts,max(d$y)),f=1,
                   cumsum(c(0,rates*diff(c(cuts,max(d$y))))),
                   method="linear")
    curve(L,0,100,add=TRUE,col="blue")
} # }
## Equivalent via transform (here with Aalens additive hazard model)
m <- lvm(y~x)
distribution(m,~y) <- aalenExponential.lvm(rate=rates,timecut=cuts)
distribution(m,~z) <- Gamma.lvm(rate=1,shape=1)
transform(m,t~y+z) <- prod
sim(m,10)
#>              y          x            z             t
#> 1  -7.52089416 -0.3975626 0.0521017141 -3.918515e-01
#> 2   2.20571704  0.3470455 0.5086850008  1.122015e+00
#> 3  -4.36804686 -0.3792286 0.5905137765 -2.579392e+00
#> 4  -0.07640396 -1.5337391 0.2058743050 -1.572961e-02
#> 5  21.43594787 -0.2640809 0.0099792760  2.139152e-01
#> 6   4.25372148  0.1839688 0.4443781625  1.890261e+00
#> 7  -0.09900900 -1.4246557 0.0006665787 -6.599729e-05
#> 8  -1.15906345 -1.0335661 0.8045995309 -9.325819e-01
#> 9   6.72402679 -0.2305547 0.6546673730  4.402001e+00
#> 10  5.53307450 -0.1962768 0.2215395973  1.225795e+00
## Shared frailty
m <- lvm(c(t1,t2)~x+z)
rates <- c(1,0.5); cuts <- c(0,5)
distribution(m,~y) <- aalenExponential.lvm(rate=rates,timecut=cuts)
distribution(m,~z) <- loggamma.lvm(rate=1,shape=1)
if (FALSE) { # \dontrun{
mets::fast.reshape(sim(m,100),varying="t")
} # }

##################################################
### General multivariate distributions
##################################################
if (FALSE) { # \dontrun{
m <- lvm()
distribution(m,~y1+y2,oratio=4) <- VGAM::rbiplackcop
ksmooth2(sim(m,1e4),rgl=FALSE,theta=-20,phi=25)

m <- lvm()
distribution(m,~z1+z2,"or1") <- VGAM::rbiplackcop
distribution(m,~y1+y2,"or2") <- VGAM::rbiplackcop
sim(m,10,p=c(or1=0.1,or2=4))
} # }

m <- lvm()
distribution(m,~y1+y2+y3,TRUE) <- function(n,...) rmvn0(n,sigma=diag(3)+1)
var(sim(m,100))
#>          y1        y2        y3
#> y1 2.546658 1.2446886 1.1131815
#> y2 1.244689 2.1601392 0.8173183
#> y3 1.113182 0.8173183 1.8257718

## Syntax also useful for univariate generators, e.g.
m <- lvm(y~x+z)
distribution(m,~y,TRUE) <- function(n) rnorm(n,mean=1000)
sim(m,5)
#>           y          x           z
#> 1 1000.5214  0.2177780  1.93296840
#> 2  999.1417 -0.4748423  0.10528753
#> 3  999.1108 -2.2866782 -0.09999915
#> 4  998.8764 -0.7593130 -0.13847376
#> 5 1001.0689 -0.3586061  0.24695597
distribution(m,~y,"m1",0) <- rnorm
sim(m,5)
#>            y          x          z
#> 1 -0.5899834  0.5347800 -2.3598938
#> 2 -0.4243369 -0.9962041 -0.5107305
#> 3  3.5456795  1.7645527  1.5297775
#> 4 -1.4880155 -0.7726338 -0.1372156
#> 5 -0.2474045  0.3482477 -0.1443716
sim(m,5,p=c(m1=100))
#>           y            x          z
#> 1 102.25740  0.129915309  2.7875521
#> 2 101.53418  1.820998862 -0.3153138
#> 3  98.71054 -0.007577379 -0.2353492
#> 4  99.17310 -0.901698372 -0.4610830
#> 5 103.29305  0.072049531  1.9003038

##################################################
### Regression design in other parameters
##################################################
## Variance heterogeneity
m <- lvm(y~x)
distribution(m,~y) <- function(n,mean,x) rnorm(n,mean,exp(x)^.5)
if (interactive()) plot(y~x,sim(m,1e3))
## Alternaively, calculate the standard error directly
addvar(m) <- ~sd ## If 'sd' should be part of the resulting data.frame
constrain(m,sd~x) <- function(x) exp(x)^.5
distribution(m,~y) <- function(n,mean,sd) rnorm(n,mean,sd)
if (interactive()) plot(y~x,sim(m,1e3))

## Regression on variance parameter
m <- lvm()
regression(m) <- y~x
regression(m) <- v~x
##distribution(m,~v) <- 0 # No stochastic term
## Alternative:
## regression(m) <- v[NA:0]~x
distribution(m,~y) <- function(n,mean,v) rnorm(n,mean,exp(v)^.5)
if (interactive()) plot(y~x,sim(m,1e3))

## Regression on shape parameter in Weibull model
m <- lvm()
regression(m) <- y ~ z+v
regression(m) <- s ~ exp(0.6*x-0.5*z)
distribution(m,~x+z) <- binomial.lvm()
distribution(m,~cens) <- coxWeibull.lvm(scale=1)
distribution(m,~y) <- coxWeibull.lvm(scale=0.1,shape=~s)
eventTime(m) <- time ~ min(y=1,cens=0)

if (interactive()) {
    d <- sim(m,1e3)
    require(survival)
    (cc <- coxph(Surv(time,status)~v+strata(x,z),data=d))
    plot(survfit(cc) ,col=1:4,mark.time=FALSE)
}

##################################################
### Categorical predictor
##################################################
m <- lvm()
## categorical(m,K=3) <- "v"
categorical(m,labels=c("A","B","C")) <- "v"

regression(m,additive=FALSE) <- y~v
if (FALSE) { # \dontrun{
plot(y~v,sim(m,1000,p=c("y~v:2"=3)))
} # }

m <- lvm()
categorical(m,labels=c("A","B","C"),p=c(0.5,0.3)) <- "v"
regression(m,additive=FALSE,beta=c(0,2,-1)) <- y~v
## equivalent to:
## regression(m,y~v,additive=FALSE) <- c(0,2,-1)
regression(m,additive=FALSE,beta=c(0,4,-1)) <- z~v
table(sim(m,1e4)$v)
#> 
#>    A    B    C 
#> 5004 3018 1978 
glm(y~v, data=sim(m,1e4))
#> 
#> Call:  glm(formula = y ~ v, data = sim(m, 10000))
#> 
#> Coefficients:
#> (Intercept)           vB           vC  
#>     0.02492      1.99774     -1.00229  
#> 
#> Degrees of Freedom: 9999 Total (i.e. Null);  9997 Residual
#> Null Deviance:	    22580 
#> Residual Deviance: 9997 	AIC: 28380
glm(y~v, data=sim(m,1e4,p=c("y~v:1"=3)))
#> 
#> Call:  glm(formula = y ~ v, data = sim(m, 10000, p = c(`y~v:1` = 3)))
#> 
#> Coefficients:
#> (Intercept)           vB           vC  
#>    0.003437     2.987178    -1.009686  
#> 
#> Degrees of Freedom: 9999 Total (i.e. Null);  9997 Residual
#> Null Deviance:	    34490 
#> Residual Deviance: 10330 	AIC: 28710

transform(m,v2~v) <- function(x) x=='A'
sim(m,10)
#>    v          y          z    v2
#> 1  B  2.2298156  4.1679282 FALSE
#> 2  B  0.6586923  3.7329303 FALSE
#> 3  B  2.6336803  2.2362200 FALSE
#> 4  C -1.2815957 -3.2806097 FALSE
#> 5  B  1.7904833  1.7922099 FALSE
#> 6  A  1.5817981 -2.4106877  TRUE
#> 7  A  0.9435654  0.8197093  TRUE
#> 8  A  0.5989207  0.9182792  TRUE
#> 9  C -0.5337450  0.2826166 FALSE
#> 10 A  0.7108434 -1.3749699  TRUE

##################################################
### Pre-calculate object
##################################################
m <- lvm(y~x)
m2 <- sim(m,'y~x'=2)
sim(m,10,'y~x'=2)
#>             y          x
#> 1  -2.9969686 -0.5822500
#> 2   2.0932122  0.4675629
#> 3   2.4677225  1.1228049
#> 4   1.3229557  1.3806512
#> 5   3.1524416  1.9010546
#> 6  -0.6734257 -0.6149359
#> 7   0.4034541 -0.4402311
#> 8   0.5278260 -0.2312745
#> 9   1.3184855  0.7196587
#> 10 -0.6764807 -0.2205306
sim(m2,10) ## Faster
#>             y          x
#> 1   0.2093413 -0.1414400
#> 2   1.7423583  0.9679083
#> 3   2.3735490  0.8827087
#> 4  -2.9847372 -1.3114255
#> 5   0.8275647 -0.2983199
#> 6  -3.4592201 -1.8890341
#> 7   2.4341144  0.9988011
#> 8  -1.1049947 -0.4407522
#> 9  -0.4269766 -0.1266534
#> 10 -1.4520110 -0.6076527