Simulate data from a general SEM model including non-linear effects and general link and distribution of variables.
# S3 method for class 'lvm'
sim(x, n = NULL, p = NULL, normal = FALSE, cond = FALSE,
sigma = 1, rho = 0.5, X = NULL, unlink=FALSE, latent=TRUE,
use.labels = TRUE, seed=NULL, ...)Model object
Number of simulated values/individuals
Parameter value (optional)
Logical indicating whether to simulate data from a multivariate normal distribution conditional on exogenous variables hence ignoring functional/distribution definition
for internal use
Default residual variance (1)
Default covariance parameter (0.5)
Optional matrix of fixed values of variables (manipulation)
Return Inverse link transformed data
Include latent variables (default TRUE)
convert categorical variables to factors before applying transformation
Random seed
Additional arguments to be passed to the low level functions
##################################################
## Logistic regression
##################################################
m <- lvm(y~x+z)
regression(m) <- x~z
distribution(m,~y+z) <- binomial.lvm("logit")
d <- sim(m,1e3)
head(d)
#> y x z
#> 1 1 -0.3890521 0
#> 2 1 1.5760916 1
#> 3 1 1.5709196 1
#> 4 1 1.4971902 1
#> 5 1 1.3515534 1
#> 6 1 0.6453046 0
e <- estimate(m,d,estimator="glm")
e
#> Estimate Std. Error Z-value P-value
#> Regressions:
#> y~x 1.06972 0.09349 11.44259 <1e-12
#> y~z 1.01578 0.16776 6.05499 1.404e-09
#> x~z 0.96477 0.06532 14.77041 <1e-12
#> Intercepts:
#> y -0.13044 0.10078 -1.29431 0.1956
#> x 0.02210 0.04701 0.47012 0.6383
#> Dispersion:
#> x 1.06874
## Simulate a few observation from estimated model
sim(e,n=5)
#> y x z
#> 1 1 0.7578086 1
#> 2 0 0.6592641 1
#> 3 1 1.2263657 1
#> 4 1 0.2300801 0
#> 5 1 0.9931559 1
##################################################
## Poisson
##################################################
distribution(m,~y) <- poisson.lvm()
d <- sim(m,1e4,p=c(y=-1,"y~x"=2,z=1))
head(d)
#> y x z
#> 1 14 1.01864520 1
#> 2 1 0.09862234 0
#> 3 0 -0.28820655 0
#> 4 19 1.53349369 1
#> 5 0 0.46584650 0
#> 6 6 1.12191083 1
estimate(m,d,estimator="glm")
#> Estimate Std. Error Z-value P-value
#> Regressions:
#> y~x 1.99880 0.00192 1042.74787 <1e-12
#> y~z 0.98743 0.01116 88.49805 <1e-12
#> x~z 0.99262 0.02273 43.66784 <1e-12
#> Intercepts:
#> y -0.98504 0.01167 -84.44281 <1e-12
#> x 0.00358 0.01950 0.18345 0.8544
#> Dispersion:
#> x 1.00426
mean(d$z); lava:::expit(1)
#> [1] 0.7251
#> [1] 0.7310586
summary(lm(y~x,sim(lvm(y[1:2]~4*x),1e3)))
#>
#> Call:
#> lm(formula = y ~ x, data = sim(lvm(y[1:2] ~ 4 * x), 1000))
#>
#> Residuals:
#> Min 1Q Median 3Q Max
#> -4.2216 -1.0570 0.0261 0.9492 4.5983
#>
#> Coefficients:
#> Estimate Std. Error t value Pr(>|t|)
#> (Intercept) 0.99738 0.04548 21.93 <2e-16 ***
#> x 3.99724 0.04426 90.31 <2e-16 ***
#> ---
#> Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
#>
#> Residual standard error: 1.438 on 998 degrees of freedom
#> Multiple R-squared: 0.891, Adjusted R-squared: 0.8909
#> F-statistic: 8156 on 1 and 998 DF, p-value: < 2.2e-16
#>
##################################################
### Gamma distribution
##################################################
m <- lvm(y~x)
distribution(m,~y+x) <- list(Gamma.lvm(shape=2),binomial.lvm())
intercept(m,~y) <- 0.5
d <- sim(m,1e4)
summary(g <- glm(y~x,family=Gamma(),data=d))
#>
#> Call:
#> glm(formula = y ~ x, family = Gamma(), data = d)
#>
#> Coefficients:
#> Estimate Std. Error t value Pr(>|t|)
#> (Intercept) 0.505874 0.005102 99.15 <2e-16 ***
#> x 1.008180 0.016302 61.84 <2e-16 ***
#> ---
#> Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
#>
#> (Dispersion parameter for Gamma family taken to be 0.5156528)
#>
#> Null deviance: 8397.4 on 9999 degrees of freedom
#> Residual deviance: 5546.3 on 9998 degrees of freedom
#> AIC: 20678
#>
#> Number of Fisher Scoring iterations: 6
#>
if (FALSE) MASS::gamma.shape(g) # \dontrun{}
args(lava::Gamma.lvm)
#> function (link = "inverse", shape, rate, unit = FALSE, var = FALSE,
#> log = FALSE, ...)
#> NULL
distribution(m,~y) <- Gamma.lvm(shape=2,log=TRUE)
sim(m,10,p=c(y=0.5))[,"y"]
#> [1] -1.14993675 0.06110529 -0.78986992 1.68142084 0.13917097 -0.35847861
#> [7] 0.73652104 -1.62401234 -0.39977265 -1.16007252
##################################################
### Beta
##################################################
m <- lvm()
distribution(m,~y) <- beta.lvm(alpha=2,beta=1)
var(sim(m,100,"y,y"=2))
#> y
#> y 1.14993
distribution(m,~y) <- beta.lvm(alpha=2,beta=1,scale=FALSE)
var(sim(m,100))
#> y
#> y 0.0521512
##################################################
### Transform
##################################################
m <- lvm()
transform(m,xz~x+z) <- function(x) x[1]*(x[2]>0)
regression(m) <- y~x+z+xz
d <- sim(m,1e3)
summary(lm(y~x+z + x*I(z>0),d))
#>
#> Call:
#> lm(formula = y ~ x + z + x * I(z > 0), data = d)
#>
#> Residuals:
#> Min 1Q Median 3Q Max
#> -3.1528 -0.6738 -0.0486 0.7210 3.6658
#>
#> Coefficients:
#> Estimate Std. Error t value Pr(>|t|)
#> (Intercept) 0.021540 0.061655 0.349 0.727
#> x 0.987755 0.047652 20.728 <2e-16 ***
#> z 1.016270 0.054549 18.630 <2e-16 ***
#> I(z > 0)TRUE -0.005932 0.106185 -0.056 0.955
#> x:I(z > 0)TRUE 1.009156 0.065653 15.371 <2e-16 ***
#> ---
#> Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
#>
#> Residual standard error: 1.018 on 995 degrees of freedom
#> Multiple R-squared: 0.7704, Adjusted R-squared: 0.7694
#> F-statistic: 834.5 on 4 and 995 DF, p-value: < 2.2e-16
#>
##################################################
### Non-random variables
##################################################
m <- lvm()
distribution(m,~x+z+v+w) <- list(Sequence.lvm(0,5),## Seq. 0 to 5 by 1/n
Binary.lvm(), ## Vector of ones
Binary.lvm(0.5), ## 0.5n 0, 0.5n 1
Binary.lvm(interval=list(c(0.3,0.5),c(0.8,1))))
sim(m,10)
#> x z v w
#> 1 0.0000000 1 0 0
#> 2 0.5555556 1 0 0
#> 3 1.1111111 1 0 1
#> 4 1.6666667 1 0 1
#> 5 2.2222222 1 0 1
#> 6 2.7777778 1 1 0
#> 7 3.3333333 1 1 0
#> 8 3.8888889 1 1 1
#> 9 4.4444444 1 1 1
#> 10 5.0000000 1 1 1
##################################################
### Cox model
### piecewise constant hazard
################################################
m <- lvm(t~x)
rates <- c(1,0.5); cuts <- c(0,5)
## Constant rate: 1 in [0,5), 0.5 in [5,Inf)
distribution(m,~t) <- coxExponential.lvm(rate=rates,timecut=cuts)
if (FALSE) { # \dontrun{
d <- sim(m,2e4,p=c("t~x"=0.1)); d$status <- TRUE
plot(timereg::aalen(survival::Surv(t,status)~x,data=d,
resample.iid=0,robust=0),spec=1)
L <- approxfun(c(cuts,max(d$t)),f=1,
cumsum(c(0,rates*diff(c(cuts,max(d$t))))),
method="linear")
curve(L,0,100,add=TRUE,col="blue")
} # }
##################################################
### Cox model
### piecewise constant hazard, gamma frailty
##################################################
m <- lvm(y~x+z)
rates <- c(0.3,0.5); cuts <- c(0,5)
distribution(m,~y+z) <- list(coxExponential.lvm(rate=rates,timecut=cuts),
loggamma.lvm(rate=1,shape=1))
if (FALSE) { # \dontrun{
d <- sim(m,2e4,p=c("y~x"=0,"y~z"=0)); d$status <- TRUE
plot(timereg::aalen(survival::Surv(y,status)~x,data=d,
resample.iid=0,robust=0),spec=1)
L <- approxfun(c(cuts,max(d$y)),f=1,
cumsum(c(0,rates*diff(c(cuts,max(d$y))))),
method="linear")
curve(L,0,100,add=TRUE,col="blue")
} # }
## Equivalent via transform (here with Aalens additive hazard model)
m <- lvm(y~x)
distribution(m,~y) <- aalenExponential.lvm(rate=rates,timecut=cuts)
distribution(m,~z) <- Gamma.lvm(rate=1,shape=1)
transform(m,t~y+z) <- prod
sim(m,10)
#> y x z t
#> 1 -7.52089416 -0.3975626 0.0521017141 -3.918515e-01
#> 2 2.20571704 0.3470455 0.5086850008 1.122015e+00
#> 3 -4.36804686 -0.3792286 0.5905137765 -2.579392e+00
#> 4 -0.07640396 -1.5337391 0.2058743050 -1.572961e-02
#> 5 21.43594787 -0.2640809 0.0099792760 2.139152e-01
#> 6 4.25372148 0.1839688 0.4443781625 1.890261e+00
#> 7 -0.09900900 -1.4246557 0.0006665787 -6.599729e-05
#> 8 -1.15906345 -1.0335661 0.8045995309 -9.325819e-01
#> 9 6.72402679 -0.2305547 0.6546673730 4.402001e+00
#> 10 5.53307450 -0.1962768 0.2215395973 1.225795e+00
## Shared frailty
m <- lvm(c(t1,t2)~x+z)
rates <- c(1,0.5); cuts <- c(0,5)
distribution(m,~y) <- aalenExponential.lvm(rate=rates,timecut=cuts)
distribution(m,~z) <- loggamma.lvm(rate=1,shape=1)
if (FALSE) { # \dontrun{
mets::fast.reshape(sim(m,100),varying="t")
} # }
##################################################
### General multivariate distributions
##################################################
if (FALSE) { # \dontrun{
m <- lvm()
distribution(m,~y1+y2,oratio=4) <- VGAM::rbiplackcop
ksmooth2(sim(m,1e4),rgl=FALSE,theta=-20,phi=25)
m <- lvm()
distribution(m,~z1+z2,"or1") <- VGAM::rbiplackcop
distribution(m,~y1+y2,"or2") <- VGAM::rbiplackcop
sim(m,10,p=c(or1=0.1,or2=4))
} # }
m <- lvm()
distribution(m,~y1+y2+y3,TRUE) <- function(n,...) rmvn0(n,sigma=diag(3)+1)
var(sim(m,100))
#> y1 y2 y3
#> y1 2.546658 1.2446886 1.1131815
#> y2 1.244689 2.1601392 0.8173183
#> y3 1.113182 0.8173183 1.8257718
## Syntax also useful for univariate generators, e.g.
m <- lvm(y~x+z)
distribution(m,~y,TRUE) <- function(n) rnorm(n,mean=1000)
sim(m,5)
#> y x z
#> 1 1000.5214 0.2177780 1.93296840
#> 2 999.1417 -0.4748423 0.10528753
#> 3 999.1108 -2.2866782 -0.09999915
#> 4 998.8764 -0.7593130 -0.13847376
#> 5 1001.0689 -0.3586061 0.24695597
distribution(m,~y,"m1",0) <- rnorm
sim(m,5)
#> y x z
#> 1 -0.5899834 0.5347800 -2.3598938
#> 2 -0.4243369 -0.9962041 -0.5107305
#> 3 3.5456795 1.7645527 1.5297775
#> 4 -1.4880155 -0.7726338 -0.1372156
#> 5 -0.2474045 0.3482477 -0.1443716
sim(m,5,p=c(m1=100))
#> y x z
#> 1 102.25740 0.129915309 2.7875521
#> 2 101.53418 1.820998862 -0.3153138
#> 3 98.71054 -0.007577379 -0.2353492
#> 4 99.17310 -0.901698372 -0.4610830
#> 5 103.29305 0.072049531 1.9003038
##################################################
### Regression design in other parameters
##################################################
## Variance heterogeneity
m <- lvm(y~x)
distribution(m,~y) <- function(n,mean,x) rnorm(n,mean,exp(x)^.5)
if (interactive()) plot(y~x,sim(m,1e3))
## Alternaively, calculate the standard error directly
addvar(m) <- ~sd ## If 'sd' should be part of the resulting data.frame
constrain(m,sd~x) <- function(x) exp(x)^.5
distribution(m,~y) <- function(n,mean,sd) rnorm(n,mean,sd)
if (interactive()) plot(y~x,sim(m,1e3))
## Regression on variance parameter
m <- lvm()
regression(m) <- y~x
regression(m) <- v~x
##distribution(m,~v) <- 0 # No stochastic term
## Alternative:
## regression(m) <- v[NA:0]~x
distribution(m,~y) <- function(n,mean,v) rnorm(n,mean,exp(v)^.5)
if (interactive()) plot(y~x,sim(m,1e3))
## Regression on shape parameter in Weibull model
m <- lvm()
regression(m) <- y ~ z+v
regression(m) <- s ~ exp(0.6*x-0.5*z)
distribution(m,~x+z) <- binomial.lvm()
distribution(m,~cens) <- coxWeibull.lvm(scale=1)
distribution(m,~y) <- coxWeibull.lvm(scale=0.1,shape=~s)
eventTime(m) <- time ~ min(y=1,cens=0)
if (interactive()) {
d <- sim(m,1e3)
require(survival)
(cc <- coxph(Surv(time,status)~v+strata(x,z),data=d))
plot(survfit(cc) ,col=1:4,mark.time=FALSE)
}
##################################################
### Categorical predictor
##################################################
m <- lvm()
## categorical(m,K=3) <- "v"
categorical(m,labels=c("A","B","C")) <- "v"
regression(m,additive=FALSE) <- y~v
if (FALSE) { # \dontrun{
plot(y~v,sim(m,1000,p=c("y~v:2"=3)))
} # }
m <- lvm()
categorical(m,labels=c("A","B","C"),p=c(0.5,0.3)) <- "v"
regression(m,additive=FALSE,beta=c(0,2,-1)) <- y~v
## equivalent to:
## regression(m,y~v,additive=FALSE) <- c(0,2,-1)
regression(m,additive=FALSE,beta=c(0,4,-1)) <- z~v
table(sim(m,1e4)$v)
#>
#> A B C
#> 5004 3018 1978
glm(y~v, data=sim(m,1e4))
#>
#> Call: glm(formula = y ~ v, data = sim(m, 10000))
#>
#> Coefficients:
#> (Intercept) vB vC
#> 0.02492 1.99774 -1.00229
#>
#> Degrees of Freedom: 9999 Total (i.e. Null); 9997 Residual
#> Null Deviance: 22580
#> Residual Deviance: 9997 AIC: 28380
glm(y~v, data=sim(m,1e4,p=c("y~v:1"=3)))
#>
#> Call: glm(formula = y ~ v, data = sim(m, 10000, p = c(`y~v:1` = 3)))
#>
#> Coefficients:
#> (Intercept) vB vC
#> 0.003437 2.987178 -1.009686
#>
#> Degrees of Freedom: 9999 Total (i.e. Null); 9997 Residual
#> Null Deviance: 34490
#> Residual Deviance: 10330 AIC: 28710
transform(m,v2~v) <- function(x) x=='A'
sim(m,10)
#> v y z v2
#> 1 B 2.2298156 4.1679282 FALSE
#> 2 B 0.6586923 3.7329303 FALSE
#> 3 B 2.6336803 2.2362200 FALSE
#> 4 C -1.2815957 -3.2806097 FALSE
#> 5 B 1.7904833 1.7922099 FALSE
#> 6 A 1.5817981 -2.4106877 TRUE
#> 7 A 0.9435654 0.8197093 TRUE
#> 8 A 0.5989207 0.9182792 TRUE
#> 9 C -0.5337450 0.2826166 FALSE
#> 10 A 0.7108434 -1.3749699 TRUE
##################################################
### Pre-calculate object
##################################################
m <- lvm(y~x)
m2 <- sim(m,'y~x'=2)
sim(m,10,'y~x'=2)
#> y x
#> 1 -2.9969686 -0.5822500
#> 2 2.0932122 0.4675629
#> 3 2.4677225 1.1228049
#> 4 1.3229557 1.3806512
#> 5 3.1524416 1.9010546
#> 6 -0.6734257 -0.6149359
#> 7 0.4034541 -0.4402311
#> 8 0.5278260 -0.2312745
#> 9 1.3184855 0.7196587
#> 10 -0.6764807 -0.2205306
sim(m2,10) ## Faster
#> y x
#> 1 0.2093413 -0.1414400
#> 2 1.7423583 0.9679083
#> 3 2.3735490 0.8827087
#> 4 -2.9847372 -1.3114255
#> 5 0.8275647 -0.2983199
#> 6 -3.4592201 -1.8890341
#> 7 2.4341144 0.9988011
#> 8 -1.1049947 -0.4407522
#> 9 -0.4269766 -0.1266534
#> 10 -1.4520110 -0.6076527