@@ -189,8 +189,10 @@ is_causal_power <- function(G, beta, residual_variance, power = 0.80) {
189189# ' @param inf_sd Standard deviation for drawing infinitesimal effects (default 0.01).
190190# ' @param standardize Logical; if TRUE, the genotype matrix will be standardized.
191191# ' @param independent Logical; if TRUE, ensures all sparse and oligogenic SNPs have |r| < ld_threshold with each other (default TRUE).
192- # ' @param ld_threshold Numeric; maximum allowed absolute correlation between causal variants when independent = TRUE (default 0.15).
193- # ' @param max_attempts Integer; maximum number of attempts to find SNPs satisfying LD constraints (default 200).
192+ # ' @param ld_threshold Numeric; starting maximum allowed absolute correlation between causal variants when independent = TRUE (default 0.10). If constraints cannot be satisfied, the threshold is progressively increased by ld_step.
193+ # ' @param ld_step Numeric; amount to increase ld_threshold after max_attempts failures (default 0.10).
194+ # ' @param ld_max Numeric; maximum ld_threshold to try before giving up (default 0.50).
195+ # ' @param max_attempts Integer; maximum number of attempts per LD threshold level to find SNPs satisfying LD constraints (default 200).
194196# ' @param seed Optional seed for reproducibility.
195197# ' @return A list containing the standardized genotype matrix, simulated phenotype,
196198# ' combined beta values, indices for each effect component, realized heritability estimates,
@@ -210,7 +212,9 @@ generate_cis_qtl_data <- function(G,
210212 inf_sd = 0.01 ,
211213 standardize = TRUE ,
212214 independent = TRUE ,
213- ld_threshold = 0.15 ,
215+ ld_threshold = 0.10 ,
216+ ld_step = 0.10 ,
217+ ld_max = 0.50 ,
214218 max_attempts = 200 ,
215219 seed = NULL ) {
216220 # Input validation
@@ -232,96 +236,111 @@ generate_cis_qtl_data <- function(G,
232236 # All columns are valid for selection
233237 valid_cols <- 1 : n_features
234238
235- # Setup for LD constraint checking
236- max_attempts <- if (independent ) max_attempts else 1
237- attempt <- 0
239+ # Setup for progressive LD constraint checking
238240 ld_satisfied <- FALSE
241+ current_ld_threshold <- ld_threshold
242+ total_attempts <- 0
239243
240244 # Set initial seed if provided
241245 if (! is.null(seed )) {
242246 set.seed(seed )
243247 }
244248
245- while (! ld_satisfied && attempt < max_attempts ) {
246- attempt <- attempt + 1
249+ # Progressive LD threshold loop
250+ while (! ld_satisfied && current_ld_threshold < = ld_max ) {
251+ attempt <- 0
247252
248- # Set seed for this attempt (for reproducibility)
249- if (! is.null(seed ) && attempt > 1 ) {
250- set.seed(seed * 1000 + attempt )
251- }
252-
253- # 1. Sparse Effects using target-based scaling
254- sparse_res <- simulate_sparse_effects(G , h2_sparse , n_sparse , sparse_sd , valid_cols )
255- beta_sparse <- sparse_res $ beta
256- sparse_indices <- sparse_res $ sparse_indices
257-
258- # 2. Oligogenic Effects using target-based scaling
259- non_sparse_indices <- setdiff(valid_cols , sparse_indices )
260- oligo_res <- simulate_oligogenic_effects(G , h2_oligogenic , n_oligogenic ,
261- mixture_props , non_sparse_indices ,
262- oligo_sds )
263- beta_oligo <- oligo_res $ beta
264- oligogenic_indices <- oligo_res $ oligogenic_indices
265-
266- # 3. Infinitesimal Effects using target-based scaling
267- infinitesimal_pool <- setdiff(non_sparse_indices , oligogenic_indices )
268-
269- # Select n_inf SNPs from the infinitesimal pool if n_inf is specified
270- if (! is.null(n_inf )) {
271- n_inf_actual <- min(n_inf , length(infinitesimal_pool ))
272- infinitesimal_indices <- sample(infinitesimal_pool , n_inf_actual )
273- } else {
274- # Default behavior: all remaining SNPs get infinitesimal effects
275- infinitesimal_indices <- infinitesimal_pool
276- }
277-
278- beta_inf <- simulate_infinitesimal_effects(G , h2_infinitesimal , infinitesimal_indices ,
279- inf_sd )
280-
281- # Combine all effect components
282- beta <- beta_sparse + beta_oligo + beta_inf
253+ while (! ld_satisfied && attempt < max_attempts ) {
254+ attempt <- attempt + 1
255+ total_attempts <- total_attempts + 1
283256
284- # Generate latent genetic component and phenotype
285- g <- as.vector(G %*% beta )
286- var_g <- var(g )
287- var_epsilon <- var_g * (1 - h2g ) / h2g
288- epsilon <- rnorm(n_samples , 0 , sqrt(var_epsilon ))
289- y <- g + epsilon
290-
291- # Check LD constraint if independent = TRUE (applied to sparse and oligogenic SNPs)
292- if (independent ) {
293- ld_satisfied <- TRUE # Assume satisfied unless violations found
294-
295- # Check sparse SNPs: |r| < ld_threshold
296- if (length(sparse_indices ) > 1 ) {
297- sparse_cor <- cor(G [, sparse_indices , drop = FALSE ])
298- high_ld_sparse <- which(abs(sparse_cor ) > = ld_threshold & upper.tri(sparse_cor , diag = FALSE ), arr.ind = TRUE )
299- if (nrow(high_ld_sparse ) > 0 ) ld_satisfied <- FALSE
257+ # Set seed for this attempt (for reproducibility)
258+ if (! is.null(seed ) && total_attempts > 1 ) {
259+ set.seed(seed * 1000 + total_attempts )
300260 }
301261
302- # Check oligogenic SNPs: |r| < ld_threshold
303- if (ld_satisfied && length(oligogenic_indices ) > 1 ) {
304- oligo_cor <- cor(G [, oligogenic_indices , drop = FALSE ])
305- high_ld_oligo <- which(abs(oligo_cor ) > = ld_threshold & upper.tri(oligo_cor , diag = FALSE ), arr.ind = TRUE )
306- if (nrow(high_ld_oligo ) > 0 ) ld_satisfied <- FALSE
262+ # 1. Sparse Effects using target-based scaling
263+ sparse_res <- simulate_sparse_effects(G , h2_sparse , n_sparse , sparse_sd , valid_cols )
264+ beta_sparse <- sparse_res $ beta
265+ sparse_indices <- sparse_res $ sparse_indices
266+
267+ # 2. Oligogenic Effects using target-based scaling
268+ non_sparse_indices <- setdiff(valid_cols , sparse_indices )
269+ oligo_res <- simulate_oligogenic_effects(G , h2_oligogenic , n_oligogenic ,
270+ mixture_props , non_sparse_indices ,
271+ oligo_sds )
272+ beta_oligo <- oligo_res $ beta
273+ oligogenic_indices <- oligo_res $ oligogenic_indices
274+
275+ # 3. Infinitesimal Effects using target-based scaling
276+ infinitesimal_pool <- setdiff(non_sparse_indices , oligogenic_indices )
277+
278+ # Select n_inf SNPs from the infinitesimal pool if n_inf is specified
279+ if (! is.null(n_inf )) {
280+ n_inf_actual <- min(n_inf , length(infinitesimal_pool ))
281+ infinitesimal_indices <- sample(infinitesimal_pool , n_inf_actual )
282+ } else {
283+ # Default behavior: all remaining SNPs get infinitesimal effects
284+ infinitesimal_indices <- infinitesimal_pool
307285 }
308286
309- # Check between sparse and oligogenic: |r| < ld_threshold
310- if (ld_satisfied && length(sparse_indices ) > 0 && length(oligogenic_indices ) > 0 ) {
311- cross_cor <- cor(G [, sparse_indices , drop = FALSE ], G [, oligogenic_indices , drop = FALSE ])
312- high_ld_cross <- which(abs(cross_cor ) > = ld_threshold , arr.ind = TRUE )
313- if (nrow(high_ld_cross ) > 0 ) ld_satisfied <- FALSE
287+ beta_inf <- simulate_infinitesimal_effects(G , h2_infinitesimal , infinitesimal_indices ,
288+ inf_sd )
289+
290+ # Combine all effect components
291+ beta <- beta_sparse + beta_oligo + beta_inf
292+
293+ # Generate latent genetic component and phenotype
294+ g <- as.vector(G %*% beta )
295+ var_g <- var(g )
296+ var_epsilon <- var_g * (1 - h2g ) / h2g
297+ epsilon <- rnorm(n_samples , 0 , sqrt(var_epsilon ))
298+ y <- g + epsilon
299+
300+ # Check LD constraint if independent = TRUE (applied to sparse and oligogenic SNPs)
301+ if (independent ) {
302+ ld_satisfied <- TRUE # Assume satisfied unless violations found
303+
304+ # Check sparse SNPs: |r| < current_ld_threshold
305+ if (length(sparse_indices ) > 1 ) {
306+ sparse_cor <- cor(G [, sparse_indices , drop = FALSE ])
307+ high_ld_sparse <- which(abs(sparse_cor ) > = current_ld_threshold & upper.tri(sparse_cor , diag = FALSE ), arr.ind = TRUE )
308+ if (nrow(high_ld_sparse ) > 0 ) ld_satisfied <- FALSE
309+ }
310+
311+ # Check oligogenic SNPs: |r| < current_ld_threshold
312+ if (ld_satisfied && length(oligogenic_indices ) > 1 ) {
313+ oligo_cor <- cor(G [, oligogenic_indices , drop = FALSE ])
314+ high_ld_oligo <- which(abs(oligo_cor ) > = current_ld_threshold & upper.tri(oligo_cor , diag = FALSE ), arr.ind = TRUE )
315+ if (nrow(high_ld_oligo ) > 0 ) ld_satisfied <- FALSE
316+ }
317+
318+ # Check between sparse and oligogenic: |r| < current_ld_threshold
319+ if (ld_satisfied && length(sparse_indices ) > 0 && length(oligogenic_indices ) > 0 ) {
320+ cross_cor <- cor(G [, sparse_indices , drop = FALSE ], G [, oligogenic_indices , drop = FALSE ])
321+ high_ld_cross <- which(abs(cross_cor ) > = current_ld_threshold , arr.ind = TRUE )
322+ if (nrow(high_ld_cross ) > 0 ) ld_satisfied <- FALSE
323+ }
324+ } else {
325+ # If not independent, accept immediately
326+ ld_satisfied <- TRUE
314327 }
315- } else {
316- # If not independent, accept
317- ld_satisfied <- TRUE
328+ }
329+
330+ # If not satisfied at current threshold, increase and try again
331+ if (! ld_satisfied && independent ) {
332+ current_ld_threshold <- current_ld_threshold + ld_step
318333 }
319334 }
320335
321- # Warn if LD constraints not satisfied
336+ # Warn if LD constraints not satisfied even at maximum threshold
322337 if (independent && ! ld_satisfied ) {
323- warning(paste0(" Failed to satisfy LD constraints after " , max_attempts ,
324- " attempts. Returning last generated data with LD violations." ))
338+ warning(paste0(" Failed to satisfy LD constraints after " , total_attempts ,
339+ " total attempts (up to ld_threshold = " , ld_max ,
340+ " ). Returning last generated data with LD violations." ))
341+ } else if (independent && current_ld_threshold > ld_threshold ) {
342+ message(paste0(" LD constraints satisfied at ld_threshold = " , current_ld_threshold ,
343+ " after " , total_attempts , " total attempts." ))
325344 }
326345
327346 # Calculate causal indices using power-based identification (non-central chi-square with Bonferroni correction)
@@ -346,6 +365,7 @@ generate_cis_qtl_data <- function(G,
346365 oligogenic_indices = oligogenic_indices ,
347366 infinitesimal_indices = infinitesimal_indices ,
348367 residual_variance = var_epsilon ,
349- causal = causal_indices
368+ causal = causal_indices ,
369+ ld_threshold_used = if (independent ) current_ld_threshold else NA
350370 ))
351371}
0 commit comments