ethzplus
diff --git a/‎DESCRIPTION‎
Lines changed: 29 additions & 2 deletions b/‎DESCRIPTION‎
Lines changed: 29 additions & 2 deletions
diff --git a/‎NAMESPACE‎
Lines changed: 7 additions & 2 deletions b/‎NAMESPACE‎
Lines changed: 7 additions & 2 deletions
diff --git a/‎R/RcppExports.R‎
Lines changed: 2 additions & 2 deletions b/‎R/RcppExports.R‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎R/coords_t.R‎
Lines changed: 2 additions & 2 deletions b/‎R/coords_t.R‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎R/covariance_filter.R‎
Lines changed: 168 additions & 0 deletions b/‎R/covariance_filter.R‎
Lines changed: 168 additions & 0 deletions
@@ -28,8 +28,35 @@ Imports:
     terra
 Suggests:
     tinytest,
-    quarto
+    quarto,
+    ranger
 VignetteBuilder: quarto
 Config/testthat/edition: 3
-LinkingTo: 
+LinkingTo:
     Rcpp
+Collate:
+    'RcppExports.R'
+    'alloc_params_t.R'
+    'coords_t.R'
+    'covariance_filter.R'
+    'parquet_duckdb.R'
+    'evoland_db.R'
+    'evoland_db_neighbors.R'
+    'evoland_db_tables.R'
+    'evoland_db_views.R'
+    'grrf_filter.r'
+    'init.R'
+    'intrv_masks_t.R'
+    'intrv_meta_t.R'
+    'lulc_data_t.R'
+    'lulc_meta_t.R'
+    'neighbors_t.R'
+    'periods_t.R'
+    'pred_data_t.R'
+    'pred_meta_t.R'
+    'trans_meta_t.R'
+    'trans_models_t.R'
+    'trans_preds_t.R'
+    'util.R'
+    'util_download.R'
+    'util_terra.R'
@@ -8,6 +8,7 @@ S3method(print,intrv_masks_t)
 S3method(print,intrv_meta_t)
 S3method(print,lulc_data_t)
 S3method(print,lulc_meta_t)
+S3method(print,neighbors_t)
 S3method(print,periods_t)
 S3method(print,pred_data_t)
 S3method(print,pred_meta_t)
@@ -22,6 +23,7 @@ S3method(validate,intrv_masks_t)
 S3method(validate,intrv_meta_t)
 S3method(validate,lulc_data_t)
 S3method(validate,lulc_meta_t)
+S3method(validate,neighbors_t)
 S3method(validate,periods_t)
 S3method(validate,pred_data_t)
 S3method(validate,pred_data_t_bool)
@@ -37,24 +39,27 @@ export(as_intrv_masks_t)
 export(as_intrv_meta_t)
 export(as_lulc_data_t)
 export(as_lulc_meta_t)
+export(as_neighbors_t)
 export(as_periods_t)
 export(as_pred_data_t)
 export(as_pred_meta_t)
 export(as_trans_meta_t)
 export(as_trans_models_t)
 export(as_trans_preds_t)
-export(compute_neighbors)
+export(covariance_filter)
 export(create_coords_t_square)
 export(create_intrv_meta_t)
 export(create_intrv_meta_t_row)
 export(create_lulc_meta_t)
+export(create_neighbors_t)
 export(create_periods_t)
 export(create_pred_meta_t)
 export(create_trans_meta_t)
-export(create_trans_preds_t)
 export(download_and_verify)
 export(evoland_db)
 export(extract_using_coords_t)
+export(grrf_filter)
+export(parquet_duckdb)
 export(print_rowwise_yaml)
 export(validate)
 importFrom(Rcpp,sourceCpp)
 
@@ -1,7 +1,7 @@
 # Generated by using Rcpp::compileAttributes() -> do not edit by hand
 # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
 
-distance_neighbors_cpp <- function(coords_t, max_distance, resolution = 100.0) {
-    .Call(`_evoland_distance_neighbors_cpp`, coords_t, max_distance, resolution)
+distance_neighbors_cpp <- function(coords_t, max_distance, resolution = 100.0, quiet = FALSE) {
+    .Call(`_evoland_distance_neighbors_cpp`, coords_t, max_distance, resolution, quiet)
 }
 
@@ -34,9 +34,9 @@ as_coords_t <- function(x) {
       geom_polygon = list()
     )
   }
-  cast_dt_col(x, "id_coord", as.integer)
+  cast_dt_col(x, "id_coord", "int")
   if (!is.null(x[["region"]])) {
-    cast_dt_col(x, "region", as.factor)
+    cast_dt_col(x, "region", "factor")
   }
   new_evoland_table(
     x,
 
@@ -0,0 +1,168 @@
+#' Two stage covariate filtering
+#'
+#' The `covariance_filter` returns a set of covariates for land use land cover change
+#' (LULCC) models based on a two-stage variable selection: a first statistical fit
+#' estimates a covariate's quality for a given prediction task. A second step selects
+#' all variables below a given correlation threshold: We iterate over a correlation
+#' matrix ordered in the first step. Starting within the leftmost column, all rows (i.e.
+#' candidates) greater than the given threshold are dropped from the full set of
+#' candidates. This candidate selection is retained and used to select the next column,
+#' until no further columns are left to investigate. The columns that were iterated over
+#' are those returned as a character vector of selected variable names.
+#'
+#' @param data A data.table of target variable and candidate covariates to be filtered;
+#'        wide format with one predictor per column.
+#' @param result_col Name of the column representing the transition results (0: no
+#'        trans, 1: trans)
+#' @param rank_fun Optional function to compute ranking scores for each covariate.
+#'        Should take arguments (x, y, weights, ...) and return a single numeric value
+#'        (lower = better). Defaults to polynomial GLM p-value ranking.
+#' @param weights Optional vector of weights to be used in the ranking function. Defaults to
+#'        class-balanced weights
+#' @param corcut Numeric threshold (0-1) for correlation filtering. Covariates with correlation
+#'        coefficients above this threshold will be filtered out. Default is 0 (no filtering).
+#' @param ... Additional arguments passed to rank_fun.
+#'
+#' @return A set of column names (covariates) to retain
+#'
+#' @details
+#' The function first ranks covariates using the provided ranking function (default:
+#' quasibinomial polynomial GLM). Then, it iteratively removes highly (Pearson)
+#' correlated variables based on the correlation cutoff threshold, preserving variables
+#' in order of their ranking. See
+#' <https://github.com/ethzplus/evoland-plus-legacy/blob/main/R/lulcc.covfilter.r> for
+#' where the concept came from. The original author was Antoine Adde, with edits by
+#' Benjamin Black. A similar mechanism is found in <https://github.com/antadde/covsel/>.
+#'
+#' @name covariance_filter
+#'
+#' @export
+
+covariance_filter <- function(
+  data,
+  result_col = "result",
+  rank_fun = rank_poly_glm,
+  weights = compute_balanced_weights(data[[result_col]]),
+  corcut = 0.7,
+  ...
+) {
+  # Early return for single covariate
+  if (ncol(data) == 1) {
+    return(data)
+  }
+
+  data.table::setDT(data)
+
+  # Validate binary outcome
+  stopifnot(
+    "corcut must be between 0 and 1" = corcut >= 0 && corcut <= 1
+  )
+
+  # Compute ranking scores for all covariates (vectorized where possible)
+  scores <- vapply(
+    data[, -..result_col],
+    rank_fun,
+    FUN.VALUE = numeric(1),
+    y = data[[result_col]],
+    weights = weights,
+    ...
+  )
+
+  # Sort by scores (lower = better/more significant)
+  ranked_order <- names(sort(scores))
+
+  # If no correlation filtering needed, return ranked predictors
+  if (corcut == 1) {
+    return(ranked_order)
+  }
+
+  # Compute correlation matrix once
+  cor_mat <- abs(cor(data[, ..ranked_order], use = "pairwise.complete.obs"))
+
+  # Iteratively select covariates based on correlation threshold
+  select_by_correlation(cor_mat, corcut)
+}
+
+
+#' @describeIn covariance_filter Default ranking function using polynomial GLM. Returns
+#' the lower p value for each of the polynomial terms
+#' @param x A numeric vector representing a single covariate
+#' @param y A binary outcome vector (0/1)
+#' @param weights Optional weights vector
+#' @keywords internal
+rank_poly_glm <- function(x, y, weights = NULL, ...) {
+  fit <- glm.fit(
+    x = cbind(1, poly(x, degree = 2, simple = TRUE)),
+    y = y,
+    family = quasibinomial(),
+    weights = weights
+  )
+
+  # Get p-values for linear and quadratic terms
+  coef_summary <- summary.glm(fit)$coefficients
+
+  # Return minimum p-value (most significant term)
+  min(coef_summary[2:3, 4], na.rm = TRUE)
+}
+
+
+#' @describeIn covariance_filter Compute class-balanced weights for imbalanced binary
+#' outcomes; returns a numeric vector
+#' @param trans_result Binary outcome vector (0/1)
+#' @param legacy Bool, use legacy weighting?
+#' @keywords internal
+compute_balanced_weights <- function(trans_result, legacy = FALSE) {
+  n_total <- length(trans_result)
+  n_trans <- sum(trans_result)
+  n_non_trans <- sum(!trans_result)
+
+  # Compute inverse frequency weights
+  weights <- numeric(n_total)
+
+  if (legacy) {
+    # I found this weighting in evoland-plus-legacy, but the models wouldn't converge
+    # https://github.com/ethzplus/evoland-plus-legacy/blob/main/R/lulcc.splitforcovselection.r
+    # This is actually just setting the underrepresented class to the rounded imbalance ratio
+    weights[!trans_result] <- 1
+    weights[trans_result] <- round(n_non_trans / n_trans)
+    return(weights)
+  }
+
+  # This is the heuristic in scikit-learn, n_samples / (n_classes * np.bincount(y))
+  # https://scikit-learn.org/stable/modules/generated/sklearn.utils.class_weight.compute_class_weight.html #nolint
+  # This weighting maintains the exact imbalance ratio
+  weights[trans_result] <- n_total / (2 * n_trans)
+  weights[!trans_result] <- n_total / (2 * n_non_trans)
+
+  weights
+}
+
+
+#' @describeIn covariance_filter Implements the iterative selection procedure.
+#' @param cor_mat Absolute correlation matrix
+#' @param corcut Correlation cutoff threshold
+#' @keywords internal
+select_by_correlation <- function(cor_mat, corcut) {
+  var_names <- colnames(cor_mat)
+
+  # Early return if all correlations are below threshold
+  if (all(cor_mat[lower.tri(cor_mat)] < corcut)) {
+    return(var_names)
+  }
+
+  selected <- character(0)
+  remaining_idx <- seq_along(var_names)
+
+  while (length(remaining_idx) > 0) {
+    # Select the first remaining variable (highest ranked)
+    current_var <- remaining_idx[1]
+    selected <- c(selected, var_names[current_var])
+
+    # Find variables with correlation <= corcut with current variable
+    # (excluding the variable itself)
+    keep_idx <- which(cor_mat[remaining_idx, current_var] <= corcut)
+    remaining_idx <- remaining_idx[keep_idx]
+  }
+
+  selected
+}
Original file line number	Diff line number	Diff line change
`@@ -1,7 +1,7 @@`
`1`	`1`	`# Generated by using Rcpp::compileAttributes() -> do not edit by hand`
`2`	`2`	`# Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393`
`3`	`3`
`4`		`-distance_neighbors_cpp <- function(coords_t, max_distance, resolution = 100.0) {`
`5`		- .Call(`_evoland_distance_neighbors_cpp`, coords_t, max_distance, resolution)
	`4`	`+distance_neighbors_cpp <- function(coords_t, max_distance, resolution = 100.0, quiet = FALSE) {`
	`5`	+ .Call(`_evoland_distance_neighbors_cpp`, coords_t, max_distance, resolution, quiet)
`6`	`6`	`}`
`7`	`7`
Original file line number	Diff line number	Diff line change
`@@ -34,9 +34,9 @@ as_coords_t <- function(x) {`
`34`	`34`	`geom_polygon = list()`
`35`	`35`	`)`
`36`	`36`	`}`
`37`		`- cast_dt_col(x, "id_coord", as.integer)`
	`37`	`+ cast_dt_col(x, "id_coord", "int")`
`38`	`38`	`if (!is.null(x[["region"]])) {`
`39`		`- cast_dt_col(x, "region", as.factor)`
	`39`	`+ cast_dt_col(x, "region", "factor")`
`40`	`40`	`}`
`41`	`41`	`new_evoland_table(`
`42`	`42`	`x,`