Skip to content

Commit caa3d35

Browse files
authored
Merge pull request #18 from ethzplus/9-trans-preds-t
Implement transition predictors pruning (generate + prune trans_preds_t) - Refactored database logic for better table management and in-memory support - Filtering capabilities: covariance and GRRF filters Closes #9
2 parents a32c51f + e6c8e5c commit caa3d35

38 files changed

+3577
-1604
lines changed

DESCRIPTION

Lines changed: 29 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,8 +28,35 @@ Imports:
2828
terra
2929
Suggests:
3030
tinytest,
31-
quarto
31+
quarto,
32+
ranger
3233
VignetteBuilder: quarto
3334
Config/testthat/edition: 3
34-
LinkingTo:
35+
LinkingTo:
3536
Rcpp
37+
Collate:
38+
'RcppExports.R'
39+
'alloc_params_t.R'
40+
'coords_t.R'
41+
'covariance_filter.R'
42+
'parquet_duckdb.R'
43+
'evoland_db.R'
44+
'evoland_db_neighbors.R'
45+
'evoland_db_tables.R'
46+
'evoland_db_views.R'
47+
'grrf_filter.r'
48+
'init.R'
49+
'intrv_masks_t.R'
50+
'intrv_meta_t.R'
51+
'lulc_data_t.R'
52+
'lulc_meta_t.R'
53+
'neighbors_t.R'
54+
'periods_t.R'
55+
'pred_data_t.R'
56+
'pred_meta_t.R'
57+
'trans_meta_t.R'
58+
'trans_models_t.R'
59+
'trans_preds_t.R'
60+
'util.R'
61+
'util_download.R'
62+
'util_terra.R'

NAMESPACE

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ S3method(print,intrv_masks_t)
88
S3method(print,intrv_meta_t)
99
S3method(print,lulc_data_t)
1010
S3method(print,lulc_meta_t)
11+
S3method(print,neighbors_t)
1112
S3method(print,periods_t)
1213
S3method(print,pred_data_t)
1314
S3method(print,pred_meta_t)
@@ -22,6 +23,7 @@ S3method(validate,intrv_masks_t)
2223
S3method(validate,intrv_meta_t)
2324
S3method(validate,lulc_data_t)
2425
S3method(validate,lulc_meta_t)
26+
S3method(validate,neighbors_t)
2527
S3method(validate,periods_t)
2628
S3method(validate,pred_data_t)
2729
S3method(validate,pred_data_t_bool)
@@ -37,24 +39,27 @@ export(as_intrv_masks_t)
3739
export(as_intrv_meta_t)
3840
export(as_lulc_data_t)
3941
export(as_lulc_meta_t)
42+
export(as_neighbors_t)
4043
export(as_periods_t)
4144
export(as_pred_data_t)
4245
export(as_pred_meta_t)
4346
export(as_trans_meta_t)
4447
export(as_trans_models_t)
4548
export(as_trans_preds_t)
46-
export(compute_neighbors)
49+
export(covariance_filter)
4750
export(create_coords_t_square)
4851
export(create_intrv_meta_t)
4952
export(create_intrv_meta_t_row)
5053
export(create_lulc_meta_t)
54+
export(create_neighbors_t)
5155
export(create_periods_t)
5256
export(create_pred_meta_t)
5357
export(create_trans_meta_t)
54-
export(create_trans_preds_t)
5558
export(download_and_verify)
5659
export(evoland_db)
5760
export(extract_using_coords_t)
61+
export(grrf_filter)
62+
export(parquet_duckdb)
5863
export(print_rowwise_yaml)
5964
export(validate)
6065
importFrom(Rcpp,sourceCpp)

R/RcppExports.R

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
# Generated by using Rcpp::compileAttributes() -> do not edit by hand
22
# Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
33

4-
distance_neighbors_cpp <- function(coords_t, max_distance, resolution = 100.0) {
5-
.Call(`_evoland_distance_neighbors_cpp`, coords_t, max_distance, resolution)
4+
distance_neighbors_cpp <- function(coords_t, max_distance, resolution = 100.0, quiet = FALSE) {
5+
.Call(`_evoland_distance_neighbors_cpp`, coords_t, max_distance, resolution, quiet)
66
}
77

R/coords_t.R

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,9 +34,9 @@ as_coords_t <- function(x) {
3434
geom_polygon = list()
3535
)
3636
}
37-
cast_dt_col(x, "id_coord", as.integer)
37+
cast_dt_col(x, "id_coord", "int")
3838
if (!is.null(x[["region"]])) {
39-
cast_dt_col(x, "region", as.factor)
39+
cast_dt_col(x, "region", "factor")
4040
}
4141
new_evoland_table(
4242
x,

R/covariance_filter.R

Lines changed: 168 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,168 @@
1+
#' Two stage covariate filtering
2+
#'
3+
#' The `covariance_filter` returns a set of covariates for land use land cover change
4+
#' (LULCC) models based on a two-stage variable selection: a first statistical fit
5+
#' estimates a covariate's quality for a given prediction task. A second step selects
6+
#' all variables below a given correlation threshold: We iterate over a correlation
7+
#' matrix ordered in the first step. Starting within the leftmost column, all rows (i.e.
8+
#' candidates) greater than the given threshold are dropped from the full set of
9+
#' candidates. This candidate selection is retained and used to select the next column,
10+
#' until no further columns are left to investigate. The columns that were iterated over
11+
#' are those returned as a character vector of selected variable names.
12+
#'
13+
#' @param data A data.table of target variable and candidate covariates to be filtered;
14+
#' wide format with one predictor per column.
15+
#' @param result_col Name of the column representing the transition results (0: no
16+
#' trans, 1: trans)
17+
#' @param rank_fun Optional function to compute ranking scores for each covariate.
18+
#' Should take arguments (x, y, weights, ...) and return a single numeric value
19+
#' (lower = better). Defaults to polynomial GLM p-value ranking.
20+
#' @param weights Optional vector of weights to be used in the ranking function. Defaults to
21+
#' class-balanced weights
22+
#' @param corcut Numeric threshold (0-1) for correlation filtering. Covariates with correlation
23+
#' coefficients above this threshold will be filtered out. Default is 0 (no filtering).
24+
#' @param ... Additional arguments passed to rank_fun.
25+
#'
26+
#' @return A set of column names (covariates) to retain
27+
#'
28+
#' @details
29+
#' The function first ranks covariates using the provided ranking function (default:
30+
#' quasibinomial polynomial GLM). Then, it iteratively removes highly (Pearson)
31+
#' correlated variables based on the correlation cutoff threshold, preserving variables
32+
#' in order of their ranking. See
33+
#' <https://github.com/ethzplus/evoland-plus-legacy/blob/main/R/lulcc.covfilter.r> for
34+
#' where the concept came from. The original author was Antoine Adde, with edits by
35+
#' Benjamin Black. A similar mechanism is found in <https://github.com/antadde/covsel/>.
36+
#'
37+
#' @name covariance_filter
38+
#'
39+
#' @export
40+
41+
covariance_filter <- function(
42+
data,
43+
result_col = "result",
44+
rank_fun = rank_poly_glm,
45+
weights = compute_balanced_weights(data[[result_col]]),
46+
corcut = 0.7,
47+
...
48+
) {
49+
# Early return for single covariate
50+
if (ncol(data) == 1) {
51+
return(data)
52+
}
53+
54+
data.table::setDT(data)
55+
56+
# Validate binary outcome
57+
stopifnot(
58+
"corcut must be between 0 and 1" = corcut >= 0 && corcut <= 1
59+
)
60+
61+
# Compute ranking scores for all covariates (vectorized where possible)
62+
scores <- vapply(
63+
data[, -..result_col],
64+
rank_fun,
65+
FUN.VALUE = numeric(1),
66+
y = data[[result_col]],
67+
weights = weights,
68+
...
69+
)
70+
71+
# Sort by scores (lower = better/more significant)
72+
ranked_order <- names(sort(scores))
73+
74+
# If no correlation filtering needed, return ranked predictors
75+
if (corcut == 1) {
76+
return(ranked_order)
77+
}
78+
79+
# Compute correlation matrix once
80+
cor_mat <- abs(cor(data[, ..ranked_order], use = "pairwise.complete.obs"))
81+
82+
# Iteratively select covariates based on correlation threshold
83+
select_by_correlation(cor_mat, corcut)
84+
}
85+
86+
87+
#' @describeIn covariance_filter Default ranking function using polynomial GLM. Returns
88+
#' the lower p value for each of the polynomial terms
89+
#' @param x A numeric vector representing a single covariate
90+
#' @param y A binary outcome vector (0/1)
91+
#' @param weights Optional weights vector
92+
#' @keywords internal
93+
rank_poly_glm <- function(x, y, weights = NULL, ...) {
94+
fit <- glm.fit(
95+
x = cbind(1, poly(x, degree = 2, simple = TRUE)),
96+
y = y,
97+
family = quasibinomial(),
98+
weights = weights
99+
)
100+
101+
# Get p-values for linear and quadratic terms
102+
coef_summary <- summary.glm(fit)$coefficients
103+
104+
# Return minimum p-value (most significant term)
105+
min(coef_summary[2:3, 4], na.rm = TRUE)
106+
}
107+
108+
109+
#' @describeIn covariance_filter Compute class-balanced weights for imbalanced binary
110+
#' outcomes; returns a numeric vector
111+
#' @param trans_result Binary outcome vector (0/1)
112+
#' @param legacy Bool, use legacy weighting?
113+
#' @keywords internal
114+
compute_balanced_weights <- function(trans_result, legacy = FALSE) {
115+
n_total <- length(trans_result)
116+
n_trans <- sum(trans_result)
117+
n_non_trans <- sum(!trans_result)
118+
119+
# Compute inverse frequency weights
120+
weights <- numeric(n_total)
121+
122+
if (legacy) {
123+
# I found this weighting in evoland-plus-legacy, but the models wouldn't converge
124+
# https://github.com/ethzplus/evoland-plus-legacy/blob/main/R/lulcc.splitforcovselection.r
125+
# This is actually just setting the underrepresented class to the rounded imbalance ratio
126+
weights[!trans_result] <- 1
127+
weights[trans_result] <- round(n_non_trans / n_trans)
128+
return(weights)
129+
}
130+
131+
# This is the heuristic in scikit-learn, n_samples / (n_classes * np.bincount(y))
132+
# https://scikit-learn.org/stable/modules/generated/sklearn.utils.class_weight.compute_class_weight.html #nolint
133+
# This weighting maintains the exact imbalance ratio
134+
weights[trans_result] <- n_total / (2 * n_trans)
135+
weights[!trans_result] <- n_total / (2 * n_non_trans)
136+
137+
weights
138+
}
139+
140+
141+
#' @describeIn covariance_filter Implements the iterative selection procedure.
142+
#' @param cor_mat Absolute correlation matrix
143+
#' @param corcut Correlation cutoff threshold
144+
#' @keywords internal
145+
select_by_correlation <- function(cor_mat, corcut) {
146+
var_names <- colnames(cor_mat)
147+
148+
# Early return if all correlations are below threshold
149+
if (all(cor_mat[lower.tri(cor_mat)] < corcut)) {
150+
return(var_names)
151+
}
152+
153+
selected <- character(0)
154+
remaining_idx <- seq_along(var_names)
155+
156+
while (length(remaining_idx) > 0) {
157+
# Select the first remaining variable (highest ranked)
158+
current_var <- remaining_idx[1]
159+
selected <- c(selected, var_names[current_var])
160+
161+
# Find variables with correlation <= corcut with current variable
162+
# (excluding the variable itself)
163+
keep_idx <- which(cor_mat[remaining_idx, current_var] <= corcut)
164+
remaining_idx <- remaining_idx[keep_idx]
165+
}
166+
167+
selected
168+
}

0 commit comments

Comments
 (0)