From 876cbe213d7c41e4b99bb6aa904d35721ea2553d Mon Sep 17 00:00:00 2001 From: Bruce Delo Date: Fri, 12 Jun 2026 10:38:46 -0300 Subject: [PATCH] A handful of changes to the process turned up during the metadata ingestion into remora. --- R/otn_imos_parquet_column_map.R | 100 +++++++++++++++++--------------- 1 file changed, 53 insertions(+), 47 deletions(-) diff --git a/R/otn_imos_parquet_column_map.R b/R/otn_imos_parquet_column_map.R index 4ba20ad..4821917 100644 --- a/R/otn_imos_parquet_column_map.R +++ b/R/otn_imos_parquet_column_map.R @@ -35,21 +35,21 @@ otn_imos_new_style_column_map <- function(det_dataframe, rcvr_dataframe = NULL, # - A receiver dataframe with appropriate columns, if necessary with data derived from the detections dataframe. # - A tag dataframe, same constraints as above. # - An animal measurements dataframe with data derived from the tag dataframe. - + # At the end of all this we return three dataframes- det_data, rec_data, tag_data- that can be passed through to the merge at the bottom of the get_data function. # Animal measurements is a bit weird so I'm going to ignore it for now. - + # This probably won't have the full range of columns that the equivalent IMOS data would. - + # This way, if we don't end up having any way to change these throughout- i.e, no rcvr/tag sheets have been passed- # we just return whatever we got, unaltered. Probably null. tag_return <- tag_dataframe rcvr_return <- rcvr_dataframe - + # Quit instantly if there is no detections dataframe. This is unlikely since this check already happens in the function above, but for completeness' # sake we'll include it. if (is.null(det_dataframe)) stop("\033[31;1mCan not run otn -> imos conversion without a detections file!\033[0m\n") - + if (!is.data.frame(det_dataframe)) { if (format == "parquet") { det_dataframe <- read_parquet(det_dataframe) @@ -57,25 +57,25 @@ otn_imos_new_style_column_map <- function(det_dataframe, rcvr_dataframe = NULL, det_dataframe <- read.csv(det_dataframe, na = c("", "null", "NA")) } } - + # If we don't get passed a receiver or tag dataframe, we derive them from det. This will give us hopefully enough info that we can create the final # detection dataframe to be returned, which will be valid for Remora. Ideally. if (is.null(rcvr_dataframe) && derive) { message("Deriving receiver dataframe...") rcvr_return <- derive_rcvr_from_new_style_det(det_dataframe) } - + if (is.null(tag_dataframe) && derive) { message("Deriving tag dataframe...") tag_return <- derive_tag_from_new_style_det(det_dataframe, tagname_column) } - + # Construct a little lookup table for the aphiaIDs. This keeps us from having to query the WORMS database over and over again (for example, the data I tested on had 300 entries for 'blue shark')- lot of redundant querying there. lookup <- get_unique_aphiaids(det_dataframe$scientificName) - + # Start by mapping the Detections dataframe. det_return <- det_dataframe %>% - select( + dplyr::select( dateCollectedUTC, catalogNumber, tagName, @@ -134,7 +134,7 @@ otn_imos_new_style_column_map <- function(det_dataframe, rcvr_dataframe = NULL, ) det_return$transmitter_deployment_id <- det_return$tag_id det_return$tagging_project_name <- det_return$installation_name - + # If we have receiver_meta, convert that to an IMOS friendly version. if (!is.null(rcvr_dataframe) && !derive) { rcvr_return <- rcvr_dataframe %>% @@ -169,7 +169,7 @@ otn_imos_new_style_column_map <- function(det_dataframe, rcvr_dataframe = NULL, sep = "-", remove = FALSE ) %>% unite( - receiver_deployment_id, c("OTN_ARRAY", "STATION", "INS_SERIAL_NUMBER"), + receiver_deployment_id, c("OTN_ARRAY", "STATION_NO", "INS_SERIAL_NUMBER"), sep = "-", remove = FALSE ) %>% rename( @@ -180,16 +180,17 @@ otn_imos_new_style_column_map <- function(det_dataframe, rcvr_dataframe = NULL, receiver_deployment_latitude = DEPLOY_LAT, receiver_deployment_longitude = DEPLOY_LONG, depth_below_surface = INSTRUMENT_DEPTH, - receiever_recovery_datetime = RECOVER_DATE_TIME, + receiver_recovery_datetime = RECOVER_DATE_TIME, receiver_recovery_latitude = RECOVER_LAT, receiver_recovery_longitude = RECOVER_LONG, + receiver_id = INS_SERIAL_NUMBER ) %>% mutate( - purchasing_organization = NA, + purchasing_organisation = NA, receiver_project_name = coll_code, ) } - + # And if we have tag metadata, convert that too. if (!is.null(tag_dataframe) && !derive) { tag_return <- tag_dataframe %>% @@ -225,8 +226,8 @@ otn_imos_new_style_column_map <- function(det_dataframe, rcvr_dataframe = NULL, AGE_UNITS, SEX, DNA_SAMPLE_TAKEN, - TREATMENT_TYPE, - RELEASE_GROUP, + #TREATMENT_TYPE, + #RELEASE_GROUP, UTC_RELEASE_DATE_TIME, RELEASE_LOCATION, RELEASE_LATITUDE, @@ -243,6 +244,10 @@ otn_imos_new_style_column_map <- function(det_dataframe, rcvr_dataframe = NULL, embargo_date = NA, transmitter_recovery_latitude = NA, transmitter_recovery_longitude = NA, + #Need these in here but I also need to check- I might need to do something more elaborate to get the tag stuff. We'll see + #after Dave tests it. + latitude = NA, + longitude = NA, tagging_project_name = coll_code, ) %>% unite( @@ -250,7 +255,8 @@ otn_imos_new_style_column_map <- function(det_dataframe, rcvr_dataframe = NULL, sep = "-", remove = FALSE ) %>% unite( - transmitter_deployment_id, c("TAG_CODE_SPACE", "TAG_ID_CODE", "cleandate", "RELEASE_LATITUDE", "RELEASE_LONGITUDE"), + #transmitter_deployment_id, c("TAG_CODE_SPACE", "TAG_ID_CODE", "cleandate", "RELEASE_LATITUDE", "RELEASE_LONGITUDE"), + transmitter_deployment_id, c("tagging_project_name", "ANIMAL_ID (floy tag ID, pit tag code, etc.)", "cleandate"), sep = "-", remove = FALSE ) %>% rename( @@ -291,48 +297,48 @@ otn_imos_new_style_column_map <- function(det_dataframe, rcvr_dataframe = NULL, derive_rcvr_from_new_style_det <- function(det_dataframe) { # To start, we will filter the releases out of our detections dataframe. no_releases <- det_dataframe %>% filter(receiver != "release") - + # The first thing we need to do is gin up some inferred min and max deployment dates. # We'll use the following code to do so. rcvr_grouped <- NULL - + # Start by grouping the detections by station, and ordering them by date. rcvr_grouped_list <- no_releases %>% group_by(station) %>% arrange(dateCollectedUTC, .by_group = TRUE) - + # Set min date and max date to null. minDate <- NULL maxDate <- NULL - + # Create a 'lead' dataframe for us to compare our current dataframe against. rcvr_grouped_list_next <- lead(rcvr_grouped_list) - + # For each row in the list for (i in 1:nrow(rcvr_grouped_list)) { row <- rcvr_grouped_list[i, ] - + # If minDate is null, set it to the currently available date. minDate being null implies that # we're just starting with this station (see where it's set to Null, below) if (is.null(minDate)) { minDate <- row$dateCollectedUTC } - + # Get the next row from our "lead" frame. nextStation <- rcvr_grouped_list_next[i, ] - + # If our next station is Null (i.e, we're at the end of the frame), or the next station is different from # the current one (i.e, we've reached the end of this time chunk)... if (is.na(nextStation$receiver) || nextStation$receiver != row$receiver) { # Set Maxdate to our current date. maxDate <- row$dateCollectedUTC - + # Add the min and max dates as entries in the row. row <- row %>% mutate( minDetectionDate = minDate, maxDetectionDate = maxDate ) - + # if rcvr_group hasn't been instantiated yet, use row to create it. if (is.null(rcvr_grouped)) { rcvr_grouped <- row @@ -341,18 +347,18 @@ derive_rcvr_from_new_style_det <- function(det_dataframe) { else { rcvr_grouped <- rbind(rcvr_grouped, row) } - + # reset our min and max date to null so the next group will be handled properly. minDate <- NULL maxDate <- NULL } } - + # Now we have rcvr_grouped, which contains the receiver metadata with the inferred min and max dates. # We can now rename the columns and do the remainder of the manipulation work as normal. - + rcvr <- rcvr_grouped %>% - select( + dplyr::select( # Select the columns from the detection extract that we have access to. collectionCode, detectedBy, @@ -388,7 +394,7 @@ derive_rcvr_from_new_style_det <- function(det_dataframe) { receiver_recovery_longitude = NA, receiver_recovery_latitude = NA ) - + return(as.data.frame(rcvr)) } @@ -416,7 +422,7 @@ derive_tag_from_new_style_det <- function(det_dataframe, tagname_column = "tagNa distinctTag <- det_dataframe %>% group_by(across(tagname_column)) %>% distinct(across(tagname_column), .keep_all = TRUE) - + # To get the correct transmitter lat/lon, we need to get the releases. releases <- det_dataframe %>% filter(receiver == "release") %>% @@ -432,7 +438,7 @@ derive_tag_from_new_style_det <- function(det_dataframe, tagname_column = "tagNa message("Number of releases:") message(nrow(releases)) tag <- distinctTag %>% - select( + dplyr::select( collectionCode, tagName, commonName, @@ -467,19 +473,19 @@ derive_tag_from_new_style_det <- function(det_dataframe, tagname_column = "tagNa transmitter_recovery_latitude = NA, transmitter_recovery_longitude = NA, ) - - + + # Now we can join the releases to get the appropriate transmitter_deployment_lat/lon tag <- left_join(tag, - releases %>% dplyr::select( - transmitter_deployment_id, - transmitter_deployment_latitude, - transmitter_deployment_longitude, - transmitter_deployment_datetime, - transmitter_deployment_locality - ), - by = "transmitter_deployment_id" + releases %>% dplyr::select( + transmitter_deployment_id, + transmitter_deployment_latitude, + transmitter_deployment_longitude, + transmitter_deployment_datetime, + transmitter_deployment_locality + ), + by = "transmitter_deployment_id" ) - + return(as.data.frame(tag)) -} +} \ No newline at end of file