Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 46 additions & 0 deletions docs/snowflake_arrow_handling.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
# Snowflake Arrow Handling

This document lists the Snowflake-specific Arrow metadata cases currently handled by Ladybug.

## Supported Cases

| Snowflake signal | Example metadata | Arrow physical storage | Ladybug result | Notes |
| --- | --- | --- | --- | --- |
| Raw Snowflake decimal type via `DATA_TYPE` | `DATA_TYPE=NUMBER(12,4)` | Any Arrow storage type | `DECIMAL(12,4)` | Parsed from Snowflake table-schema metadata. |
| Raw Snowflake decimal type via `DATA_TYPE` with implicit scale | `DATA_TYPE=NUMBER(18)` | Any Arrow storage type | `DECIMAL(18,0)` | Missing scale defaults to `0`. |
| Raw Snowflake decimal aliases via `DATA_TYPE` | `DATA_TYPE=NUMERIC(10,3)` or `DATA_TYPE=DECIMAL(10,3)` | Any Arrow storage type | `DECIMAL(10,3)` | Matching is case-insensitive and whitespace-tolerant. |
| Snowflake logical decimal metadata | `logicalType=FIXED`, `precision=7`, `scale=2` | Integer-backed Arrow (`INT8/16/32/64`, `UINT8/16/32/64`) | `DECIMAL(7,2)` | Used for query-result Arrow schemas. |
| Snowflake logical decimal metadata | `logicalType=FIXED`, `precision=9`, `scale=2` | Float-backed Arrow (`FLOAT`, `DOUBLE`) | `DECIMAL(9,2)` | Values are cast into decimal backing storage during scan. |
| Snowflake raw type fallback to logical metadata | malformed `DATA_TYPE` plus valid `logicalType=FIXED` metadata | Integer-backed or float-backed Arrow | `DECIMAL(p,s)` from `logicalType` metadata | If raw `DATA_TYPE` parsing fails, Snowflake `logicalType` parsing is tried next. |
| Snowflake metadata precedence over generic metadata | `DATA_TYPE=NUMBER(12,4)` plus generic `logicalType=DECIMAL`, `precision=9`, `scale=3` | Any Arrow storage type | `DECIMAL(12,4)` | Snowflake raw type metadata wins over generic metadata. |

## Current Scope

Only Snowflake decimal semantics are handled today.

Specifically:

- `NUMBER(p,s)`
- `NUMBER(p)`
- `NUMERIC(p,s)`
- `DECIMAL(p,s)`
- `logicalType=FIXED`

## Not Yet Handled

The Snowflake ADBC driver documents additional logical types that are not currently interpreted in a Snowflake-specific way here, including:

- `real`
- `date`
- `time`
- `timestamp_ltz`
- `timestamp_ntz`
- `timestamp_tz`
- `text`
- `binary`
- `variant`
- `object`
- `array`
- `boolean`

For those, Ladybug currently relies on the standard Arrow physical type unless future Snowflake-specific decoding is added.
4 changes: 4 additions & 0 deletions src/common/arrow/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,10 @@ add_library(lbug_common_arrow
OBJECT
arrow_array_scan.cpp
arrow_converter.cpp
arrow_schema_metadata_generic_decoder.cpp
arrow_schema_metadata_snowflake_decoder.cpp
arrow_schema_metadata_utils.cpp
arrow_schema_metadata.cpp
arrow_null_mask_tree.cpp
arrow_row_batch.cpp
arrow_type.cpp)
Expand Down
118 changes: 118 additions & 0 deletions src/common/arrow/arrow_array_scan.cpp
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
#include "common/arrow/arrow_converter.h"
#include "common/arrow/arrow_schema_metadata.h"
#include "common/exception/runtime.h"
#include "common/types/int128_t.h"
#include "common/types/interval_t.h"
#include "common/types/types.h"
#include "common/vector/value_vector.h"
#include "function/cast/functions/cast_decimal.h"
#include "function/cast/functions/numeric_limits.h"

namespace lbug {
Expand Down Expand Up @@ -58,6 +60,74 @@ static void scanArrowArrayFixedSizePrimitiveAndCastTo(const ArrowArray* array,
});
}

template<typename SRC>
static void scanArrowArrayIntegerBackedDecimal(const ArrowArray* array, ValueVector& outputVector,
ArrowNullMaskTree* mask, uint64_t srcOffset, uint64_t dstOffset, uint64_t count) {
switch (outputVector.dataType.getPhysicalType()) {
case PhysicalTypeID::INT16:
return scanArrowArrayFixedSizePrimitiveAndCastTo<SRC, int16_t>(array, outputVector, mask,
srcOffset, dstOffset, count);
case PhysicalTypeID::INT32:
return scanArrowArrayFixedSizePrimitiveAndCastTo<SRC, int32_t>(array, outputVector, mask,
srcOffset, dstOffset, count);
case PhysicalTypeID::INT64:
return scanArrowArrayFixedSizePrimitiveAndCastTo<SRC, int64_t>(array, outputVector, mask,
srcOffset, dstOffset, count);
case PhysicalTypeID::INT128:
return scanArrowArrayFixedSizePrimitiveAndCastTo<SRC, int128_t>(array, outputVector, mask,
srcOffset, dstOffset, count);
default:
throw RuntimeException(
"Invalid decimal output type: " +
PhysicalTypeUtils::toString(outputVector.dataType.getPhysicalType()));
}
}

template<typename SRC, typename DST>
static void castArrowArrayDecimalValue(SRC input, ValueVector& outputVector, uint64_t pos) {
DST output{};
function::CastToDecimal::operation(input, output, outputVector, outputVector);
outputVector.setValue<DST>(pos, output);
}

template<typename SRC, typename DST>
static void scanArrowArrayDecimalWithCastTo(const ArrowArray* array, ValueVector& outputVector,
ArrowNullMaskTree* mask, uint64_t srcOffset, uint64_t dstOffset, uint64_t count) {
auto arrayBuffer = (const SRC*)array->buffers[1];

mask->copyToValueVector(&outputVector, dstOffset, count);

rowIter(outputVector, count, [&](auto i) {
if (!mask->isNull(i)) {
castArrowArrayDecimalValue<SRC, DST>(arrayBuffer[i + srcOffset], outputVector,
i + dstOffset);
}
});
}

template<typename SRC>
static void scanArrowArrayDecimalWithCast(const ArrowArray* array, ValueVector& outputVector,
ArrowNullMaskTree* mask, uint64_t srcOffset, uint64_t dstOffset, uint64_t count) {
switch (outputVector.dataType.getPhysicalType()) {
case PhysicalTypeID::INT16:
return scanArrowArrayDecimalWithCastTo<SRC, int16_t>(array, outputVector, mask, srcOffset,
dstOffset, count);
case PhysicalTypeID::INT32:
return scanArrowArrayDecimalWithCastTo<SRC, int32_t>(array, outputVector, mask, srcOffset,
dstOffset, count);
case PhysicalTypeID::INT64:
return scanArrowArrayDecimalWithCastTo<SRC, int64_t>(array, outputVector, mask, srcOffset,
dstOffset, count);
case PhysicalTypeID::INT128:
return scanArrowArrayDecimalWithCastTo<SRC, int128_t>(array, outputVector, mask, srcOffset,
dstOffset, count);
default:
throw RuntimeException(
"Invalid decimal output type: " +
PhysicalTypeUtils::toString(outputVector.dataType.getPhysicalType()));
}
}

template<>
void scanArrowArrayFixedSizePrimitive<bool>(const ArrowArray* array, ValueVector& outputVector,
ArrowNullMaskTree* mask, uint64_t srcOffset, uint64_t dstOffset, uint64_t count) {
Expand Down Expand Up @@ -408,7 +478,55 @@ static void scanArrowArrayRunEndEncoded(const ArrowSchema* schema, const ArrowAr
void ArrowConverter::fromArrowArray(const ArrowSchema* schema, const ArrowArray* array,
ValueVector& outputVector, ArrowNullMaskTree* mask, uint64_t srcOffset, uint64_t dstOffset,
uint64_t count) {
return fromArrowArray(schema, array, outputVector, mask, srcOffset, dstOffset, count, nullptr);
}

void ArrowConverter::fromArrowArray(const ArrowSchema* schema, const ArrowArray* array,
ValueVector& outputVector, ArrowNullMaskTree* mask, uint64_t srcOffset, uint64_t dstOffset,
uint64_t count, const std::optional<ArrowLogicalTypeInfo>* logicalTypeInfo) {
const auto arrowType = schema->format;
std::optional<ArrowLogicalTypeInfo> parsedLogicalTypeInfo;
if (logicalTypeInfo == nullptr) {
parsedLogicalTypeInfo = tryGetArrowLogicalTypeInfo(schema);
logicalTypeInfo = &parsedLogicalTypeInfo;
}
if (logicalTypeInfo->has_value() &&
(*logicalTypeInfo)->type == ArrowLogicalTypeInfo::Type::DECIMAL) {
switch (arrowType[0]) {
case 'c':
return scanArrowArrayIntegerBackedDecimal<int8_t>(array, outputVector, mask, srcOffset,
dstOffset, count);
case 'C':
return scanArrowArrayIntegerBackedDecimal<uint8_t>(array, outputVector, mask, srcOffset,
dstOffset, count);
case 's':
return scanArrowArrayIntegerBackedDecimal<int16_t>(array, outputVector, mask, srcOffset,
dstOffset, count);
case 'S':
return scanArrowArrayIntegerBackedDecimal<uint16_t>(array, outputVector, mask,
srcOffset, dstOffset, count);
case 'i':
return scanArrowArrayIntegerBackedDecimal<int32_t>(array, outputVector, mask, srcOffset,
dstOffset, count);
case 'I':
return scanArrowArrayIntegerBackedDecimal<uint32_t>(array, outputVector, mask,
srcOffset, dstOffset, count);
case 'l':
return scanArrowArrayIntegerBackedDecimal<int64_t>(array, outputVector, mask, srcOffset,
dstOffset, count);
case 'L':
return scanArrowArrayIntegerBackedDecimal<uint64_t>(array, outputVector, mask,
srcOffset, dstOffset, count);
case 'f':
return scanArrowArrayDecimalWithCast<float>(array, outputVector, mask, srcOffset,
dstOffset, count);
case 'g':
return scanArrowArrayDecimalWithCast<double>(array, outputVector, mask, srcOffset,
dstOffset, count);
default:
break;
}
}
if (array->dictionary != nullptr) {
switch (arrowType[0]) {
case 'c':
Expand Down
25 changes: 25 additions & 0 deletions src/common/arrow/arrow_schema_metadata.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
#include "common/arrow/arrow_schema_metadata.h"

#include "arrow_schema_metadata_internal.h"

namespace lbug {
namespace common {

std::optional<ArrowLogicalTypeInfo> tryGetArrowLogicalTypeInfo(const ArrowSchema* schema) {
if (schema == nullptr || schema->format == nullptr || schema->metadata == nullptr) {
return std::nullopt;
}
const auto metadata = readArrowMetadata(schema->metadata);
if (auto snowflakeRawDataTypeInfo = tryParseSnowflakeRawDataTypeInfo(metadata);
snowflakeRawDataTypeInfo.has_value()) {
return snowflakeRawDataTypeInfo;
}
if (auto snowflakeTypeInfo = tryParseSnowflakeLogicalTypeInfo(schema, metadata);
snowflakeTypeInfo.has_value()) {
return snowflakeTypeInfo;
}
return tryParseGenericIntegerBackedDecimalMetadata(schema, metadata);
}

} // namespace common
} // namespace lbug
35 changes: 35 additions & 0 deletions src/common/arrow/arrow_schema_metadata_generic_decoder.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
#include "arrow_schema_metadata_internal.h"

namespace lbug {
namespace common {

std::optional<ArrowLogicalTypeInfo> tryParseGenericIntegerBackedDecimalMetadata(
const ArrowSchema* schema, const ArrowMetadataMap& metadata) {
if (!isIntegralArrowStorageType(schema->format)) {
return std::nullopt;
}
const auto logicalType = getMetadataValue(metadata, "logicaltype");
if (!logicalType.has_value()) {
return std::nullopt;
}
const auto normalized = toLower(*logicalType);
if (normalized != "decimal" && normalized != "number" && normalized != "numeric") {
return std::nullopt;
}
const auto precision = getMetadataValue(metadata, "precision");
const auto scale = getMetadataValue(metadata, "scale");
if (!precision.has_value() || !scale.has_value()) {
return std::nullopt;
}
const auto parsedPrecision = tryParseUint32(*precision);
const auto parsedScale = tryParseUint32(*scale);
if (!parsedPrecision.has_value() || !parsedScale.has_value() ||
!isValidDecimalParameters(*parsedPrecision, *parsedScale)) {
return std::nullopt;
}
return ArrowLogicalTypeInfo{ArrowLogicalTypeInfo::Source::GENERIC_METADATA,
ArrowLogicalTypeInfo::Type::DECIMAL, ArrowDecimalTypeInfo{*parsedPrecision, *parsedScale}};
}

} // namespace common
} // namespace lbug
34 changes: 34 additions & 0 deletions src/common/arrow/arrow_schema_metadata_internal.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
#pragma once

#include <map>
#include <optional>
#include <string>
#include <vector>

#include "common/arrow/arrow_schema_metadata.h"

namespace lbug {
namespace common {

using ArrowMetadataMap = std::map<std::string, std::string>;

bool isIntegralArrowStorageType(const char* arrowType);
bool isFloatingArrowStorageType(const char* arrowType);
std::string toLower(std::string value);
ArrowMetadataMap readArrowMetadata(const char* metadata);
std::string trim(std::string value);
std::vector<std::string> splitCommaSeparated(std::string value);
std::optional<std::string> getMetadataValue(const ArrowMetadataMap& metadata,
const std::string& key);
std::optional<uint32_t> tryParseUint32(const std::string& value);
bool isValidDecimalParameters(uint32_t precision, uint32_t scale);

std::optional<ArrowLogicalTypeInfo> tryParseSnowflakeRawDataTypeInfo(
const ArrowMetadataMap& metadata);
std::optional<ArrowLogicalTypeInfo> tryParseSnowflakeLogicalTypeInfo(const ArrowSchema* schema,
const ArrowMetadataMap& metadata);
std::optional<ArrowLogicalTypeInfo> tryParseGenericIntegerBackedDecimalMetadata(
const ArrowSchema* schema, const ArrowMetadataMap& metadata);

} // namespace common
} // namespace lbug
86 changes: 86 additions & 0 deletions src/common/arrow/arrow_schema_metadata_snowflake_decoder.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
#include "arrow_schema_metadata_internal.h"

namespace lbug {
namespace common {

namespace {

std::optional<ArrowDecimalTypeInfo> tryParseSnowflakeDecimalType(const std::string& rawDataType) {
auto normalized = toLower(trim(rawDataType));
const auto openParen = normalized.find('(');
if (openParen == std::string::npos) {
return std::nullopt;
}
const auto typeName = trim(normalized.substr(0, openParen));
if (typeName != "number" && typeName != "numeric" && typeName != "decimal") {
return std::nullopt;
}
const auto closeParen = normalized.find(')', openParen + 1);
if (closeParen == std::string::npos) {
return std::nullopt;
}
auto args = splitCommaSeparated(normalized.substr(openParen + 1, closeParen - openParen - 1));
if (args.empty() || args.size() > 2) {
return std::nullopt;
}
const auto precision = tryParseUint32(args[0]);
if (!precision.has_value()) {
return std::nullopt;
}
auto scale = std::optional<uint32_t>{0};
if (args.size() == 2) {
scale = tryParseUint32(args[1]);
}
if (!scale.has_value() || !isValidDecimalParameters(*precision, *scale)) {
return std::nullopt;
}
return ArrowDecimalTypeInfo{*precision, *scale};
}

} // namespace

std::optional<ArrowLogicalTypeInfo> tryParseSnowflakeRawDataTypeInfo(
const ArrowMetadataMap& metadata) {
const auto rawDataType = getMetadataValue(metadata, "data_type");
if (!rawDataType.has_value()) {
return std::nullopt;
}
const auto decimalInfo = tryParseSnowflakeDecimalType(*rawDataType);
if (!decimalInfo.has_value()) {
return std::nullopt;
}
return ArrowLogicalTypeInfo{ArrowLogicalTypeInfo::Source::SNOWFLAKE,
ArrowLogicalTypeInfo::Type::DECIMAL, *decimalInfo};
}

std::optional<ArrowLogicalTypeInfo> tryParseSnowflakeLogicalTypeInfo(const ArrowSchema* schema,
const ArrowMetadataMap& metadata) {
if (!isIntegralArrowStorageType(schema->format) &&
!isFloatingArrowStorageType(schema->format)) {
return std::nullopt;
}
const auto logicalType = getMetadataValue(metadata, "logicaltype");
if (!logicalType.has_value()) {
return std::nullopt;
}
const auto normalized = toLower(*logicalType);
if (normalized != "fixed") {
return std::nullopt;
}
const auto precision = getMetadataValue(metadata, "precision");
const auto scale = getMetadataValue(metadata, "scale");
if (!precision.has_value() || !scale.has_value()) {
return std::nullopt;
}
const auto parsedPrecision = tryParseUint32(*precision);
const auto parsedScale = tryParseUint32(*scale);
if (!parsedPrecision.has_value() || !parsedScale.has_value() ||
!isValidDecimalParameters(*parsedPrecision, *parsedScale)) {
return std::nullopt;
}
return ArrowLogicalTypeInfo{ArrowLogicalTypeInfo::Source::SNOWFLAKE,
ArrowLogicalTypeInfo::Type::DECIMAL, ArrowDecimalTypeInfo{*parsedPrecision, *parsedScale}};
}

} // namespace common
} // namespace lbug
Loading
Loading