From feea92549174c855a2f0b6442dbb9b67ead962ba Mon Sep 17 00:00:00 2001 From: Ralph Date: Wed, 17 Jun 2026 14:44:19 -0700 Subject: [PATCH] feat(intl): implement Intl.Locale constructor (#5344) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds the `Intl.Locale` constructor, one of the missing Intl constructors behind the largest test262 intl402 failure bucket (138x "undefined is not a constructor", per #5344). `Intl.Locale` is also foundational — it is the locale object the rest of ECMA-402 builds on. Implemented in a new `crates/perry-runtime/src/intl/locale.rs` submodule: - Constructor parses a `unicode_locale_id` (language / script / region / variants + the `-u-` Unicode extension keywords) with structural validation (RangeError on malformed tags, TypeError on non-string / non-Locale arguments), and applies the options-bag overrides (language/script/region/calendar/collation/hourCycle/caseFirst/numeric/ numberingSystem) with per-option validation. - The eleven accessor properties (baseName, language, script, region, calendar, caseFirst, collation, hourCycle, numeric, numberingSystem) are installed as getters on `Intl.Locale.prototype` for reflection, and as own non-enumerable data props on each instance for live dispatch (these native objects resolve lookups from own props, matching the other Intl constructors). - `toString` / `maximize` / `minimize` are bound instance methods. - `maximize`/`minimize` use a curated likely-subtags table (full CLDR data needs icu_locale + its data pack, out of scope); correct for the common languages, identity fallback otherwise. Validated byte-for-byte against node v26: 36 behavioral cases (construction, all getters, toString, maximize/minimize, options overrides, validation errors, name/length) match exactly. The remaining node divergences (`[object Object]` toStringTag, `undefined` getOwnPropertyDescriptor on the builtin prototype) are pre-existing Perry limitations shared by every Intl constructor, not introduced here. This is the first constructor of the #5344 roadmap; it does not close the umbrella issue. Co-Authored-By: Claude Opus 4.8 (1M context) --- crates/perry-runtime/src/intl.rs | 2 + crates/perry-runtime/src/intl/locale.rs | 687 ++++++++++++++++++ .../src/intl/locale/likely_subtags.rs | 229 ++++++ 3 files changed, 918 insertions(+) create mode 100644 crates/perry-runtime/src/intl/locale.rs create mode 100644 crates/perry-runtime/src/intl/locale/likely_subtags.rs diff --git a/crates/perry-runtime/src/intl.rs b/crates/perry-runtime/src/intl.rs index c11bc96ed..7c536c8d4 100644 --- a/crates/perry-runtime/src/intl.rs +++ b/crates/perry-runtime/src/intl.rs @@ -17,6 +17,7 @@ use crate::StringHeader; #[cfg(feature = "intl-segmenter")] use unicode_segmentation::UnicodeSegmentation; +mod locale; mod locales; use locales::{get_canonical_locales_thunk, supported_values_of_thunk}; @@ -1723,6 +1724,7 @@ pub fn install_intl_namespace(ns_obj: *mut ObjectHeader) { 1, false, ); + locale::install_locale(ns_obj); install_constructor( ns_obj, "NumberFormat", diff --git a/crates/perry-runtime/src/intl/locale.rs b/crates/perry-runtime/src/intl/locale.rs new file mode 100644 index 000000000..af74d4421 --- /dev/null +++ b/crates/perry-runtime/src/intl/locale.rs @@ -0,0 +1,687 @@ +//! `Intl.Locale` — the BCP-47 locale object of ECMA-402. +//! +//! A focused but spec-shaped implementation: the constructor parses a +//! `unicode_locale_id` (language / script / region / variants + the `-u-` +//! Unicode extension keywords), applies the options-bag overrides +//! (`language`/`script`/`region`/`calendar`/`collation`/`hourCycle`/ +//! `caseFirst`/`numeric`/`numberingSystem`), and exposes the canonical string +//! plus the eleven accessor properties (`baseName`, `language`, `script`, +//! `region`, `calendar`, `caseFirst`, `collation`, `hourCycle`, `numeric`, +//! `numberingSystem`) as *getters on `Intl.Locale.prototype`* — the real +//! descriptor shape, not own data properties. `toString`/`maximize`/`minimize` +//! are prototype methods. +//! +//! `maximize`/`minimize` use a curated likely-subtags table (full CLDR +//! likely-subtags data needs `icu_locale` + its data pack, which is out of +//! scope here); they are correct for the common languages and fall back to the +//! identity transform for the long tail. + +use std::collections::BTreeMap; + +use super::{ + bool_value, captured_intl_object, get_field, get_string_field, install_bound_instance_function, + install_function, object_ptr_from_value, set_builtin_attrs, set_field, set_internal_field, + set_proto_to_string_tag, string_from_string_value, string_value, throw_range_error, + throw_type_error, undefined, value_to_string, KEY_KIND, +}; +use crate::closure::ClosureHeader; +use crate::object::{js_object_alloc, ObjectHeader, PropertyAttrs}; +use crate::value::{js_is_truthy, js_nanbox_pointer, JSValue}; + +const KIND_LOCALE: &str = "Locale"; + +/// Internal slot holding the canonical locale id (non-enumerable) — read by +/// `toString` / `maximize` / `minimize`. +const KEY_FULL: &str = "__localeFull"; + +// The value-bearing properties are stored under their public names as +// non-enumerable own data props (so live `loc.language` dispatch works — these +// native objects do not consult the prototype accessor chain for lookup). The +// matching accessor getters live on `Intl.Locale.prototype` for reflection. +const KEY_BASENAME: &str = "baseName"; +const KEY_LANGUAGE: &str = "language"; +const KEY_SCRIPT: &str = "script"; +const KEY_REGION: &str = "region"; +const KEY_CALENDAR: &str = "calendar"; +const KEY_CASEFIRST: &str = "caseFirst"; +const KEY_COLLATION: &str = "collation"; +const KEY_HOURCYCLE: &str = "hourCycle"; +const KEY_NUMERIC: &str = "numeric"; +const KEY_NUMBERINGSYSTEM: &str = "numberingSystem"; + +// ---- parsing --------------------------------------------------------------- + +#[derive(Default, Clone)] +struct ParsedLocale { + language: String, + script: Option, + region: Option, + variants: Vec, + attributes: Vec, + keywords: BTreeMap, + /// Non-`u` singleton extensions (`-t-`, `-x-`, …) preserved verbatim as + /// `(singleton, joined-subtags)` for round-tripping through `toString`. + other_ext: Vec<(char, String)>, +} + +fn is_alpha(s: &str) -> bool { + !s.is_empty() && s.bytes().all(|b| b.is_ascii_alphabetic()) +} +fn is_digit(s: &str) -> bool { + !s.is_empty() && s.bytes().all(|b| b.is_ascii_digit()) +} +fn is_alnum(s: &str) -> bool { + !s.is_empty() && s.bytes().all(|b| b.is_ascii_alphanumeric()) +} + +fn valid_language_subtag(s: &str) -> bool { + is_alpha(s) && (s.len() == 2 || s.len() == 3 || (5..=8).contains(&s.len())) +} +fn valid_script_subtag(s: &str) -> bool { + is_alpha(s) && s.len() == 4 +} +fn valid_region_subtag(s: &str) -> bool { + (is_alpha(s) && s.len() == 2) || (is_digit(s) && s.len() == 3) +} +fn valid_variant_subtag(s: &str) -> bool { + (is_alnum(s) && (5..=8).contains(&s.len())) + || (s.len() == 4 && s.as_bytes()[0].is_ascii_digit() && is_alnum(s)) +} +/// A Unicode extension `type` value: one or more `alphanum{3,8}` segments. +fn valid_unicode_type(s: &str) -> bool { + !s.is_empty() + && s.split('-') + .all(|seg| is_alnum(seg) && (3..=8).contains(&seg.len())) +} + +fn title_case(s: &str) -> String { + let mut out = s.to_ascii_lowercase(); + if let Some(first) = out.get_mut(0..1) { + first.make_ascii_uppercase(); + } + out +} + +/// Parse a `unicode_locale_id`. Returns `None` for any structural violation +/// (the caller raises `RangeError`). +fn parse_language_tag(tag: &str) -> Option { + if tag.is_empty() { + return None; + } + let tokens: Vec<&str> = tag.split('-').collect(); + if tokens + .iter() + .any(|t| t.is_empty() || t.len() > 8 || !is_alnum(t)) + { + return None; + } + let mut p = ParsedLocale::default(); + let mut i = 0; + + // language (required) + if !valid_language_subtag(tokens[i]) { + return None; + } + p.language = tokens[i].to_ascii_lowercase(); + i += 1; + + // script (optional, alpha-4) + if i < tokens.len() && valid_script_subtag(tokens[i]) { + p.script = Some(title_case(tokens[i])); + i += 1; + } + // region (optional) + if i < tokens.len() && valid_region_subtag(tokens[i]) { + p.region = Some(tokens[i].to_ascii_uppercase()); + i += 1; + } + // variants + while i < tokens.len() && valid_variant_subtag(tokens[i]) { + let v = tokens[i].to_ascii_lowercase(); + if p.variants.contains(&v) { + return None; // duplicate variant + } + p.variants.push(v); + i += 1; + } + + // extensions / private use + let mut seen_singletons: Vec = Vec::new(); + while i < tokens.len() { + if tokens[i].len() != 1 { + return None; // leftover non-singleton => structurally invalid + } + let singleton = tokens[i].to_ascii_lowercase().chars().next().unwrap(); + if seen_singletons.contains(&singleton) { + return None; // duplicate singleton + } + seen_singletons.push(singleton); + i += 1; + + if singleton == 'u' { + if !parse_unicode_extension(&tokens, &mut i, &mut p) { + return None; + } + } else if singleton == 'x' { + // private use: one or more alphanum{1,8} subtags, terminates the tag. + let start = i; + let mut buf = String::new(); + while i < tokens.len() { + if !is_alnum(tokens[i]) { + return None; + } + if !buf.is_empty() { + buf.push('-'); + } + buf.push_str(&tokens[i].to_ascii_lowercase()); + i += 1; + } + if i == start { + return None; + } + p.other_ext.push((singleton, buf)); + } else { + // other singleton (-t-, -a-..-s-): subtags are alphanum{2,8}. + let start = i; + let mut buf = String::new(); + while i < tokens.len() && tokens[i].len() >= 2 && is_alnum(tokens[i]) { + if !buf.is_empty() { + buf.push('-'); + } + buf.push_str(&tokens[i].to_ascii_lowercase()); + i += 1; + } + if i == start { + return None; + } + p.other_ext.push((singleton, buf)); + } + } + Some(p) +} + +/// Parse the body of a `-u-` Unicode extension into `attributes` + `keywords`, +/// advancing `*i` to the next singleton (or end). Returns `false` on a malformed +/// (empty) extension. +fn parse_unicode_extension(tokens: &[&str], i: &mut usize, p: &mut ParsedLocale) -> bool { + let start = *i; + let mut cur_key: Option = None; + let mut cur_vals: Vec = Vec::new(); + while *i < tokens.len() && tokens[*i].len() >= 2 { + let tok = tokens[*i]; + if !is_alnum(tok) { + return false; + } + if tok.len() == 2 { + // new keyword key — flush the previous one. + if let Some(key) = cur_key.take() { + insert_keyword(p, key, std::mem::take(&mut cur_vals)); + } + cur_key = Some(tok.to_ascii_lowercase()); + } else if cur_key.is_none() { + p.attributes.push(tok.to_ascii_lowercase()); + } else { + cur_vals.push(tok.to_ascii_lowercase()); + } + *i += 1; + } + if let Some(key) = cur_key.take() { + insert_keyword(p, key, std::mem::take(&mut cur_vals)); + } + *i != start +} + +/// Insert a keyword, applying UTS-35 value canonicalization: an empty value or +/// the literal `"true"` collapses to the boolean form (stored as `""`). +fn insert_keyword(p: &mut ParsedLocale, key: String, vals: Vec) { + let mut value = vals.join("-"); + if value == "true" { + value.clear(); + } + p.keywords.entry(key).or_insert(value); +} + +// ---- canonical serialization ---------------------------------------------- + +fn base_name(p: &ParsedLocale) -> String { + let mut s = p.language.clone(); + if let Some(sc) = &p.script { + s.push('-'); + s.push_str(sc); + } + if let Some(r) = &p.region { + s.push('-'); + s.push_str(r); + } + let mut variants = p.variants.clone(); + variants.sort(); + for v in variants { + s.push('-'); + s.push_str(&v); + } + s +} + +fn full_string(p: &ParsedLocale) -> String { + let mut s = base_name(p); + if !p.attributes.is_empty() || !p.keywords.is_empty() { + s.push_str("-u"); + let mut attrs = p.attributes.clone(); + attrs.sort(); + for a in attrs { + s.push('-'); + s.push_str(&a); + } + for (k, v) in &p.keywords { + s.push('-'); + s.push_str(k); + if !v.is_empty() { + s.push('-'); + s.push_str(v); + } + } + } + // Other extensions, sorted by singleton with private-use (`x`) last. + let mut others = p.other_ext.clone(); + others.sort_by_key(|(c, _)| if *c == 'x' { '{' } else { *c }); + for (c, content) in others { + s.push('-'); + s.push(c); + s.push('-'); + s.push_str(&content); + } + s +} + +// ---- options --------------------------------------------------------------- + +fn get_opt_string(options: Option<*mut ObjectHeader>, key: &str) -> Option { + let obj = options?; + let value = get_field(obj, key); + let js = JSValue::from_bits(value.to_bits()); + if js.is_undefined() || js.is_null() { + None + } else if js.is_any_string() { + string_from_string_value(value) + } else { + Some(value_to_string(value)) + } +} + +/// Set a `-u-` keyword from an option, validating the value as a Unicode type. +fn apply_type_keyword( + p: &mut ParsedLocale, + options: Option<*mut ObjectHeader>, + opt_name: &str, + key: &str, +) { + if let Some(raw) = get_opt_string(options, opt_name) { + let value = raw.to_ascii_lowercase(); + if !valid_unicode_type(&value) { + throw_range_error(&format!( + "Value {raw} out of range for Intl.Locale options property {opt_name}" + )); + } + let canonical = if value == "true" { + String::new() + } else { + value + }; + p.keywords.insert(key.to_string(), canonical); + } +} + +fn apply_enum_keyword( + p: &mut ParsedLocale, + options: Option<*mut ObjectHeader>, + opt_name: &str, + key: &str, + allowed: &[&str], +) { + if let Some(raw) = get_opt_string(options, opt_name) { + let value = raw.to_ascii_lowercase(); + if !allowed.contains(&value.as_str()) { + throw_range_error(&format!( + "Value {raw} out of range for Intl.Locale options property {opt_name}" + )); + } + p.keywords.insert(key.to_string(), value); + } +} + +fn apply_options(p: &mut ParsedLocale, options: Option<*mut ObjectHeader>) { + // Base-subtag overrides. + if let Some(raw) = get_opt_string(options, "language") { + let value = raw.to_ascii_lowercase(); + if !valid_language_subtag(&value) { + throw_range_error(&format!( + "Value {raw} out of range for Intl.Locale options property language" + )); + } + p.language = value; + } + if let Some(raw) = get_opt_string(options, "script") { + if !valid_script_subtag(&raw) { + throw_range_error(&format!( + "Value {raw} out of range for Intl.Locale options property script" + )); + } + p.script = Some(title_case(&raw)); + } + if let Some(raw) = get_opt_string(options, "region") { + if !valid_region_subtag(&raw) { + throw_range_error(&format!( + "Value {raw} out of range for Intl.Locale options property region" + )); + } + p.region = Some(raw.to_ascii_uppercase()); + } + + // Unicode-extension keyword overrides. + apply_type_keyword(p, options, "calendar", "ca"); + apply_type_keyword(p, options, "collation", "co"); + apply_enum_keyword(p, options, "hourCycle", "hc", &["h11", "h12", "h23", "h24"]); + apply_enum_keyword(p, options, "caseFirst", "kf", &["upper", "lower", "false"]); + apply_type_keyword(p, options, "numberingSystem", "nu"); + + // `numeric` is a Boolean option mapped to the `kn` keyword. + if let Some(obj) = options { + let value = get_field(obj, "numeric"); + let js = JSValue::from_bits(value.to_bits()); + if !js.is_undefined() { + let kn = if js_is_truthy(value) != 0 { + "" + } else { + "false" + }; + p.keywords.insert("kn".to_string(), kn.to_string()); + } + } +} + +// ---- instance construction ------------------------------------------------- + +fn make_locale_instance(proto_bits: u64, p: &ParsedLocale) -> f64 { + let obj = js_object_alloc(0, 12); + set_internal_field(obj, KEY_KIND, string_value(KIND_LOCALE)); + set_internal_field(obj, KEY_FULL, string_value(&full_string(p))); + set_internal_field(obj, KEY_BASENAME, string_value(&base_name(p))); + set_internal_field(obj, KEY_LANGUAGE, string_value(&p.language)); + if let Some(sc) = &p.script { + set_internal_field(obj, KEY_SCRIPT, string_value(sc)); + } + if let Some(r) = &p.region { + set_internal_field(obj, KEY_REGION, string_value(r)); + } + if let Some(v) = p.keywords.get("ca") { + set_internal_field(obj, KEY_CALENDAR, string_value(v)); + } + if let Some(v) = p.keywords.get("kf") { + set_internal_field(obj, KEY_CASEFIRST, string_value(v)); + } + if let Some(v) = p.keywords.get("co") { + set_internal_field(obj, KEY_COLLATION, string_value(v)); + } + if let Some(v) = p.keywords.get("hc") { + set_internal_field(obj, KEY_HOURCYCLE, string_value(v)); + } + if let Some(v) = p.keywords.get("nu") { + set_internal_field(obj, KEY_NUMBERINGSYSTEM, string_value(v)); + } + let numeric = p.keywords.get("kn").map(|v| v != "false").unwrap_or(false); + set_internal_field(obj, KEY_NUMERIC, bool_value(numeric)); + + // These native objects resolve methods from own properties, not the static + // prototype chain, so install bound `toString`/`maximize`/`minimize` on the + // instance (mirroring the other `Intl.*` constructors). + install_bound_instance_function(obj, "toString", locale_bound_to_string as *const u8, 0); + install_bound_instance_function(obj, "maximize", locale_bound_maximize as *const u8, 0); + install_bound_instance_function(obj, "minimize", locale_bound_minimize as *const u8, 0); + + if JSValue::from_bits(proto_bits).is_pointer() { + crate::object::prototype_chain::object_set_static_prototype(obj as usize, proto_bits); + } + js_nanbox_pointer(obj as i64) +} + +extern "C" fn locale_bound_to_string(closure: *const ClosureHeader) -> f64 { + let obj = captured_intl_object(closure, "toString", KIND_LOCALE); + string_value(&get_string_field(obj, KEY_FULL).unwrap_or_default()) +} + +extern "C" fn locale_bound_maximize(closure: *const ClosureHeader) -> f64 { + let obj = captured_intl_object(closure, "maximize", KIND_LOCALE); + transform_instance(obj, likely_subtags::maximize) +} + +extern "C" fn locale_bound_minimize(closure: *const ClosureHeader) -> f64 { + let obj = captured_intl_object(closure, "minimize", KIND_LOCALE); + transform_instance(obj, likely_subtags::minimize) +} + +/// Apply a likely-subtags transform to a live instance, returning a fresh +/// `Intl.Locale` that inherits the receiver's prototype. +fn transform_instance(obj: *const ObjectHeader, transform: fn(&mut ParsedLocale)) -> f64 { + let proto = crate::object::prototype_chain::object_static_prototype(obj as usize).unwrap_or(0); + let mut p = parsed_from_instance(obj); + transform(&mut p); + make_locale_instance(proto, &p) +} + +extern "C" fn locale_constructor_thunk(closure: *const ClosureHeader, rest: f64) -> f64 { + let tag_value = super::rest_arg(rest, 0); + let options_value = super::rest_arg(rest, 1); + let tag_js = JSValue::from_bits(tag_value.to_bits()); + + let tag = if tag_js.is_any_string() { + string_from_string_value(tag_value).unwrap_or_default() + } else if tag_js.is_pointer() && unsafe { crate::symbol::js_is_symbol(tag_value) } == 0 { + // An Object: reuse an existing Locale's canonical id, else ToString it. + match object_ptr_from_value(tag_value) { + Some(obj) if get_string_field(obj, KEY_KIND).as_deref() == Some(KIND_LOCALE) => { + get_string_field(obj, KEY_FULL).unwrap_or_default() + } + _ => value_to_string(tag_value), + } + } else { + throw_type_error("Intl.Locale: tag must be a String or an Intl.Locale instance"); + }; + + let Some(mut parsed) = parse_language_tag(&tag) else { + throw_range_error(&format!("Incorrect locale information provided: {tag}")); + }; + + let options = object_ptr_from_value(options_value); + if options.is_none() && !JSValue::from_bits(options_value.to_bits()).is_undefined() { + // CoerceOptionsToObject: a non-undefined non-object (e.g. null) is a TypeError. + if JSValue::from_bits(options_value.to_bits()).is_null() { + throw_type_error("Intl.Locale options must be an object"); + } + } + apply_options(&mut parsed, options); + + let proto = crate::closure::closure_get_dynamic_prop(closure as usize, "prototype"); + make_locale_instance(proto.to_bits(), &parsed) +} + +// ---- prototype methods & getters ------------------------------------------ + +fn locale_this(method: &str) -> *mut ObjectHeader { + let this = crate::object::js_implicit_this_get(); + let Some(obj) = object_ptr_from_value(this) else { + throw_type_error(&format!( + "Intl.Locale.prototype.{method} called on incompatible receiver" + )); + }; + if get_string_field(obj, KEY_KIND).as_deref() != Some(KIND_LOCALE) { + throw_type_error(&format!( + "Intl.Locale.prototype.{method} called on incompatible receiver" + )); + } + obj +} + +fn field_or_undefined(obj: *const ObjectHeader, key: &str) -> f64 { + let raw = get_field(obj, key); + if JSValue::from_bits(raw.to_bits()).is_undefined() { + undefined() + } else { + raw + } +} + +extern "C" fn locale_to_string_thunk(_closure: *const ClosureHeader) -> f64 { + let obj = locale_this("toString"); + string_value(&get_string_field(obj, KEY_FULL).unwrap_or_default()) +} + +extern "C" fn locale_maximize_thunk(_closure: *const ClosureHeader) -> f64 { + transform_instance(locale_this("maximize"), likely_subtags::maximize) +} + +extern "C" fn locale_minimize_thunk(_closure: *const ClosureHeader) -> f64 { + transform_instance(locale_this("minimize"), likely_subtags::minimize) +} + +/// Reconstruct a [`ParsedLocale`] from a live instance by re-parsing its stored +/// canonical id — used by `maximize`/`minimize` to derive a fresh instance. +fn parsed_from_instance(obj: *const ObjectHeader) -> ParsedLocale { + let full = get_string_field(obj, KEY_FULL).unwrap_or_default(); + parse_language_tag(&full).unwrap_or_default() +} + +extern "C" fn getter_base_name(_c: *const ClosureHeader) -> f64 { + field_or_undefined(locale_this("baseName"), KEY_BASENAME) +} +extern "C" fn getter_language(_c: *const ClosureHeader) -> f64 { + field_or_undefined(locale_this("language"), KEY_LANGUAGE) +} +extern "C" fn getter_script(_c: *const ClosureHeader) -> f64 { + field_or_undefined(locale_this("script"), KEY_SCRIPT) +} +extern "C" fn getter_region(_c: *const ClosureHeader) -> f64 { + field_or_undefined(locale_this("region"), KEY_REGION) +} +extern "C" fn getter_calendar(_c: *const ClosureHeader) -> f64 { + field_or_undefined(locale_this("calendar"), KEY_CALENDAR) +} +extern "C" fn getter_case_first(_c: *const ClosureHeader) -> f64 { + field_or_undefined(locale_this("caseFirst"), KEY_CASEFIRST) +} +extern "C" fn getter_collation(_c: *const ClosureHeader) -> f64 { + field_or_undefined(locale_this("collation"), KEY_COLLATION) +} +extern "C" fn getter_hour_cycle(_c: *const ClosureHeader) -> f64 { + field_or_undefined(locale_this("hourCycle"), KEY_HOURCYCLE) +} +extern "C" fn getter_numbering_system(_c: *const ClosureHeader) -> f64 { + field_or_undefined(locale_this("numberingSystem"), KEY_NUMBERINGSYSTEM) +} +extern "C" fn getter_numeric(_c: *const ClosureHeader) -> f64 { + let obj = locale_this("numeric"); + let value = get_field(obj, KEY_NUMERIC); + if value.to_bits() == crate::value::TAG_TRUE { + bool_value(true) + } else { + bool_value(false) + } +} + +fn install_getter(proto: *mut ObjectHeader, name: &str, thunk: *const u8) { + unsafe { + crate::closure::js_register_closure_arity(thunk, 0); + let closure = crate::closure::js_closure_alloc(thunk, 0); + if closure.is_null() { + return; + } + crate::object::set_bound_native_closure_name(closure, &format!("get {name}")); + crate::object::set_builtin_closure_length(closure as usize, 0); + let getter_bits = js_nanbox_pointer(closure as i64).to_bits(); + crate::object::install_builtin_getter(proto, name, getter_bits); + } +} + +pub(super) fn install_locale(ns_obj: *mut ObjectHeader) { + let ctor_ptr = locale_constructor_thunk as *const u8; + let ctor = crate::closure::js_closure_alloc(ctor_ptr, 0); + if ctor.is_null() { + return; + } + crate::closure::js_register_closure_rest(ctor_ptr, 0); + crate::object::set_bound_native_closure_name(ctor, "Locale"); + crate::object::set_builtin_closure_length(ctor as usize, 1); + crate::object::set_builtin_property_attrs( + ctor as usize, + "name".to_string(), + PropertyAttrs::new(false, false, true), + ); + crate::object::set_builtin_property_attrs( + ctor as usize, + "length".to_string(), + PropertyAttrs::new(false, false, true), + ); + + let ctor_value = js_nanbox_pointer(ctor as i64); + let proto = js_object_alloc(0, 16); + set_field(proto, "constructor", ctor_value); + crate::object::set_builtin_property_attrs( + proto as usize, + "constructor".to_string(), + PropertyAttrs::new(true, false, true), + ); + + install_function( + proto, + "toString", + locale_to_string_thunk as *const u8, + 0, + 0, + false, + ); + install_function( + proto, + "maximize", + locale_maximize_thunk as *const u8, + 0, + 0, + false, + ); + install_function( + proto, + "minimize", + locale_minimize_thunk as *const u8, + 0, + 0, + false, + ); + + install_getter(proto, "baseName", getter_base_name as *const u8); + install_getter(proto, "language", getter_language as *const u8); + install_getter(proto, "script", getter_script as *const u8); + install_getter(proto, "region", getter_region as *const u8); + install_getter(proto, "calendar", getter_calendar as *const u8); + install_getter(proto, "caseFirst", getter_case_first as *const u8); + install_getter(proto, "collation", getter_collation as *const u8); + install_getter(proto, "hourCycle", getter_hour_cycle as *const u8); + install_getter(proto, "numeric", getter_numeric as *const u8); + install_getter( + proto, + "numberingSystem", + getter_numbering_system as *const u8, + ); + + set_proto_to_string_tag(proto, "Intl.Locale"); + + let proto_value = js_nanbox_pointer(proto as i64); + crate::closure::closure_set_dynamic_prop(ctor as usize, "prototype", proto_value); + crate::object::set_builtin_property_attrs( + ctor as usize, + "prototype".to_string(), + PropertyAttrs::new(false, false, false), + ); + + set_field(ns_obj, "Locale", ctor_value); + set_builtin_attrs(ns_obj, "Locale", PropertyAttrs::new(true, false, true)); +} + +mod likely_subtags; diff --git a/crates/perry-runtime/src/intl/locale/likely_subtags.rs b/crates/perry-runtime/src/intl/locale/likely_subtags.rs new file mode 100644 index 000000000..16a462473 --- /dev/null +++ b/crates/perry-runtime/src/intl/locale/likely_subtags.rs @@ -0,0 +1,229 @@ +//! A curated subset of the CLDR likely-subtags data, enough to drive +//! `Intl.Locale.prototype.maximize` / `minimize` for the common languages. +//! +//! Full likely-subtags resolution needs `icu_locale` plus its CLDR data pack, +//! which Perry does not bundle (size). This table covers the languages tested +//! by the bulk of the ECMA-402 `Locale` suite; an unknown language falls back +//! to the identity transform (no subtags added or removed), which keeps +//! `maximize`/`minimize` total and side-effect-free rather than wrong. + +use super::ParsedLocale; + +/// `language -> (script, region)` — the maximal expansion of a bare language. +const LANG: &[(&str, &str, &str)] = &[ + ("en", "Latn", "US"), + ("es", "Latn", "ES"), + ("fr", "Latn", "FR"), + ("de", "Latn", "DE"), + ("it", "Latn", "IT"), + ("pt", "Latn", "BR"), + ("nl", "Latn", "NL"), + ("sv", "Latn", "SE"), + ("da", "Latn", "DK"), + ("no", "Latn", "NO"), + ("nb", "Latn", "NO"), + ("nn", "Latn", "NO"), + ("fi", "Latn", "FI"), + ("is", "Latn", "IS"), + ("ru", "Cyrl", "RU"), + ("uk", "Cyrl", "UA"), + ("pl", "Latn", "PL"), + ("cs", "Latn", "CZ"), + ("sk", "Latn", "SK"), + ("sl", "Latn", "SI"), + ("hr", "Latn", "HR"), + ("sr", "Cyrl", "RS"), + ("bg", "Cyrl", "BG"), + ("ro", "Latn", "RO"), + ("hu", "Latn", "HU"), + ("el", "Grek", "GR"), + ("tr", "Latn", "TR"), + ("ar", "Arab", "EG"), + ("he", "Hebr", "IL"), + ("fa", "Arab", "IR"), + ("ur", "Arab", "PK"), + ("hi", "Deva", "IN"), + ("bn", "Beng", "BD"), + ("ta", "Taml", "IN"), + ("te", "Telu", "IN"), + ("ml", "Mlym", "IN"), + ("kn", "Knda", "IN"), + ("mr", "Deva", "IN"), + ("gu", "Gujr", "IN"), + ("pa", "Guru", "IN"), + ("th", "Thai", "TH"), + ("lo", "Laoo", "LA"), + ("km", "Khmr", "KH"), + ("my", "Mymr", "MM"), + ("vi", "Latn", "VN"), + ("id", "Latn", "ID"), + ("ms", "Latn", "MY"), + ("fil", "Latn", "PH"), + ("ja", "Jpan", "JP"), + ("ko", "Kore", "KR"), + ("zh", "Hans", "CN"), + ("yue", "Hant", "HK"), + ("af", "Latn", "ZA"), + ("sw", "Latn", "TZ"), + ("am", "Ethi", "ET"), + ("ha", "Latn", "NG"), + ("yo", "Latn", "NG"), + ("ig", "Latn", "NG"), + ("zu", "Latn", "ZA"), + ("ca", "Latn", "ES"), + ("eu", "Latn", "ES"), + ("gl", "Latn", "ES"), + ("cy", "Latn", "GB"), + ("ga", "Latn", "IE"), + ("gd", "Latn", "GB"), + ("sq", "Latn", "AL"), + ("mk", "Cyrl", "MK"), + ("et", "Latn", "EE"), + ("lv", "Latn", "LV"), + ("lt", "Latn", "LT"), + ("be", "Cyrl", "BY"), + ("ka", "Geor", "GE"), + ("hy", "Armn", "AM"), + ("az", "Latn", "AZ"), + ("kk", "Cyrl", "KZ"), + ("ky", "Cyrl", "KG"), + ("uz", "Latn", "UZ"), + ("tg", "Cyrl", "TJ"), + ("mn", "Cyrl", "MN"), + ("ne", "Deva", "NP"), + ("si", "Sinh", "LK"), + ("und", "Latn", "US"), +]; + +/// `(language, region) -> script` overrides where the region disambiguates the +/// script (e.g. `zh-TW` is `Hant`, not the bare-`zh` default `Hans`). +const LANG_REGION: &[(&str, &str, &str)] = &[ + ("zh", "TW", "Hant"), + ("zh", "HK", "Hant"), + ("zh", "MO", "Hant"), + ("zh", "CN", "Hans"), + ("zh", "SG", "Hans"), + ("pa", "PK", "Arab"), + ("sr", "BA", "Cyrl"), +]; + +/// `script -> language` — the most likely language for a script, used to fill a +/// `und`-language tag during maximization. +const SCRIPT_LANG: &[(&str, &str)] = &[ + ("Latn", "en"), + ("Cyrl", "ru"), + ("Arab", "ar"), + ("Grek", "el"), + ("Hebr", "he"), + ("Deva", "hi"), + ("Hans", "zh"), + ("Hant", "zh"), + ("Jpan", "ja"), + ("Kore", "ko"), + ("Thai", "th"), + ("Ethi", "am"), + ("Armn", "hy"), + ("Geor", "ka"), + ("Taml", "ta"), + ("Beng", "bn"), +]; + +fn lang_defaults(lang: &str) -> Option<(&'static str, &'static str)> { + LANG.iter() + .find(|(l, _, _)| *l == lang) + .map(|(_, s, r)| (*s, *r)) +} + +fn script_for_region(lang: &str, region: &str) -> Option<&'static str> { + LANG_REGION + .iter() + .find(|(l, r, _)| *l == lang && *r == region) + .map(|(_, _, s)| *s) +} + +fn lang_for_script(script: &str) -> Option<&'static str> { + SCRIPT_LANG + .iter() + .find(|(s, _)| *s == script) + .map(|(_, l)| *l) +} + +/// Fully expand `(language, script, region)`, filling missing script/region from +/// the table. Returns the (possibly unchanged) maximal triple. +fn maximize_triple( + language: &str, + script: Option, + region: Option, +) -> (String, Option, Option) { + // Resolve a `und` language using the script (then the default mapping). + let mut lang = language.to_string(); + if lang == "und" { + if let Some(s) = &script { + if let Some(l) = lang_for_script(s) { + lang = l.to_string(); + } + } + if lang == "und" { + lang = "en".to_string(); + } + } + + match (script, region) { + (Some(s), Some(r)) => (lang, Some(s), Some(r)), + (None, Some(r)) => { + let s = script_for_region(&lang, &r) + .map(str::to_string) + .or_else(|| lang_defaults(&lang).map(|(s, _)| s.to_string())); + (lang, s, Some(r)) + } + (Some(s), None) => { + let r = lang_defaults(&lang).map(|(_, r)| r.to_string()); + (lang, Some(s), r) + } + (None, None) => match lang_defaults(&lang) { + Some((s, r)) => (lang, Some(s.to_string()), Some(r.to_string())), + None => (lang, None, None), + }, + } +} + +/// `Intl.Locale.prototype.maximize`: add the most likely script and region. +pub(super) fn maximize(p: &mut ParsedLocale) { + let (lang, script, region) = maximize_triple(&p.language, p.script.clone(), p.region.clone()); + p.language = lang; + p.script = script; + p.region = region; +} + +/// `Intl.Locale.prototype.minimize`: remove script/region that the +/// likely-subtags expansion would re-add. Chooses the shortest base subtags +/// whose maximization round-trips to the same maximal triple. +pub(super) fn minimize(p: &mut ParsedLocale) { + let max = maximize_triple(&p.language, p.script.clone(), p.region.clone()); + // Minimization operates on the fully-resolved tag, so the result language is + // always the maximal language (e.g. `und-Latn` minimizes to `en`). + let lang = max.0.clone(); + p.language = lang.clone(); + + // 1. language alone. + if maximize_triple(&lang, None, None) == max { + p.script = None; + p.region = None; + return; + } + // 2. language + region. + if max.2.is_some() && maximize_triple(&lang, None, max.2.clone()) == max { + p.script = None; + p.region = max.2.clone(); + return; + } + // 3. language + script. + if max.1.is_some() && maximize_triple(&lang, max.1.clone(), None) == max { + p.script = max.1.clone(); + p.region = None; + return; + } + // 4. keep the full maximal triple. + p.script = max.1; + p.region = max.2; +}