Adding template_parts as a field in the template class

jzbahrai · jzbahrai · commit 607d4ef2e4b0 · 2026-01-29T17:43:02.000Z
diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile
@@ -1,4 +1,4 @@
-FROM mcr.microsoft.com/vscode/devcontainers/python:3.12
+FROM mcr.microsoft.com/vscode/devcontainers/python:3.12-bookworm
 
 ENV POETRY_VERSION="1.7.1"
 ENV POETRY_VENV_PATH="/home/vscode/.venv/workspace"
diff --git a/.github/actions/waffles/requirements.txt b/.github/actions/waffles/requirements.txt
@@ -2,4 +2,4 @@ docopt==0.6.2
 Flask==2.3.3
 markupsafe==2.1.5
 setuptools==78.1.1 # required for distutils in Python 3.12
-git+https://github.com/cds-snc/notifier-utils.git@53.2.13#egg=notifications-utils
+git+https://github.com/cds-snc/notifier-utils.git@53.2.14#egg=notifications-utils
diff --git a/notifications_utils/sanitise_text.py b/notifications_utils/sanitise_text.py
@@ -100,7 +100,7 @@ class SanitiseSMS(SanitiseText):
 
     # Welsh characters not already included in GSM
     WELSH_NON_GSM_CHARACTERS = set("ÂâÊêÎîÔôÛûŴŵŶŷ")
-    FRENCH_NON_GSM_CHARACTESR = set("ÀÂËÎÏÔŒÙÛâçêëîïôœû")
+    FRENCH_NON_GSM_CHARACTERS = set("ÀÂËÎÏÔŒÙÛâçêëîïôœû")
     INUKTITUK_CHARACTERS = set(
         "ᐁᐯᑌᑫᕴᒉᒣᓀᓭᓓᔦᑦᔦᕓᕂᙯᖅᑫᙰᐃᐱᑎᑭᕵᒋᒥᓂᓯ𑪶𑪰ᓕᔨᑦᔨᖨᕕᕆᕿᖅᑭᖏᙱᖠᐄᐲᑏᑮᕶᒌᒦᓃᓰ𑪷𑪱ᓖᔩᑦᔩᖩᕖᕇᖀᖅᑮᖐᙲᖡᐅᐳᑐᑯᕷᒍᒧᓄᓱ𑪸𑪲ᓗᔪᑦᔪᖪᕗᕈᖁᖅᑯᖑᙳᖢᐊᐸᑕᑲᕹᒐᒪᓇᓴ𑪺𑪴ᓚᔭᑦᔭᖬᕙᕋᖃᖅᑲᖓᙵᖤᑉᑦᒃᕻᒡᒻᓐᔅᓪᔾᑦᔾᖮᕝᕐᖅᖅᒃᖕᖖᖦᖯᕼᑊ"  # noqa: E501
     )
@@ -116,7 +116,7 @@ class SanitiseSMS(SanitiseText):
             + "^{}\\[~]|€"  # character set extension
         )
         | WELSH_NON_GSM_CHARACTERS
-        | FRENCH_NON_GSM_CHARACTESR
+        | FRENCH_NON_GSM_CHARACTERS
         | INUKTITUK_CHARACTERS
         | CREE_CHARACTERS
         | OJIBWE_CHARACTERS
diff --git a/notifications_utils/template.py b/notifications_utils/template.py
@@ -225,6 +225,17 @@ def is_message_too_long(self):
     def is_name_too_long(self):
         return len(self.name) > self.NAME_CHAR_LIMIT
 
+    def message_parts(self):
+        content_with_placeholders = str(self)
+        character_count = self.content_count
+        unicode = is_unicode(content_with_placeholders)
+        fragment_count = get_sms_fragment_count(character_count, unicode)
+        return {
+            "character_count": character_count,
+            "fragment_count": fragment_count,
+            "unicode": unicode,
+        }
+
 
 class SMSPreviewTemplate(SMSMessageTemplate):
     def __init__(
@@ -803,7 +814,7 @@ def get_sms_fragment_count(character_count, is_unicode):
 
 
 def is_unicode(content):
-    return set(content) & set(SanitiseSMS.WELSH_NON_GSM_CHARACTERS)
+    return set(content) & (set(SanitiseSMS.WELSH_NON_GSM_CHARACTERS) | set(SanitiseSMS.FRENCH_NON_GSM_CHARACTERS))
 
 
 def get_html_email_body(template_content, template_values, redact_missing_personalisation=False, html="escape"):
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "notifications-utils"
-version = "53.2.13"
+version = "53.2.14"
 description = "Shared python code for Notification - Provides logging utils etc."
 authors = ["Canadian Digital Service"]
 license = "MIT license"
diff --git a/tests/test_template.py b/tests/test_template.py
@@ -1,6 +1,6 @@
 import pytest
 from bs4 import BeautifulSoup
-from notifications_utils.template import get_html_email_body
+from notifications_utils.template import SMSMessageTemplate, get_html_email_body
 
 
 def test_lang_tags_in_templates():
@@ -198,3 +198,198 @@ def test_rtl_tags_work_with_other_features(self, content: str, extra_tag: str):
         assert '<div dir="rtl">' in html
         assert "RTL CONTENT" in html
         assert "<{}".format(extra_tag) in html
+
+
+class TestTemplateParts:
+    def test_message_parts_basic(self):
+        template = {"content": "Hello world", "template_type": "sms"}
+        sms = SMSMessageTemplate(template)
+        parts = sms.message_parts()
+
+        assert parts["character_count"] == 11
+        assert parts["fragment_count"] == 1
+        assert parts["unicode"] == set()  # Empty set for non-unicode
+
+    def test_message_parts_with_unicode(self):
+        # Welsh character 'â' triggers unicode ('â' is 2 bytes in UTF-8)
+        template = {"content": "Helo byd â", "template_type": "sms"}
+        sms = SMSMessageTemplate(template)
+        parts = sms.message_parts()
+
+        assert parts["character_count"] == 11  # 9 ASCII bytes + 2 bytes for 'â' = 11 total bytes
+        assert parts["fragment_count"] == 1
+        assert parts["unicode"] == {"â"}  # Set containing unicode char
+
+    def test_message_parts_long_non_unicode_single_fragment(self):
+        # 160 bytes is the limit for single non-unicode SMS
+        template = {"content": "a" * 160, "template_type": "sms"}
+        sms = SMSMessageTemplate(template)
+        parts = sms.message_parts()
+
+        assert parts["character_count"] == 160
+        assert parts["fragment_count"] == 1
+        assert parts["unicode"] == set()  # Empty set for non-unicode
+
+    def test_message_parts_long_non_unicode_multiple_fragments(self):
+        # 161 bytes triggers multi-part SMS (153 bytes per fragment)
+        template = {"content": "a" * 161, "template_type": "sms"}
+        sms = SMSMessageTemplate(template)
+        parts = sms.message_parts()
+
+        assert parts["character_count"] == 161
+        assert parts["fragment_count"] == 2
+        assert parts["unicode"] == set()  # Empty set for non-unicode
+
+    def test_message_parts_long_unicode_single_fragment(self):
+        # 70 bytes is the limit for single unicode SMS ('â' is 2 bytes each)
+        template = {"content": "â" * 35, "template_type": "sms"}
+        sms = SMSMessageTemplate(template)
+        parts = sms.message_parts()
+
+        assert parts["character_count"] == 70  # 35 chars * 2 bytes each
+        assert parts["fragment_count"] == 1
+        assert parts["unicode"] == {"â"}  # Set containing unicode char
+
+    def test_message_parts_long_unicode_multiple_fragments(self):
+        # 71 bytes triggers multi-part unicode SMS (67 bytes per fragment)
+        template = {"content": "â" * 36, "template_type": "sms"}
+        sms = SMSMessageTemplate(template)
+        parts = sms.message_parts()
+
+        assert parts["character_count"] == 72  # 36 chars * 2 bytes each
+        assert parts["fragment_count"] == 2
+        assert parts["unicode"] == {"â"}  # Set containing unicode char
+
+    def test_message_parts_with_placeholders(self):
+        template = {"content": "Hello ((name))", "template_type": "sms"}
+        sms = SMSMessageTemplate(template, values={"name": "Alice"})
+        parts = sms.message_parts()
+
+        assert parts["character_count"] == 11  # "Hello Alice"
+        assert parts["fragment_count"] == 1
+        assert parts["unicode"] == set()  # Empty set for non-unicode
+
+    def test_message_parts_with_unicode_placeholder(self):
+        template = {"content": "Hello ((name))", "template_type": "sms"}
+        sms = SMSMessageTemplate(template, values={"name": "Siân"})
+        parts = sms.message_parts()
+
+        assert parts["character_count"] == 11  # "Hello Siân" (â is 2 bytes)
+        assert parts["fragment_count"] == 1
+        assert parts["unicode"] == {"â"}  # Set containing unicode char
+
+    def test_message_parts_with_prefix(self):
+        template = {"content": "Hello world", "template_type": "sms"}
+        sms = SMSMessageTemplate(template, prefix="Service")
+        parts = sms.message_parts()
+
+        # "Service: Hello world" = 20 bytes
+        assert parts["character_count"] == 20
+        assert parts["fragment_count"] == 1
+        assert parts["unicode"] == set()  # Empty set for non-unicode
+
+    def test_message_parts_with_prefix_hidden(self):
+        template = {"content": "Hello world", "template_type": "sms"}
+        sms = SMSMessageTemplate(template, prefix="Service", show_prefix=False)
+        parts = sms.message_parts()
+
+        # Prefix not shown, so just "Hello world" = 11 bytes
+        assert parts["character_count"] == 11
+        assert parts["fragment_count"] == 1
+        assert parts["unicode"] == set()  # Empty set for non-unicode
+
+    @pytest.mark.parametrize(
+        "content, byte_count, fragment_count, has_unicode",
+        [
+            # Non-unicode: single fragment up to 160 bytes, then 153 bytes per fragment
+            ("a" * 160, 160, 1, False),
+            ("a" * 161, 161, 2, False),
+            ("a" * 306, 306, 2, False),
+            ("a" * 307, 307, 3, False),
+            # Unicode: single fragment up to 70 bytes, then 67 bytes per fragment
+            # 'â' is 2 bytes in UTF-8
+            ("â" * 35, 70, 1, True),  # 35 chars * 2 = 70 bytes
+            ("â" * 36, 72, 2, True),  # 36 chars * 2 = 72 bytes (>70)
+            ("â" * 67, 134, 2, True),  # 67 chars * 2 = 134 bytes
+            ("â" * 68, 136, 3, True),  # 68 chars * 2 = 136 bytes (>134)
+        ],
+    )
+    def test_message_parts_fragment_boundaries(self, content, byte_count, fragment_count, has_unicode):
+        template = {"content": content, "template_type": "sms"}
+        sms = SMSMessageTemplate(template)
+        parts = sms.message_parts()
+
+        assert parts["character_count"] == byte_count
+        assert parts["fragment_count"] == fragment_count
+        # Check if unicode set is empty or not
+        assert bool(parts["unicode"]) == has_unicode
+
+    def test_message_parts_with_multiple_unicode_chars_near_250_bytes(self):
+        # Test with 4 different French non-GSM unicode characters (each 2 bytes in UTF-8)
+        # Using: â, ê, î, ô from FRENCH_NON_GSM_CHARACTERS
+        # Unicode SMS fragments: 70 bytes for single, then 67 bytes per fragment
+        # 4 fragments can hold up to 268 bytes (70 for first would be single, but 71+ triggers multi-part at 67 each)
+        # Boundary: 201 bytes = 3 fragments, 202 bytes = 4 fragments
+
+        # Create content with mix of 4 French non-GSM unicode characters: â, ê, î, ô
+        # Each is 2 bytes, so we need 100 chars total = 200 bytes (just under boundary)
+        content_200_bytes = "âêîô" * 25  # 4 chars * 25 = 100 chars * 2 bytes = 200 bytes
+        template = {"content": content_200_bytes, "template_type": "sms"}
+        sms = SMSMessageTemplate(template)
+        parts = sms.message_parts()
+
+        assert parts["character_count"] == 200
+        assert parts["fragment_count"] == 3  # 200 bytes = 3 fragments (67*2 = 134, need 3rd for remaining 66)
+        assert len(parts["unicode"]) == 4  # 4 different non-GSM chars
+
+        # Now add one more unicode char to cross the boundary to 202 bytes
+        content_202_bytes = content_200_bytes + "â"  # +2 bytes = 202 bytes total
+        template = {"content": content_202_bytes, "template_type": "sms"}
+        sms = SMSMessageTemplate(template)
+        parts = sms.message_parts()
+
+        assert parts["character_count"] == 202
+        assert parts["fragment_count"] == 4  # 202 bytes crosses boundary, needs 4 fragments
+        assert len(parts["unicode"]) == 4  # Still 4 different non-GSM chars
+
+        # Test at exactly 250 bytes (still in 4-fragment range: 202-268 bytes)
+        # Need 125 chars * 2 bytes = 250 bytes
+        content_250_bytes = "âêîô" * 31 + "âê"  # (4*31 + 2) = 126 chars * 2 bytes = 252 bytes
+        # Adjust: 124 chars = 248 bytes, 125 chars = 250 bytes
+        content_250_bytes = "âêîô" * 31 + "â"  # (4*31 + 1) = 125 chars * 2 bytes = 250 bytes
+        template = {"content": content_250_bytes, "template_type": "sms"}
+        sms = SMSMessageTemplate(template)
+        parts = sms.message_parts()
+
+        assert parts["character_count"] == 250
+        assert parts["fragment_count"] == 4  # 250 bytes = 4 fragments
+        assert len(parts["unicode"]) == 4  # 4 different non-GSM chars
+
+    def test_message_parts_with_multiple_unicode_chars(self):
+        # Real-world bilingual emergency test message with French accented characters
+        content = (
+            "NB- xxxxxxxx, 120 Harbourview Blvd: This is a test for the xxxxxxxx employees, "
+            "and no action is required from you at this time. The purpose of this exercise is "
+            "to ensure that our emergency communication system is functioning properly and that "
+            "everyone is familiar with the process.\n"
+            "Ceci est uniquement un test pour les employés xx xxxxxxxx et aucune action n'est "
+            "requise de votre part pour le moment. L'objectif de cet exercice est de s'assurer "
+            "que notre système de communication d'urgence fonctionne correctement et que chacun "
+            "connaît la procédure."
+        )
+
+        template = {"content": content, "template_type": "sms"}
+        sms = SMSMessageTemplate(template)
+        parts = sms.message_parts()
+
+        # Verify it's detected as unicode (has French accented characters)
+        assert len(parts["unicode"]) > 0
+
+        # Content is large enough to require multiple SMS fragments
+        # With unicode, fragments are: first 70 bytes, then 67 bytes each
+        assert parts["character_count"] > 500  # Should be around 580+ bytes
+        assert parts["fragment_count"] >= 9  # Should need multiple fragments
+
+        # Verify specific unicode characters from French text
+        french_unicode_chars = {"î"}
+        assert french_unicode_chars.issubset(parts["unicode"])

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-FROM mcr.microsoft.com/vscode/devcontainers/python:3.12`
	`1`	`+FROM mcr.microsoft.com/vscode/devcontainers/python:3.12-bookworm`
`2`	`2`
`3`	`3`	`ENV POETRY_VERSION="1.7.1"`
`4`	`4`	`ENV POETRY_VENV_PATH="/home/vscode/.venv/workspace"`