Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .devcontainer/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
FROM mcr.microsoft.com/vscode/devcontainers/python:3.12
FROM mcr.microsoft.com/vscode/devcontainers/python:3.12-bookworm

ENV POETRY_VERSION="1.7.1"
ENV POETRY_VENV_PATH="/home/vscode/.venv/workspace"
Expand Down
2 changes: 1 addition & 1 deletion .github/actions/waffles/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@ docopt==0.6.2
Flask==2.3.3
markupsafe==2.1.5
setuptools==78.1.1 # required for distutils in Python 3.12
git+https://github.com/cds-snc/notifier-utils.git@53.2.13#egg=notifications-utils
git+https://github.com/cds-snc/notifier-utils.git@53.2.14#egg=notifications-utils
4 changes: 2 additions & 2 deletions notifications_utils/sanitise_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ class SanitiseSMS(SanitiseText):

# Welsh characters not already included in GSM
WELSH_NON_GSM_CHARACTERS = set("ÂâÊêÎîÔôÛûŴŵŶŷ")
FRENCH_NON_GSM_CHARACTESR = set("ÀÂËÎÏÔŒÙÛâçêëîïôœû")
FRENCH_NON_GSM_CHARACTERS = set("ÀÂËÎÏÔŒÙÛâçêëîïôœû")
INUKTITUK_CHARACTERS = set(
"ᐁᐯᑌᑫᕴᒉᒣᓀᓭᓓᔦᑦᔦᕓᕂᙯᖅᑫᙰᐃᐱᑎᑭᕵᒋᒥᓂᓯ𑪶𑪰ᓕᔨᑦᔨᖨᕕᕆᕿᖅᑭᖏᙱᖠᐄᐲᑏᑮᕶᒌᒦᓃᓰ𑪷𑪱ᓖᔩᑦᔩᖩᕖᕇᖀᖅᑮᖐᙲᖡᐅᐳᑐᑯᕷᒍᒧᓄᓱ𑪸𑪲ᓗᔪᑦᔪᖪᕗᕈᖁᖅᑯᖑᙳᖢᐊᐸᑕᑲᕹᒐᒪᓇᓴ𑪺𑪴ᓚᔭᑦᔭᖬᕙᕋᖃᖅᑲᖓᙵᖤᑉᑦᒃᕻᒡᒻᓐᔅᓪᔾᑦᔾᖮᕝᕐᖅᖅᒃᖕᖖᖦᖯᕼᑊ" # noqa: E501
)
Expand All @@ -116,7 +116,7 @@ class SanitiseSMS(SanitiseText):
+ "^{}\\[~]|€" # character set extension
)
| WELSH_NON_GSM_CHARACTERS
| FRENCH_NON_GSM_CHARACTESR
| FRENCH_NON_GSM_CHARACTERS
| INUKTITUK_CHARACTERS
| CREE_CHARACTERS
| OJIBWE_CHARACTERS
Expand Down
13 changes: 12 additions & 1 deletion notifications_utils/template.py
Original file line number Diff line number Diff line change
Expand Up @@ -225,6 +225,17 @@ def is_message_too_long(self):
def is_name_too_long(self):
return len(self.name) > self.NAME_CHAR_LIMIT

def message_parts(self):
content_with_placeholders = str(self)
character_count = self.content_count
unicode = is_unicode(content_with_placeholders)
fragment_count = get_sms_fragment_count(character_count, unicode)
return {
"character_count": character_count,
"fragment_count": fragment_count,
"unicode": unicode,
}


class SMSPreviewTemplate(SMSMessageTemplate):
def __init__(
Expand Down Expand Up @@ -803,7 +814,7 @@ def get_sms_fragment_count(character_count, is_unicode):


def is_unicode(content):
return set(content) & set(SanitiseSMS.WELSH_NON_GSM_CHARACTERS)
return set(content) & (set(SanitiseSMS.WELSH_NON_GSM_CHARACTERS) | set(SanitiseSMS.FRENCH_NON_GSM_CHARACTERS))


def get_html_email_body(template_content, template_values, redact_missing_personalisation=False, html="escape"):
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "notifications-utils"
version = "53.2.13"
version = "53.2.14"
description = "Shared python code for Notification - Provides logging utils etc."
authors = ["Canadian Digital Service"]
license = "MIT license"
Expand Down
196 changes: 195 additions & 1 deletion tests/test_template.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import pytest
from bs4 import BeautifulSoup
from notifications_utils.template import get_html_email_body
from notifications_utils.template import SMSMessageTemplate, get_html_email_body


def test_lang_tags_in_templates():
Expand Down Expand Up @@ -198,3 +198,197 @@ def test_rtl_tags_work_with_other_features(self, content: str, extra_tag: str):
assert '<div dir="rtl">' in html
assert "RTL CONTENT" in html
assert "<{}".format(extra_tag) in html


class TestTemplateParts:
def test_message_parts_basic(self):
template = {"content": "Hello world", "template_type": "sms"}
sms = SMSMessageTemplate(template)
parts = sms.message_parts()

assert parts["character_count"] == 11
assert parts["fragment_count"] == 1
assert parts["unicode"] == set() # Empty set for non-unicode

def test_message_parts_with_unicode(self):
# Welsh character 'â' triggers unicode ('â' is 2 bytes in UTF-8)
template = {"content": "Helo byd â", "template_type": "sms"}
sms = SMSMessageTemplate(template)
parts = sms.message_parts()

assert parts["character_count"] == 11 # "Helo byd " (9 bytes) + "â" (2 bytes) = 11 total bytes
assert parts["fragment_count"] == 1
assert parts["unicode"] == {"â"} # Set containing unicode char

def test_message_parts_long_non_unicode_single_fragment(self):
# 160 bytes is the limit for single non-unicode SMS
template = {"content": "a" * 160, "template_type": "sms"}
sms = SMSMessageTemplate(template)
parts = sms.message_parts()

assert parts["character_count"] == 160
assert parts["fragment_count"] == 1
assert parts["unicode"] == set() # Empty set for non-unicode

def test_message_parts_long_non_unicode_multiple_fragments(self):
# 161 bytes triggers multi-part SMS (153 bytes per fragment)
template = {"content": "a" * 161, "template_type": "sms"}
sms = SMSMessageTemplate(template)
parts = sms.message_parts()

assert parts["character_count"] == 161
assert parts["fragment_count"] == 2
assert parts["unicode"] == set() # Empty set for non-unicode

def test_message_parts_long_unicode_single_fragment(self):
# 70 bytes is the limit for single unicode SMS ('â' is 2 bytes each)
template = {"content": "â" * 35, "template_type": "sms"}
sms = SMSMessageTemplate(template)
parts = sms.message_parts()

assert parts["character_count"] == 70 # 35 chars * 2 bytes each
assert parts["fragment_count"] == 1
assert parts["unicode"] == {"â"} # Set containing unicode char

def test_message_parts_long_unicode_multiple_fragments(self):
# 71 bytes triggers multi-part unicode SMS (67 bytes per fragment)
template = {"content": "â" * 36, "template_type": "sms"}
sms = SMSMessageTemplate(template)
parts = sms.message_parts()

assert parts["character_count"] == 72 # 36 chars * 2 bytes each
assert parts["fragment_count"] == 2
assert parts["unicode"] == {"â"} # Set containing unicode char

def test_message_parts_with_placeholders(self):
template = {"content": "Hello ((name))", "template_type": "sms"}
sms = SMSMessageTemplate(template, values={"name": "Alice"})
parts = sms.message_parts()

assert parts["character_count"] == 11 # "Hello Alice"
assert parts["fragment_count"] == 1
assert parts["unicode"] == set() # Empty set for non-unicode

def test_message_parts_with_unicode_placeholder(self):
template = {"content": "Hello ((name))", "template_type": "sms"}
sms = SMSMessageTemplate(template, values={"name": "Siân"})
parts = sms.message_parts()

assert parts["character_count"] == 11 # "Hello Siân" (â is 2 bytes)
assert parts["fragment_count"] == 1
assert parts["unicode"] == {"â"} # Set containing unicode char

def test_message_parts_with_prefix(self):
template = {"content": "Hello world", "template_type": "sms"}
sms = SMSMessageTemplate(template, prefix="Service")
parts = sms.message_parts()

# "Service: Hello world" = 20 bytes
assert parts["character_count"] == 20
assert parts["fragment_count"] == 1
assert parts["unicode"] == set() # Empty set for non-unicode

def test_message_parts_with_prefix_hidden(self):
template = {"content": "Hello world", "template_type": "sms"}
sms = SMSMessageTemplate(template, prefix="Service", show_prefix=False)
parts = sms.message_parts()

# Prefix not shown, so just "Hello world" = 11 bytes
assert parts["character_count"] == 11
assert parts["fragment_count"] == 1
assert parts["unicode"] == set() # Empty set for non-unicode

@pytest.mark.parametrize(
"content, byte_count, fragment_count, has_unicode",
[
# Non-unicode: single fragment up to 160 bytes, then 153 bytes per fragment
("a" * 160, 160, 1, False),
("a" * 161, 161, 2, False),
("a" * 306, 306, 2, False),
("a" * 307, 307, 3, False),
# Unicode: single fragment up to 70 bytes, then 67 bytes per fragment
# 'â' is 2 bytes in UTF-8
("â" * 35, 70, 1, True), # 35 chars * 2 = 70 bytes
("â" * 36, 72, 2, True), # 36 chars * 2 = 72 bytes (>70)
("â" * 67, 134, 2, True), # 67 chars * 2 = 134 bytes
("â" * 68, 136, 3, True), # 68 chars * 2 = 136 bytes (>134)
],
)
def test_message_parts_fragment_boundaries(self, content, byte_count, fragment_count, has_unicode):
template = {"content": content, "template_type": "sms"}
sms = SMSMessageTemplate(template)
parts = sms.message_parts()

assert parts["character_count"] == byte_count
assert parts["fragment_count"] == fragment_count
# Check if unicode set is empty or not
assert bool(parts["unicode"]) == has_unicode

def test_message_parts_with_multiple_unicode_chars_near_250_bytes(self):
# Test with 4 different French non-GSM unicode characters (each 2 bytes in UTF-8)
# Using: â, ê, î, ô from FRENCH_NON_GSM_CHARACTERS
# Unicode SMS fragments: 70 bytes for single, then 67 bytes per fragment
# 4 fragments can hold up to 268 bytes (70 for first would be single, but 71+ triggers multi-part at 67 each)
# Boundary: 201 bytes = 3 fragments, 202 bytes = 4 fragments

# Create content with mix of 4 French non-GSM unicode characters: â, ê, î, ô
# Each is 2 bytes, so we need 100 chars total = 200 bytes (just under boundary)
content_200_bytes = "âêîô" * 25 # 4 chars * 25 = 100 chars * 2 bytes = 200 bytes
template = {"content": content_200_bytes, "template_type": "sms"}
sms = SMSMessageTemplate(template)
parts = sms.message_parts()

assert parts["character_count"] == 200
assert parts["fragment_count"] == 3 # 200 bytes = 3 fragments (67*2 = 134, need 3rd for remaining 66)
assert len(parts["unicode"]) == 4 # 4 different non-GSM chars

# Now add one more unicode char to cross the boundary to 202 bytes
content_202_bytes = content_200_bytes + "â" # +2 bytes = 202 bytes total
template = {"content": content_202_bytes, "template_type": "sms"}
sms = SMSMessageTemplate(template)
parts = sms.message_parts()

assert parts["character_count"] == 202
assert parts["fragment_count"] == 4 # 202 bytes crosses boundary, needs 4 fragments
assert len(parts["unicode"]) == 4 # Still 4 different non-GSM chars

# Test at exactly 250 bytes (still in 4-fragment range: 202-268 bytes)
# Need 125 chars * 2 bytes = 250 bytes
# Adjust: 124 chars = 248 bytes, 125 chars = 250 bytes
content_250_bytes = "âêîô" * 31 + "â" # (4*31 + 1) = 125 chars * 2 bytes = 250 bytes
template = {"content": content_250_bytes, "template_type": "sms"}
sms = SMSMessageTemplate(template)
parts = sms.message_parts()

assert parts["character_count"] == 250
assert parts["fragment_count"] == 4 # 250 bytes = 4 fragments
assert len(parts["unicode"]) == 4 # 4 different non-GSM chars

def test_message_parts_with_multiple_unicode_chars(self):
# Real-world bilingual emergency test message with French accented characters
content = (
"NB- xxxxxxxx, 120 xxxxxxxxxxx Blvd: This is a test for the xxxxxxxx employees, "
"and no action is required from you at this time. The purpose of this exercise is "
"to ensure that our emergency communication system is functioning properly and that "
"everyone is familiar with the process.\n"
"Ceci est uniquement un test pour les employés xx xxxxxxxx et aucune action n'est "
"requise de votre part pour le moment. L'objectif de cet exercice est de s'assurer "
"que notre système de communication d'urgence fonctionne correctement et que chacun "
"connaît la procédure."
)
Comment thread
jzbahrai marked this conversation as resolved.

template = {"content": content, "template_type": "sms"}
sms = SMSMessageTemplate(template)
parts = sms.message_parts()

# Verify it's detected as unicode (has French accented characters)
assert len(parts["unicode"]) > 0

# Content is large enough to require multiple SMS fragments
# With unicode, fragments are: first 70 bytes, then 67 bytes each
assert parts["character_count"] > 500 # Should be around 580+ bytes
assert parts["fragment_count"] == 9

# Verify specific unicode characters from French text
french_unicode_chars = {"î"}
assert french_unicode_chars.issubset(parts["unicode"])
Loading