From 7b50d62ff1094053459dcb0371c63ac82c208f65 Mon Sep 17 00:00:00 2001 From: Ryan Steele Date: Mon, 11 Aug 2025 16:10:53 -0400 Subject: [PATCH 1/4] Fix normalization for multi-segment TLDs using Fastmail --- email_normalize/__init__.py | 29 +++++++++++++++++++++---- setup.cfg | 1 + tests/test_normalize.py | 43 +++++++++++++++++++++++++++++++++++++ 3 files changed, 69 insertions(+), 4 deletions(-) diff --git a/email_normalize/__init__.py b/email_normalize/__init__.py index 825ca1c..4abc021 100644 --- a/email_normalize/__init__.py +++ b/email_normalize/__init__.py @@ -13,6 +13,7 @@ import operator import time import typing +import tldextract from email import utils import aiodns @@ -205,10 +206,30 @@ async def normalize(self, email_address: str) -> Result: @staticmethod def _local_part_as_hostname(local_part: str, domain_part: str) -> typing.Tuple[str, str]: - domain_segments = domain_part.split('.') - if len(domain_segments) > 2: - local_part = domain_segments[0] - domain_part = '.'.join(domain_segments[1:]) + # Use tldextract to properly parse the domain + extracted = tldextract.extract(domain_part) + + # If there's a subdomain, use the first part of the subdomain as the local part + # and the rest (domain + suffix) as the domain part + if extracted.subdomain: + subdomain_parts = extracted.subdomain.split('.') + local_part = subdomain_parts[0] + + # Reconstruct domain_part: remaining subdomain parts + domain + suffix + remaining_subdomain = '.'.join(subdomain_parts[1:]) if len(subdomain_parts) > 1 else '' + domain_name = extracted.domain + suffix = extracted.suffix + + # Build the new domain part + domain_part_components = [] + if remaining_subdomain: + domain_part_components.append(remaining_subdomain) + if domain_name: + domain_part_components.append(domain_name) + if suffix: + domain_part_components.append(suffix) + + domain_part = '.'.join(domain_part_components) return local_part, domain_part @staticmethod diff --git a/setup.cfg b/setup.cfg index c329e2a..64f6dc5 100644 --- a/setup.cfg +++ b/setup.cfg @@ -37,6 +37,7 @@ keywords = include_package_data = True install_requires = aiodns + tldextract packages = email_normalize zip_safe = true diff --git a/tests/test_normalize.py b/tests/test_normalize.py index 3c73b5e..0ba5377 100644 --- a/tests/test_normalize.py +++ b/tests/test_normalize.py @@ -70,6 +70,49 @@ def test_fastmail_local_part_as_hostname(self): address, '{}@{}'.format(local_part, domain_part), mx_records, 'Fastmail') + def test_fastmail_multi_segment_tld_no_subdomain(self): + """Test that domains with multi-segment TLDs but no subdomain are not modified.""" + local_part = str(uuid.uuid4()) + domain_part = '{}.co.uk'.format(uuid.uuid4()) + address = '{}@{}'.format(local_part, domain_part) + mx_records = [(10, 'in1-smtp.messagingengine.com')] + self._perform_test( + address, '{}@{}'.format(local_part, domain_part), + mx_records, 'Fastmail') + + def test_fastmail_multi_segment_tld_with_subdomain(self): + """Test that domains with multi-segment TLDs and subdomains are correctly normalized.""" + local_part = str(uuid.uuid4()) + domain_part = '{}.com.au'.format(uuid.uuid4()) + address = 'testing@{}.{}'.format(local_part, domain_part) + mx_records = [(10, 'in1-smtp.messagingengine.com')] + self._perform_test( + address, '{}@{}'.format(local_part, domain_part), + mx_records, 'Fastmail') + + def test_fastmail_complex_multi_segment_tld(self): + """Test complex case with multiple subdomains and multi-segment TLD.""" + local_part = str(uuid.uuid4()) + subdomain_part = str(uuid.uuid4()) + domain_part = '{}.org.uk'.format(uuid.uuid4()) + address = 'testing@{}.{}.{}'.format(local_part, subdomain_part, domain_part) + mx_records = [(10, 'in1-smtp.messagingengine.com')] + self._perform_test( + address, '{}@{}.{}'.format(local_part, subdomain_part, domain_part), + mx_records, 'Fastmail') + + def test_fastmail_deep_subdomain_single_tld(self): + """Test deep subdomain structure with single TLD.""" + local_part = str(uuid.uuid4()) + subdomain1 = str(uuid.uuid4()) + subdomain2 = str(uuid.uuid4()) + domain_part = '{}.com'.format(uuid.uuid4()) + address = 'testing@{}.{}.{}.{}'.format(local_part, subdomain1, subdomain2, domain_part) + mx_records = [(10, 'in1-smtp.messagingengine.com')] + self._perform_test( + address, '{}@{}.{}.{}'.format(local_part, subdomain1, subdomain2, domain_part), + mx_records, 'Fastmail') + def test_google(self): local_part = str(uuid.uuid4()).replace('-', '.') domain_part = str(uuid.uuid4()) From ea0eea96ad1d4c870cef45d7b111c832760f822f Mon Sep 17 00:00:00 2001 From: Ryan Steele Date: Wed, 13 Aug 2025 16:45:50 -0400 Subject: [PATCH 2/4] Bump version --- VERSION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/VERSION b/VERSION index 227cea2..4a36342 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -2.0.0 +3.0.0 From c17cf64fdefb44a4268a390ae90444988651ae34 Mon Sep 17 00:00:00 2001 From: Ryan Steele Date: Wed, 13 Aug 2025 17:09:17 -0400 Subject: [PATCH 3/4] Update docs --- docs/history.rst | 26 ++++++++++++++++++++++++++ docs/index.rst | 1 + 2 files changed, 27 insertions(+) create mode 100644 docs/history.rst diff --git a/docs/history.rst b/docs/history.rst new file mode 100644 index 0000000..5cab099 --- /dev/null +++ b/docs/history.rst @@ -0,0 +1,26 @@ +Release History +=============== + +3.0.0 +----- +- FIXED: Multi-segment TLD normalization using tldextract library +- FIXED: Fastmail subdomain parsing for domains like .co.uk, .com.au + +2.0.0 +----- +- FIXED: Remove period stripping from Yahoo domain normalization +- ADDED: googlemail.com support and consistent host lowercasing +- CHANGED: Make Normalizer no longer a singleton +- FIXED: Handle domains with no MX records + +1.0.2 +----- +- FIXED: Documentation and distribution improvements +- CHANGED: Modernized test infrastructure and Python version support +- REMOVED: Python 3.5 and 3.6 support + +1.0.0 +----- +- Initial stable release with modernized codebase +- ADDED: Optional DNS resolution and timeout handling +- FIXED: Python 2/3 compatibility issues \ No newline at end of file diff --git a/docs/index.rst b/docs/index.rst index 18a45cc..ef5edee 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -22,6 +22,7 @@ Documentation normalizer mxrecords result + history Currently Supported Mailbox Providers ------------------------------------- From e276b12147f0fd10c8c07e05ec02bc13e4446ccd Mon Sep 17 00:00:00 2001 From: Ryan Steele Date: Thu, 14 Aug 2025 11:20:43 -0400 Subject: [PATCH 4/4] Address aiodns invalid ttl issue aiodns was returning TTL values of -1 for Gmail's MX records, causing cache entries to be considered immediately expired. This meant that on the second call to mx_records('gmail.com'), the cache entry was deleted and recreated instead of being reused, resetting the hit counter to 1. The fix filters out invalid TTL values (<= 0) and falls back to failure_ttl when no valid TTL values are available, ensuring cache entries have reasonable expiration times. --- docs/history.rst | 3 ++- email_normalize/__init__.py | 3 +-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/history.rst b/docs/history.rst index 5cab099..7d06d81 100644 --- a/docs/history.rst +++ b/docs/history.rst @@ -5,6 +5,7 @@ Release History ----- - FIXED: Multi-segment TLD normalization using tldextract library - FIXED: Fastmail subdomain parsing for domains like .co.uk, .com.au +- FIXED: Cache entires expiring immediately if aiodns returned invalid TTL 2.0.0 ----- @@ -23,4 +24,4 @@ Release History ----- - Initial stable release with modernized codebase - ADDED: Optional DNS resolution and timeout handling -- FIXED: Python 2/3 compatibility issues \ No newline at end of file +- FIXED: Python 2/3 compatibility issues diff --git a/email_normalize/__init__.py b/email_normalize/__init__.py index 4abc021..461a891 100644 --- a/email_normalize/__init__.py +++ b/email_normalize/__init__.py @@ -155,8 +155,7 @@ async def mx_records(self, domain_part: str) -> MXRecords: mx_records, ttl = [], self.failure_ttl else: mx_records = [(r.priority, r.host) for r in records] - ttl = min(r.ttl for r in records) \ - if records else self.failure_ttl + ttl = min((r.ttl for r in records if r.ttl >= 0), default=self.failure_ttl) # Prune the cache if over the limit, finding least used, oldest if len(cache.keys()) >= self.cache_limit: