diff --git a/.github/workflows/deploy.yaml b/.github/workflows/deploy.yaml index 072f8e2..55a40c8 100644 --- a/.github/workflows/deploy.yaml +++ b/.github/workflows/deploy.yaml @@ -1,25 +1,31 @@ -name: Deployment +name: Publish to PyPI + on: - push: - branches-ignore: ["*"] - tags: ["*"] + release: + types: [created] + jobs: deploy: runs-on: ubuntu-latest - if: github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags') && github.repository == 'gmr/email-normalize' - environment: release + + environment: + name: pypi + url: https://pypi.org/project/email-normalize/ + permissions: contents: read id-token: write + steps: - - name: Checkout repository - uses: actions/checkout@v5 + - uses: actions/checkout@v5 - name: Install uv uses: astral-sh/setup-uv@v6 - - name: Build package - run: uv build + - name: Build and check package + run: | + uv build + uvx twine check dist/* - - name: Publish package + - name: Publish package distributions to PyPI uses: pypa/gh-action-pypi-publish@release/v1 diff --git a/.github/workflows/testing.yaml b/.github/workflows/testing.yaml index f7fb3fc..314027f 100644 --- a/.github/workflows/testing.yaml +++ b/.github/workflows/testing.yaml @@ -1,10 +1,6 @@ name: Testing on: pull_request: - paths-ignore: - - 'docs/**' - - '*.md' - - '*.rst' push: branches: ["*"] paths-ignore: @@ -12,10 +8,10 @@ on: - '*.md' - '*.rst' tags-ignore: ["*"] + jobs: test: runs-on: ubuntu-latest - timeout-minutes: 3 strategy: fail-fast: false matrix: @@ -47,7 +43,6 @@ jobs: - name: Upload Coverage uses: codecov/codecov-action@v5 - if: github.event_name == 'push' && github.repository == 'gmr/email-normalize' with: files: ./build/coverage.xml flags: unittests diff --git a/email_normalize/__init__.py b/email_normalize/__init__.py index 7785ffa..87afb39 100644 --- a/email_normalize/__init__.py +++ b/email_normalize/__init__.py @@ -24,6 +24,10 @@ MXRecords = list[tuple[int, str]] +_tld_extract = tldextract.TLDExtract( + suffix_list_urls=(), cache_dir=None, fallback_to_snapshot=True +) + cache: dict[str, 'CachedItem'] = {} @@ -99,8 +103,11 @@ def __init__( cache_limit: int = 1024, cache_failures: bool = True, failure_ttl: int = 300, + skip_dns: bool = False, ) -> None: - self._resolver = aiodns.DNSResolver(name_servers) + self._skip_dns = skip_dns + if not skip_dns: + self._resolver = aiodns.DNSResolver(name_servers) self.cache_failures = cache_failures self.cache_limit = cache_limit self.failure_ttl = failure_ttl @@ -114,6 +121,8 @@ async def mx_records(self, domain_part: str) -> MXRecords: domain_part: The domain to resolve MX records for. """ + if self._skip_dns: + return [] if self._skip_cache(domain_part): try: records = await self._resolver.query(domain_part, 'MX') @@ -158,8 +167,12 @@ async def normalize(self, email_address: str) -> Result: """ address = utils.parseaddr(email_address) local_part, domain_part = address[1].lower().split('@') - mx_records = await self.mx_records(domain_part) - provider = self._lookup_provider(mx_records) + if self._skip_dns: + mx_records = [] + provider = self._lookup_provider_by_domain(domain_part) + else: + mx_records = await self.mx_records(domain_part) + provider = self._lookup_provider(mx_records) if provider: if provider.Flags & providers.Rules.LOCAL_PART_AS_HOSTNAME: local_part, domain_part = self._local_part_as_hostname( @@ -181,7 +194,7 @@ def _local_part_as_hostname( local_part: str, domain_part: str, ) -> tuple[str, str]: - extracted = tldextract.extract(domain_part) + extracted = _tld_extract(domain_part) if extracted.subdomain: subdomain_parts = extracted.subdomain.split('.') local_part = subdomain_parts[0] @@ -200,6 +213,12 @@ def _local_part_as_hostname( domain_part = '.'.join(components) return local_part, domain_part + @staticmethod + def _lookup_provider_by_domain( + domain_part: str, + ) -> type[providers.MailboxProvider] | None: + return providers.DomainMap.get(domain_part) + @staticmethod def _lookup_provider( mx_records: list[tuple[int, str]], @@ -221,7 +240,10 @@ def _skip_cache(self, domain: str) -> bool: return False -def normalize(email_address: str) -> Result: +def normalize( + email_address: str, + skip_dns: bool = False, +) -> Result: """Normalize an email address. This function abstracts the asyncio base for this library and @@ -231,10 +253,13 @@ def normalize(email_address: str) -> Result: Args: email_address: The address to normalize. + skip_dns: Skip DNS MX record lookups and use a static + domain map to detect well-known mailbox providers. + Defaults to ``False``. """ async def _normalize(): - return await Normalizer().normalize(email_address) + return await Normalizer(skip_dns=skip_dns).normalize(email_address) return asyncio.run(_normalize()) diff --git a/email_normalize/providers.py b/email_normalize/providers.py index a014f60..03c8833 100644 --- a/email_normalize/providers.py +++ b/email_normalize/providers.py @@ -84,3 +84,30 @@ class Zoho(MailboxProvider): Yandex, Zoho, ] + +DomainMap: dict[str, type[MailboxProvider]] = { + 'icloud.com': Apple, + 'me.com': Apple, + 'mac.com': Apple, + 'fastmail.com': Fastmail, + 'fastmail.fm': Fastmail, + 'gmail.com': Google, + 'googlemail.com': Google, + 'outlook.com': Microsoft, + 'hotmail.com': Microsoft, + 'live.com': Microsoft, + 'msn.com': Microsoft, + 'proton.me': ProtonMail, + 'protonmail.com': ProtonMail, + 'pm.me': ProtonMail, + 'yahoo.com': Yahoo, + 'yahoo.co.uk': Yahoo, + 'yahoo.co.jp': Yahoo, + 'ymail.com': Yahoo, + 'aol.com': Yahoo, + 'yandex.com': Yandex, + 'yandex.ru': Yandex, + 'ya.ru': Yandex, + 'zoho.com': Zoho, + 'zohomail.com': Zoho, +} diff --git a/tests/test_normalize.py b/tests/test_normalize.py index 719e9ce..6e64a83 100644 --- a/tests/test_normalize.py +++ b/tests/test_normalize.py @@ -1,3 +1,4 @@ +import asyncio import unittest import uuid import warnings @@ -185,3 +186,118 @@ def test_zoho(self): self._perform_test( address, f'{local_part}@{domain_part}', mx_records, 'Zoho' ) + + +class SkipDNSTestCase(TestCase): + def _normalize(self, address): + return email_normalize.normalize(address, skip_dns=True) + + def _assert_provider(self, domain, expected_provider): + local_part = str(uuid.uuid4()) + address = f'{local_part}@{domain}' + result = self._normalize(address) + self.assertEqual(result.mailbox_provider, expected_provider) + self.assertListEqual(result.mx_records, []) + return result + + def test_gmail(self): + self._assert_provider('gmail.com', 'Google') + + def test_googlemail(self): + self._assert_provider('googlemail.com', 'Google') + + def test_outlook(self): + self._assert_provider('outlook.com', 'Microsoft') + + def test_hotmail(self): + self._assert_provider('hotmail.com', 'Microsoft') + + def test_live(self): + self._assert_provider('live.com', 'Microsoft') + + def test_msn(self): + self._assert_provider('msn.com', 'Microsoft') + + def test_icloud(self): + self._assert_provider('icloud.com', 'Apple') + + def test_me(self): + self._assert_provider('me.com', 'Apple') + + def test_mac(self): + self._assert_provider('mac.com', 'Apple') + + def test_fastmail(self): + self._assert_provider('fastmail.com', 'Fastmail') + + def test_fastmail_fm(self): + self._assert_provider('fastmail.fm', 'Fastmail') + + def test_protonmail(self): + self._assert_provider('protonmail.com', 'ProtonMail') + + def test_proton_me(self): + self._assert_provider('proton.me', 'ProtonMail') + + def test_pm_me(self): + self._assert_provider('pm.me', 'ProtonMail') + + def test_yahoo(self): + self._assert_provider('yahoo.com', 'Yahoo') + + def test_yahoo_co_uk(self): + self._assert_provider('yahoo.co.uk', 'Yahoo') + + def test_yahoo_co_jp(self): + self._assert_provider('yahoo.co.jp', 'Yahoo') + + def test_ymail(self): + self._assert_provider('ymail.com', 'Yahoo') + + def test_aol(self): + self._assert_provider('aol.com', 'Yahoo') + + def test_yandex_com(self): + self._assert_provider('yandex.com', 'Yandex') + + def test_yandex_ru(self): + self._assert_provider('yandex.ru', 'Yandex') + + def test_ya_ru(self): + self._assert_provider('ya.ru', 'Yandex') + + def test_zoho(self): + self._assert_provider('zoho.com', 'Zoho') + + def test_zohomail(self): + self._assert_provider('zohomail.com', 'Zoho') + + def test_google_rules_applied(self): + result = self._normalize('u.s.e.r+tag@gmail.com') + self.assertEqual(result.normalized_address, 'user@gmail.com') + self.assertEqual(result.mailbox_provider, 'Google') + + def test_microsoft_plus_addressing(self): + result = self._normalize('user+tag@outlook.com') + self.assertEqual(result.normalized_address, 'user@outlook.com') + + def test_unknown_domain(self): + domain = f'{uuid.uuid4()}.example.com' + result = self._normalize(f'user@{domain}') + self.assertIsNone(result.mailbox_provider) + self.assertListEqual(result.mx_records, []) + + def test_dns_not_called(self): + with mock.patch( + 'email_normalize.Normalizer.mx_records', + side_effect=AssertionError('DNS should not be called'), + ): + result = self._normalize('test@gmail.com') + self.assertEqual(result.mailbox_provider, 'Google') + + def test_async_normalizer(self): + normalizer = email_normalize.Normalizer(skip_dns=True) + result = asyncio.run(normalizer.normalize('u.s.e.r+tag@gmail.com')) + self.assertEqual(result.normalized_address, 'user@gmail.com') + self.assertEqual(result.mailbox_provider, 'Google') + self.assertListEqual(result.mx_records, [])