From 8158a7a9721127426bb8300c12045752b47360b5 Mon Sep 17 00:00:00 2001 From: Chris Doman Date: Wed, 26 Feb 2025 13:52:06 +0000 Subject: [PATCH] Faster yield --- cloudgrep/cloud.py | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/cloudgrep/cloud.py b/cloudgrep/cloud.py index 07c11d8..b29d5cf 100644 --- a/cloudgrep/cloud.py +++ b/cloudgrep/cloud.py @@ -166,14 +166,25 @@ def get_objects( from_date: Optional[datetime], end_date: Optional[datetime], file_size: int, + max_matches: int = 1000000 # generous default ) -> Iterator[str]: - """Yield objects that match filter""" - s3 = boto3.client("s3") - paginator = s3.get_paginator("list_objects_v2") - for page in paginator.paginate(Bucket=bucket, Prefix=prefix): + """Yield a maximum of max_matches objects that match filter""" + # Reuse the S3 client if already created; otherwise, create one + if not hasattr(self, "s3_client"): + self.s3_client = boto3.client("s3") + paginator = self.s3_client.get_paginator("list_objects_v2") + count = 0 + for page in paginator.paginate( + Bucket=bucket, + Prefix=prefix, + PaginationConfig={'PageSize': 1000} + ): for obj in page.get("Contents", []): if self.filter_object(obj, key_contains, from_date, end_date, file_size): yield obj.get("Key") + count += 1 + if count >= max_matches: + return def get_azure_objects( self,