cado-security · adrien-goetz-wmx · Apr 3, 2025 · Apr 3, 2025 · Apr 3, 2025
diff --git a/cloudgrep/__main__.py b/cloudgrep/__main__.py
@@ -46,8 +46,13 @@ def main() -> None:
         "-lp", "--log_properties", type=list_of_strings, help="Comma-separated list of log properties to extract"
     )
     parser.add_argument("-jo", "--json_output", action="store_true", help="Output results in JSON format")
+    parser.add_argument(
+        "-cd", "--convert_date", action="store_true", help="Convert date to ISO format (YYYY-MM-DDTHH:MM:SS)"
+    )
+    parser.add_argument(
+        "-og", "--use_og_name", action="store_true", help="Decide if you want to use original key name or tmporary name for uncompress files"
+    )
     args = parser.parse_args()
-
     if len(sys.argv) == 1:
         parser.print_help(sys.stderr)
         sys.exit(1)
@@ -82,6 +87,8 @@ def main() -> None:
         log_properties=args.log_properties,
         profile=args.profile,
         json_output=args.json_output,
+        convert_date=args.convert_date,
+        use_og_name=args.use_og_name,
     )
 
 

diff --git a/cloudgrep/cloud.py b/cloudgrep/cloud.py
@@ -11,7 +11,7 @@
 from typing import Iterator, Optional, List, Any, Tuple
 import logging
 from cloudgrep.search import Search
-
+from pytz import timezone
 class Cloud:
     def __init__(self) -> None:
         self.search = Search()
@@ -42,6 +42,8 @@ def download_from_s3_multithread(
         log_format: Optional[str] = None,
         log_properties: List[str] = [],
         json_output: Optional[bool] = False,
+        convert_date: bool = False,
+        use_og_name : bool = False
     ) -> int:
         """Download and search files from AWS S3"""
         if log_properties is None:
@@ -53,8 +55,9 @@ def _download_search_s3(key: str) -> int:
             try:
                 logging.info(f"Downloading s3://{bucket}/{key} to {tmp_name}")
                 s3.download_file(bucket, key, tmp_name)
+                og_name = key if use_og_name else tmp_name
                 matched = self.search.search_file(
-                    tmp_name, key, query, hide_filenames, yara_rules, log_format, log_properties, json_output
+                    tmp_name, key, query, hide_filenames, yara_rules, log_format, log_properties, json_output, convert_date=convert_date, og_name=og_name
                 )
                 return 1 if matched else 0
             except Exception:
@@ -166,7 +169,8 @@ def get_objects(
         from_date: Optional[datetime],
         end_date: Optional[datetime],
         file_size: int,
-        max_matches: int = 1000000 # generous default
+        max_matches: int = 1000000, # generous default
+        convert_date: bool = False,
     ) -> Iterator[str]:
         """Yield a maximum of max_matches objects that match filter"""
         # Reuse the S3 client if already created; otherwise, create one
@@ -180,7 +184,7 @@ def get_objects(
             PaginationConfig={'PageSize': 1000}
         ):
             for obj in page.get("Contents", []):
-                if self.filter_object(obj, key_contains, from_date, end_date, file_size):
+                if self.filter_object(obj, key_contains, from_date, end_date, file_size, convert_date=convert_date):
                     yield obj.get("Key")
                     count += 1
                     if count >= max_matches:
@@ -227,9 +231,17 @@ def filter_object(
         from_date: Optional[datetime],
         to_date: Optional[datetime],
         file_size: int,
+        convert_date: bool = False,
     ) -> bool:
         """Filter an S3 object based on modification date, size, and key substring"""
         last_modified = obj.get("LastModified")
+        # Fix error : "TypeError: can't compare offset-naive and offset-aware datetimes"
+        if convert_date:
+            from_date = from_date.astimezone(timezone("UTC")) if from_date else None
+            to_date = to_date.astimezone(timezone("UTC")) if to_date else None
+            # Convert last_modified to UTC if it's not already
+            if isinstance(last_modified, datetime):
+                last_modified = last_modified.astimezone(timezone("UTC"))
         if last_modified:
             if from_date and last_modified < from_date:
                 return False

diff --git a/cloudgrep/cloudgrep.py b/cloudgrep/cloudgrep.py
@@ -26,6 +26,7 @@ def list_files(
         from_date: Optional[datetime] = None,
         end_date: Optional[datetime] = None,
         file_size: int = 100_000_000, # 100MB
+        convert_date: Optional[bool] = False,
     ) -> Dict[str, List[Any]]:
         """
         Returns a dictionary of matching files for each cloud provider.
@@ -37,7 +38,7 @@ def list_files(
         """
         files = {}
         if bucket:
-            files["s3"] = list(self.cloud.get_objects(bucket, prefix, key_contains, from_date, end_date, file_size))
+            files["s3"] = list(self.cloud.get_objects(bucket, prefix, key_contains, from_date, end_date, file_size, convert_date=convert_date))
         if account_name and container_name:
             files["azure"] = list(
                 self.cloud.get_azure_objects(
@@ -69,6 +70,8 @@ def search(
         profile: Optional[str] = None,
         json_output: bool = False,
         files: Optional[Dict[str, List[Any]]] = None,
+        convert_date: Optional[bool] = False,
+        use_og_name: Optional[bool] = False,
     ) -> None:
         """
         Searches the contents of files matching the given queries.
@@ -98,6 +101,9 @@ def search(
             elif log_type.lower() == "azure":
                 log_format = "json"
                 log_properties = ["data"]
+            elif log_type.lower() == "waf":
+                log_format = "jsonl"
+                log_properties = None
             else:
                 logging.error(f"Invalid log_type: {log_type}")
                 return
@@ -109,14 +115,14 @@ def search(
                 matching_keys = files["s3"]
             else:
                 matching_keys = list(
-                    self.cloud.get_objects(bucket, prefix, key_contains, from_date, end_date, file_size)
+                    self.cloud.get_objects(bucket, prefix, key_contains, from_date, end_date, file_size, convert_date=convert_date)
                 )
             s3_client = boto3.client("s3")
             region = s3_client.get_bucket_location(Bucket=bucket).get("LocationConstraint", "unknown")
             logging.warning(f"Bucket region: {region}. (Search from the same region to avoid egress charges.)")
             logging.warning(f"Searching {len(matching_keys)} files in {bucket} for {query}...")
             self.cloud.download_from_s3_multithread(
-                bucket, matching_keys, query, hide_filenames, yara_rules, log_format, log_properties, json_output
+                bucket, matching_keys, query, hide_filenames, yara_rules, log_format, log_properties, json_output, convert_date=convert_date, use_og_name=use_og_name
             )
 
         if account_name and container_name:

diff --git a/cloudgrep/search.py b/cloudgrep/search.py
@@ -30,7 +30,16 @@ def print_match(self, match_info: dict, hide_filenames: bool, json_output: Optio
             print(f"{output.get('key_name', '')}: {line}" if not hide_filenames else line)
 
     def parse_logs(self, line: str, log_format: Optional[str]) -> Any:
-        if log_format == "json":
+        if log_format == "jsonl":
+            try:
+                # JSON Lines format (each line is a separate JSON object)
+                # This is a common format for logs, especially in cloud environments
+                # where each log entry is a separate line.
+                line_split = line.strip().split("\n")
+                return line_split
+            except json.JSONDecodeError as e:
+                logging.error(f"JSON decode error in line: {line} ({e})")
+        elif log_format == "json":
             try:
                 return json.loads(line)
             except json.JSONDecodeError as e:
@@ -84,13 +93,14 @@ def search_line(
         log_format: Optional[str],
         log_properties: List[str] = [],
         json_output: Optional[bool] = False,
+        convert_date: Optional[bool] = False,
     ) -> bool:
         """Regex search of the line"""
         found = False
         for regex in compiled_patterns:
             if regex.search(line):
                 if log_format:
-                    self.search_logs(line, key_name, regex.pattern, hide_filenames, log_format, log_properties, json_output)
+                    self.search_logs(line, key_name, regex.pattern, hide_filenames, log_format, log_properties, json_output, convert_date)
                 else:
                     self.print_match(
                         {"key_name": key_name, "query": regex.pattern, "line": line}, hide_filenames, json_output
@@ -122,6 +132,8 @@ def search_file(
         log_properties: List[str] = [],
         json_output: Optional[bool] = False,
         account_name: Optional[str] = None,
+        convert_date: Optional[bool] = False,
+        og_name: Optional[str] = None,
     ) -> bool:
         """Regex search of the file line by line"""
         logging.info(f"Searching {file_name} for patterns: {patterns}")
@@ -132,11 +144,10 @@ def search_file(
 
         def process_lines(lines: Iterable[str]) -> bool:
             return any(
-                self.search_line(key_name, compiled_patterns, hide_filenames, line, log_format, log_properties, json_output)
+                self.search_line(key_name, compiled_patterns, hide_filenames, line, log_format, log_properties, json_output, convert_date)
                 for line in lines
             )
-
-        if file_name.endswith(".gz"):
+        if file_name.endswith(".gz") or og_name.endswith(".gz"):
             try:
                 with gzip.open(file_name, "rt", encoding="utf-8", errors="ignore") as f:
                     if account_name:
@@ -147,7 +158,7 @@ def process_lines(lines: Iterable[str]) -> bool:
             except Exception:
                 logging.exception(f"Error processing gzip file: {file_name}")
                 return False
-        elif file_name.endswith(".zip"):
+        elif file_name.endswith(".zip") or og_name.endswith(".zip"):
             matched_any = False
             try:
                 with zipfile.ZipFile(file_name, "r") as zf:

diff --git a/requirements.txt b/requirements.txt
@@ -14,3 +14,4 @@ azure-identity==1.16.1
 google-cloud-storage==2.12.0
 setuptools==70.0.0
 yara-python-wheel==4.4.0
+pytz==2025.2