Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 8 additions & 1 deletion cloudgrep/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,8 +46,13 @@ def main() -> None:
"-lp", "--log_properties", type=list_of_strings, help="Comma-separated list of log properties to extract"
)
parser.add_argument("-jo", "--json_output", action="store_true", help="Output results in JSON format")
parser.add_argument(
"-cd", "--convert_date", action="store_true", help="Convert date to ISO format (YYYY-MM-DDTHH:MM:SS)"
)
parser.add_argument(
"-og", "--use_og_name", action="store_true", help="Decide if you want to use original key name or tmporary name for uncompress files"
)
args = parser.parse_args()

if len(sys.argv) == 1:
parser.print_help(sys.stderr)
sys.exit(1)
Expand Down Expand Up @@ -82,6 +87,8 @@ def main() -> None:
log_properties=args.log_properties,
profile=args.profile,
json_output=args.json_output,
convert_date=args.convert_date,
use_og_name=args.use_og_name,
)


Expand Down
20 changes: 16 additions & 4 deletions cloudgrep/cloud.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from typing import Iterator, Optional, List, Any, Tuple
import logging
from cloudgrep.search import Search

from pytz import timezone
class Cloud:
def __init__(self) -> None:
self.search = Search()
Expand Down Expand Up @@ -42,6 +42,8 @@ def download_from_s3_multithread(
log_format: Optional[str] = None,
log_properties: List[str] = [],
json_output: Optional[bool] = False,
convert_date: bool = False,
use_og_name : bool = False
) -> int:
"""Download and search files from AWS S3"""
if log_properties is None:
Expand All @@ -53,8 +55,9 @@ def _download_search_s3(key: str) -> int:
try:
logging.info(f"Downloading s3://{bucket}/{key} to {tmp_name}")
s3.download_file(bucket, key, tmp_name)
og_name = key if use_og_name else tmp_name
matched = self.search.search_file(
tmp_name, key, query, hide_filenames, yara_rules, log_format, log_properties, json_output
tmp_name, key, query, hide_filenames, yara_rules, log_format, log_properties, json_output, convert_date=convert_date, og_name=og_name
)
return 1 if matched else 0
except Exception:
Expand Down Expand Up @@ -166,7 +169,8 @@ def get_objects(
from_date: Optional[datetime],
end_date: Optional[datetime],
file_size: int,
max_matches: int = 1000000 # generous default
max_matches: int = 1000000, # generous default
convert_date: bool = False,
) -> Iterator[str]:
"""Yield a maximum of max_matches objects that match filter"""
# Reuse the S3 client if already created; otherwise, create one
Expand All @@ -180,7 +184,7 @@ def get_objects(
PaginationConfig={'PageSize': 1000}
):
for obj in page.get("Contents", []):
if self.filter_object(obj, key_contains, from_date, end_date, file_size):
if self.filter_object(obj, key_contains, from_date, end_date, file_size, convert_date=convert_date):
yield obj.get("Key")
count += 1
if count >= max_matches:
Expand Down Expand Up @@ -227,9 +231,17 @@ def filter_object(
from_date: Optional[datetime],
to_date: Optional[datetime],
file_size: int,
convert_date: bool = False,
) -> bool:
"""Filter an S3 object based on modification date, size, and key substring"""
last_modified = obj.get("LastModified")
# Fix error : "TypeError: can't compare offset-naive and offset-aware datetimes"
if convert_date:
from_date = from_date.astimezone(timezone("UTC")) if from_date else None
to_date = to_date.astimezone(timezone("UTC")) if to_date else None
# Convert last_modified to UTC if it's not already
if isinstance(last_modified, datetime):
last_modified = last_modified.astimezone(timezone("UTC"))
if last_modified:
if from_date and last_modified < from_date:
return False
Expand Down
12 changes: 9 additions & 3 deletions cloudgrep/cloudgrep.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ def list_files(
from_date: Optional[datetime] = None,
end_date: Optional[datetime] = None,
file_size: int = 100_000_000, # 100MB
convert_date: Optional[bool] = False,
) -> Dict[str, List[Any]]:
"""
Returns a dictionary of matching files for each cloud provider.
Expand All @@ -37,7 +38,7 @@ def list_files(
"""
files = {}
if bucket:
files["s3"] = list(self.cloud.get_objects(bucket, prefix, key_contains, from_date, end_date, file_size))
files["s3"] = list(self.cloud.get_objects(bucket, prefix, key_contains, from_date, end_date, file_size, convert_date=convert_date))
if account_name and container_name:
files["azure"] = list(
self.cloud.get_azure_objects(
Expand Down Expand Up @@ -69,6 +70,8 @@ def search(
profile: Optional[str] = None,
json_output: bool = False,
files: Optional[Dict[str, List[Any]]] = None,
convert_date: Optional[bool] = False,
use_og_name: Optional[bool] = False,
) -> None:
"""
Searches the contents of files matching the given queries.
Expand Down Expand Up @@ -98,6 +101,9 @@ def search(
elif log_type.lower() == "azure":
log_format = "json"
log_properties = ["data"]
elif log_type.lower() == "waf":
log_format = "jsonl"
log_properties = None
else:
logging.error(f"Invalid log_type: {log_type}")
return
Expand All @@ -109,14 +115,14 @@ def search(
matching_keys = files["s3"]
else:
matching_keys = list(
self.cloud.get_objects(bucket, prefix, key_contains, from_date, end_date, file_size)
self.cloud.get_objects(bucket, prefix, key_contains, from_date, end_date, file_size, convert_date=convert_date)
)
s3_client = boto3.client("s3")
region = s3_client.get_bucket_location(Bucket=bucket).get("LocationConstraint", "unknown")
logging.warning(f"Bucket region: {region}. (Search from the same region to avoid egress charges.)")
logging.warning(f"Searching {len(matching_keys)} files in {bucket} for {query}...")
self.cloud.download_from_s3_multithread(
bucket, matching_keys, query, hide_filenames, yara_rules, log_format, log_properties, json_output
bucket, matching_keys, query, hide_filenames, yara_rules, log_format, log_properties, json_output, convert_date=convert_date, use_og_name=use_og_name
)

if account_name and container_name:
Expand Down
23 changes: 17 additions & 6 deletions cloudgrep/search.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,16 @@ def print_match(self, match_info: dict, hide_filenames: bool, json_output: Optio
print(f"{output.get('key_name', '')}: {line}" if not hide_filenames else line)

def parse_logs(self, line: str, log_format: Optional[str]) -> Any:
if log_format == "json":
if log_format == "jsonl":
try:
# JSON Lines format (each line is a separate JSON object)
# This is a common format for logs, especially in cloud environments
# where each log entry is a separate line.
line_split = line.strip().split("\n")
return line_split
except json.JSONDecodeError as e:
logging.error(f"JSON decode error in line: {line} ({e})")
elif log_format == "json":
try:
return json.loads(line)
except json.JSONDecodeError as e:
Expand Down Expand Up @@ -84,13 +93,14 @@ def search_line(
log_format: Optional[str],
log_properties: List[str] = [],
json_output: Optional[bool] = False,
convert_date: Optional[bool] = False,
) -> bool:
"""Regex search of the line"""
found = False
for regex in compiled_patterns:
if regex.search(line):
if log_format:
self.search_logs(line, key_name, regex.pattern, hide_filenames, log_format, log_properties, json_output)
self.search_logs(line, key_name, regex.pattern, hide_filenames, log_format, log_properties, json_output, convert_date)
else:
self.print_match(
{"key_name": key_name, "query": regex.pattern, "line": line}, hide_filenames, json_output
Expand Down Expand Up @@ -122,6 +132,8 @@ def search_file(
log_properties: List[str] = [],
json_output: Optional[bool] = False,
account_name: Optional[str] = None,
convert_date: Optional[bool] = False,
og_name: Optional[str] = None,
) -> bool:
"""Regex search of the file line by line"""
logging.info(f"Searching {file_name} for patterns: {patterns}")
Expand All @@ -132,11 +144,10 @@ def search_file(

def process_lines(lines: Iterable[str]) -> bool:
return any(
self.search_line(key_name, compiled_patterns, hide_filenames, line, log_format, log_properties, json_output)
self.search_line(key_name, compiled_patterns, hide_filenames, line, log_format, log_properties, json_output, convert_date)
for line in lines
)

if file_name.endswith(".gz"):
if file_name.endswith(".gz") or og_name.endswith(".gz"):
try:
with gzip.open(file_name, "rt", encoding="utf-8", errors="ignore") as f:
if account_name:
Expand All @@ -147,7 +158,7 @@ def process_lines(lines: Iterable[str]) -> bool:
except Exception:
logging.exception(f"Error processing gzip file: {file_name}")
return False
elif file_name.endswith(".zip"):
elif file_name.endswith(".zip") or og_name.endswith(".zip"):
matched_any = False
try:
with zipfile.ZipFile(file_name, "r") as zf:
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -14,3 +14,4 @@ azure-identity==1.16.1
google-cloud-storage==2.12.0
setuptools==70.0.0
yara-python-wheel==4.4.0
pytz==2025.2