OpenTransitTools
diff --git a/‎README.md‎
Lines changed: 10 additions & 1 deletion b/‎README.md‎
Lines changed: 10 additions & 1 deletion
diff --git a/‎docs/modsec_audit.log‎
Lines changed: 449 additions & 0 deletions b/‎docs/modsec_audit.log‎
Lines changed: 449 additions & 0 deletions
diff --git a/‎ott/log_parser/control/loader.py‎
Lines changed: 17 additions & 8 deletions b/‎ott/log_parser/control/loader.py‎
Lines changed: 17 additions & 8 deletions
diff --git a/‎ott/log_parser/control/parser.py‎
Lines changed: 1 addition & 1 deletion b/‎ott/log_parser/control/parser.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎ott/log_parser/control/parser_modsec.py‎
Lines changed: 220 additions & 0 deletions b/‎ott/log_parser/control/parser_modsec.py‎
Lines changed: 220 additions & 0 deletions
diff --git a/‎ott/log_parser/control/reporter.py‎
Lines changed: 1 addition & 1 deletion b/‎ott/log_parser/control/reporter.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎ott/log_parser/control/stats.py‎
Lines changed: 6 additions & 1 deletion b/‎ott/log_parser/control/stats.py‎
Lines changed: 6 additions & 1 deletion
@@ -1,4 +1,13 @@
 log_parser
 ====
-
 parse apache logs, pulling out OTP trip planner urls, and parsing that data into a .csv or .json file
+
+
+  quickstart
+  ====
+ - buildout
+ - scripts/db/create.sh
+ - bin/load_and_post_process -l ./docs -c
+ - (see entries in the new 'logs' database)
+ - bin/stats
+
@@ -1,4 +1,5 @@
 from ott.log_parser.control import parser
+from ott.log_parser.control import parser_modsec
 
 from ott.log_parser.db.processed_requests import ProcessedRequests
 from .. import utils
@@ -11,17 +12,25 @@
 
 def load_log_file(file, session):
     """ load a log file into the db """
-    logs = []
-    recs = parser.parse_log_file(file)
-    log.info("from file {}, parsed {} number of records".format(file, len(recs)))
-    for r in recs:
-        rawlog = RawLog(r)
-        logs.append(rawlog)
-    RawLog.persist_data(session, logs)
+    try:
+        recs = parser.parse_log_file(file)
+    except:
+        recs = None
+    if recs is None or len(recs) == 0:
+        # with no recs from first parser, maybe this is a mod_security file containing trip plans
+        #import pdb; pdb.set_trace()
+        recs = parser_modsec.parse_log_file(file)
+
+    if recs and len(recs) > 0:
+        log.info("from file {}, parsed {} number of records".format(file, len(recs)))
+        logs = []
+        for r in recs:
+            rawlog = RawLog(r)
+            logs.append(rawlog)
+        RawLog.persist_data(session, logs)
 
 
 def loader():
-    #import pdb; pdb.set_trace()
     files, cmdline = utils.cmd_line_loader()
     if len(files) == 0:
         if cmdline.log_directory == "CLEAR":
 
@@ -1,5 +1,5 @@
 """
-  Will parse Apache log file, looking for requests to the OpenTripPlanner (OTP)
+  Will parse Apache access log file, looking for requests to the OpenTripPlanner (OTP)
 
   :note: individual log entries look like this string
    172.25.102.10 "172.25.102.10" - - [26/Jan/2021:10:36:23 -0800] "GET /?sessionId=blah HTTP/1.1" 200 2972 "-" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.104 Safari/537.36"
 
@@ -0,0 +1,220 @@
+"""
+  Will parse mod_security log files
+  This is needed for OpenTripPlanner (OTP) v2.x, which uses HTTP POST (graphql) requests,
+  and with Apache, the mod_security2 modules is how you see these requests
+"""
+
+import os
+import re
+from ott.utils.parse.cmdline.base_cmdline import file_cmdline
+
+
+def parse_modsec_audit_log(filename):
+    """
+    written by Claude (github) in Aug 2025 (with a couple hacks by Frank)
+
+    seemingly great job (almost) at reading the A-Z blocks for each unique session 
+    opens a mod_security2 log file, and returns a list of dicts, where each dict
+    is the unique request, broken up by sections.  
+    
+    For this project, we're mostly interested in section 'C', which contains the 
+    graphql POST payload input into OTP.  We also want other sections, like the
+    referrer and device used, etc...
+    """
+    entries = []
+    with open(filename, 'r', encoding='utf-8', errors='ignore') as f:
+        content = f.read()
+
+    # hack: add some junk before content, b/c if no junk the first record will get culled via the 'next()' 
+    content = "\n\n" + content
+
+    # Split into entries by boundary lines like '---abc123-A---'
+    raw_entries = re.split(r"\n--([a-fA-F0-9]+)-A--\n", content)
+    it = iter(raw_entries)  # via split, return a list: [before junk, id1, entry1, id2, entry2, ...]
+    next(it)  # skip junk before the first entry
+    for unique_id, entry in zip(it, it):
+        entry_dict = {"id": unique_id}
+        # hack: add back the A section header culled out when split'ing content to obtain id above
+        entry = "\n--{}-A--\n{}".format(unique_id, entry)
+        #import pdb; pdb.set_trace()
+        # Section parsing: ---abc123-A---, ---abc123-B---, ---abc123-C---, etc.
+        sections = re.split(rf"\n--{unique_id}-([A-Z])--\n", entry)
+        # [before, section_letter1, section1, section_letter2, section2, ...]
+        section_it = iter(sections)
+        next(section_it)  # skip content before first section
+        for section_letter, section_content in zip(section_it, section_it):
+            entry_dict[section_letter] = section_content.strip()
+        entries.append(entry_dict)
+    return entries
+
+
+def parse_section_a(req):
+    """
+    --9e7b8111-A--
+    [10/Aug/2025:15:39:41 --0700] xxx 172.25.90.86 55986 172.25.102.224 443
+    """
+    date = None
+    ip = None
+    sec_a = req.get("A", None)
+
+    try:
+        date_match = re.search(r"\[(.*?)\]", sec_a)
+        date = date_match.group(1) if date_match else None
+        date = date # convert to date time
+    except Exception as e:
+        pass
+
+    try:
+        ip_matches = re.findall(r"\b(?:\d{1,3}\.){3}\d{1,3}\b", sec_a)
+        ip = ip_matches[0]
+    except Exception as e:
+        pass
+    return date, ip
+
+
+def parse_section_b(req):
+    """
+    section b has request headers
+    POST /rtp/gtfs/v1 HTTP/1.1
+    Referer: https://labs-5.trimet.org/
+    User-Agent: Mozilla/5.0 (Win...
+    """
+    user_agent = ""    
+    referer = ""
+    url = ""
+
+    sec_b = req.get("B", None)
+    try:    
+        #import pdb; pdb.set_trace()
+        ua = re.search(r"User-Agent: (.*)\n", sec_b)
+        user_agent = ua.group(1) if ua else ""
+    except Exception as e:
+        pass
+
+    try:    
+        r = re.search(r"Referer: (.*)\n", sec_b)
+        referer = r.group(1) if r else ""
+    except Exception as e:
+        pass
+
+    try:    
+        u = re.search(r"POST (.*) HTTP.*\n", sec_b)
+        url = u.group(1) if u else ""
+    except Exception as e:
+        pass
+
+
+    return user_agent, url, referer
+
+
+def parse_section_c(req):
+    """
+    section c has the POST payload
+    split the string at variables, and return that json of key/value pairs
+
+    --9e7b8111-C--
+    ....description\ninputField\n}\n}\n}\n","variables":{"date":"2025-08-10","time":"15:23",...
+    """
+    ret_val = None
+
+    sec_c = req.get("C", None)
+    try:
+        if "query" in sec_c:
+            if "variables" in sec_c:
+                vars = sec_c.split("variables\":")
+                ret_val = vars[1][:-1]  # return things right of the variables, except for dangling bracket
+            else:
+                ret_val = sec_c
+    except Exception as e:
+        pass
+    return ret_val
+
+
+def parse_section_f(req, def_code="520"):
+    """
+    section f has response headers
+    pull out the HTTP status code
+    
+    --ac12e444-F--
+    HTTP/1.1 200 OK
+    Content-Encoding: gzip    
+    """
+    code = def_code
+    
+    sec_f = req.get("F", None)
+    try:
+        c = re.search(r"HTTP.*(\d{3}).*", sec_f)
+        code = c.group(1) if c else def_code
+    except Exception as e:
+        pass
+    return code
+
+
+def parse_raw_request(req):
+    """
+    parse out the various 'raw' elements from a given mod_security2 log record (dict)
+    parser.py attribute names: '{ip} - - [{apache_dt}] "{meth} {url} {http}" {code} {size} "{referer}" "{browser}"\n'
+    """
+    rec = {}
+
+    date, ip = parse_section_a(req)
+    rec['ip'] = ip
+    rec['apache_dt'] = date
+
+    user_agent, url, referer = parse_section_b(req)
+    rec['browser'] = user_agent
+    rec['url'] = url
+    rec['referer'] = referer
+
+    payload = parse_section_c(req)
+    rec['payload'] = payload
+
+    code = parse_section_f(req)
+    rec['code'] = code
+
+    return rec
+
+
+def parse_processed(req):
+    """
+    parse out the 'ul' elements from a given raw mod_security2 record
+    """
+    rec = {}
+    rec['ip'] = ""
+    rec['date'] = ""
+    rec['url'] = ""
+    rec['code'] = ""
+    rec['referrer'] = ""
+    rec['browser'] = ""
+
+
+def parse_log_file(file: os.PathLike):
+    ret_val = []
+    parsed_entries = parse_modsec_audit_log(file)
+    for e in parsed_entries:
+        rec = parse_raw_request(e)
+        if rec and rec.get('url', None):
+            url = rec.get('url', "")
+            if 'atisExe' not in url and 'solr/select' not in url:
+                ret_val.append(rec)
+    return ret_val
+
+
+def simple_test(parse=True):
+    cmd = file_cmdline("bin/parser_modsec_test", "docs/modsec_audit.txt")
+    parsed_entries = parse_modsec_audit_log(cmd.file)
+    for e in parsed_entries:
+        if parse:
+            raw = parse_raw_request(e)
+            print(raw)
+            pro = parse_processed(raw)
+        else:
+            print(f"ID: {e['id']}")
+            print("Sections:", list(e.keys()))
+            for k in list(e.keys()):
+                print(e.get(k, 'No section {}'.format(k)))
+                print()
+                #break
+            print("=" * 60)
+            print()
+        
@@ -1,5 +1,5 @@
 """
-  Will read log_parser report .json data, and work to sort and catogrize based on user and place
+  Will read log_parser report .json data, and work to sort and categorize based on user and place
 
   Measures:
    - requests with the same start and end lat,log coordinates
 
@@ -1,6 +1,7 @@
 """
 stats
 """
+from datetime import datetime, timedelta
 from urllib import request
 from ott.log_parser.db.processed_requests import ProcessedRequests
 from .. import utils
@@ -33,14 +34,18 @@ def __init__(self, session):
                 c['related'] += 1
 
     def print(self):
+        days = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]
+        now = datetime.today() - timedelta(days=1)  # offset for cron job processing yesterday's data
         names = sorted(self.app_counts)
-        print("\nTotal Requests: {}".format(self.total_plans))
+        print("\n{} {}".format(days[now.weekday()], now.strftime("%B %d, %Y")))
+        print("Total Requests:  {}".format(self.total_plans))
         print("Unique Requests: {}\n".format(self.filtered_plans))
         print("  {:40} {:8} {:8} {:8}".format("APP NAME", "    total", "filtered", " related"))
         print("  {:40} {:8} {:8} {:8}".format("--------", "    -----", "--------", " -------"))
         for n in names:
             o = self.app_counts[n]
             print("  {:40}: {:8} {:8} {:8}".format(n, o['full'], o['filtered'], o['related']))
+        print()
 
 
 def main():