Skip to content

Commit 3656947

Browse files
authored
Merge pull request #2 from OpenTransitTools/rtp
rtp - graphql POST processor (from apache log)
2 parents 0b2d7e6 + 2340d0a commit 3656947

14 files changed

Lines changed: 826 additions & 55 deletions

File tree

README.md

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,13 @@
11
log_parser
22
====
3-
43
parse apache logs, pulling out OTP trip planner urls, and parsing that data into a .csv or .json file
4+
5+
6+
quickstart
7+
====
8+
- buildout
9+
- scripts/db/create.sh
10+
- bin/load_and_post_process -l ./docs -c
11+
- (see entries in the new 'logs' database)
12+
- bin/stats
13+

docs/modsec_audit.log

Lines changed: 449 additions & 0 deletions
Large diffs are not rendered by default.

ott/log_parser/control/loader.py

Lines changed: 17 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
from ott.log_parser.control import parser
2+
from ott.log_parser.control import parser_modsec
23

34
from ott.log_parser.db.processed_requests import ProcessedRequests
45
from .. import utils
@@ -11,17 +12,25 @@
1112

1213
def load_log_file(file, session):
1314
""" load a log file into the db """
14-
logs = []
15-
recs = parser.parse_log_file(file)
16-
log.info("from file {}, parsed {} number of records".format(file, len(recs)))
17-
for r in recs:
18-
rawlog = RawLog(r)
19-
logs.append(rawlog)
20-
RawLog.persist_data(session, logs)
15+
try:
16+
recs = parser.parse_log_file(file)
17+
except:
18+
recs = None
19+
if recs is None or len(recs) == 0:
20+
# with no recs from first parser, maybe this is a mod_security file containing trip plans
21+
#import pdb; pdb.set_trace()
22+
recs = parser_modsec.parse_log_file(file)
23+
24+
if recs and len(recs) > 0:
25+
log.info("from file {}, parsed {} number of records".format(file, len(recs)))
26+
logs = []
27+
for r in recs:
28+
rawlog = RawLog(r)
29+
logs.append(rawlog)
30+
RawLog.persist_data(session, logs)
2131

2232

2333
def loader():
24-
#import pdb; pdb.set_trace()
2534
files, cmdline = utils.cmd_line_loader()
2635
if len(files) == 0:
2736
if cmdline.log_directory == "CLEAR":

ott/log_parser/control/parser.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
"""
2-
Will parse Apache log file, looking for requests to the OpenTripPlanner (OTP)
2+
Will parse Apache access log file, looking for requests to the OpenTripPlanner (OTP)
33
44
:note: individual log entries look like this string
55
172.25.102.10 "172.25.102.10" - - [26/Jan/2021:10:36:23 -0800] "GET /?sessionId=blah HTTP/1.1" 200 2972 "-" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.104 Safari/537.36"
Lines changed: 220 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,220 @@
1+
"""
2+
Will parse mod_security log files
3+
This is needed for OpenTripPlanner (OTP) v2.x, which uses HTTP POST (graphql) requests,
4+
and with Apache, the mod_security2 modules is how you see these requests
5+
"""
6+
7+
import os
8+
import re
9+
from ott.utils.parse.cmdline.base_cmdline import file_cmdline
10+
11+
12+
def parse_modsec_audit_log(filename):
13+
"""
14+
written by Claude (github) in Aug 2025 (with a couple hacks by Frank)
15+
16+
seemingly great job (almost) at reading the A-Z blocks for each unique session
17+
opens a mod_security2 log file, and returns a list of dicts, where each dict
18+
is the unique request, broken up by sections.
19+
20+
For this project, we're mostly interested in section 'C', which contains the
21+
graphql POST payload input into OTP. We also want other sections, like the
22+
referrer and device used, etc...
23+
"""
24+
entries = []
25+
with open(filename, 'r', encoding='utf-8', errors='ignore') as f:
26+
content = f.read()
27+
28+
# hack: add some junk before content, b/c if no junk the first record will get culled via the 'next()'
29+
content = "\n\n" + content
30+
31+
# Split into entries by boundary lines like '---abc123-A---'
32+
raw_entries = re.split(r"\n--([a-fA-F0-9]+)-A--\n", content)
33+
it = iter(raw_entries) # via split, return a list: [before junk, id1, entry1, id2, entry2, ...]
34+
next(it) # skip junk before the first entry
35+
for unique_id, entry in zip(it, it):
36+
entry_dict = {"id": unique_id}
37+
# hack: add back the A section header culled out when split'ing content to obtain id above
38+
entry = "\n--{}-A--\n{}".format(unique_id, entry)
39+
#import pdb; pdb.set_trace()
40+
# Section parsing: ---abc123-A---, ---abc123-B---, ---abc123-C---, etc.
41+
sections = re.split(rf"\n--{unique_id}-([A-Z])--\n", entry)
42+
# [before, section_letter1, section1, section_letter2, section2, ...]
43+
section_it = iter(sections)
44+
next(section_it) # skip content before first section
45+
for section_letter, section_content in zip(section_it, section_it):
46+
entry_dict[section_letter] = section_content.strip()
47+
entries.append(entry_dict)
48+
return entries
49+
50+
51+
def parse_section_a(req):
52+
"""
53+
--9e7b8111-A--
54+
[10/Aug/2025:15:39:41 --0700] xxx 172.25.90.86 55986 172.25.102.224 443
55+
"""
56+
date = None
57+
ip = None
58+
sec_a = req.get("A", None)
59+
60+
try:
61+
date_match = re.search(r"\[(.*?)\]", sec_a)
62+
date = date_match.group(1) if date_match else None
63+
date = date # convert to date time
64+
except Exception as e:
65+
pass
66+
67+
try:
68+
ip_matches = re.findall(r"\b(?:\d{1,3}\.){3}\d{1,3}\b", sec_a)
69+
ip = ip_matches[0]
70+
except Exception as e:
71+
pass
72+
return date, ip
73+
74+
75+
def parse_section_b(req):
76+
"""
77+
section b has request headers
78+
POST /rtp/gtfs/v1 HTTP/1.1
79+
Referer: https://labs-5.trimet.org/
80+
User-Agent: Mozilla/5.0 (Win...
81+
"""
82+
user_agent = ""
83+
referer = ""
84+
url = ""
85+
86+
sec_b = req.get("B", None)
87+
try:
88+
#import pdb; pdb.set_trace()
89+
ua = re.search(r"User-Agent: (.*)\n", sec_b)
90+
user_agent = ua.group(1) if ua else ""
91+
except Exception as e:
92+
pass
93+
94+
try:
95+
r = re.search(r"Referer: (.*)\n", sec_b)
96+
referer = r.group(1) if r else ""
97+
except Exception as e:
98+
pass
99+
100+
try:
101+
u = re.search(r"POST (.*) HTTP.*\n", sec_b)
102+
url = u.group(1) if u else ""
103+
except Exception as e:
104+
pass
105+
106+
107+
return user_agent, url, referer
108+
109+
110+
def parse_section_c(req):
111+
"""
112+
section c has the POST payload
113+
split the string at variables, and return that json of key/value pairs
114+
115+
--9e7b8111-C--
116+
....description\ninputField\n}\n}\n}\n","variables":{"date":"2025-08-10","time":"15:23",...
117+
"""
118+
ret_val = None
119+
120+
sec_c = req.get("C", None)
121+
try:
122+
if "query" in sec_c:
123+
if "variables" in sec_c:
124+
vars = sec_c.split("variables\":")
125+
ret_val = vars[1][:-1] # return things right of the variables, except for dangling bracket
126+
else:
127+
ret_val = sec_c
128+
except Exception as e:
129+
pass
130+
return ret_val
131+
132+
133+
def parse_section_f(req, def_code="520"):
134+
"""
135+
section f has response headers
136+
pull out the HTTP status code
137+
138+
--ac12e444-F--
139+
HTTP/1.1 200 OK
140+
Content-Encoding: gzip
141+
"""
142+
code = def_code
143+
144+
sec_f = req.get("F", None)
145+
try:
146+
c = re.search(r"HTTP.*(\d{3}).*", sec_f)
147+
code = c.group(1) if c else def_code
148+
except Exception as e:
149+
pass
150+
return code
151+
152+
153+
def parse_raw_request(req):
154+
"""
155+
parse out the various 'raw' elements from a given mod_security2 log record (dict)
156+
parser.py attribute names: '{ip} - - [{apache_dt}] "{meth} {url} {http}" {code} {size} "{referer}" "{browser}"\n'
157+
"""
158+
rec = {}
159+
160+
date, ip = parse_section_a(req)
161+
rec['ip'] = ip
162+
rec['apache_dt'] = date
163+
164+
user_agent, url, referer = parse_section_b(req)
165+
rec['browser'] = user_agent
166+
rec['url'] = url
167+
rec['referer'] = referer
168+
169+
payload = parse_section_c(req)
170+
rec['payload'] = payload
171+
172+
code = parse_section_f(req)
173+
rec['code'] = code
174+
175+
return rec
176+
177+
178+
def parse_processed(req):
179+
"""
180+
parse out the 'ul' elements from a given raw mod_security2 record
181+
"""
182+
rec = {}
183+
rec['ip'] = ""
184+
rec['date'] = ""
185+
rec['url'] = ""
186+
rec['code'] = ""
187+
rec['referrer'] = ""
188+
rec['browser'] = ""
189+
190+
191+
def parse_log_file(file: os.PathLike):
192+
ret_val = []
193+
parsed_entries = parse_modsec_audit_log(file)
194+
for e in parsed_entries:
195+
rec = parse_raw_request(e)
196+
if rec and rec.get('url', None):
197+
url = rec.get('url', "")
198+
if 'atisExe' not in url and 'solr/select' not in url:
199+
ret_val.append(rec)
200+
return ret_val
201+
202+
203+
def simple_test(parse=True):
204+
cmd = file_cmdline("bin/parser_modsec_test", "docs/modsec_audit.txt")
205+
parsed_entries = parse_modsec_audit_log(cmd.file)
206+
for e in parsed_entries:
207+
if parse:
208+
raw = parse_raw_request(e)
209+
print(raw)
210+
pro = parse_processed(raw)
211+
else:
212+
print(f"ID: {e['id']}")
213+
print("Sections:", list(e.keys()))
214+
for k in list(e.keys()):
215+
print(e.get(k, 'No section {}'.format(k)))
216+
print()
217+
#break
218+
print("=" * 60)
219+
print()
220+

ott/log_parser/control/reporter.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
"""
2-
Will read log_parser report .json data, and work to sort and catogrize based on user and place
2+
Will read log_parser report .json data, and work to sort and categorize based on user and place
33
44
Measures:
55
- requests with the same start and end lat,log coordinates

ott/log_parser/control/stats.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
"""
22
stats
33
"""
4+
from datetime import datetime, timedelta
45
from urllib import request
56
from ott.log_parser.db.processed_requests import ProcessedRequests
67
from .. import utils
@@ -33,14 +34,18 @@ def __init__(self, session):
3334
c['related'] += 1
3435

3536
def print(self):
37+
days = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]
38+
now = datetime.today() - timedelta(days=1) # offset for cron job processing yesterday's data
3639
names = sorted(self.app_counts)
37-
print("\nTotal Requests: {}".format(self.total_plans))
40+
print("\n{} {}".format(days[now.weekday()], now.strftime("%B %d, %Y")))
41+
print("Total Requests: {}".format(self.total_plans))
3842
print("Unique Requests: {}\n".format(self.filtered_plans))
3943
print(" {:40} {:8} {:8} {:8}".format("APP NAME", " total", "filtered", " related"))
4044
print(" {:40} {:8} {:8} {:8}".format("--------", " -----", "--------", " -------"))
4145
for n in names:
4246
o = self.app_counts[n]
4347
print(" {:40}: {:8} {:8} {:8}".format(n, o['full'], o['filtered'], o['related']))
48+
print()
4449

4550

4651
def main():

0 commit comments

Comments
 (0)