-
Notifications
You must be signed in to change notification settings - Fork 0
93 lines (79 loc) · 2.98 KB
/
scrape-deploy.yml
File metadata and controls
93 lines (79 loc) · 2.98 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
name: Scrape & Deploy Release Notes
on:
schedule:
- cron: "0 6 * * 1" # every Monday 06:00 UTC
workflow_dispatch:
push:
branches: [main]
paths:
- "scraper/**"
- "web/**"
- "tests/**"
jobs:
test:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
with:
python-version: "3.12"
- run: pip install requests beautifulsoup4 lxml pytest
- name: Run parser tests (no network needed)
run: python -m pytest tests/ -v
scrape-and-deploy:
needs: test
runs-on: ubuntu-latest
permissions:
contents: write
pages: write
id-token: write
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
with:
python-version: "3.12"
- run: pip install requests beautifulsoup4 lxml
- name: Restore cached databases
uses: actions/cache@v4
id: db-cache
with:
path: |
db/release_notes.db
db/workbook_release_notes.db
# Week-scoped key: persists across scraper code changes within a week.
# Changing the key prefix (e.g. v2→v3) forces a full re-scrape when needed.
key: release-notes-db-v2-${{ github.run_id }}
restore-keys: |
release-notes-db-v2-
- name: Run Vantagepoint scraper
# --resume : skip already-scraped old releases
# --recheck 10 : always re-fetch last 10 releases to detect retroactive amendments
# --concurrency 4 : parallel fetches (polite — 0.5s delay per worker)
run: python scraper/scraper.py --resume --recheck 10 --concurrency 4
- name: Run WorkBook scraper
run: python scraper/workbook_scraper.py --resume --recheck 10 --concurrency 4
- name: Run DB changes scraper
run: python scraper/db_changes_scraper.py --resume --recheck 10 --concurrency 4
- name: Copy DBs to web directory
run: |
cp db/release_notes.db web/release_notes.db
cp db/workbook_release_notes.db web/workbook_release_notes.db
- name: Inject cache-busting version into HTML files
run: |
sed -i 's|"./release_notes.db"|"./release_notes.db?v=${{ github.sha }}"|' web/index.html
sed -i 's|"../workbook_release_notes.db"|"../workbook_release_notes.db?v=${{ github.sha }}"|' web/workbook/index.html
sed -i 's|"../release_notes.db"|"../release_notes.db?v=${{ github.sha }}"|' web/db-changes/index.html
- name: Save updated databases
uses: actions/cache/save@v4
with:
path: |
db/release_notes.db
db/workbook_release_notes.db
key: release-notes-db-v2-${{ github.run_id }}
- name: Deploy to GitHub Pages
uses: peaceiris/actions-gh-pages@v4
with:
github_token: ${{ secrets.GITHUB_TOKEN }}
publish_dir: ./web
publish_branch: gh-pages
cname: "" # set your custom domain here if needed