Skip to content
This repository was archived by the owner on May 5, 2025. It is now read-only.

Commit ed4b50d

Browse files
authored
Add regular Upload cleanup (#1219)
1 parent 9de1f0d commit ed4b50d

6 files changed

Lines changed: 259 additions & 7 deletions

services/cleanup/regular.py

Lines changed: 48 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,9 @@
11
import logging
22
import random
3+
from datetime import datetime, timedelta, timezone
4+
5+
from django.db.models.query import QuerySet
6+
from shared.django_apps.reports.models import ReportSession as Upload
37

48
from services.cleanup.cleanup import run_cleanup
59
from services.cleanup.utils import CleanupResult, CleanupSummary, cleanup_context
@@ -11,8 +15,7 @@ def run_regular_cleanup() -> CleanupSummary:
1115
log.info("Starting regular cleanup job")
1216
complete_summary = CleanupSummary(CleanupResult(0), summary={})
1317

14-
# Usage of these model was removed, and we should clean up all its data before dropping the table for good.
15-
cleanups_to_run = []
18+
cleanups_to_run = create_upload_cleanup_jobs()
1619

1720
# as we expect this job to have frequent retries, and cleanup to take a long time,
1821
# lets shuffle the various cleanups so that each one of those makes a little progress.
@@ -27,9 +30,51 @@ def run_regular_cleanup() -> CleanupSummary:
2730
complete_summary.add(summary)
2831

2932
# TODO:
30-
# - cleanup old `ReportSession`s (aka `Upload`s)
3133
# - cleanup `Commit`s that are `deleted`
3234
# - figure out a way how we can first mark, and then fully delete `Branch`es
3335

3436
log.info("Regular cleanup finished")
3537
return complete_summary
38+
39+
40+
UPLOAD_RETENTION_PERIOD = 150
41+
MONTH_SLOTS = 120
42+
43+
44+
def create_upload_cleanup_jobs() -> list[QuerySet]:
45+
"""
46+
This returns a list of `Upload` querysets, each targetting a subset of to-delete data.
47+
48+
As the `Upload` table is one of our biggest tables, running an (almost)
49+
unbounded `DELETE` query would certainly cause problems.
50+
51+
Fortunately though, the (production) table has an index on `created_at`,
52+
so queries targetting a range on that field should be fairly quick, and we
53+
can use that to devide up the deletion workload onto more manageable chunks.
54+
55+
We are targetting 30-day chunks, going back ~10 years.
56+
As the main cleanup task above is using a `random.shuffle`, and the cleanup
57+
task itself is being restarted/retried on timeouts, this will end up with
58+
an even distribution of cleanup tasks running concurrently, deleting different
59+
chunks of this table.
60+
"""
61+
latest_timestamp = datetime.now(timezone.utc) - timedelta(
62+
days=UPLOAD_RETENTION_PERIOD
63+
)
64+
timestamps = [latest_timestamp]
65+
for _ in range(MONTH_SLOTS):
66+
timestamps.append(timestamps[-1] - timedelta(days=30))
67+
timestamps.reverse()
68+
69+
begin_timestamp = None
70+
queries: list[QuerySet] = []
71+
for timestamp in timestamps:
72+
query = Upload.objects
73+
if begin_timestamp:
74+
query = query.filter(created_at__gte=begin_timestamp)
75+
query = query.filter(created_at__lt=timestamp)
76+
queries.append(query)
77+
78+
begin_timestamp = timestamp
79+
80+
return queries
Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
-- UploadError
2+
DELETE
3+
FROM "reports_uploaderror"
4+
WHERE "reports_uploaderror"."upload_id" IN
5+
(SELECT U0."id"
6+
FROM "reports_upload" U0
7+
WHERE (U0."created_at" >= %s
8+
AND U0."created_at" < %s));
9+
-- [2024-10-25 00:00:00+00:00, 2024-11-24 00:00:00+00:00]
10+
11+
12+
-- UploadFlagMembership
13+
DELETE
14+
FROM "reports_uploadflagmembership"
15+
WHERE "reports_uploadflagmembership"."upload_id" IN
16+
(SELECT U0."id"
17+
FROM "reports_upload" U0
18+
WHERE (U0."created_at" >= %s
19+
AND U0."created_at" < %s));
20+
-- [2024-10-25 00:00:00+00:00, 2024-11-24 00:00:00+00:00]
21+
22+
23+
-- UploadLevelTotals
24+
DELETE
25+
FROM "reports_uploadleveltotals"
26+
WHERE "reports_uploadleveltotals"."upload_id" IN
27+
(SELECT U0."id"
28+
FROM "reports_upload" U0
29+
WHERE (U0."created_at" >= %s
30+
AND U0."created_at" < %s));
31+
-- [2024-10-25 00:00:00+00:00, 2024-11-24 00:00:00+00:00]
32+
33+
34+
-- TestInstance
35+
DELETE
36+
FROM "reports_testinstance"
37+
WHERE "reports_testinstance"."upload_id" IN
38+
(SELECT U0."id"
39+
FROM "reports_upload" U0
40+
WHERE (U0."created_at" >= %s
41+
AND U0."created_at" < %s));
42+
-- [2024-10-25 00:00:00+00:00, 2024-11-24 00:00:00+00:00]
43+
44+
45+
-- ReportSession
46+
DELETE
47+
FROM "reports_upload"
48+
WHERE ("reports_upload"."created_at" >= %s
49+
AND "reports_upload"."created_at" < %s);
50+
-- [2024-10-25 00:00:00+00:00, 2024-11-24 00:00:00+00:00]

0 commit comments

Comments
 (0)