11import logging
22import random
3+ from datetime import datetime , timedelta , timezone
4+
5+ from django .db .models .query import QuerySet
6+ from shared .django_apps .reports .models import ReportSession as Upload
37
48from services .cleanup .cleanup import run_cleanup
59from services .cleanup .utils import CleanupResult , CleanupSummary , cleanup_context
@@ -11,8 +15,7 @@ def run_regular_cleanup() -> CleanupSummary:
1115 log .info ("Starting regular cleanup job" )
1216 complete_summary = CleanupSummary (CleanupResult (0 ), summary = {})
1317
14- # Usage of these model was removed, and we should clean up all its data before dropping the table for good.
15- cleanups_to_run = []
18+ cleanups_to_run = create_upload_cleanup_jobs ()
1619
1720 # as we expect this job to have frequent retries, and cleanup to take a long time,
1821 # lets shuffle the various cleanups so that each one of those makes a little progress.
@@ -27,9 +30,51 @@ def run_regular_cleanup() -> CleanupSummary:
2730 complete_summary .add (summary )
2831
2932 # TODO:
30- # - cleanup old `ReportSession`s (aka `Upload`s)
3133 # - cleanup `Commit`s that are `deleted`
3234 # - figure out a way how we can first mark, and then fully delete `Branch`es
3335
3436 log .info ("Regular cleanup finished" )
3537 return complete_summary
38+
39+
40+ UPLOAD_RETENTION_PERIOD = 150
41+ MONTH_SLOTS = 120
42+
43+
44+ def create_upload_cleanup_jobs () -> list [QuerySet ]:
45+ """
46+ This returns a list of `Upload` querysets, each targetting a subset of to-delete data.
47+
48+ As the `Upload` table is one of our biggest tables, running an (almost)
49+ unbounded `DELETE` query would certainly cause problems.
50+
51+ Fortunately though, the (production) table has an index on `created_at`,
52+ so queries targetting a range on that field should be fairly quick, and we
53+ can use that to devide up the deletion workload onto more manageable chunks.
54+
55+ We are targetting 30-day chunks, going back ~10 years.
56+ As the main cleanup task above is using a `random.shuffle`, and the cleanup
57+ task itself is being restarted/retried on timeouts, this will end up with
58+ an even distribution of cleanup tasks running concurrently, deleting different
59+ chunks of this table.
60+ """
61+ latest_timestamp = datetime .now (timezone .utc ) - timedelta (
62+ days = UPLOAD_RETENTION_PERIOD
63+ )
64+ timestamps = [latest_timestamp ]
65+ for _ in range (MONTH_SLOTS ):
66+ timestamps .append (timestamps [- 1 ] - timedelta (days = 30 ))
67+ timestamps .reverse ()
68+
69+ begin_timestamp = None
70+ queries : list [QuerySet ] = []
71+ for timestamp in timestamps :
72+ query = Upload .objects
73+ if begin_timestamp :
74+ query = query .filter (created_at__gte = begin_timestamp )
75+ query = query .filter (created_at__lt = timestamp )
76+ queries .append (query )
77+
78+ begin_timestamp = timestamp
79+
80+ return queries
0 commit comments