From 66ff7006132436e321ea8889ea79c637d305a29a Mon Sep 17 00:00:00 2001
From: D-VR <26770468+D-VR@users.noreply.github.com>
Date: Tue, 30 Jun 2026 21:09:37 +0200
Subject: [PATCH 1/5] Fix name-clash collision dropping same-named files

Files added with name_clash_id=None (direct external links, direct Moodle
content files, and embedded videojs nodes) all hashed md5("None") when
disambiguating same-named siblings, so two distinct same-named files
resolved to one path and one was silently lost.

Add Node._clash_suffix(), which falls back to the node's URL when
name_clash_id is None. At the rename branch the siblings differ precisely
because their URLs differ, so the URL is a present, distinct key.
---
 syncmymoodle/__main__.py             | 34 ++++++++++++----------------
 tests/test_course_prefix_handling.py | 29 ++++++++++++++++++++++++
 2 files changed, 44 insertions(+), 19 deletions(-)

diff --git a/syncmymoodle/__main__.py b/syncmymoodle/__main__.py
index fe47223..001d6ff 100755
--- a/syncmymoodle/__main__.py
+++ b/syncmymoodle/__main__.py
@@ -195,6 +195,17 @@ def go_to_path(self, target_path):
                 raise Exception("The path is not found in this root node. Wrong path?")
         return target_node[-1]
 
+    def _clash_suffix(self):
+        # Stable, distinct suffix used to disambiguate same-named siblings.
+        # Fall back to the URL when no name_clash_id is set (direct-link,
+        # embedded, and direct-content file nodes pass name_clash_id=None);
+        # otherwise such nodes would all hash to md5("None") and collide onto
+        # the same path, silently dropping all but one file.
+        key = self.name_clash_id if self.name_clash_id is not None else self.url
+        return base64.urlsafe_b64encode(
+            hashlib.md5(str(key).encode("utf-8")).hexdigest().encode("utf-8")
+        ).decode()[:10]
+
     def remove_children_nameclashes(self):
         # Check for duplicate filenames
 
@@ -248,30 +259,15 @@ def remove_children_nameclashes(self):
                 )
             ]
             if len(siblings) > 0:
-                # if a filename is still duplicate in its directory, we rename it by appending its id (urlsafe base64 so it also works for urls).
+                # if a filename is still duplicate in its directory, we rename
+                # it by appending a stable per-node key (works for ids and urls).
                 filename = Path(child.name)
                 child.name = (
-                    filename.stem
-                    + "_"
-                    + base64.urlsafe_b64encode(
-                        hashlib.md5(str(child.name_clash_id).encode("utf-8"))
-                        .hexdigest()
-                        .encode("utf-8")
-                    ).decode()[:10]
-                    + filename.suffix
+                    filename.stem + "_" + child._clash_suffix() + filename.suffix
                 )
                 for s in siblings:
                     filename = Path(s.name)
-                    s.name = (
-                        filename.stem
-                        + "_"
-                        + base64.urlsafe_b64encode(
-                            hashlib.md5(str(s.name_clash_id).encode("utf-8"))
-                            .hexdigest()
-                            .encode("utf-8")
-                        ).decode()[:10]
-                        + filename.suffix
-                    )
+                    s.name = filename.stem + "_" + s._clash_suffix() + filename.suffix
                     self.children.remove(s)
                 unclashed_children.extend(siblings)
 
diff --git a/tests/test_course_prefix_handling.py b/tests/test_course_prefix_handling.py
index 2cd9b98..a01ebef 100644
--- a/tests/test_course_prefix_handling.py
+++ b/tests/test_course_prefix_handling.py
@@ -116,3 +116,32 @@ def test_same_name_with_different_urls_still_gets_stable_suffixes():
     assert "Slides" not in names
     for name in names:
         assert name.startswith("Slides_")
+
+
+def test_clashing_files_without_name_clash_id_use_url_for_distinct_names():
+    # Direct-link / direct-content / embedded nodes pass name_clash_id=None.
+    # Two such same-named files with different URLs must still get distinct
+    # names (falling back to the URL) instead of both hashing md5("None").
+    root = Node("", -1, "Root", None)
+    section = root.add_child("General", None, "Section")
+    section.add_child(
+        "slides.pdf",
+        None,
+        "Linked file [application/pdf]",
+        url="https://a.example/slides.pdf",
+        name_clash_id=None,
+    )
+    section.add_child(
+        "slides.pdf",
+        None,
+        "Linked file [application/pdf]",
+        url="https://b.example/slides.pdf",
+        name_clash_id=None,
+    )
+
+    root.remove_children_nameclashes()
+
+    names = [child.name for child in section.children]
+    assert len(names) == 2
+    assert len(set(names)) == 2
+    assert "slides.pdf" not in names

From 810d1e8290c4fc27c7bcdc3bf9ddac501bb85fb1 Mon Sep 17 00:00:00 2001
From: D-VR <26770468+D-VR@users.noreply.github.com>
Date: Tue, 30 Jun 2026 21:11:05 +0200
Subject: [PATCH 2/5] Match selected_courses/skip_courses by exact id, with
 selected overriding skip

Course filtering matched configured URLs with `str(course_id) in url`, a
substring test: selecting course 12 also pulled in courses 1 and 2, and
skipping 12 silently dropped 1 and 2. Add _course_id_in_filter(), which
compares the parsed `id` query parameter (or a bare numeric entry) exactly.

Also make selected_courses a true allowlist that overrides skip_courses
(and only_sync_semester), as documented; previously skip_courses was applied
first and could not be overridden. Harden shortname/idnumber access so a
missing or empty field no longer aborts the sync or creates an empty-named
semester folder.
---
 syncmymoodle/__main__.py     | 58 +++++++++++++++++++-----------------
 tests/test_sync_filtering.py | 47 +++++++++++++++++++++++++++++
 2 files changed, 78 insertions(+), 27 deletions(-)

diff --git a/syncmymoodle/__main__.py b/syncmymoodle/__main__.py
index 001d6ff..ab5e2c2 100755
--- a/syncmymoodle/__main__.py
+++ b/syncmymoodle/__main__.py
@@ -645,6 +645,23 @@ def _as_list(self, value):
             return value
         return [value]
 
+    def _course_id_in_filter(self, course_id, entries) -> bool:
+        """Return True if ``course_id`` is referenced by a configured entry.
+
+        Entries are course URLs (``.../course/view.php?id=NNN``). The ``id``
+        query parameter is compared exactly, so e.g. ``id=12`` does not also
+        match courses ``1`` or ``2``. A bare numeric id entry is also accepted.
+        """
+        course_id = str(course_id)
+        for entry in entries or []:
+            entry = str(entry)
+            parsed = urllib.parse.urlparse(entry)
+            if course_id in urllib.parse.parse_qs(parsed.query).get("id", []):
+                return True
+            if entry.strip() == course_id:
+                return True
+        return False
+
     def _configured_patterns(self, *keys, course_id=None):
         patterns = []
         for key in keys:
@@ -1371,39 +1388,26 @@ def sync(self):
 
         # Syncing all courses
         for course in self.get_all_courses():
-            course_name = self._format_course_name(course["shortname"])
+            course_name = self._format_course_name(
+                course.get("shortname") or f"course-{course.get('id')}"
+            )
             course_id = course["id"]
 
-            if (
-                len(
-                    [
-                        c
-                        for c in self.config.get("skip_courses", [])
-                        if str(course_id) in c
-                    ]
-                )
-                > 0
-            ):
-                continue
-
-            # Skip not selected courses
-            if (
-                len(self.config.get("selected_courses", [])) > 0
-                and len(
-                    [
-                        c
-                        for c in self.config.get("selected_courses", [])
-                        if str(course["id"]) in c
-                    ]
-                )
-                == 0
+            selected_courses = self.config.get("selected_courses", [])
+            if selected_courses:
+                # selected_courses is an explicit allowlist that overrides
+                # skip_courses (and, below, only_sync_semester).
+                if not self._course_id_in_filter(course_id, selected_courses):
+                    continue
+            elif self._course_id_in_filter(
+                course_id, self.config.get("skip_courses", [])
             ):
                 continue
 
-            semestername = course["idnumber"][:4]
-            # Skip not selected semesters
+            semestername = (course.get("idnumber") or "")[:4] or "unknown-semester"
+            # Skip not selected semesters (selected_courses overrides this)
             if (
-                len(self.config.get("selected_courses", [])) == 0
+                not selected_courses
                 and self.config.get("only_sync_semester", [])
                 and semestername not in self.config.get("only_sync_semester", [])
             ):
diff --git a/tests/test_sync_filtering.py b/tests/test_sync_filtering.py
index e9d7ee1..693456a 100644
--- a/tests/test_sync_filtering.py
+++ b/tests/test_sync_filtering.py
@@ -49,3 +49,50 @@ def test_skip_courses_and_semester_filter_limit_synced_courses():
         "Semester | 26ss |  |  | ",
         "Course | 26ss/Current Semester |  |  | ",
     ]
+
+
+# Course ids that are substrings of one another, to pin down exact-id matching.
+SUBSTRING_COURSES = [
+    {"id": 1, "shortname": "Course One", "idnumber": "26ss-1"},
+    {"id": 2, "shortname": "Course Two", "idnumber": "26ss-2"},
+    {"id": 12, "shortname": "Course Twelve", "idnumber": "26ss-12"},
+    {"id": 123, "shortname": "Course OneTwoThree", "idnumber": "26ss-123"},
+]
+
+
+def _run_filter(config):
+    synced = []
+    syncer = make_syncer(config)
+    syncer.get_all_courses = lambda: SUBSTRING_COURSES  # type: ignore[method-assign]
+    syncer.get_course = lambda course_id: synced.append(course_id) or []  # type: ignore[method-assign]
+    syncer.session = FakeSession()
+    syncer.sync()
+    return synced
+
+
+def test_selected_courses_match_by_exact_id_not_substring():
+    # Selecting course 12 must not also pull in courses 1 and 2.
+    synced = _run_filter(
+        {"selected_courses": ["https://moodle.rwth-aachen.de/course/view.php?id=12"]}
+    )
+    assert synced == [12]
+
+
+def test_skip_courses_match_by_exact_id_not_substring():
+    # Skipping course 12 must not silently drop courses 1 and 2.
+    synced = _run_filter(
+        {"skip_courses": ["https://moodle.rwth-aachen.de/course/view.php?id=12"]}
+    )
+    assert synced == [1, 2, 123]
+
+
+def test_bare_numeric_id_entry_is_accepted():
+    synced = _run_filter({"selected_courses": ["12"]})
+    assert synced == [12]
+
+
+def test_selected_courses_override_skip_courses():
+    # A course present in both lists is synced: selected_courses wins.
+    url = "https://moodle.rwth-aachen.de/course/view.php?id=12"
+    synced = _run_filter({"selected_courses": [url], "skip_courses": [url]})
+    assert synced == [12]

From bf4fb2713e89205a0ca7a90872860a66dd1cc835 Mon Sep 17 00:00:00 2001
From: D-VR <26770468+D-VR@users.noreply.github.com>
Date: Tue, 30 Jun 2026 21:11:56 +0200
Subject: [PATCH 3/5] Harden login flow against unexpected page structure

The SSO/TOTP login and wstoken parsing assumed Moodle's exact HTML, so a
changed or error page raised AttributeError/TypeError instead of the
intended clean exit. Null-check the sesskey <script> and the maintenance
<body>; route every login form-field extraction (csrf_token, RelayState,
SAMLResponse) through a require_input_value helper that exits with a clear
message when a field is missing; and parse the mobile-launch token
defensively (isolate the token value, guard base64 decode and the ":::"
split). Also use sys.exit consistently.
---
 syncmymoodle/__main__.py | 61 ++++++++++++++++++++++++++++++++--------
 1 file changed, 50 insertions(+), 11 deletions(-)

diff --git a/syncmymoodle/__main__.py b/syncmymoodle/__main__.py
index ab5e2c2..297a170 100755
--- a/syncmymoodle/__main__.py
+++ b/syncmymoodle/__main__.py
@@ -1004,13 +1004,33 @@ def _check_moodle_availability(self):
     def login(self):
         def get_session_key(soup):
             script = soup.find("script", string=lambda text: text and "sesskey" in text)
-            js_text = script.text
-            match = re.search(r'"sesskey":"(.*?)"', js_text)
+            match = (
+                re.search(r'"sesskey":"(.*?)"', script.text)
+                if script is not None
+                else None
+            )
             if match:
                 return match.group(1)
             else:
                 logger.critical("Can't retrieve session key from JavaScript config")
-                exit(1)
+                sys.exit(1)
+
+        def require_input_value(soup, name, context):
+            value = self._get_input_value(soup, name)
+            if value is None:
+                logger.critical(
+                    "Failed to login: expected form field %r was missing at the "
+                    "%s. The RWTH login flow may have changed or the servers may "
+                    "have difficulties. For current service status, see %s.",
+                    name,
+                    context,
+                    RWTH_STATUS_URL,
+                )
+                self._check_rwth_status_page()
+                logger.info("-------Login-Error-Soup--------")
+                logger.info(soup)
+                sys.exit(1)
+            return value
 
         self.session = requests.Session()
         cookie_file = Path(self.config.get("cookie_file", "./session")).expanduser()
@@ -1046,7 +1066,8 @@ def get_session_key(soup):
             alert.decompose()
 
         # Extract body text after cleanup
-        body_text = soup_check.find("body").get_text(separator=" ", strip=True)
+        body = soup_check.find("body")
+        body_text = body.get_text(separator=" ", strip=True) if body else ""
 
         # Check for maintenance notice
         if "Wartungsarbeiten" in body_text:
@@ -1058,7 +1079,9 @@ def get_session_key(soup):
 
         soup = bs(resp.text, features="lxml")
         if soup.find("input", {"name": "RelayState"}) is None:
-            csrf_token = soup.find("input", {"name": "csrf_token"})["value"]
+            csrf_token = require_input_value(
+                soup, "csrf_token", "username/password form"
+            )
             login_data = {
                 "j_username": self.config["user"],
                 "j_password": self.config["password"],
@@ -1081,7 +1104,9 @@ def get_session_key(soup):
                 logger.info(soup)
                 sys.exit(1)
 
-            csrf_token = soup.find("input", {"name": "csrf_token"})["value"]
+            csrf_token = require_input_value(
+                soup, "csrf_token", "TOTP generator selection form"
+            )
 
             print("Setting TOTP generator")
             totp_selection_data = {
@@ -1106,7 +1131,7 @@ def get_session_key(soup):
                 logger.info(soup)
                 sys.exit(1)
 
-            csrf_token = soup.find("input", {"name": "csrf_token"})["value"]
+            csrf_token = require_input_value(soup, "csrf_token", "TOTP entry form")
             if not self.config.get("totpsecret"):
                 totp_input = input(f"Enter TOTP for generator {self.config['totp']}:\n")
             else:
@@ -1135,8 +1160,8 @@ def get_session_key(soup):
             logger.info(soup)
             sys.exit(1)
         data = {
-            "RelayState": soup.find("input", {"name": "RelayState"})["value"],
-            "SAMLResponse": soup.find("input", {"name": "SAMLResponse"})["value"],
+            "RelayState": require_input_value(soup, "RelayState", "SAML response"),
+            "SAMLResponse": require_input_value(soup, "SAMLResponse", "SAML response"),
         }
         resp = self.session.post(
             "https://moodle.rwth-aachen.de/Shibboleth.sso/SAML2/POST", data=data
@@ -1221,9 +1246,23 @@ def getCookies(cookie_jar, domain):
                 )
             sys.exit(1)
 
-        token_base64d = location.split("token=")[1]
+        # The redirect looks like moodlemobile://token=BASE64[&...]; isolate the
+        # token value and decode it defensively so a malformed redirect yields a
+        # clear message instead of a traceback.
+        token_base64d = location.split("token=", 1)[1].split("&")[0]
         conn.close()
-        self.wstoken = base64.b64decode(token_base64d).decode().split(":::")[1]
+        try:
+            token_parts = base64.b64decode(token_base64d).decode().split(":::")
+        except (ValueError, UnicodeDecodeError):
+            token_parts = []
+        if len(token_parts) < 2 or not token_parts[1]:
+            logger.critical(
+                "Failed to parse the Moodle webservice token from the mobile "
+                "launch redirect. Your saved session may be stale; delete the "
+                "cookie file and try again."
+            )
+            sys.exit(1)
+        self.wstoken = token_parts[1]
         return self.wstoken
 
     def get_all_courses(self):

From 8b710284a45ba66fd9435b7eee87c5d172740a87 Mon Sep 17 00:00:00 2001
From: D-VR <26770468+D-VR@users.noreply.github.com>
Date: Tue, 30 Jun 2026 21:18:10 +0200
Subject: [PATCH 4/5] Make file downloads safe against failed updates,
 corruption, and stale caches

Rework download_file and the per-course cache so an update never destroys or
silently abandons local data:

- Conflict "rename" mode now downloads the replacement to a temp file first and
  only moves the local file aside on success, so a failed/aborted download (an
  expired session returning an HTML error page) never empties the canonical
  path.
- Resume partial downloads only with a validating If-Range using a recorded
  etag sidecar, and use hidden, namespaced temp files, so a blind range request
  can no longer splice a newer version onto an old partial or clobber a user's
  own *.temp file.
- A failed ETag comparison is treated as "no cached etag" (fall back to the
  timestamp/HEAD heuristic) instead of silently overwriting the local file.
- Check filetype/name exclusions before any conflict handling so excluded files
  are never displaced.
- Trust cached version markers only when the previous run actually downloaded
  the file, and have cache_root_node preserve the on-disk version's markers for
  files that were not fetched this run, so a failed update is retried cleanly
  next run instead of being skipped forever or moved aside as a spurious
  conflict.
- Skip on matching timemodified only when it is meaningful, so Sciebo files
  (which have no timemodified) fall through to etag-based change detection and
  are actually re-downloaded when they change.
---
 syncmymoodle/__main__.py        | 222 +++++++++++++++------
 tests/test_download_behavior.py | 335 +++++++++++++++++++++++++++++++-
 tests/test_storage_safety.py    |   2 +
 3 files changed, 493 insertions(+), 66 deletions(-)

diff --git a/syncmymoodle/__main__.py b/syncmymoodle/__main__.py
index 297a170..6a14808 100755
--- a/syncmymoodle/__main__.py
+++ b/syncmymoodle/__main__.py
@@ -357,17 +357,56 @@ def _read_private_gzip_json(self, path: Path, description: str):
             )
             return None
 
-    def _node_to_cache_data(self, node: Node):
+    def _match_old_cache_child(self, old_node, child):
+        """Find the previous cache node corresponding to ``child``, if any."""
+        if old_node is None:
+            return None
+        candidates = [
+            c
+            for c in getattr(old_node, "children", [])
+            if c.name == child.name and c.type == child.type
+        ]
+        if not candidates:
+            return None
+        for candidate in candidates:
+            if candidate.url == child.url:
+                return candidate
+        return candidates[0]
+
+    def _node_to_cache_data(self, node: Node, old_node: Node | None = None):
+        timemodified = node.timemodified
+        etag = node.etag
+        is_downloaded = node.is_downloaded
+        # If this file was not (re)downloaded this run but a previously
+        # downloaded version is still on disk, keep the previously cached version
+        # markers. Otherwise the cache would record Moodle's new timemodified/etag
+        # for a file we never actually fetched, which either skips the file
+        # forever or moves the on-disk copy aside as a spurious conflict on the
+        # next run's retry.
+        if (
+            not node.is_downloaded
+            and old_node is not None
+            and getattr(old_node, "is_downloaded", False)
+            and self.get_sanitized_node_path(node).exists()
+        ):
+            timemodified = getattr(old_node, "timemodified", None)
+            etag = getattr(old_node, "etag", None)
+            is_downloaded = True
         return {
             "name": node.name,
             "id": node.id,
             "type": node.type,
             "url": node.url,
-            "timemodified": node.timemodified,
-            "etag": node.etag,
+            "timemodified": timemodified,
+            "etag": etag,
             "name_clash_id": node.name_clash_id,
-            "is_downloaded": node.is_downloaded,
-            "children": [self._node_to_cache_data(child) for child in node.children],
+            "is_downloaded": is_downloaded,
+            "children": [
+                self._node_to_cache_data(
+                    child, self._match_old_cache_child(old_node, child)
+                )
+                for child in node.children
+            ],
         }
 
     def _node_from_cache_data(self, data, parent=None):
@@ -448,13 +487,19 @@ def cache_root_node(self):
                 if course_node.type != "Course":
                     continue
                 course_path = self.get_sanitized_node_path(course_node)
+                # Read the previous course cache before overwriting it, so we can
+                # preserve version markers for files that were not downloaded
+                # this run (see _node_to_cache_data).
+                old_course_root = self._get_course_cache_root(course_node)
                 course_path.mkdir(parents=True, exist_ok=True)
                 cache_path = course_path / ".syncmymoodle_cache"
                 self._write_private_gzip_json(
                     cache_path,
                     {
                         "format": "syncmymoodle.course-cache.v1",
-                        "course": self._node_to_cache_data(course_node),
+                        "course": self._node_to_cache_data(
+                            course_node, old_course_root
+                        ),
                     },
                 )
 
@@ -2042,6 +2087,16 @@ def download_file(self, node):
         if self._should_skip_url(node.url, f"{node.type} file"):
             return True
 
+        # Respect filetype/name exclusions up front so that excluded files never
+        # trigger conflict handling, displace local files, or create temp files.
+        if node.name.split(".")[-1] in self.config.get("exclude_filetypes", []):
+            return True
+        if any(
+            fnmatchcase(node.name, pattern)
+            for pattern in self.config.get("exclude_files", [])
+        ):
+            return True
+
         # If we already downloaded this path during the current run, skip any
         # further processing. This avoids duplicate downloads and spurious
         # conflicts when the same remote file appears multiple times in the
@@ -2056,17 +2111,30 @@ def download_file(self, node):
         # Decide whether we need to (re-)download the file at all
         cached_timemodified = None
         old_node = None
+        conflict_rename_pending = False
         if downloadpath.exists():
             if not self.config.get("updatefiles"):
                 return True
 
             # Try to find a cached node for this file from the per-course cache.
             old_node = self._get_old_node_for(node)
+            # Only trust the cached version markers when the previous run
+            # actually downloaded the file. Otherwise an update that failed last
+            # time (e.g. an expired session) gets cached with Moodle's new
+            # timemodified and would be skipped forever, leaving a stale file.
+            # Treat a non-downloaded cache entry as if there were no cache at all.
+            if old_node is not None and not getattr(old_node, "is_downloaded", False):
+                old_node = None
             if old_node is not None:
                 cached_timemodified = getattr(old_node, "timemodified", None)
                 old_etag = getattr(old_node, "etag", None)
-                # If Moodle did not change the file, skip re-download.
-                if node.timemodified == cached_timemodified:
+                # If Moodle did not change the file, skip re-download. Only when
+                # timemodified is meaningful: Sciebo files have no timemodified
+                # (always None), so this must fall through to the etag check
+                # below instead of treating None == None as "unchanged".
+                if cached_timemodified is not None and (
+                    node.timemodified == cached_timemodified
+                ):
                     return True
                 # For Sciebo, we use the etag from the previous run as the
                 # remote version marker. If it matches the current etag from
@@ -2164,44 +2232,40 @@ def download_file(self, node):
                     )
                     return True
                 if conflict_mode == "rename":
-                    # Move the locally modified file out of the way before download
-                    conflict_path = self._make_conflict_path(downloadpath)
-                    try:
-                        downloadpath.rename(conflict_path)
-                        logger.warning(
-                            "Detected local changes for %s, moving to %s before "
-                            "downloading updated file from Moodle",
-                            downloadpath,
-                            conflict_path,
-                        )
-                    except OSError:
-                        logger.exception(
-                            "Failed to move locally modified file %s to %s, "
-                            "skipping Moodle update to avoid data loss",
-                            downloadpath,
-                            conflict_path,
-                        )
-                        return True
+                    # Defer moving the locally modified file aside until the
+                    # replacement has been fully downloaded, so an aborted or
+                    # failed download (e.g. an expired session returning an HTML
+                    # error page) never leaves the canonical path empty.
+                    conflict_rename_pending = True
                 # conflict_mode == "overwrite": fall through and overwrite
 
-        if len(node.name.split(".")) > 0 and node.name.split(".")[
-            -1
-        ] in self.config.get("exclude_filetypes", []):
-            return True
-
-        if any(
-            fnmatchcase(node.name, pattern)
-            for pattern in self.config.get("exclude_files")
-        ):
-            return True
-
-        tmp_downloadpath = downloadpath.with_suffix(downloadpath.suffix + ".temp")
+        # Hidden, namespaced temp/sidecar names so we never resume from or
+        # overwrite a file the user happens to own. The sidecar records the
+        # ETag a partial download was fetched against.
+        tmp_downloadpath = downloadpath.parent / f".{downloadpath.name}.smmpart"
+        etag_sidecar = tmp_downloadpath.with_name(tmp_downloadpath.name + ".etag")
+
+        # Only resume a previous partial when we recorded the ETag it was fetched
+        # against, so we can ask the server (via If-Range) to confirm the remote
+        # content is unchanged. Without that proof a blind range request could
+        # splice bytes from a newer version onto an older partial and silently
+        # corrupt the file.
+        resume_size = 0
+        partial_etag: str | None = None
+        header = dict()
         if tmp_downloadpath.exists():
-            resume_size = tmp_downloadpath.stat().st_size
-            header = {"Range": f"bytes={resume_size}-"}
-        else:
-            resume_size = 0
-            header = dict()
+            if etag_sidecar.exists():
+                try:
+                    partial_etag = etag_sidecar.read_text(encoding="utf-8").strip()
+                except OSError:
+                    partial_etag = None
+            if partial_etag:
+                resume_size = tmp_downloadpath.stat().st_size
+                header = {"Range": f"bytes={resume_size}-", "If-Range": partial_etag}
+            else:
+                # Cannot validate the partial; discard it and start fresh.
+                tmp_downloadpath.unlink(missing_ok=True)
+                etag_sidecar.unlink(missing_ok=True)
         if node.type.lower() == "sciebo file":
             header = {**header, **node.additional_info}
 
@@ -2209,25 +2273,27 @@ def download_file(self, node):
             self.session.get(node.url, headers=header, stream=True)
         ) as response:
             etag_header = response.headers.get("ETag")
-            # If we attempted to resume but the server did not honor the Range
-            # header (status != 206), fallback to a full download and ignore
-            # the existing partial file to avoid corrupting PDFs or other
-            # content by appending a second full copy.
-            if resume_size and response.status_code == 200:
-                resume_size = 0
-                tmp_downloadpath.unlink(missing_ok=True)
 
-            if not self._download_response_is_usable(node, response, downloadpath):
-                return False
-
-            if resume_size and response.status_code != 206:
-                logger.warning(
-                    "Skipping download of %s from %s because the server returned "
-                    "HTTP %s instead of partial content for a resumed download",
-                    downloadpath,
-                    node.url,
-                    response.status_code,
+            if resume_size:
+                # The remote content differs from our partial when the server
+                # ignores the range (any non-206) or, for servers that ignore
+                # If-Range, when the returned ETag no longer matches what the
+                # partial was fetched against.
+                version_changed = response.status_code != 206 or (
+                    etag_header is not None
+                    and partial_etag is not None
+                    and etag_header != partial_etag
                 )
+                if version_changed:
+                    resume_size = 0
+                    tmp_downloadpath.unlink(missing_ok=True)
+                    etag_sidecar.unlink(missing_ok=True)
+                    if response.status_code == 206:
+                        # This 206 body is only the tail of a now-changed file
+                        # and cannot be used; restart fresh on the next run.
+                        return False
+
+            if not self._download_response_is_usable(node, response, downloadpath):
                 return False
 
             content = response.iter_content(self.block_size)
@@ -2255,6 +2321,13 @@ def download_file(self, node):
             if resume_size:
                 progress_bar.update(resume_size)
             downloadpath.parent.mkdir(parents=True, exist_ok=True)
+            # Record the ETag this partial is being fetched against so an
+            # interrupted download can be safely resumed next time.
+            if etag_header:
+                try:
+                    etag_sidecar.write_text(etag_header, encoding="utf-8")
+                except OSError:
+                    pass
             mode = "ab" if resume_size else "wb"
             with tmp_downloadpath.open(mode) as file:
                 if first_chunk:
@@ -2264,7 +2337,34 @@ def download_file(self, node):
                     progress_bar.update(len(data))
                     file.write(data)
             progress_bar.close()
-            tmp_downloadpath.rename(downloadpath)
+
+            # The replacement is now fully on disk. Only at this point do we move
+            # a conflicting local file aside, so a failure above never empties
+            # the canonical path.
+            if conflict_rename_pending:
+                conflict_path = self._make_conflict_path(downloadpath)
+                try:
+                    downloadpath.rename(conflict_path)
+                    logger.warning(
+                        "Detected local changes for %s, moved to %s before "
+                        "installing the updated file from Moodle",
+                        downloadpath,
+                        conflict_path,
+                    )
+                except OSError:
+                    logger.exception(
+                        "Failed to move locally modified file %s to %s; keeping "
+                        "it and discarding the downloaded update to avoid data "
+                        "loss",
+                        downloadpath,
+                        conflict_path,
+                    )
+                    tmp_downloadpath.unlink(missing_ok=True)
+                    etag_sidecar.unlink(missing_ok=True)
+                    return True
+
+            os.replace(tmp_downloadpath, downloadpath)
+            etag_sidecar.unlink(missing_ok=True)
             # Align the local mtime with Moodle's timemodified to detect local
             # changes on subsequent runs.
             if getattr(node, "timemodified", None) is not None:
diff --git a/tests/test_download_behavior.py b/tests/test_download_behavior.py
index fb87852..596ee51 100644
--- a/tests/test_download_behavior.py
+++ b/tests/test_download_behavior.py
@@ -14,12 +14,18 @@ def sha1(data: bytes) -> str:
     return hashlib.sha1(data).hexdigest()
 
 
-def seed_course_cache(config, *, timemodified, etag):
-    """Write a per-course cache to disk describing a previously synced file."""
+def seed_course_cache(config, *, timemodified, etag, is_downloaded=True):
+    """Write a per-course cache to disk describing a previously synced file.
+
+    A real cache is written after the download walk, so a file that was
+    successfully fetched is marked is_downloaded=True; pass False to simulate a
+    previous run whose download failed.
+    """
     cache_syncer = make_syncer(config)
-    cached_root, _ = build_single_file_tree(
+    cached_root, cached_file = build_single_file_tree(
         "slides.pdf", URL, timemodified=timemodified, etag=etag
     )
+    cached_file.is_downloaded = is_downloaded
     cache_syncer.root_node = cached_root
     cache_syncer.cache_root_node()
 
@@ -55,8 +61,8 @@ def test_download_streams_chunks_to_disk_and_records_metadata(tmp_path):
     assert syncer.download_file(file_node) is True
 
     assert download_path.read_bytes() == b"".join(chunks)
-    # The temporary part-file is renamed away once the download completes.
-    assert not download_path.with_suffix(download_path.suffix + ".temp").exists()
+    # The temp part-file and its etag sidecar are cleaned up on completion.
+    assert list(download_path.parent.glob(".*.smmpart*")) == []
     # mtime is aligned with Moodle's timemodified so later runs detect changes.
     assert int(download_path.stat().st_mtime) == 1710000500
     # The ETag is persisted on the node for the next run's change detection.
@@ -209,6 +215,43 @@ def test_unchanged_timemodified_skips_download_despite_local_edit(tmp_path):
     assert list(download_path.parent.glob("*.syncconflict.*")) == []
 
 
+def test_failed_previous_download_is_retried_not_skipped(tmp_path):
+    # The cache records Moodle's timemodified even when the previous download
+    # failed (is_downloaded=False). Such an entry must not suppress a retry,
+    # otherwise a stale file would be kept forever.
+    config = {"basedir": str(tmp_path), "updatefiles": True}
+    seed_course_cache(config, timemodified=1710000300, etag=None, is_downloaded=False)
+    syncer, file_node = make_run_syncer(config, timemodified=1710000300)
+    download_path = syncer.get_sanitized_node_path(file_node)
+    download_path.parent.mkdir(parents=True, exist_ok=True)
+    download_path.write_bytes(b"OLD STALE VERSION")
+    syncer.session.add(
+        "GET",
+        URL,
+        FakeResponse(
+            headers={"Content-Type": "application/pdf"}, chunks=[b"NEW CORRECT VERSION"]
+        ),
+    )
+
+    assert syncer.download_file(file_node) is True
+    assert syncer.session.count("GET", URL) == 1
+    assert download_path.read_bytes() == b"NEW CORRECT VERSION"
+
+
+def test_successful_previous_download_with_same_timemodified_is_skipped(tmp_path):
+    # The complement: a downloaded cache entry with an unchanged timemodified is
+    # still skipped without contacting the server.
+    config = {"basedir": str(tmp_path), "updatefiles": True}
+    seed_course_cache(config, timemodified=1710000300, etag=None, is_downloaded=True)
+    syncer, file_node = make_run_syncer(config, timemodified=1710000300)
+    download_path = syncer.get_sanitized_node_path(file_node)
+    download_path.parent.mkdir(parents=True, exist_ok=True)
+    download_path.write_bytes(b"already downloaded")
+
+    assert syncer.download_file(file_node) is True
+    assert syncer.session.calls == []
+
+
 def test_etag_failure_falls_back_to_timestamp_heuristic_conflict(tmp_path, monkeypatch):
     # A faulty ETag cache is treated as if there were no cached ETag, so the
     # timestamp heuristic decides. Here the local mtime differs from the cached
@@ -250,3 +293,285 @@ def boom(path, etag):
     assert syncer.download_file(file_node) is True
     assert download_path.read_bytes() == new_body
     assert list(download_path.parent.glob("*.syncconflict.*")) == []
+
+
+# --------------------------------------------------------------------------
+# Failed/aborted update must not empty the canonical path (bug #1)
+# --------------------------------------------------------------------------
+
+
+def test_rename_conflict_failed_html_update_preserves_canonical_file(tmp_path):
+    # An expired session returns an HTML login page that masquerades as a new
+    # version. The download is rejected; the user's file must stay in place and
+    # not be displaced to a side-car (which would empty the canonical path).
+    syncer, file_node, download_path, local_modified = _setup_conflict(
+        tmp_path, "rename"
+    )
+    syncer.session.add(
+        "GET",
+        URL,
+        FakeResponse(
+            headers={"Content-Type": "text/html"},
+            text="<!doctype html><html>login</html>",
+        ),
+    )
+
+    assert syncer.download_file(file_node) is False
+    assert download_path.exists()
+    assert download_path.read_bytes() == local_modified
+    assert list(download_path.parent.glob("*.syncconflict.*")) == []
+
+
+def test_rename_conflict_non_2xx_update_preserves_canonical_file(tmp_path):
+    syncer, file_node, download_path, local_modified = _setup_conflict(
+        tmp_path, "rename"
+    )
+    syncer.session.add("GET", URL, FakeResponse(status_code=403, text="forbidden"))
+
+    assert syncer.download_file(file_node) is False
+    assert download_path.read_bytes() == local_modified
+    assert list(download_path.parent.glob("*.syncconflict.*")) == []
+
+
+def test_excluded_filetype_existing_file_is_not_touched(tmp_path):
+    # Exclusions are honored before any conflict handling, so an excluded file
+    # that already exists is never displaced or downloaded.
+    config = {
+        "basedir": str(tmp_path),
+        "updatefiles": True,
+        "update_files_conflict": "rename",
+        "exclude_filetypes": ["pdf"],
+    }
+    seed_course_cache(config, timemodified=1710000300, etag=sha1(b"original"))
+    syncer, file_node = make_run_syncer(config, timemodified=1710000400)
+    download_path = syncer.get_sanitized_node_path(file_node)
+    download_path.parent.mkdir(parents=True, exist_ok=True)
+    download_path.write_bytes(b"locally edited content")
+
+    # No GET route registered: a request would raise in the fake session.
+    assert syncer.download_file(file_node) is True
+    assert download_path.read_bytes() == b"locally edited content"
+    assert syncer.session.calls == []
+    assert list(download_path.parent.glob("*.syncconflict.*")) == []
+
+
+# --------------------------------------------------------------------------
+# Safe resume of partial downloads (bug #2)
+# --------------------------------------------------------------------------
+
+
+def _seed_partial(syncer, file_node, body, etag):
+    """Write a hidden partial download plus its etag sidecar for ``file_node``."""
+    download_path = syncer.get_sanitized_node_path(file_node)
+    download_path.parent.mkdir(parents=True, exist_ok=True)
+    partial = download_path.parent / f".{download_path.name}.smmpart"
+    partial.write_bytes(body)
+    partial.with_name(partial.name + ".etag").write_text(etag, encoding="utf-8")
+    return download_path
+
+
+def test_resume_appends_when_remote_unchanged(tmp_path):
+    config = {"basedir": str(tmp_path)}
+    syncer, file_node = make_run_syncer(config, timemodified=1710000500)
+    download_path = _seed_partial(syncer, file_node, b"HEAD-", '"v1"')
+    syncer.session.add(
+        "GET",
+        URL,
+        FakeResponse(
+            status_code=206,
+            headers={"Content-Type": "application/pdf", "ETag": '"v1"'},
+            chunks=[b"TAIL"],
+        ),
+    )
+
+    assert syncer.download_file(file_node) is True
+    # The partial head is kept and the resumed tail appended.
+    assert download_path.read_bytes() == b"HEAD-TAIL"
+    assert list(download_path.parent.glob(".*.smmpart*")) == []
+
+
+def test_resume_discards_partial_when_remote_served_full_content(tmp_path):
+    # If-Range honored: the remote changed, so the server sends a 200 with the
+    # full new body. The stale partial must be discarded, not appended to.
+    config = {"basedir": str(tmp_path)}
+    syncer, file_node = make_run_syncer(config, timemodified=1710000500)
+    download_path = _seed_partial(syncer, file_node, b"OLD-PARTIAL", '"v1"')
+    syncer.session.add(
+        "GET",
+        URL,
+        FakeResponse(
+            status_code=200,
+            headers={"Content-Type": "application/pdf", "ETag": '"v2"'},
+            chunks=[b"FULL-NEW-CONTENT"],
+        ),
+    )
+
+    assert syncer.download_file(file_node) is True
+    assert download_path.read_bytes() == b"FULL-NEW-CONTENT"
+    assert list(download_path.parent.glob(".*.smmpart*")) == []
+
+
+def test_resume_aborts_when_server_ignores_if_range(tmp_path):
+    # Some servers honor Range but ignore If-Range, returning a 206 tail of a
+    # changed file. The mismatched ETag must be detected so we discard the
+    # partial and retry fresh next run instead of corrupting the file.
+    config = {"basedir": str(tmp_path)}
+    syncer, file_node = make_run_syncer(config, timemodified=1710000500)
+    download_path = _seed_partial(syncer, file_node, b"OLD-PARTIAL", '"v1"')
+    syncer.session.add(
+        "GET",
+        URL,
+        FakeResponse(
+            status_code=206,
+            headers={"Content-Type": "application/pdf", "ETag": '"v2"'},
+            chunks=[b"TAIL-OF-NEW-VERSION"],
+        ),
+    )
+
+    assert syncer.download_file(file_node) is False
+    # Nothing corrupt is left behind; the stale partial is gone.
+    assert not download_path.exists()
+    assert list(download_path.parent.glob(".*.smmpart*")) == []
+
+
+def test_unrecognized_partial_without_sidecar_is_not_resumed(tmp_path):
+    # A leftover partial with no etag sidecar cannot be validated, so it is
+    # discarded and a fresh full download is performed.
+    config = {"basedir": str(tmp_path)}
+    syncer, file_node = make_run_syncer(config, timemodified=1710000500)
+    download_path = syncer.get_sanitized_node_path(file_node)
+    download_path.parent.mkdir(parents=True, exist_ok=True)
+    (download_path.parent / f".{download_path.name}.smmpart").write_bytes(b"STALE")
+    syncer.session.add(
+        "GET",
+        URL,
+        FakeResponse(headers={"Content-Type": "application/pdf"}, chunks=[b"FRESH"]),
+    )
+
+    assert syncer.download_file(file_node) is True
+    assert download_path.read_bytes() == b"FRESH"
+    assert list(download_path.parent.glob(".*.smmpart*")) == []
+
+
+# --------------------------------------------------------------------------
+# Sciebo change detection via ETag (no timemodified) (bug #8)
+# --------------------------------------------------------------------------
+
+SCIEBO_URL = "https://rwth-aachen.sciebo.de/public.php/webdav/notes.pdf"
+
+
+def _sciebo_tree(etag, is_downloaded=False):
+    root = Node("", -1, "Root", None)
+    semester = root.add_child("26ss", None, "Semester")
+    course = semester.add_child("Download Course", 301, "Course")
+    section = course.add_child("General", 401, "Section")
+    file_node = section.add_child(
+        "notes.pdf",
+        None,
+        "Sciebo File",
+        url=SCIEBO_URL,
+        additional_info={"Authorization": "Basic x"},
+        etag=etag,
+    )
+    file_node.is_downloaded = is_downloaded
+    return root, file_node
+
+
+def _seed_sciebo_cache(config, etag, content):
+    cache_syncer = make_syncer(config)
+    root, file_node = _sciebo_tree(etag, is_downloaded=True)
+    cache_syncer.root_node = root
+    cache_syncer.cache_root_node()
+    download_path = make_syncer(config).get_sanitized_node_path(file_node)
+    download_path.parent.mkdir(parents=True, exist_ok=True)
+    download_path.write_bytes(content)
+    return download_path
+
+
+def test_sciebo_changed_etag_triggers_redownload(tmp_path):
+    # Sciebo files have no timemodified, so a changed ETag is the only signal.
+    config = {"basedir": str(tmp_path), "updatefiles": True}
+    old = b"old sciebo content"
+    download_path = _seed_sciebo_cache(config, sha1(old), old)
+    syncer = make_syncer(config)
+    syncer.session = FakeSession()
+    new = b"new sciebo content"
+    syncer.session.add(
+        "GET",
+        SCIEBO_URL,
+        FakeResponse(headers={"Content-Type": "application/pdf"}, chunks=[new]),
+    )
+    _, current = _sciebo_tree(sha1(new))
+
+    assert syncer.download_file(current) is True
+    assert syncer.session.count("GET", SCIEBO_URL) == 1
+    assert download_path.read_bytes() == new
+    assert list(download_path.parent.glob("*.syncconflict.*")) == []
+
+
+def test_sciebo_unchanged_etag_skips_download(tmp_path):
+    config = {"basedir": str(tmp_path), "updatefiles": True}
+    content = b"sciebo content"
+    download_path = _seed_sciebo_cache(config, sha1(content), content)
+    syncer = make_syncer(config)
+    syncer.session = FakeSession()
+    _, current = _sciebo_tree(sha1(content))  # unchanged etag
+
+    assert syncer.download_file(current) is True
+    assert syncer.session.calls == []
+    assert download_path.read_bytes() == content
+
+
+# --------------------------------------------------------------------------
+# Cache reflects on-disk state, not optimistic Moodle markers (refinement)
+# --------------------------------------------------------------------------
+
+
+def _cached_file_node(config, course_node):
+    cached_course = make_syncer(config)._get_course_cache_root(course_node)
+    return cached_course.children[0].children[0]  # General -> slides.pdf
+
+
+def test_cache_preserves_markers_for_failed_download_over_existing_file(tmp_path):
+    config = {"basedir": str(tmp_path), "updatefiles": True}
+    v1 = b"version one"
+    seed_course_cache(config, timemodified=100, etag=sha1(v1), is_downloaded=True)
+    download_path = make_syncer(config).get_sanitized_node_path(
+        build_single_file_tree("slides.pdf", URL)[1]
+    )
+    download_path.parent.mkdir(parents=True, exist_ok=True)
+    download_path.write_bytes(v1)
+
+    # A run where Moodle reports a new version (200) but the download did not
+    # happen (is_downloaded=False) and the old file is still on disk.
+    syncer = make_syncer(config)
+    root, file_node = build_single_file_tree(
+        "slides.pdf", URL, timemodified=200, etag="poisoned"
+    )
+    file_node.is_downloaded = False
+    syncer.root_node = root
+    syncer.cache_root_node()
+
+    cached_file = _cached_file_node(config, root.children[0].children[0])
+    # The cache keeps the on-disk version's markers, not Moodle's new ones.
+    assert cached_file.timemodified == 100
+    assert cached_file.etag == sha1(v1)
+    assert cached_file.is_downloaded is True
+
+
+def test_cache_does_not_preserve_markers_when_file_absent(tmp_path):
+    config = {"basedir": str(tmp_path), "updatefiles": True}
+    seed_course_cache(config, timemodified=100, etag="old", is_downloaded=True)
+
+    # Failed download with no file on disk: nothing to preserve.
+    syncer = make_syncer(config)
+    root, file_node = build_single_file_tree(
+        "slides.pdf", URL, timemodified=200, etag="new"
+    )
+    file_node.is_downloaded = False
+    syncer.root_node = root
+    syncer.cache_root_node()
+
+    cached_file = _cached_file_node(config, root.children[0].children[0])
+    assert cached_file.timemodified == 200
+    assert cached_file.is_downloaded is False
diff --git a/tests/test_storage_safety.py b/tests/test_storage_safety.py
index 47b3be7..ea6f02f 100644
--- a/tests/test_storage_safety.py
+++ b/tests/test_storage_safety.py
@@ -50,6 +50,8 @@ def test_download_uses_course_cache_to_skip_unchanged_file(tmp_path):
         url="https://moodle.rwth-aachen.de/pluginfile.php/301/slides.pdf",
         timemodified=1710000300,
     )
+    # A real cache is written after a successful download.
+    cached_file.is_downloaded = True
     cached_syncer.root_node = cached_root
     cached_syncer.cache_root_node()
 

From d17263065db5337fc7f1dfcfca4785ff9405a095 Mon Sep 17 00:00:00 2001
From: D-VR <26770468+D-VR@users.noreply.github.com>
Date: Tue, 30 Jun 2026 21:47:43 +0200
Subject: [PATCH 5/5] Require matching ETag for resumed downloads

---
 syncmymoodle/__main__.py        | 17 ++++++++---------
 tests/test_download_behavior.py | 21 +++++++++++++++++++++
 2 files changed, 29 insertions(+), 9 deletions(-)

diff --git a/syncmymoodle/__main__.py b/syncmymoodle/__main__.py
index 6a14808..1e9c12e 100755
--- a/syncmymoodle/__main__.py
+++ b/syncmymoodle/__main__.py
@@ -2276,21 +2276,20 @@ def download_file(self, node):
 
             if resume_size:
                 # The remote content differs from our partial when the server
-                # ignores the range (any non-206) or, for servers that ignore
-                # If-Range, when the returned ETag no longer matches what the
-                # partial was fetched against.
-                version_changed = response.status_code != 206 or (
-                    etag_header is not None
-                    and partial_etag is not None
-                    and etag_header != partial_etag
+                # ignores the range (any non-206) or cannot prove that the
+                # returned tail belongs to the same ETag as the saved partial.
+                valid_resume = (
+                    response.status_code == 206 and etag_header == partial_etag
                 )
+                version_changed = not valid_resume
                 if version_changed:
                     resume_size = 0
                     tmp_downloadpath.unlink(missing_ok=True)
                     etag_sidecar.unlink(missing_ok=True)
                     if response.status_code == 206:
-                        # This 206 body is only the tail of a now-changed file
-                        # and cannot be used; restart fresh on the next run.
+                        # This 206 body is only a tail, and without an exact
+                        # ETag match it cannot be safely appended. Restart fresh
+                        # on the next run.
                         return False
 
             if not self._download_response_is_usable(node, response, downloadpath):
diff --git a/tests/test_download_behavior.py b/tests/test_download_behavior.py
index 596ee51..98bac6d 100644
--- a/tests/test_download_behavior.py
+++ b/tests/test_download_behavior.py
@@ -434,6 +434,27 @@ def test_resume_aborts_when_server_ignores_if_range(tmp_path):
     assert list(download_path.parent.glob(".*.smmpart*")) == []
 
 
+def test_resume_aborts_when_partial_response_has_no_etag(tmp_path):
+    # A 206 response without an ETag cannot prove that the returned tail belongs
+    # to the same remote version as the saved partial.
+    config = {"basedir": str(tmp_path)}
+    syncer, file_node = make_run_syncer(config, timemodified=1710000500)
+    download_path = _seed_partial(syncer, file_node, b"OLD-PARTIAL", '"v1"')
+    syncer.session.add(
+        "GET",
+        URL,
+        FakeResponse(
+            status_code=206,
+            headers={"Content-Type": "application/pdf"},
+            chunks=[b"UNVERIFIED-TAIL"],
+        ),
+    )
+
+    assert syncer.download_file(file_node) is False
+    assert not download_path.exists()
+    assert list(download_path.parent.glob(".*.smmpart*")) == []
+
+
 def test_unrecognized_partial_without_sidecar_is_not_resumed(tmp_path):
     # A leftover partial with no etag sidecar cannot be validated, so it is
     # discarded and a fresh full download is performed.