diff --git a/syncmymoodle/__main__.py b/syncmymoodle/__main__.py index fe47223..1e9c12e 100755 --- a/syncmymoodle/__main__.py +++ b/syncmymoodle/__main__.py @@ -195,6 +195,17 @@ def go_to_path(self, target_path): raise Exception("The path is not found in this root node. Wrong path?") return target_node[-1] + def _clash_suffix(self): + # Stable, distinct suffix used to disambiguate same-named siblings. + # Fall back to the URL when no name_clash_id is set (direct-link, + # embedded, and direct-content file nodes pass name_clash_id=None); + # otherwise such nodes would all hash to md5("None") and collide onto + # the same path, silently dropping all but one file. + key = self.name_clash_id if self.name_clash_id is not None else self.url + return base64.urlsafe_b64encode( + hashlib.md5(str(key).encode("utf-8")).hexdigest().encode("utf-8") + ).decode()[:10] + def remove_children_nameclashes(self): # Check for duplicate filenames @@ -248,30 +259,15 @@ def remove_children_nameclashes(self): ) ] if len(siblings) > 0: - # if a filename is still duplicate in its directory, we rename it by appending its id (urlsafe base64 so it also works for urls). + # if a filename is still duplicate in its directory, we rename + # it by appending a stable per-node key (works for ids and urls). filename = Path(child.name) child.name = ( - filename.stem - + "_" - + base64.urlsafe_b64encode( - hashlib.md5(str(child.name_clash_id).encode("utf-8")) - .hexdigest() - .encode("utf-8") - ).decode()[:10] - + filename.suffix + filename.stem + "_" + child._clash_suffix() + filename.suffix ) for s in siblings: filename = Path(s.name) - s.name = ( - filename.stem - + "_" - + base64.urlsafe_b64encode( - hashlib.md5(str(s.name_clash_id).encode("utf-8")) - .hexdigest() - .encode("utf-8") - ).decode()[:10] - + filename.suffix - ) + s.name = filename.stem + "_" + s._clash_suffix() + filename.suffix self.children.remove(s) unclashed_children.extend(siblings) @@ -361,17 +357,56 @@ def _read_private_gzip_json(self, path: Path, description: str): ) return None - def _node_to_cache_data(self, node: Node): + def _match_old_cache_child(self, old_node, child): + """Find the previous cache node corresponding to ``child``, if any.""" + if old_node is None: + return None + candidates = [ + c + for c in getattr(old_node, "children", []) + if c.name == child.name and c.type == child.type + ] + if not candidates: + return None + for candidate in candidates: + if candidate.url == child.url: + return candidate + return candidates[0] + + def _node_to_cache_data(self, node: Node, old_node: Node | None = None): + timemodified = node.timemodified + etag = node.etag + is_downloaded = node.is_downloaded + # If this file was not (re)downloaded this run but a previously + # downloaded version is still on disk, keep the previously cached version + # markers. Otherwise the cache would record Moodle's new timemodified/etag + # for a file we never actually fetched, which either skips the file + # forever or moves the on-disk copy aside as a spurious conflict on the + # next run's retry. + if ( + not node.is_downloaded + and old_node is not None + and getattr(old_node, "is_downloaded", False) + and self.get_sanitized_node_path(node).exists() + ): + timemodified = getattr(old_node, "timemodified", None) + etag = getattr(old_node, "etag", None) + is_downloaded = True return { "name": node.name, "id": node.id, "type": node.type, "url": node.url, - "timemodified": node.timemodified, - "etag": node.etag, + "timemodified": timemodified, + "etag": etag, "name_clash_id": node.name_clash_id, - "is_downloaded": node.is_downloaded, - "children": [self._node_to_cache_data(child) for child in node.children], + "is_downloaded": is_downloaded, + "children": [ + self._node_to_cache_data( + child, self._match_old_cache_child(old_node, child) + ) + for child in node.children + ], } def _node_from_cache_data(self, data, parent=None): @@ -452,13 +487,19 @@ def cache_root_node(self): if course_node.type != "Course": continue course_path = self.get_sanitized_node_path(course_node) + # Read the previous course cache before overwriting it, so we can + # preserve version markers for files that were not downloaded + # this run (see _node_to_cache_data). + old_course_root = self._get_course_cache_root(course_node) course_path.mkdir(parents=True, exist_ok=True) cache_path = course_path / ".syncmymoodle_cache" self._write_private_gzip_json( cache_path, { "format": "syncmymoodle.course-cache.v1", - "course": self._node_to_cache_data(course_node), + "course": self._node_to_cache_data( + course_node, old_course_root + ), }, ) @@ -649,6 +690,23 @@ def _as_list(self, value): return value return [value] + def _course_id_in_filter(self, course_id, entries) -> bool: + """Return True if ``course_id`` is referenced by a configured entry. + + Entries are course URLs (``.../course/view.php?id=NNN``). The ``id`` + query parameter is compared exactly, so e.g. ``id=12`` does not also + match courses ``1`` or ``2``. A bare numeric id entry is also accepted. + """ + course_id = str(course_id) + for entry in entries or []: + entry = str(entry) + parsed = urllib.parse.urlparse(entry) + if course_id in urllib.parse.parse_qs(parsed.query).get("id", []): + return True + if entry.strip() == course_id: + return True + return False + def _configured_patterns(self, *keys, course_id=None): patterns = [] for key in keys: @@ -991,13 +1049,33 @@ def _check_moodle_availability(self): def login(self): def get_session_key(soup): script = soup.find("script", string=lambda text: text and "sesskey" in text) - js_text = script.text - match = re.search(r'"sesskey":"(.*?)"', js_text) + match = ( + re.search(r'"sesskey":"(.*?)"', script.text) + if script is not None + else None + ) if match: return match.group(1) else: logger.critical("Can't retrieve session key from JavaScript config") - exit(1) + sys.exit(1) + + def require_input_value(soup, name, context): + value = self._get_input_value(soup, name) + if value is None: + logger.critical( + "Failed to login: expected form field %r was missing at the " + "%s. The RWTH login flow may have changed or the servers may " + "have difficulties. For current service status, see %s.", + name, + context, + RWTH_STATUS_URL, + ) + self._check_rwth_status_page() + logger.info("-------Login-Error-Soup--------") + logger.info(soup) + sys.exit(1) + return value self.session = requests.Session() cookie_file = Path(self.config.get("cookie_file", "./session")).expanduser() @@ -1033,7 +1111,8 @@ def get_session_key(soup): alert.decompose() # Extract body text after cleanup - body_text = soup_check.find("body").get_text(separator=" ", strip=True) + body = soup_check.find("body") + body_text = body.get_text(separator=" ", strip=True) if body else "" # Check for maintenance notice if "Wartungsarbeiten" in body_text: @@ -1045,7 +1124,9 @@ def get_session_key(soup): soup = bs(resp.text, features="lxml") if soup.find("input", {"name": "RelayState"}) is None: - csrf_token = soup.find("input", {"name": "csrf_token"})["value"] + csrf_token = require_input_value( + soup, "csrf_token", "username/password form" + ) login_data = { "j_username": self.config["user"], "j_password": self.config["password"], @@ -1068,7 +1149,9 @@ def get_session_key(soup): logger.info(soup) sys.exit(1) - csrf_token = soup.find("input", {"name": "csrf_token"})["value"] + csrf_token = require_input_value( + soup, "csrf_token", "TOTP generator selection form" + ) print("Setting TOTP generator") totp_selection_data = { @@ -1093,7 +1176,7 @@ def get_session_key(soup): logger.info(soup) sys.exit(1) - csrf_token = soup.find("input", {"name": "csrf_token"})["value"] + csrf_token = require_input_value(soup, "csrf_token", "TOTP entry form") if not self.config.get("totpsecret"): totp_input = input(f"Enter TOTP for generator {self.config['totp']}:\n") else: @@ -1122,8 +1205,8 @@ def get_session_key(soup): logger.info(soup) sys.exit(1) data = { - "RelayState": soup.find("input", {"name": "RelayState"})["value"], - "SAMLResponse": soup.find("input", {"name": "SAMLResponse"})["value"], + "RelayState": require_input_value(soup, "RelayState", "SAML response"), + "SAMLResponse": require_input_value(soup, "SAMLResponse", "SAML response"), } resp = self.session.post( "https://moodle.rwth-aachen.de/Shibboleth.sso/SAML2/POST", data=data @@ -1208,9 +1291,23 @@ def getCookies(cookie_jar, domain): ) sys.exit(1) - token_base64d = location.split("token=")[1] + # The redirect looks like moodlemobile://token=BASE64[&...]; isolate the + # token value and decode it defensively so a malformed redirect yields a + # clear message instead of a traceback. + token_base64d = location.split("token=", 1)[1].split("&")[0] conn.close() - self.wstoken = base64.b64decode(token_base64d).decode().split(":::")[1] + try: + token_parts = base64.b64decode(token_base64d).decode().split(":::") + except (ValueError, UnicodeDecodeError): + token_parts = [] + if len(token_parts) < 2 or not token_parts[1]: + logger.critical( + "Failed to parse the Moodle webservice token from the mobile " + "launch redirect. Your saved session may be stale; delete the " + "cookie file and try again." + ) + sys.exit(1) + self.wstoken = token_parts[1] return self.wstoken def get_all_courses(self): @@ -1375,39 +1472,26 @@ def sync(self): # Syncing all courses for course in self.get_all_courses(): - course_name = self._format_course_name(course["shortname"]) + course_name = self._format_course_name( + course.get("shortname") or f"course-{course.get('id')}" + ) course_id = course["id"] - if ( - len( - [ - c - for c in self.config.get("skip_courses", []) - if str(course_id) in c - ] - ) - > 0 - ): - continue - - # Skip not selected courses - if ( - len(self.config.get("selected_courses", [])) > 0 - and len( - [ - c - for c in self.config.get("selected_courses", []) - if str(course["id"]) in c - ] - ) - == 0 + selected_courses = self.config.get("selected_courses", []) + if selected_courses: + # selected_courses is an explicit allowlist that overrides + # skip_courses (and, below, only_sync_semester). + if not self._course_id_in_filter(course_id, selected_courses): + continue + elif self._course_id_in_filter( + course_id, self.config.get("skip_courses", []) ): continue - semestername = course["idnumber"][:4] - # Skip not selected semesters + semestername = (course.get("idnumber") or "")[:4] or "unknown-semester" + # Skip not selected semesters (selected_courses overrides this) if ( - len(self.config.get("selected_courses", [])) == 0 + not selected_courses and self.config.get("only_sync_semester", []) and semestername not in self.config.get("only_sync_semester", []) ): @@ -2003,6 +2087,16 @@ def download_file(self, node): if self._should_skip_url(node.url, f"{node.type} file"): return True + # Respect filetype/name exclusions up front so that excluded files never + # trigger conflict handling, displace local files, or create temp files. + if node.name.split(".")[-1] in self.config.get("exclude_filetypes", []): + return True + if any( + fnmatchcase(node.name, pattern) + for pattern in self.config.get("exclude_files", []) + ): + return True + # If we already downloaded this path during the current run, skip any # further processing. This avoids duplicate downloads and spurious # conflicts when the same remote file appears multiple times in the @@ -2017,17 +2111,30 @@ def download_file(self, node): # Decide whether we need to (re-)download the file at all cached_timemodified = None old_node = None + conflict_rename_pending = False if downloadpath.exists(): if not self.config.get("updatefiles"): return True # Try to find a cached node for this file from the per-course cache. old_node = self._get_old_node_for(node) + # Only trust the cached version markers when the previous run + # actually downloaded the file. Otherwise an update that failed last + # time (e.g. an expired session) gets cached with Moodle's new + # timemodified and would be skipped forever, leaving a stale file. + # Treat a non-downloaded cache entry as if there were no cache at all. + if old_node is not None and not getattr(old_node, "is_downloaded", False): + old_node = None if old_node is not None: cached_timemodified = getattr(old_node, "timemodified", None) old_etag = getattr(old_node, "etag", None) - # If Moodle did not change the file, skip re-download. - if node.timemodified == cached_timemodified: + # If Moodle did not change the file, skip re-download. Only when + # timemodified is meaningful: Sciebo files have no timemodified + # (always None), so this must fall through to the etag check + # below instead of treating None == None as "unchanged". + if cached_timemodified is not None and ( + node.timemodified == cached_timemodified + ): return True # For Sciebo, we use the etag from the previous run as the # remote version marker. If it matches the current etag from @@ -2125,44 +2232,40 @@ def download_file(self, node): ) return True if conflict_mode == "rename": - # Move the locally modified file out of the way before download - conflict_path = self._make_conflict_path(downloadpath) - try: - downloadpath.rename(conflict_path) - logger.warning( - "Detected local changes for %s, moving to %s before " - "downloading updated file from Moodle", - downloadpath, - conflict_path, - ) - except OSError: - logger.exception( - "Failed to move locally modified file %s to %s, " - "skipping Moodle update to avoid data loss", - downloadpath, - conflict_path, - ) - return True + # Defer moving the locally modified file aside until the + # replacement has been fully downloaded, so an aborted or + # failed download (e.g. an expired session returning an HTML + # error page) never leaves the canonical path empty. + conflict_rename_pending = True # conflict_mode == "overwrite": fall through and overwrite - if len(node.name.split(".")) > 0 and node.name.split(".")[ - -1 - ] in self.config.get("exclude_filetypes", []): - return True - - if any( - fnmatchcase(node.name, pattern) - for pattern in self.config.get("exclude_files") - ): - return True - - tmp_downloadpath = downloadpath.with_suffix(downloadpath.suffix + ".temp") + # Hidden, namespaced temp/sidecar names so we never resume from or + # overwrite a file the user happens to own. The sidecar records the + # ETag a partial download was fetched against. + tmp_downloadpath = downloadpath.parent / f".{downloadpath.name}.smmpart" + etag_sidecar = tmp_downloadpath.with_name(tmp_downloadpath.name + ".etag") + + # Only resume a previous partial when we recorded the ETag it was fetched + # against, so we can ask the server (via If-Range) to confirm the remote + # content is unchanged. Without that proof a blind range request could + # splice bytes from a newer version onto an older partial and silently + # corrupt the file. + resume_size = 0 + partial_etag: str | None = None + header = dict() if tmp_downloadpath.exists(): - resume_size = tmp_downloadpath.stat().st_size - header = {"Range": f"bytes={resume_size}-"} - else: - resume_size = 0 - header = dict() + if etag_sidecar.exists(): + try: + partial_etag = etag_sidecar.read_text(encoding="utf-8").strip() + except OSError: + partial_etag = None + if partial_etag: + resume_size = tmp_downloadpath.stat().st_size + header = {"Range": f"bytes={resume_size}-", "If-Range": partial_etag} + else: + # Cannot validate the partial; discard it and start fresh. + tmp_downloadpath.unlink(missing_ok=True) + etag_sidecar.unlink(missing_ok=True) if node.type.lower() == "sciebo file": header = {**header, **node.additional_info} @@ -2170,25 +2273,26 @@ def download_file(self, node): self.session.get(node.url, headers=header, stream=True) ) as response: etag_header = response.headers.get("ETag") - # If we attempted to resume but the server did not honor the Range - # header (status != 206), fallback to a full download and ignore - # the existing partial file to avoid corrupting PDFs or other - # content by appending a second full copy. - if resume_size and response.status_code == 200: - resume_size = 0 - tmp_downloadpath.unlink(missing_ok=True) - - if not self._download_response_is_usable(node, response, downloadpath): - return False - if resume_size and response.status_code != 206: - logger.warning( - "Skipping download of %s from %s because the server returned " - "HTTP %s instead of partial content for a resumed download", - downloadpath, - node.url, - response.status_code, + if resume_size: + # The remote content differs from our partial when the server + # ignores the range (any non-206) or cannot prove that the + # returned tail belongs to the same ETag as the saved partial. + valid_resume = ( + response.status_code == 206 and etag_header == partial_etag ) + version_changed = not valid_resume + if version_changed: + resume_size = 0 + tmp_downloadpath.unlink(missing_ok=True) + etag_sidecar.unlink(missing_ok=True) + if response.status_code == 206: + # This 206 body is only a tail, and without an exact + # ETag match it cannot be safely appended. Restart fresh + # on the next run. + return False + + if not self._download_response_is_usable(node, response, downloadpath): return False content = response.iter_content(self.block_size) @@ -2216,6 +2320,13 @@ def download_file(self, node): if resume_size: progress_bar.update(resume_size) downloadpath.parent.mkdir(parents=True, exist_ok=True) + # Record the ETag this partial is being fetched against so an + # interrupted download can be safely resumed next time. + if etag_header: + try: + etag_sidecar.write_text(etag_header, encoding="utf-8") + except OSError: + pass mode = "ab" if resume_size else "wb" with tmp_downloadpath.open(mode) as file: if first_chunk: @@ -2225,7 +2336,34 @@ def download_file(self, node): progress_bar.update(len(data)) file.write(data) progress_bar.close() - tmp_downloadpath.rename(downloadpath) + + # The replacement is now fully on disk. Only at this point do we move + # a conflicting local file aside, so a failure above never empties + # the canonical path. + if conflict_rename_pending: + conflict_path = self._make_conflict_path(downloadpath) + try: + downloadpath.rename(conflict_path) + logger.warning( + "Detected local changes for %s, moved to %s before " + "installing the updated file from Moodle", + downloadpath, + conflict_path, + ) + except OSError: + logger.exception( + "Failed to move locally modified file %s to %s; keeping " + "it and discarding the downloaded update to avoid data " + "loss", + downloadpath, + conflict_path, + ) + tmp_downloadpath.unlink(missing_ok=True) + etag_sidecar.unlink(missing_ok=True) + return True + + os.replace(tmp_downloadpath, downloadpath) + etag_sidecar.unlink(missing_ok=True) # Align the local mtime with Moodle's timemodified to detect local # changes on subsequent runs. if getattr(node, "timemodified", None) is not None: diff --git a/tests/test_course_prefix_handling.py b/tests/test_course_prefix_handling.py index 2cd9b98..a01ebef 100644 --- a/tests/test_course_prefix_handling.py +++ b/tests/test_course_prefix_handling.py @@ -116,3 +116,32 @@ def test_same_name_with_different_urls_still_gets_stable_suffixes(): assert "Slides" not in names for name in names: assert name.startswith("Slides_") + + +def test_clashing_files_without_name_clash_id_use_url_for_distinct_names(): + # Direct-link / direct-content / embedded nodes pass name_clash_id=None. + # Two such same-named files with different URLs must still get distinct + # names (falling back to the URL) instead of both hashing md5("None"). + root = Node("", -1, "Root", None) + section = root.add_child("General", None, "Section") + section.add_child( + "slides.pdf", + None, + "Linked file [application/pdf]", + url="https://a.example/slides.pdf", + name_clash_id=None, + ) + section.add_child( + "slides.pdf", + None, + "Linked file [application/pdf]", + url="https://b.example/slides.pdf", + name_clash_id=None, + ) + + root.remove_children_nameclashes() + + names = [child.name for child in section.children] + assert len(names) == 2 + assert len(set(names)) == 2 + assert "slides.pdf" not in names diff --git a/tests/test_download_behavior.py b/tests/test_download_behavior.py index fb87852..98bac6d 100644 --- a/tests/test_download_behavior.py +++ b/tests/test_download_behavior.py @@ -14,12 +14,18 @@ def sha1(data: bytes) -> str: return hashlib.sha1(data).hexdigest() -def seed_course_cache(config, *, timemodified, etag): - """Write a per-course cache to disk describing a previously synced file.""" +def seed_course_cache(config, *, timemodified, etag, is_downloaded=True): + """Write a per-course cache to disk describing a previously synced file. + + A real cache is written after the download walk, so a file that was + successfully fetched is marked is_downloaded=True; pass False to simulate a + previous run whose download failed. + """ cache_syncer = make_syncer(config) - cached_root, _ = build_single_file_tree( + cached_root, cached_file = build_single_file_tree( "slides.pdf", URL, timemodified=timemodified, etag=etag ) + cached_file.is_downloaded = is_downloaded cache_syncer.root_node = cached_root cache_syncer.cache_root_node() @@ -55,8 +61,8 @@ def test_download_streams_chunks_to_disk_and_records_metadata(tmp_path): assert syncer.download_file(file_node) is True assert download_path.read_bytes() == b"".join(chunks) - # The temporary part-file is renamed away once the download completes. - assert not download_path.with_suffix(download_path.suffix + ".temp").exists() + # The temp part-file and its etag sidecar are cleaned up on completion. + assert list(download_path.parent.glob(".*.smmpart*")) == [] # mtime is aligned with Moodle's timemodified so later runs detect changes. assert int(download_path.stat().st_mtime) == 1710000500 # The ETag is persisted on the node for the next run's change detection. @@ -209,6 +215,43 @@ def test_unchanged_timemodified_skips_download_despite_local_edit(tmp_path): assert list(download_path.parent.glob("*.syncconflict.*")) == [] +def test_failed_previous_download_is_retried_not_skipped(tmp_path): + # The cache records Moodle's timemodified even when the previous download + # failed (is_downloaded=False). Such an entry must not suppress a retry, + # otherwise a stale file would be kept forever. + config = {"basedir": str(tmp_path), "updatefiles": True} + seed_course_cache(config, timemodified=1710000300, etag=None, is_downloaded=False) + syncer, file_node = make_run_syncer(config, timemodified=1710000300) + download_path = syncer.get_sanitized_node_path(file_node) + download_path.parent.mkdir(parents=True, exist_ok=True) + download_path.write_bytes(b"OLD STALE VERSION") + syncer.session.add( + "GET", + URL, + FakeResponse( + headers={"Content-Type": "application/pdf"}, chunks=[b"NEW CORRECT VERSION"] + ), + ) + + assert syncer.download_file(file_node) is True + assert syncer.session.count("GET", URL) == 1 + assert download_path.read_bytes() == b"NEW CORRECT VERSION" + + +def test_successful_previous_download_with_same_timemodified_is_skipped(tmp_path): + # The complement: a downloaded cache entry with an unchanged timemodified is + # still skipped without contacting the server. + config = {"basedir": str(tmp_path), "updatefiles": True} + seed_course_cache(config, timemodified=1710000300, etag=None, is_downloaded=True) + syncer, file_node = make_run_syncer(config, timemodified=1710000300) + download_path = syncer.get_sanitized_node_path(file_node) + download_path.parent.mkdir(parents=True, exist_ok=True) + download_path.write_bytes(b"already downloaded") + + assert syncer.download_file(file_node) is True + assert syncer.session.calls == [] + + def test_etag_failure_falls_back_to_timestamp_heuristic_conflict(tmp_path, monkeypatch): # A faulty ETag cache is treated as if there were no cached ETag, so the # timestamp heuristic decides. Here the local mtime differs from the cached @@ -250,3 +293,306 @@ def boom(path, etag): assert syncer.download_file(file_node) is True assert download_path.read_bytes() == new_body assert list(download_path.parent.glob("*.syncconflict.*")) == [] + + +# -------------------------------------------------------------------------- +# Failed/aborted update must not empty the canonical path (bug #1) +# -------------------------------------------------------------------------- + + +def test_rename_conflict_failed_html_update_preserves_canonical_file(tmp_path): + # An expired session returns an HTML login page that masquerades as a new + # version. The download is rejected; the user's file must stay in place and + # not be displaced to a side-car (which would empty the canonical path). + syncer, file_node, download_path, local_modified = _setup_conflict( + tmp_path, "rename" + ) + syncer.session.add( + "GET", + URL, + FakeResponse( + headers={"Content-Type": "text/html"}, + text="login", + ), + ) + + assert syncer.download_file(file_node) is False + assert download_path.exists() + assert download_path.read_bytes() == local_modified + assert list(download_path.parent.glob("*.syncconflict.*")) == [] + + +def test_rename_conflict_non_2xx_update_preserves_canonical_file(tmp_path): + syncer, file_node, download_path, local_modified = _setup_conflict( + tmp_path, "rename" + ) + syncer.session.add("GET", URL, FakeResponse(status_code=403, text="forbidden")) + + assert syncer.download_file(file_node) is False + assert download_path.read_bytes() == local_modified + assert list(download_path.parent.glob("*.syncconflict.*")) == [] + + +def test_excluded_filetype_existing_file_is_not_touched(tmp_path): + # Exclusions are honored before any conflict handling, so an excluded file + # that already exists is never displaced or downloaded. + config = { + "basedir": str(tmp_path), + "updatefiles": True, + "update_files_conflict": "rename", + "exclude_filetypes": ["pdf"], + } + seed_course_cache(config, timemodified=1710000300, etag=sha1(b"original")) + syncer, file_node = make_run_syncer(config, timemodified=1710000400) + download_path = syncer.get_sanitized_node_path(file_node) + download_path.parent.mkdir(parents=True, exist_ok=True) + download_path.write_bytes(b"locally edited content") + + # No GET route registered: a request would raise in the fake session. + assert syncer.download_file(file_node) is True + assert download_path.read_bytes() == b"locally edited content" + assert syncer.session.calls == [] + assert list(download_path.parent.glob("*.syncconflict.*")) == [] + + +# -------------------------------------------------------------------------- +# Safe resume of partial downloads (bug #2) +# -------------------------------------------------------------------------- + + +def _seed_partial(syncer, file_node, body, etag): + """Write a hidden partial download plus its etag sidecar for ``file_node``.""" + download_path = syncer.get_sanitized_node_path(file_node) + download_path.parent.mkdir(parents=True, exist_ok=True) + partial = download_path.parent / f".{download_path.name}.smmpart" + partial.write_bytes(body) + partial.with_name(partial.name + ".etag").write_text(etag, encoding="utf-8") + return download_path + + +def test_resume_appends_when_remote_unchanged(tmp_path): + config = {"basedir": str(tmp_path)} + syncer, file_node = make_run_syncer(config, timemodified=1710000500) + download_path = _seed_partial(syncer, file_node, b"HEAD-", '"v1"') + syncer.session.add( + "GET", + URL, + FakeResponse( + status_code=206, + headers={"Content-Type": "application/pdf", "ETag": '"v1"'}, + chunks=[b"TAIL"], + ), + ) + + assert syncer.download_file(file_node) is True + # The partial head is kept and the resumed tail appended. + assert download_path.read_bytes() == b"HEAD-TAIL" + assert list(download_path.parent.glob(".*.smmpart*")) == [] + + +def test_resume_discards_partial_when_remote_served_full_content(tmp_path): + # If-Range honored: the remote changed, so the server sends a 200 with the + # full new body. The stale partial must be discarded, not appended to. + config = {"basedir": str(tmp_path)} + syncer, file_node = make_run_syncer(config, timemodified=1710000500) + download_path = _seed_partial(syncer, file_node, b"OLD-PARTIAL", '"v1"') + syncer.session.add( + "GET", + URL, + FakeResponse( + status_code=200, + headers={"Content-Type": "application/pdf", "ETag": '"v2"'}, + chunks=[b"FULL-NEW-CONTENT"], + ), + ) + + assert syncer.download_file(file_node) is True + assert download_path.read_bytes() == b"FULL-NEW-CONTENT" + assert list(download_path.parent.glob(".*.smmpart*")) == [] + + +def test_resume_aborts_when_server_ignores_if_range(tmp_path): + # Some servers honor Range but ignore If-Range, returning a 206 tail of a + # changed file. The mismatched ETag must be detected so we discard the + # partial and retry fresh next run instead of corrupting the file. + config = {"basedir": str(tmp_path)} + syncer, file_node = make_run_syncer(config, timemodified=1710000500) + download_path = _seed_partial(syncer, file_node, b"OLD-PARTIAL", '"v1"') + syncer.session.add( + "GET", + URL, + FakeResponse( + status_code=206, + headers={"Content-Type": "application/pdf", "ETag": '"v2"'}, + chunks=[b"TAIL-OF-NEW-VERSION"], + ), + ) + + assert syncer.download_file(file_node) is False + # Nothing corrupt is left behind; the stale partial is gone. + assert not download_path.exists() + assert list(download_path.parent.glob(".*.smmpart*")) == [] + + +def test_resume_aborts_when_partial_response_has_no_etag(tmp_path): + # A 206 response without an ETag cannot prove that the returned tail belongs + # to the same remote version as the saved partial. + config = {"basedir": str(tmp_path)} + syncer, file_node = make_run_syncer(config, timemodified=1710000500) + download_path = _seed_partial(syncer, file_node, b"OLD-PARTIAL", '"v1"') + syncer.session.add( + "GET", + URL, + FakeResponse( + status_code=206, + headers={"Content-Type": "application/pdf"}, + chunks=[b"UNVERIFIED-TAIL"], + ), + ) + + assert syncer.download_file(file_node) is False + assert not download_path.exists() + assert list(download_path.parent.glob(".*.smmpart*")) == [] + + +def test_unrecognized_partial_without_sidecar_is_not_resumed(tmp_path): + # A leftover partial with no etag sidecar cannot be validated, so it is + # discarded and a fresh full download is performed. + config = {"basedir": str(tmp_path)} + syncer, file_node = make_run_syncer(config, timemodified=1710000500) + download_path = syncer.get_sanitized_node_path(file_node) + download_path.parent.mkdir(parents=True, exist_ok=True) + (download_path.parent / f".{download_path.name}.smmpart").write_bytes(b"STALE") + syncer.session.add( + "GET", + URL, + FakeResponse(headers={"Content-Type": "application/pdf"}, chunks=[b"FRESH"]), + ) + + assert syncer.download_file(file_node) is True + assert download_path.read_bytes() == b"FRESH" + assert list(download_path.parent.glob(".*.smmpart*")) == [] + + +# -------------------------------------------------------------------------- +# Sciebo change detection via ETag (no timemodified) (bug #8) +# -------------------------------------------------------------------------- + +SCIEBO_URL = "https://rwth-aachen.sciebo.de/public.php/webdav/notes.pdf" + + +def _sciebo_tree(etag, is_downloaded=False): + root = Node("", -1, "Root", None) + semester = root.add_child("26ss", None, "Semester") + course = semester.add_child("Download Course", 301, "Course") + section = course.add_child("General", 401, "Section") + file_node = section.add_child( + "notes.pdf", + None, + "Sciebo File", + url=SCIEBO_URL, + additional_info={"Authorization": "Basic x"}, + etag=etag, + ) + file_node.is_downloaded = is_downloaded + return root, file_node + + +def _seed_sciebo_cache(config, etag, content): + cache_syncer = make_syncer(config) + root, file_node = _sciebo_tree(etag, is_downloaded=True) + cache_syncer.root_node = root + cache_syncer.cache_root_node() + download_path = make_syncer(config).get_sanitized_node_path(file_node) + download_path.parent.mkdir(parents=True, exist_ok=True) + download_path.write_bytes(content) + return download_path + + +def test_sciebo_changed_etag_triggers_redownload(tmp_path): + # Sciebo files have no timemodified, so a changed ETag is the only signal. + config = {"basedir": str(tmp_path), "updatefiles": True} + old = b"old sciebo content" + download_path = _seed_sciebo_cache(config, sha1(old), old) + syncer = make_syncer(config) + syncer.session = FakeSession() + new = b"new sciebo content" + syncer.session.add( + "GET", + SCIEBO_URL, + FakeResponse(headers={"Content-Type": "application/pdf"}, chunks=[new]), + ) + _, current = _sciebo_tree(sha1(new)) + + assert syncer.download_file(current) is True + assert syncer.session.count("GET", SCIEBO_URL) == 1 + assert download_path.read_bytes() == new + assert list(download_path.parent.glob("*.syncconflict.*")) == [] + + +def test_sciebo_unchanged_etag_skips_download(tmp_path): + config = {"basedir": str(tmp_path), "updatefiles": True} + content = b"sciebo content" + download_path = _seed_sciebo_cache(config, sha1(content), content) + syncer = make_syncer(config) + syncer.session = FakeSession() + _, current = _sciebo_tree(sha1(content)) # unchanged etag + + assert syncer.download_file(current) is True + assert syncer.session.calls == [] + assert download_path.read_bytes() == content + + +# -------------------------------------------------------------------------- +# Cache reflects on-disk state, not optimistic Moodle markers (refinement) +# -------------------------------------------------------------------------- + + +def _cached_file_node(config, course_node): + cached_course = make_syncer(config)._get_course_cache_root(course_node) + return cached_course.children[0].children[0] # General -> slides.pdf + + +def test_cache_preserves_markers_for_failed_download_over_existing_file(tmp_path): + config = {"basedir": str(tmp_path), "updatefiles": True} + v1 = b"version one" + seed_course_cache(config, timemodified=100, etag=sha1(v1), is_downloaded=True) + download_path = make_syncer(config).get_sanitized_node_path( + build_single_file_tree("slides.pdf", URL)[1] + ) + download_path.parent.mkdir(parents=True, exist_ok=True) + download_path.write_bytes(v1) + + # A run where Moodle reports a new version (200) but the download did not + # happen (is_downloaded=False) and the old file is still on disk. + syncer = make_syncer(config) + root, file_node = build_single_file_tree( + "slides.pdf", URL, timemodified=200, etag="poisoned" + ) + file_node.is_downloaded = False + syncer.root_node = root + syncer.cache_root_node() + + cached_file = _cached_file_node(config, root.children[0].children[0]) + # The cache keeps the on-disk version's markers, not Moodle's new ones. + assert cached_file.timemodified == 100 + assert cached_file.etag == sha1(v1) + assert cached_file.is_downloaded is True + + +def test_cache_does_not_preserve_markers_when_file_absent(tmp_path): + config = {"basedir": str(tmp_path), "updatefiles": True} + seed_course_cache(config, timemodified=100, etag="old", is_downloaded=True) + + # Failed download with no file on disk: nothing to preserve. + syncer = make_syncer(config) + root, file_node = build_single_file_tree( + "slides.pdf", URL, timemodified=200, etag="new" + ) + file_node.is_downloaded = False + syncer.root_node = root + syncer.cache_root_node() + + cached_file = _cached_file_node(config, root.children[0].children[0]) + assert cached_file.timemodified == 200 + assert cached_file.is_downloaded is False diff --git a/tests/test_storage_safety.py b/tests/test_storage_safety.py index 47b3be7..ea6f02f 100644 --- a/tests/test_storage_safety.py +++ b/tests/test_storage_safety.py @@ -50,6 +50,8 @@ def test_download_uses_course_cache_to_skip_unchanged_file(tmp_path): url="https://moodle.rwth-aachen.de/pluginfile.php/301/slides.pdf", timemodified=1710000300, ) + # A real cache is written after a successful download. + cached_file.is_downloaded = True cached_syncer.root_node = cached_root cached_syncer.cache_root_node() diff --git a/tests/test_sync_filtering.py b/tests/test_sync_filtering.py index e9d7ee1..693456a 100644 --- a/tests/test_sync_filtering.py +++ b/tests/test_sync_filtering.py @@ -49,3 +49,50 @@ def test_skip_courses_and_semester_filter_limit_synced_courses(): "Semester | 26ss | | | ", "Course | 26ss/Current Semester | | | ", ] + + +# Course ids that are substrings of one another, to pin down exact-id matching. +SUBSTRING_COURSES = [ + {"id": 1, "shortname": "Course One", "idnumber": "26ss-1"}, + {"id": 2, "shortname": "Course Two", "idnumber": "26ss-2"}, + {"id": 12, "shortname": "Course Twelve", "idnumber": "26ss-12"}, + {"id": 123, "shortname": "Course OneTwoThree", "idnumber": "26ss-123"}, +] + + +def _run_filter(config): + synced = [] + syncer = make_syncer(config) + syncer.get_all_courses = lambda: SUBSTRING_COURSES # type: ignore[method-assign] + syncer.get_course = lambda course_id: synced.append(course_id) or [] # type: ignore[method-assign] + syncer.session = FakeSession() + syncer.sync() + return synced + + +def test_selected_courses_match_by_exact_id_not_substring(): + # Selecting course 12 must not also pull in courses 1 and 2. + synced = _run_filter( + {"selected_courses": ["https://moodle.rwth-aachen.de/course/view.php?id=12"]} + ) + assert synced == [12] + + +def test_skip_courses_match_by_exact_id_not_substring(): + # Skipping course 12 must not silently drop courses 1 and 2. + synced = _run_filter( + {"skip_courses": ["https://moodle.rwth-aachen.de/course/view.php?id=12"]} + ) + assert synced == [1, 2, 123] + + +def test_bare_numeric_id_entry_is_accepted(): + synced = _run_filter({"selected_courses": ["12"]}) + assert synced == [12] + + +def test_selected_courses_override_skip_courses(): + # A course present in both lists is synced: selected_courses wins. + url = "https://moodle.rwth-aachen.de/course/view.php?id=12" + synced = _run_filter({"selected_courses": [url], "skip_courses": [url]}) + assert synced == [12]