diff --git a/.env.example b/.env.example index 95def211..0177549f 100644 --- a/.env.example +++ b/.env.example @@ -15,4 +15,6 @@ OPT_TAG_GITHUB=true # Tag Link with "GitHub" OPT_TAG_GITHUBSTARS=true # Tag Link with "GitHub Stars" OPT_TAG_LANGUAGE=false # Tag Link with Language of repo (e.g. Python or JavaScript) OPT_TAG_USERNAME=false # Tag GitHub username -OPT_TAG_CUSTOM=false # Add custom tags, separated by commas (e.g. tag1,tag2) \ No newline at end of file +OPT_TAG_CUSTOM=false # Add custom tags, separated by commas (e.g. tag1,tag2) +OPT_DELETE_DUPLICATE=false # Delete existing duplicate links from Collection (COLLECTION_ID) +DEBUG=false # Enable/Disable Debug Mode \ No newline at end of file diff --git a/Dockerfile b/Dockerfile index 26910d2f..cab9f268 100644 --- a/Dockerfile +++ b/Dockerfile @@ -16,6 +16,8 @@ ENV OPT_TAG_GITHUBSTARS=true ENV OPT_TAG_LANGUAGE=false ENV OPT_TAG_USERNAME=false ENV OPT_TAG_CUSTOM=false +ENV OPT_DELETE_DUPLICATE=false +ENV DEBUG=false WORKDIR /app diff --git a/README.md b/README.md index 1a16dc35..dbcc4af1 100644 --- a/README.md +++ b/README.md @@ -42,6 +42,8 @@ curl -LsSf https://astral.sh/uv/install.sh | sh OPT_TAG_GITHUBSTARS=true OPT_TAG_LANGUAGE=false OPT_TAG_USERNAME=false + OPT_DELETE_DUPLICATE=false + DEBUG=false ``` ## Usage @@ -75,6 +77,8 @@ uv run starwarden.py -id YOUR_COLLECTION_ID | OPT_TAG_LANGUAGE | false | Tag Link with Language of repo (e.g. Python or JavaScript) | | OPT_TAG_USERNAME | false | Tag GitHub username | | OPT_TAG_CUSTOM | | Add custom tags, separated by commas (e.g. tag1,tag2) | +| OPT_DELETE_DUPLICATE| false | Delete existing duplicate links from Collection (COLLECTION_ID) | +| DEBUG | false | Enable/Disable debug mode | ## Unsupervised Updates @@ -97,6 +101,8 @@ For automated, unsupervised updates, you can use Docker with the provided docker OPT_TAG_GITHUBSTARS=true OPT_TAG_LANGUAGE=false OPT_TAG_USERNAME=false + OPT_DELETE_DUPLICATE=false + DEBUG=false ``` 3. Use the following `docker-compose.yml` file: diff --git a/starwarden/config.py b/starwarden/config.py index 8c9ea0d7..c1cba73e 100644 --- a/starwarden/config.py +++ b/starwarden/config.py @@ -28,12 +28,14 @@ def load_env(): "github_username": os.getenv("GITHUB_USERNAME"), "linkwarden_url": os.getenv("LINKWARDEN_URL"), "linkwarden_token": os.getenv("LINKWARDEN_TOKEN"), + "debug": os.getenv("DEBUG", "false").lower() in ("true", "1"), "opt_tag": os.getenv("OPT_TAG", "false").lower() in ("true", "1"), "opt_tag_github": os.getenv("OPT_TAG_GITHUB", "false").lower() in ("true", "1"), "opt_tag_githubStars": os.getenv("OPT_TAG_GITHUBSTARS", "false").lower() in ("true", "1"), "opt_tag_language": os.getenv("OPT_TAG_LANGUAGE", "false").lower() in ("true", "1"), "opt_tag_username": os.getenv("OPT_TAG_USERNAME", "false").lower() in ("true", "1"), "opt_tag_custom": os.getenv("OPT_TAG_CUSTOM", ""), + "opt_delete_duplicate": os.getenv("OPT_DELETE_DUPLICATE", "false").lower() in ("true", "1"), "APPRISE_URLS": os.getenv("APPRISE_URLS"), "DOCKERIZED": os.getenv("DOCKERIZED", "false").lower() in ("true", "1"), } diff --git a/starwarden/linkwarden_api.py b/starwarden/linkwarden_api.py index 45eb8450..265627da 100644 --- a/starwarden/linkwarden_api.py +++ b/starwarden/linkwarden_api.py @@ -10,38 +10,59 @@ LINK_EXISTS_GLOBALLY = object() -def get_existing_links(linkwarden_url, linkwarden_token, collection_id): - url = f"{linkwarden_url.rstrip('/')}/api/v1/links" +def get_existing_links(linkwarden_url, linkwarden_token, collection_id, delete_duplicate=False): + url = f"{linkwarden_url.rstrip('/')}/api/v1/search" headers = { "Authorization": f"Bearer {linkwarden_token}", "Content-Type": "application/json", } - cursor = 0 - seen_links = set() + cursor = None + seen_urls = set() + duplicate_link_ids = [] + total_links_processed = 0 + while True: try: logger.debug(f"Fetching links from cursor {cursor} for collection {collection_id}") + params = {"collectionId": collection_id, "sort": 1} + if cursor is not None: + params["cursor"] = cursor + response = requests.get( url, - params={"collectionId": collection_id, "cursor": cursor, "sort": 1}, + params=params, headers=headers, timeout=30, ) response.raise_for_status() - data = response.json() - links = data.get("response", []) + data = response.json() + links = data.get("data", {}).get("links", []) + next_cursor = data.get("data", {}).get("nextCursor") + logger.debug(f"Fetched {len(links)} links from cursor {cursor}") - - new_links = [link["url"] for link in links if link["url"] not in seen_links] - if not new_links: - logger.info(f"No new links found from cursor {cursor}. Stopping pagination.") - break - - seen_links.update(new_links) - yield from new_links - if not links: + total_links_processed += len(links) + + for link in links: + link_url = link["url"] + link_id = link["id"] + + if link_url in seen_urls: + # Found a duplicate + logger.debug(f"Found duplicate link: {link_url} (ID: {link_id})") + duplicate_link_ids.append(link_id) + # Do not yield duplicates since they are queued for deletion + continue + else: + seen_urls.add(link_url) + # Only yield URLs that are not marked as duplicates + yield link_url + + if next_cursor is None: + logger.info("Reached end of pagination (no nextCursor in response)") break - cursor = links[-1].get("id") + else: + cursor = next_cursor + logger.debug(f"Advancing to next cursor: {cursor}") except requests.RequestException as e: logger.error(f"Error fetching links from cursor {cursor}: {str(e)}") @@ -49,6 +70,26 @@ def get_existing_links(linkwarden_url, linkwarden_token, collection_id): logger.error(f"Response status code: {e.response.status_code}") logger.error(f"Response content: {e.response.text}") break + + # Handle duplicate deletion if requested + if delete_duplicate and duplicate_link_ids: + logger.info(f"Found {len(duplicate_link_ids)} duplicate links to delete: {duplicate_link_ids}") + + batch_size = 100 + total_deleted = 0 + + for i in range(0, len(duplicate_link_ids), batch_size): + batch = duplicate_link_ids[i:i + batch_size] + logger.debug(f"Deleting batch {i//batch_size + 1}: {len(batch)} links") + + deleted_count = delete_links(linkwarden_url, linkwarden_token, batch) + if deleted_count is not None: + total_deleted += deleted_count + else: + logger.error(f"Failed to delete batch {i//batch_size + 1}") + + logger.info(f"Successfully deleted {total_deleted} duplicate links out of {len(duplicate_link_ids)} found") + logger.info(f"Processed {total_links_processed} total links in collection {collection_id}") def get_collections(linkwarden_url, linkwarden_token): @@ -161,3 +202,54 @@ def upload_link(linkwarden_url, linkwarden_token, collection_id, repo, tags): logger.error(f"Response content: {e.response.text}") return None + + +def delete_links(linkwarden_url, linkwarden_token, link_ids): + url = f"{linkwarden_url.rstrip('/')}/api/v1/links" + headers = { + "Authorization": f"Bearer {linkwarden_token}", + "Content-Type": "application/json", + } + + request_data = { + "linkIds": link_ids + } + + logger.debug(f"Attempting to delete {len(link_ids)} links: {link_ids}") + + try: + response = requests.delete( + url, + headers=headers, + json=request_data, + timeout=30, + ) + + logger.debug(f"Delete response status code: {response.status_code}") + logger.debug(f"Delete response content: {response.text}") + + if response.status_code == 401: + logger.error("Unauthorized: Invalid or expired token for delete operation") + return None + + response.raise_for_status() + response_json = response.json() + + deleted_count = response_json.get("response", {}).get("count", 0) + + logger.info(f"Successfully deleted {deleted_count} links out of {len(link_ids)} requested") + + if deleted_count != len(link_ids): + logger.warning(f"Expected to delete {len(link_ids)} links, but only {deleted_count} were deleted") + + return deleted_count + + except Timeout: + logger.error("Request timed out while deleting links") + return None + except requests.RequestException as e: + logger.error(f"Error deleting links from Linkwarden: {str(e)}") + if hasattr(e, "response") and e.response is not None: + logger.error(f"Response status code: {e.response.status_code}") + logger.error(f"Response content: {e.response.text}") + return None \ No newline at end of file