diff --git a/README.md b/README.md index 8d41745..92c6061 100644 --- a/README.md +++ b/README.md @@ -27,10 +27,17 @@ so the shape of your library is version-controlled even if the contents are not. ## Requirements ```bash -pip3 install pymupdf +pip3 install -r requirements.txt ``` -Python 3.10+. No other dependencies. +Or install individually: + +```bash +pip3 install pymupdf # required for convert.py +pip3 install internetarchive # required for archive.org downloads +``` + +Python 3.10+. --- @@ -51,6 +58,10 @@ Python 3.10+. No other dependencies. ### 1. Download a collection +The source is auto-detected from the URL. Both modes share `--output-dir`, `--delay`, and `--dry-run`. + +**World Radio History** — scrapes PDF links from an archive page: + ```bash # Preview what would be downloaded python3 download.py "https://www.worldradiohistory.com/ETI_Magazine.htm" --dry-run @@ -64,6 +75,36 @@ python3 download.py "https://www.worldradiohistory.com/ETI_Magazine.htm" \ --filter "1970" --output-dir collections/eti/pdfs ``` +**archive.org** — downloads files from a single archive.org item by identifier. +Each issue typically has two PDF variants: a plain image PDF and a `_text.pdf` with an +Abbyy OCR text layer. The `--pdf-format` flag controls which variant is downloaded +(`text` is the default since `convert.py` extracts from the OCR layer): + +```bash +# Download all OCR PDFs from an archive.org item +python3 download.py "https://archive.org/details/ElektorMagazine" \ + --output-dir collections/elektor/pdfs + +# Download only issues from a specific decade +python3 download.py "https://archive.org/details/ElektorMagazine" \ + --output-dir collections/elektor/pdfs \ + --year-from 1974 --year-to 1989 + +# Download image-only PDFs (no OCR layer) +python3 download.py "https://archive.org/details/ElektorMagazine" \ + --pdf-format image --output-dir collections/elektor/pdfs + +# Preview without downloading +python3 download.py "https://archive.org/details/ElektorMagazine" \ + --year-from 1980 --dry-run +``` + +| Flag | Description | Default | +| --- | --- | --- | +| `--pdf-format` | `text` (_text.pdf, OCR), `image` (plain PDF), `both` | `text` | +| `--year-from` | Only download files with a year >= this value | — | +| `--year-to` | Only download files with a year <= this value | — | + ### 2. Probe the collection structure ```bash diff --git a/download.py b/download.py index fa94992..4daa425 100644 --- a/download.py +++ b/download.py @@ -1,26 +1,45 @@ #!/usr/bin/env python3 """ -Download all magazine PDFs from a World Radio History archive page. +Download magazine PDFs from a World Radio History archive page or an archive.org item. -Fetches the given page, extracts all PDF links, and downloads them into a -local directory — preserving the remote subdirectory structure so issues and -special collections land in separate folders. Already-downloaded files are -skipped, making the script safe to re-run. +Source is auto-detected from the URL: + - archive.org/details/... → archive.org item download (requires internetarchive) + - all other URLs → World Radio History link-scrape mode Usage: python3 download.py [--output-dir DIR] [--delay SECONDS] [--dry-run] + [--filter STRING] + [--pdf-format {text,image,both}] + [--year-from YEAR] [--year-to YEAR] Args: - url World Radio History page URL to scrape PDF links from - --output-dir Local directory to download into (default: ./Downloads/) - --delay Seconds to wait between downloads (default: 2) - --dry-run List what would be downloaded without downloading anything - --filter Only download URLs containing this string (e.g. "1979") + url Archive page URL or archive.org item URL + --output-dir Local directory to download into (default: ./Downloads/) + --delay Seconds to wait between downloads (default: 2) + --dry-run List what would be downloaded without downloading anything + --filter Only download URLs/filenames containing this string (WRH mode only) + --pdf-format Which PDF variant to download from archive.org: + text → *_text.pdf only — Abbyy OCR overlay (default) + image → plain *.pdf only — image container, may lack OCR + both → download both variants + --year-from Only download files whose filename contains a year >= this value + --year-to Only download files whose filename contains a year <= this value Examples: - python3 download.py https://www.worldradiohistory.com/ETI_Magazine.htm - python3 download.py https://www.worldradiohistory.com/ETI_Magazine.htm --dry-run - python3 download.py https://www.worldradiohistory.com/ETI_Magazine.htm --filter "UK/Electronics-Today-UK" + # World Radio History + python3 download.py "https://www.worldradiohistory.com/ETI_Magazine.htm" + python3 download.py "https://www.worldradiohistory.com/ETI_Magazine.htm" --dry-run + python3 download.py "https://www.worldradiohistory.com/ETI_Magazine.htm" \\ + --filter "UK/Electronics-Today-UK" + + # archive.org + python3 download.py "https://archive.org/details/ElektorMagazine" \\ + --output-dir collections/elektor/pdfs \\ + --year-from 1974 --year-to 1989 + python3 download.py "https://archive.org/details/ElektorMagazine" \\ + --pdf-format both --dry-run + +Author: Alister Lewis-Bowen """ import argparse @@ -40,6 +59,80 @@ } +# --------------------------------------------------------------------------- +# Shared helpers +# --------------------------------------------------------------------------- + +def download_file(url: str, dest: Path) -> bool: + """Download a single file, returning True on success. + + @param url: URL to download + @param dest: Local destination path + @return: True if downloaded, False if skipped (already exists) + """ + if dest.exists(): + return False + + dest.parent.mkdir(parents=True, exist_ok=True) + tmp = dest.with_suffix(".tmp") + + try: + req = urllib.request.Request(url, headers=HEADERS) + with urllib.request.urlopen(req, timeout=60) as resp, open(tmp, "wb") as f: + while chunk := resp.read(65536): + f.write(chunk) + tmp.rename(dest) + return True + except Exception as e: + if tmp.exists(): + tmp.unlink() + raise RuntimeError(f"Failed to download {url}: {e}") from e + + +def format_size(path: Path) -> str: + """Format a file's size as a human-readable string. + + @param path: Path to an existing file + @return: Human-readable size string, e.g. "4.2 MB" + """ + size = path.stat().st_size + for unit in ("B", "KB", "MB", "GB"): + if size < 1024: + return f"{size:.1f} {unit}" + size /= 1024 + return f"{size:.1f} TB" + + +def format_size_bytes(size_bytes: int | None) -> str: + """Format a byte count as a human-readable string. + + @param size_bytes: File size in bytes, or None if unknown + @return: Human-readable size string + """ + if size_bytes is None: + return "? B" + size = float(size_bytes) + for unit in ("B", "KB", "MB", "GB"): + if size < 1024: + return f"{size:.1f} {unit}" + size /= 1024 + return f"{size:.1f} TB" + + +def extract_year(filename: str) -> int | None: + """Extract the first four-digit year from a filename. + + @param filename: File name to search + @return: Year as int, or None if not found + """ + m = re.search(r"(\d{4})", filename) + return int(m.group(1)) if m else None + + +# --------------------------------------------------------------------------- +# World Radio History mode +# --------------------------------------------------------------------------- + def fetch_page(url: str) -> str: """Fetch a web page and return its HTML content. @@ -63,7 +156,7 @@ def extract_pdf_links(html: str, base_url: str) -> list[str]: for href in hrefs: full = urllib.parse.urljoin(base_url, href) parsed = urllib.parse.urlparse(full) - encoded_path = urllib.parse.quote(parsed.path, safe='/:@!$&\'()*+,;=') + encoded_path = urllib.parse.quote(parsed.path, safe="/:@!$&'()*+,;=") full = parsed._replace(path=encoded_path).geturl() absolute.add(full) return sorted(absolute) @@ -78,68 +171,16 @@ def url_to_local_path(pdf_url: str, output_dir: Path, base_url: str) -> Path: @return: Local Path where the file should be saved """ parsed = urllib.parse.urlparse(pdf_url) - # Strip leading slash from path rel_path = parsed.path.lstrip("/") - # URL-decode the path (handles %20 etc.) rel_path = urllib.parse.unquote(rel_path) return output_dir / rel_path -def download_file(url: str, dest: Path) -> bool: - """Download a single file, returning True on success. +def run_worldradiohistory(args: argparse.Namespace) -> None: + """Download PDFs from a World Radio History archive page. - @param url: URL to download - @param dest: Local destination path - @return: True if downloaded, False if skipped (already exists) + @param args: Parsed command-line arguments """ - if dest.exists(): - return False # Already downloaded - - dest.parent.mkdir(parents=True, exist_ok=True) - tmp = dest.with_suffix(".tmp") - - try: - req = urllib.request.Request(url, headers=HEADERS) - with urllib.request.urlopen(req, timeout=60) as resp, open(tmp, "wb") as f: - while chunk := resp.read(65536): - f.write(chunk) - tmp.rename(dest) - return True - except Exception as e: - if tmp.exists(): - tmp.unlink() - raise RuntimeError(f"Failed to download {url}: {e}") from e - - -def format_size(path: Path) -> str: - """Format a file's size as a human-readable string. - - @param path: Path to an existing file - @return: Human-readable size string, e.g. "4.2 MB" - """ - size = path.stat().st_size - for unit in ("B", "KB", "MB", "GB"): - if size < 1024: - return f"{size:.1f} {unit}" - size /= 1024 - return f"{size:.1f} TB" - - -def main() -> None: - parser = argparse.ArgumentParser( - description="Download PDFs from a World Radio History archive page" - ) - parser.add_argument("url", help="Archive page URL") - parser.add_argument("--output-dir", type=Path, help="Download root directory (default: ./Downloads)") - parser.add_argument("--delay", type=float, default=2.0, help="Seconds between downloads (default: 2)") - parser.add_argument("--dry-run", action="store_true", help="List files without downloading") - parser.add_argument("--filter", help="Only download URLs containing this string") - args = parser.parse_args() - - parsed_url = urllib.parse.urlparse(args.url) - if not args.output_dir: - args.output_dir = Path("Downloads") / parsed_url.hostname - print(f"Fetching index: {args.url}") try: html = fetch_page(args.url) @@ -193,5 +234,193 @@ def main() -> None: print(f"Files saved to: {args.output_dir}") +# --------------------------------------------------------------------------- +# archive.org mode +# --------------------------------------------------------------------------- + +def get_archive_org_item_id(url: str) -> str: + """Extract the archive.org item identifier from a /details/ URL. + + @param url: archive.org URL containing /details/ + @return: Item identifier string + @example: "https://archive.org/details/ElektorMagazine" → "ElektorMagazine" + """ + parsed = urllib.parse.urlparse(url) + parts = [p for p in parsed.path.split("/") if p] + try: + idx = parts.index("details") + return urllib.parse.unquote(parts[idx + 1]) + except (ValueError, IndexError): + print(f"ERROR: Cannot extract item identifier from URL: {url}") + sys.exit(1) + + +def select_archive_files(files: list, pdf_format: str, year_from: int | None, year_to: int | None) -> list: + """Filter archive.org file list by PDF format and optional year range. + + @param files: List of internetarchive File objects + @param pdf_format: One of "text", "image", or "both" + @param year_from: Lower year bound (inclusive), or None + @param year_to: Upper year bound (inclusive), or None + @return: Filtered and sorted list of File objects + """ + # Keep only PDFs + selected = [f for f in files if f.name.lower().endswith(".pdf")] + + # Filter by format variant + if pdf_format == "text": + selected = [f for f in selected if f.name.lower().endswith("_text.pdf")] + elif pdf_format == "image": + selected = [f for f in selected if not f.name.lower().endswith("_text.pdf")] + # "both" keeps all PDFs + + # Filter by year range + if year_from is not None or year_to is not None: + filtered = [] + for f in selected: + year = extract_year(f.name) + if year is None: + filtered.append(f) # no year in name — include by default + continue + if year_from is not None and year < year_from: + continue + if year_to is not None and year > year_to: + continue + filtered.append(f) + selected = filtered + + return sorted(selected, key=lambda f: f.name) + + +def run_archive_org(args: argparse.Namespace) -> None: + """Download PDFs from an archive.org item. + + @param args: Parsed command-line arguments + """ + try: + import internetarchive as ia + except ImportError: + print("ERROR: The 'internetarchive' package is required for archive.org downloads.") + print(" Install it with: pip3 install internetarchive") + sys.exit(1) + + item_id = get_archive_org_item_id(args.url) + print(f"Fetching archive.org item: {item_id}") + + item = ia.get_item(item_id) + all_files = list(item.get_files()) + + selected = select_archive_files( + all_files, + pdf_format=args.pdf_format, + year_from=args.year_from, + year_to=args.year_to, + ) + + year_range = "" + if args.year_from or args.year_to: + lo = str(args.year_from) if args.year_from else "any" + hi = str(args.year_to) if args.year_to else "any" + year_range = f" (years {lo}–{hi})" + + print(f"Found {len(selected)} PDFs{year_range} [format: {args.pdf_format}]") + print(f"Output directory: {args.output_dir}") + print() + + if args.dry_run: + print("DRY RUN — files that would be downloaded:") + for f in selected: + dest = args.output_dir / f.name + status = "EXISTS" if dest.exists() else "NEW" + size = format_size_bytes(f.size) + print(f" [{status}] {f.name} ({size})") + return + + downloaded = 0 + skipped = 0 + errors = 0 + total = len(selected) + + for i, f in enumerate(selected, 1): + dest = args.output_dir / f.name + + if dest.exists(): + print(f"[{i}/{total}] SKIP {f.name}") + skipped += 1 + continue + + file_url = f"https://archive.org/download/{urllib.parse.quote(item_id)}/{urllib.parse.quote(f.name)}" + + try: + download_file(file_url, dest) + size = format_size(dest) + print(f"[{i}/{total}] {f.name} ({size})") + downloaded += 1 + time.sleep(args.delay) + except RuntimeError as e: + print(f"[{i}/{total}] ERROR: {e}") + errors += 1 + + print() + print(f"Done. Downloaded: {downloaded} Skipped: {skipped} Errors: {errors}") + print(f"Files saved to: {args.output_dir}") + + +# --------------------------------------------------------------------------- +# Entry point +# --------------------------------------------------------------------------- + +def main() -> None: + """Parse arguments, detect source, and dispatch to the appropriate downloader.""" + parser = argparse.ArgumentParser( + description="Download PDFs from a World Radio History page or an archive.org item" + ) + parser.add_argument("url", help="Archive page URL or archive.org item URL") + parser.add_argument( + "--output-dir", type=Path, help="Download root directory (default: ./Downloads/)" + ) + parser.add_argument( + "--delay", type=float, default=2.0, help="Seconds between downloads (default: 2)" + ) + parser.add_argument("--dry-run", action="store_true", help="List files without downloading") + + # World Radio History options + parser.add_argument("--filter", help="Only download URLs containing this string (WRH mode only)") + + # archive.org options + parser.add_argument( + "--pdf-format", + choices=["text", "image", "both"], + default="text", + help=( + "Which PDF variant to download from archive.org: " + "text=*_text.pdf (OCR, default), image=plain *.pdf, both=all variants" + ), + ) + parser.add_argument( + "--year-from", + type=int, + metavar="YEAR", + help="Only download files whose filename contains a year >= YEAR", + ) + parser.add_argument( + "--year-to", + type=int, + metavar="YEAR", + help="Only download files whose filename contains a year <= YEAR", + ) + + args = parser.parse_args() + + parsed_url = urllib.parse.urlparse(args.url) + if not args.output_dir: + args.output_dir = Path("Downloads") / parsed_url.hostname + + if "archive.org/details/" in args.url: + run_archive_org(args) + else: + run_worldradiohistory(args) + + if __name__ == "__main__": main() diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..f4a3025 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,2 @@ +pymupdf +internetarchive