diff --git a/convert.py b/convert.py index fbeb2fb..bd63bfe 100644 --- a/convert.py +++ b/convert.py @@ -1,28 +1,33 @@ #!/usr/bin/env python3 -""" -Convert magazine/book PDFs from World Radio History (or similar archives) to -searchable Markdown with rendered page images. +"""Convert magazine/book PDFs to searchable Markdown with rendered page images. Extracts OCR text and renders each page as a PNG, producing per-publication markdown files that embed page images alongside the extracted text. A master index links all publications. Usage: - # Probe a directory to understand PDF structure before converting python3 convert.py --analyze [--input-dir DIR] - - # Convert all PDFs in a directory python3 convert.py [--input-dir DIR] [--output-dir DIR] [--dpi DPI] [--force] + python3 convert.py --global-index COLLECTIONS_ROOT Args: - --analyze Probe PDFs and report structure without converting - --input-dir Directory containing PDFs (required) - --output-dir Output directory for markdown and images (default: ./converted) - --pattern Glob pattern to select PDFs (default: **/*.pdf) - --dpi Render resolution for page images (default: 200) - --force Re-process publications even if output already exists + --analyze Probe PDFs and report structure without converting + --input-dir Directory containing PDFs + --output-dir Output directory for markdown and images (default: ./converted) + --pattern Glob pattern to select PDFs (default: **/*.pdf) + --dpi Render resolution for page images (default: 200) + --force Re-process publications even if output already exists --write-collection-md Auto-generate COLLECTION.md alongside the output directory - --global-index ROOT Generate CATALOGUE.md from all collections under ROOT (standalone mode) + --global-index ROOT Generate CATALOGUE.md from all collections under ROOT + +Author: Alister Lewis-Bowen +Version: 1.0.0 +Date: 2026-04-04 +License: MIT +Dependencies: pymupdf (fitz) +Exit codes: + 0: Success + 1: Error (missing dependency, invalid arguments, or input not found) """ import argparse @@ -67,9 +72,13 @@ def parse_slug(filename: str) -> tuple[str, str]: Tries date, volume, issue, and bare-number patterns in order, falling back to a cleaned version of the filename stem. - @param filename: PDF filename (basename only) - @return: (slug, label) e.g. ("1979-01", "January 1979") or ("vol-3", "Vol. 3") - @example: + Args: + filename: PDF filename (basename only). + + Returns: + Tuple of (slug, label), e.g. ("1979-01", "January 1979") or ("vol-3", "Vol. 3"). + + Example: parse_slug("Hobby-Electronics-1979-01-S-OCR.pdf") # ("1979-01", "January 1979") parse_slug("Bernards-Babani-BP042.pdf") # ("BP042", "BP042") """ @@ -109,9 +118,13 @@ def resolve_slugs(pdfs: list[Path]) -> dict[Path, str]: When multiple PDFs resolve to the same slug, appends the parent directory name to each slug to make them unique. - @param pdfs: List of PDF paths to map - @return: Dict mapping each PDF path to its unique slug - @example: + Args: + pdfs: List of PDF paths to map. + + Returns: + Dict mapping each PDF path to its unique slug. + + Example: # 70s/ETI-1985-08.pdf and 80s/ETI-1985-08.pdf both parse to "1985-08" # resolve_slugs returns {70s/...: "1985-08-70s", 80s/...: "1985-08-80s"} """ @@ -143,9 +156,13 @@ def infer_publication_name(stem: str) -> str: Strips trailing date/number/OCR artefacts, converts hyphens to spaces. - @param stem: Filename without extension - @return: Human-readable publication name guess - @example: + Args: + stem: Filename without extension. + + Returns: + Human-readable publication name guess. + + Example: infer_publication_name("Hobby-Electronics-1979-01-S-OCR") # "Hobby Electronics" infer_publication_name("Practical-Wireless-1965-03") # "Practical Wireless" """ @@ -164,8 +181,11 @@ def infer_publication_name(stem: str) -> str: def probe_pdf(pdf_path: Path) -> dict: """Gather structural information about a PDF without converting it. - @param pdf_path: Path to the PDF file - @return: Dict with keys: filename, pages, has_text, image_pages, text_sample, slug, label + Args: + pdf_path: Path to the PDF file. + + Returns: + Dict with keys: filename, pages, has_text, image_pages, text_sample, slug, label. """ try: doc = fitz.open(str(pdf_path)) @@ -208,8 +228,9 @@ def probe_pdf(pdf_path: Path) -> dict: def analyze_directory(input_dir: Path, pattern: str) -> None: """Print a structural report for all PDFs in a directory. - @param input_dir: Directory to scan - @param pattern: Glob pattern to filter files + Args: + input_dir: Directory to scan. + pattern: Glob pattern to filter files. """ pdfs = sorted(input_dir.glob(pattern)) if not pdfs: @@ -278,8 +299,11 @@ def analyze_directory(input_dir: Path, pattern: str) -> None: def clean_text(text: str) -> str: """Normalise OCR text for markdown output. - @param text: Raw OCR text from a PDF page - @return: Cleaned text suitable for markdown + Args: + text: Raw OCR text from a PDF page. + + Returns: + Cleaned text suitable for markdown. """ text = re.sub(r"\n{3,}", "\n\n", text) lines = [line.rstrip() for line in text.splitlines()] @@ -289,9 +313,10 @@ def clean_text(text: str) -> str: def render_page_png(page: fitz.Page, output_path: Path, dpi: int) -> None: """Render a PDF page to a PNG file. - @param page: PyMuPDF page object - @param output_path: Destination PNG path - @param dpi: Render resolution in dots per inch + Args: + page: PyMuPDF page object. + output_path: Destination PNG path. + dpi: Render resolution in dots per inch. """ matrix = fitz.Matrix(dpi / 72, dpi / 72) pixmap = page.get_pixmap(matrix=matrix, colorspace=fitz.csRGB) @@ -301,12 +326,15 @@ def render_page_png(page: fitz.Page, output_path: Path, dpi: int) -> None: def convert_publication(pdf_path: Path, output_dir: Path, dpi: int, force: bool, slug_override: str | None = None) -> dict: """Convert a single PDF to markdown with rendered page images. - @param pdf_path: Path to the source PDF - @param output_dir: Root output directory - @param dpi: Page render resolution - @param force: Re-process even if output exists - @param slug_override: Optional pre-resolved slug (used when collision disambiguation is applied) - @return: Dict with slug, title, pages, articles for index building + Args: + pdf_path: Path to the source PDF. + output_dir: Root output directory. + dpi: Page render resolution. + force: Re-process even if output exists. + slug_override: Optional pre-resolved slug (used when collision disambiguation is applied). + + Returns: + Dict with slug, title, pages, articles for index building. """ slug, label = parse_slug(pdf_path.name) if slug_override is not None: @@ -380,8 +408,9 @@ def convert_publication(pdf_path: Path, output_dir: Path, dpi: int, force: bool, def write_publication_index(info: dict, output_dir: Path) -> None: """Write a concise index.md for a single publication. - @param info: Dict from convert_publication() - @param output_dir: Root output directory + Args: + info: Dict from convert_publication(). + output_dir: Root output directory. """ pub_dir = output_dir / info["slug"] lines = [ @@ -403,8 +432,9 @@ def write_publication_index(info: dict, output_dir: Path) -> None: def write_master_index(all_publications: list[dict], output_dir: Path) -> None: """Write the master index.md linking all converted publications. - @param all_publications: List of info dicts from convert_publication() - @param output_dir: Root output directory + Args: + all_publications: List of info dicts from convert_publication(). + output_dir: Root output directory. """ lines = [ "# Magazine / Book Archive", @@ -453,9 +483,10 @@ def write_collection_md(output_dir: Path, all_publications: list[dict], input_di Writes to output_dir.parent/COLLECTION.md. Existing files are not overwritten unless the user removes them manually; this function skips if the file exists. - @param output_dir: Indexed output directory (e.g. collections/NAME/indexed) - @param all_publications: List of publication info dicts from convert_publication() - @param input_dir: Source PDF directory + Args: + output_dir: Indexed output directory (e.g. collections/NAME/indexed). + all_publications: List of publication info dicts from convert_publication(). + input_dir: Source PDF directory. """ dest = output_dir.parent / "COLLECTION.md" if dest.exists(): @@ -521,8 +552,9 @@ def write_global_index(collections_root: Path, output_path: Path) -> None: Scans each subdirectory of collections_root for an indexed/ subdirectory and an optional COLLECTION.md, then writes a markdown table to output_path. - @param collections_root: Root directory containing collection subdirectories - @param output_path: Destination file path for the generated index + Args: + collections_root: Root directory containing collection subdirectories. + output_path: Destination file path for the generated index. """ lines = [ "# Library Catalogue", diff --git a/download.py b/download.py index 4daa425..c00d6a8 100644 --- a/download.py +++ b/download.py @@ -1,6 +1,5 @@ #!/usr/bin/env python3 -""" -Download magazine PDFs from a World Radio History archive page or an archive.org item. +"""Download magazine PDFs from a World Radio History archive page or an archive.org item. Source is auto-detected from the URL: - archive.org/details/... → archive.org item download (requires internetarchive) @@ -40,6 +39,13 @@ --pdf-format both --dry-run Author: Alister Lewis-Bowen +Version: 1.0.0 +Date: 2026-04-04 +License: MIT +Dependencies: internetarchive (optional, for archive.org downloads) +Exit codes: + 0: Success + 1: Error (invalid URL, download failure, or missing dependency) """ import argparse @@ -66,9 +72,12 @@ def download_file(url: str, dest: Path) -> bool: """Download a single file, returning True on success. - @param url: URL to download - @param dest: Local destination path - @return: True if downloaded, False if skipped (already exists) + Args: + url: URL to download. + dest: Local destination path. + + Returns: + True if downloaded, False if skipped (already exists). """ if dest.exists(): return False @@ -92,8 +101,11 @@ def download_file(url: str, dest: Path) -> bool: def format_size(path: Path) -> str: """Format a file's size as a human-readable string. - @param path: Path to an existing file - @return: Human-readable size string, e.g. "4.2 MB" + Args: + path: Path to an existing file. + + Returns: + Human-readable size string, e.g. "4.2 MB". """ size = path.stat().st_size for unit in ("B", "KB", "MB", "GB"): @@ -106,8 +118,11 @@ def format_size(path: Path) -> str: def format_size_bytes(size_bytes: int | None) -> str: """Format a byte count as a human-readable string. - @param size_bytes: File size in bytes, or None if unknown - @return: Human-readable size string + Args: + size_bytes: File size in bytes, or None if unknown. + + Returns: + Human-readable size string. """ if size_bytes is None: return "? B" @@ -122,8 +137,11 @@ def format_size_bytes(size_bytes: int | None) -> str: def extract_year(filename: str) -> int | None: """Extract the first four-digit year from a filename. - @param filename: File name to search - @return: Year as int, or None if not found + Args: + filename: File name to search. + + Returns: + Year as int, or None if not found. """ m = re.search(r"(\d{4})", filename) return int(m.group(1)) if m else None @@ -136,8 +154,11 @@ def extract_year(filename: str) -> int | None: def fetch_page(url: str) -> str: """Fetch a web page and return its HTML content. - @param url: Page URL to fetch - @return: HTML content as string + Args: + url: Page URL to fetch. + + Returns: + HTML content as string. """ req = urllib.request.Request(url, headers=HEADERS) with urllib.request.urlopen(req, timeout=30) as resp: @@ -147,9 +168,12 @@ def fetch_page(url: str) -> str: def extract_pdf_links(html: str, base_url: str) -> list[str]: """Extract all PDF hrefs from a page and resolve them to absolute URLs. - @param html: Raw HTML content - @param base_url: Base URL of the page for resolving relative links - @return: Sorted list of absolute PDF URLs + Args: + html: Raw HTML content. + base_url: Base URL of the page for resolving relative links. + + Returns: + Sorted list of absolute PDF URLs. """ hrefs = re.findall(r'href="([^"]*\.pdf)"', html, re.IGNORECASE) absolute = set() @@ -165,10 +189,13 @@ def extract_pdf_links(html: str, base_url: str) -> list[str]: def url_to_local_path(pdf_url: str, output_dir: Path, base_url: str) -> Path: """Convert a PDF URL to a local file path, preserving subdirectory structure. - @param pdf_url: Absolute URL of the PDF - @param output_dir: Root local download directory - @param base_url: Base URL of the archive page (used to strip the hostname prefix) - @return: Local Path where the file should be saved + Args: + pdf_url: Absolute URL of the PDF. + output_dir: Root local download directory. + base_url: Base URL of the archive page (used to strip the hostname prefix). + + Returns: + Local Path where the file should be saved. """ parsed = urllib.parse.urlparse(pdf_url) rel_path = parsed.path.lstrip("/") @@ -179,7 +206,8 @@ def url_to_local_path(pdf_url: str, output_dir: Path, base_url: str) -> Path: def run_worldradiohistory(args: argparse.Namespace) -> None: """Download PDFs from a World Radio History archive page. - @param args: Parsed command-line arguments + Args: + args: Parsed command-line arguments. """ print(f"Fetching index: {args.url}") try: @@ -241,9 +269,14 @@ def run_worldradiohistory(args: argparse.Namespace) -> None: def get_archive_org_item_id(url: str) -> str: """Extract the archive.org item identifier from a /details/ URL. - @param url: archive.org URL containing /details/ - @return: Item identifier string - @example: "https://archive.org/details/ElektorMagazine" → "ElektorMagazine" + Args: + url: archive.org URL containing /details/. + + Returns: + Item identifier string. + + Example: + "https://archive.org/details/ElektorMagazine" → "ElektorMagazine" """ parsed = urllib.parse.urlparse(url) parts = [p for p in parsed.path.split("/") if p] @@ -258,11 +291,14 @@ def get_archive_org_item_id(url: str) -> str: def select_archive_files(files: list, pdf_format: str, year_from: int | None, year_to: int | None) -> list: """Filter archive.org file list by PDF format and optional year range. - @param files: List of internetarchive File objects - @param pdf_format: One of "text", "image", or "both" - @param year_from: Lower year bound (inclusive), or None - @param year_to: Upper year bound (inclusive), or None - @return: Filtered and sorted list of File objects + Args: + files: List of internetarchive File objects. + pdf_format: One of "text", "image", or "both". + year_from: Lower year bound (inclusive), or None. + year_to: Upper year bound (inclusive), or None. + + Returns: + Filtered and sorted list of File objects. """ # Keep only PDFs selected = [f for f in files if f.name.lower().endswith(".pdf")] @@ -295,7 +331,8 @@ def select_archive_files(files: list, pdf_format: str, year_from: int | None, ye def run_archive_org(args: argparse.Namespace) -> None: """Download PDFs from an archive.org item. - @param args: Parsed command-line arguments + Args: + args: Parsed command-line arguments. """ try: import internetarchive as ia diff --git a/init-findings.sh b/init-findings.sh index 87c4a29..214f104 100755 --- a/init-findings.sh +++ b/init-findings.sh @@ -6,7 +6,7 @@ # # Name: init-findings.sh # Description: Scaffold findings/ directory for publication-library research -# Author: ali5ter +# Author: Alister Lewis-Bowen # Usage: ./init-findings.sh [--cloud dropbox|icloud|gdrive|local] # Dependencies: bash 4+ # Exit codes: 0 success, 1 error @@ -20,6 +20,18 @@ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" FINDINGS_DIR="${SCRIPT_DIR}/findings" CLOUD_TYPE="${1:-}" +# Load pfb for terminal output +PFB_SCRIPT="${SCRIPT_DIR}/lib/pfb/pfb.sh" +if [[ -f "${PFB_SCRIPT}" ]]; then + # shellcheck source=lib/pfb/pfb.sh + source "${PFB_SCRIPT}" +elif command -v pfb &>/dev/null; then + : # pfb already on PATH +else + echo "ERROR: pfb not found. Run: git submodule update --init lib/pfb" >&2 + exit 1 +fi + # --------------------------------------------------------------------------- # Helpers # --------------------------------------------------------------------------- @@ -86,13 +98,13 @@ while [[ $# -gt 0 ]]; do esac done -echo "publication-library — findings/ setup" +pfb heading "publication-library — findings/ setup" "📁" echo # Determine where findings will live if [[ -n "${CLOUD}" ]]; then CLOUD_DEST="$(cloud_path "${CLOUD}")" - echo "Cloud storage: ${CLOUD} → ${CLOUD_DEST}" + pfb info "Cloud storage: ${CLOUD} → ${CLOUD_DEST}" # Create the cloud directory and scaffold inside it mkdir -p "${CLOUD_DEST}/topics" @@ -101,24 +113,23 @@ if [[ -n "${CLOUD}" ]]; then # Create symlink if findings/ doesn't already exist if [[ -e "${FINDINGS_DIR}" ]]; then - echo "INFO: ${FINDINGS_DIR} already exists — skipping symlink" + pfb info "${FINDINGS_DIR} already exists — skipping symlink" else ln -s "${CLOUD_DEST}" "${FINDINGS_DIR}" - echo "Symlink created: ${FINDINGS_DIR} → ${CLOUD_DEST}" + pfb success "Symlink created: ${FINDINGS_DIR} → ${CLOUD_DEST}" fi else # Local only mkdir -p "${FINDINGS_DIR}/topics" mkdir -p "${FINDINGS_DIR}/projects" mkdir -p "${FINDINGS_DIR}/sessions" - echo "Created: ${FINDINGS_DIR}/" + pfb success "Created: ${FINDINGS_DIR}/" fi echo -echo "Directory structure:" -echo " findings/" -echo " ├── topics/ ← topic reference notes (e.g. synthesisers.md)" -echo " ├── projects/ ← project research notes" -echo " └── sessions/ ← dated session logs (YYYY-MM-DD-topic.md)" +pfb subheading "findings/" +pfb subheading " ├── topics/ — topic reference notes (e.g. synthesisers.md)" +pfb subheading " ├── projects/ — project research notes" +pfb subheading " └── sessions/ — dated session logs (YYYY-MM-DD-topic.md)" echo -echo "Done. findings/ is gitignored and will not be committed." +pfb success "Done. findings/ is gitignored and will not be committed." diff --git a/search.py b/search.py index edf4e8a..c255c71 100755 --- a/search.py +++ b/search.py @@ -1,6 +1,5 @@ #!/usr/bin/env python3 -""" -Search across indexed library collections and display formatted results. +"""Search across indexed library collections and display formatted results. Wraps grep to provide structured output grouped by publication. Results show the collection name, publication slug, and matching lines with context. @@ -20,6 +19,15 @@ python3 search.py "VCA" --collection eti python3 search.py synthesiser --files-only python3 search.py "guitar" --context 3 + +Author: Alister Lewis-Bowen +Version: 1.0.0 +Date: 2026-04-05 +License: MIT +Dependencies: grep (system) +Exit codes: + 0: Success + 1: Error (collections directory not found, or no matches) """ import argparse @@ -32,10 +40,13 @@ def search_indexed(term: str, indexed_dir: Path, context: int) -> list[dict]: """Search a collection's content.md files for a term. - @param term: Search term (case-insensitive) - @param indexed_dir: Path to the collection's indexed directory - @param context: Lines of context around each match - @return: List of dicts with keys: slug, line_num, content + Args: + term: Search term (case-insensitive). + indexed_dir: Path to the collection's indexed directory. + context: Lines of context around each match. + + Returns: + List of dicts with keys: slug, line_num, content. """ cmd = ["grep", "-rin", f"--include=content.md", f"-C{context}", term, str(indexed_dir) + "/"] result = subprocess.run(cmd, capture_output=True, text=True) @@ -69,9 +80,12 @@ def search_indexed(term: str, indexed_dir: Path, context: int) -> list[dict]: def files_matching(term: str, indexed_dir: Path) -> list[str]: """Return a sorted list of publication slugs containing the term. - @param term: Search term (case-insensitive) - @param indexed_dir: Path to the collection's indexed directory - @return: Sorted list of unique publication slugs + Args: + term: Search term (case-insensitive). + indexed_dir: Path to the collection's indexed directory. + + Returns: + Sorted list of unique publication slugs. """ cmd = ["grep", "-ril", f"--include=content.md", term, str(indexed_dir) + "/"] result = subprocess.run(cmd, capture_output=True, text=True) @@ -89,8 +103,11 @@ def files_matching(term: str, indexed_dir: Path) -> list[str]: def group_by_slug(matches: list[dict]) -> dict[str, list[dict]]: """Group a flat list of match dicts by publication slug. - @param matches: List of match dicts from search_indexed() - @return: Dict mapping slug to list of its matches + Args: + matches: List of match dicts from search_indexed(). + + Returns: + Dict mapping slug to list of its matches. """ groups: dict[str, list[dict]] = {} for m in matches: