diff --git a/src/cratedb_about/cli.py b/src/cratedb_about/cli.py index 9c28d89..c2c55f8 100644 --- a/src/cratedb_about/cli.py +++ b/src/cratedb_about/cli.py @@ -1,11 +1,13 @@ import logging import typing as t from pathlib import Path +from pprint import pprint import click from pueblo.util.cli import boot_click from cratedb_about.bundle.llmstxt import LllmsTxtBuilder +from cratedb_about.hub.model import LLMsTxtHub from cratedb_about.outline import CrateDbKnowledgeOutline from cratedb_about.query.core import CrateDbKnowledgeConversation from cratedb_about.query.model import Example @@ -99,6 +101,17 @@ def bundle(ctx: click.Context, url: str, format_: str, outdir: Path) -> None: logger.info("Ready.") +@cli.command() +@click.pass_context +def hub(ctx: click.Context) -> None: + """ + Inquire information from https://llmtxt.dev/hub. + """ + txt_hub = LLMsTxtHub().fetch() + pprint(txt_hub.items) + logger.info("Ready.") + + @cli.command() @click.argument("question", type=str, required=False) @click.option("--backend", type=click.Choice(["openai", "claude"]), default="openai") diff --git a/src/cratedb_about/hub/__init__.py b/src/cratedb_about/hub/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/cratedb_about/hub/model.py b/src/cratedb_about/hub/model.py new file mode 100644 index 0000000..0966ba5 --- /dev/null +++ b/src/cratedb_about/hub/model.py @@ -0,0 +1,68 @@ +import dataclasses +import logging +import typing as t + +from bs4 import BeautifulSoup + +from cratedb_about.util import get_cache_client + + +logger = logging.getLogger(__name__) + + +@dataclasses.dataclass +class Resource: + url: str + size: int = -1 + + +@dataclasses.dataclass +class LLMsTxtHubItem: + title: str + website: str + description: str + logo: str + tags: t.List[str] = dataclasses.field(default_factory=list) + resources: t.List[Resource] = dataclasses.field(default_factory=list) + + +class LLMsTxtHub: + url: str = "https://llmtxt.dev/hub" + + def __init__(self): + self.items: t.List[LLMsTxtHubItem] = [] + self.client = get_cache_client(ttl=60*60*24) + + def fetch(self): + index_html = self.client.get(self.url) + bs = BeautifulSoup(index_html, "html.parser") + cards = bs.find_all(attrs={"class": "website-card"}) + self.items = [self.card_to_model(card) for card in cards] + self.acquire_sizes() + return self + + def acquire_sizes(self): + logger.info(f"Acquiring sizes for {len(self.items)} items") + for item in self.items: + logger.info(f"Acquiring size for {item}") + for resource in item.resources: + try: + response = self.client.get(resource.url) + resource.size = len(response.text) + except Exception as e: + logger.warning(f"Failed to acquire size for {item}: {e}") + + @staticmethod + def card_to_model(card): + divs = card.find(name="div") + title = divs.find(name="h3").text + tags = [] + for tag in divs.find_all(name="span"): + tags.append(tag.text) + website = divs.find(name="p", attrs={"class": "text-sm"}).text + description = divs.find(name="p", attrs={"class": "text-sm", "title": True}).text + logo_url = divs.find(name="img").get("src") + resources = [] + for anchor in divs.find_all(name="a"): + resources.append(Resource(url=anchor.get("href"))) + return LLMsTxtHubItem(title=title, website=website, description=description, logo=logo_url, tags=tags, resources=resources)