Skip to content

Commit 30cf5fb

Browse files
committed
ref(resolver)
1 parent db37daa commit 30cf5fb

12 files changed

Lines changed: 150 additions & 116 deletions

File tree

app/business/info_base/block.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,7 @@ def get_resolver(cls, block_id: BlockID) -> Opt["Resolver"]:
7272
block = cls.get(block_id)
7373
if block is None:
7474
return None
75-
return ResolverManager.new_resolver(block)
75+
return ResolverManager.get(block)
7676

7777
@classmethod
7878
def create(
@@ -111,6 +111,7 @@ def create(
111111
async def refresh_embeddings(cls):
112112
"""Rebuild all blocks' embeddings - delegates to sink embedding service"""
113113
from app.business.sink.embedding import EmbeddingManager
114+
114115
await EmbeddingManager.refresh_all_block_embeddings()
115116

116117
@classmethod
@@ -120,8 +121,8 @@ async def fetchsert(cls, block: BlockModel, db_session: sqlmodel.Session) -> Blo
120121
Will NOT commit the session.
121122
"""
122123
from app.business.sink.embedding import EmbeddingManager
123-
124-
resolver = ResolverManager.new_resolver(block)
124+
125+
resolver = ResolverManager.get(block)
125126
existing = resolver.get_existing(db_session)
126127
if existing is not None:
127128
logger.debug(
@@ -149,7 +150,7 @@ async def organize(cls, block: BlockModel):
149150
FIXME
150151
"""
151152
with SessionLocal() as db_session:
152-
resolver = ResolverManager.new_resolver(block)
153+
resolver = ResolverManager.get(block)
153154
generator = (await resolver.breakdown())()
154155
try:
155156
i = generator.send(None)
@@ -179,6 +180,7 @@ def query_by_embedding(
179180
:param resolver: Filter by resolver type, None means no filter
180181
"""
181182
from app.business.sink.embedding import EmbeddingManager
183+
182184
return EmbeddingManager.query_blocks_by_embedding(
183185
block_id=block_id,
184186
embedding=embedding,

app/business/info_base/resolver/main.py

Lines changed: 75 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -4,32 +4,33 @@
44

55
import sqlmodel
66

7+
from app.business.info_base.relation import RelationManager
78
from app.business.info_base.storage.main import StorageManager
89

910
from app.schemas.info_base.main import SubGraphForm
10-
from app.schemas.info_base.block import ResolverType, BlockModel
11+
from app.schemas.info_base.block import BlockID, ResolverType, BlockModel
1112
from app.schemas.info_base.relation import RelationModel
1213
from app.schemas.info_base.storage import StorageID
1314

1415

1516
class ResolverManager:
16-
RESOLVERS: dict[ResolverType, type["Resolver"]] = {}
17+
RESOLVER_CLS: dict[ResolverType, type["Resolver"]] = {}
1718
"""Global resolver registry.
1819
19-
Map ResolverType to Resolver class
20-
"""
20+
Map ResolverType to Resolver class
21+
"""
2122

2223
@classmethod
2324
def register_resolver(cls, resolver_cls: type["Resolver"]):
24-
cls.RESOLVERS[resolver_cls.__rsotype__] = resolver_cls
25+
cls.RESOLVER_CLS[resolver_cls.__rsotype__] = resolver_cls
2526

2627
@classmethod
27-
def new_resolver(cls, block: BlockModel) -> "Resolver":
28+
def get(cls, block: BlockModel) -> "Resolver":
2829
"""Create resolver instance from block."""
2930
try:
30-
resolver_cls = cls.RESOLVERS[block.resolver]
31+
resolver_cls = cls.RESOLVER_CLS[block.resolver]
3132
except KeyError:
32-
raise NotImplementedError(f"Resolver {block.resolver} not implemented.")
33+
raise NotImplementedError(f"Resolver {block.resolver} not implemented/registered.")
3334
return resolver_cls(block)
3435

3536

@@ -38,7 +39,12 @@ def new_resolver(cls, block: BlockModel) -> "Resolver":
3839

3940

4041
class Resolver(abc.ABC, typing.Generic[SolvedContentTV, RawContentTV]):
41-
"""Resolver resolves a star graph (a block and its direct relations)"""
42+
"""Resolver resolves a star graph (a block and its direct relations)
43+
44+
Generic parameters:
45+
- SolvedContentTV: The type of the solved content
46+
- RawContentTV: The type of the raw content
47+
"""
4248

4349
__rsotype__: ResolverType
4450
"""Resolver type"""
@@ -55,18 +61,67 @@ def __init__(self, block: BlockModel, relations: Opt[tuple[RelationModel, ...]]
5561
:param relations: Relations of the block.
5662
"""
5763
self._block = block
58-
self._relations = relations or tuple()
59-
self._raw_content: RawContentTV
60-
self._solved_content: SolvedContentTV
61-
self.__post_init__()
64+
self.__relations = relations or None
65+
self.__raw_content: RawContentTV | None = None
66+
"""The (real) content of the block, commonly fetched from storage.
67+
If storage is None, uses block.content
68+
"""
69+
if self._block.storage is None:
70+
self.__raw_content = typing.cast(RawContentTV, self._block.content)
71+
self.__solved_content: SolvedContentTV | None = None
72+
"""Solved content is the content the resolver really works with,
73+
commonly from raw content.
74+
"""
75+
self.__post_init__(self.__raw_content)
6276

63-
def __post_init__(self):
77+
def __post_init__(self, raw_content: Opt[RawContentTV] = None) -> None:
6478
"""Subclass post-initialization hook.
6579
66-
It's suggest to resolve the block content to _solved_content here.
80+
It's suggest to set __solved_content here if possible:
81+
```python
82+
async def __post_init__(self, raw_content):
83+
if raw_content is not None:
84+
... # anyhow from raw_content
85+
self.set_solved_content(solved_content)
86+
```
6787
"""
6888
...
6989

90+
@property
91+
def block_id(self) -> BlockID:
92+
"""Get the block ID."""
93+
return typing.cast(BlockID, self._block.id)
94+
95+
async def get_raw_content(self) -> RawContentTV:
96+
"""Get the raw content of the block."""
97+
if self.__raw_content is None:
98+
if self._block.storage is None:
99+
self.__raw_content = typing.cast(RawContentTV, self._block.content)
100+
else:
101+
storage = StorageManager.get_storage(self._block.storage)
102+
self.__raw_content = typing.cast(
103+
RawContentTV, await storage.get_raw_content(self._block.content)
104+
)
105+
106+
return self.__raw_content
107+
108+
async def get_solved_content(self) -> SolvedContentTV:
109+
"""Get the solved content of the block."""
110+
if self.__solved_content is None:
111+
raise NotImplementedError(
112+
f"{self.__class__.__name__}.get_solved_content() must be implemented by subclasses."
113+
)
114+
return self.__solved_content
115+
116+
def set_solved_content(self, content: SolvedContentTV) -> None:
117+
self.__solved_content = content
118+
119+
async def get_relations(self) -> tuple[RelationModel, ...]:
120+
"""Get relations of the block."""
121+
if self.__relations is None:
122+
self.__relations = RelationManager.get(block_id=self.block_id)
123+
return self.__relations
124+
70125
@classmethod
71126
# @abc.abstractmethod TODO
72127
def create_block(cls, content, storage: Opt[StorageID] = None) -> BlockModel: ...
@@ -75,14 +130,15 @@ def create_block(cls, content, storage: Opt[StorageID] = None) -> BlockModel: ..
75130
# @abc.abstractmethod TODO
76131
def create_graph(cls, *args, **kwargs) -> SubGraphForm: ...
77132

133+
@abc.abstractmethod
78134
async def get_text(self) -> str:
79135
"""Get block content in text format."""
80-
storage = StorageManager.new_storage(self._block)
81-
return await storage.get_content(self._block)
136+
...
82137

83-
def get_str_for_embedding(self) -> str:
138+
@abc.abstractmethod
139+
async def get_str_for_embedding(self) -> str:
84140
"""Get string representation for embedding generation."""
85-
return self._block.content
141+
...
86142

87143
def get_existing(self, db_session: sqlmodel.Session) -> Opt[BlockModel]:
88144
"""Check if a block with the same content already exists in the database.

app/business/info_base/storage/http.py

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ async def _fetch_url(self, url: str) -> aiohttp.ClientResponse:
4747
response.raise_for_status()
4848
return response
4949

50-
async def get_content(self, block: BlockModel) -> str | bytes:
50+
async def get_raw_content(self, block: BlockModel) -> str | bytes:
5151
"""Fetch content from the URL in block.content.
5252
5353
:param block: Block containing the URL in its content field
@@ -59,7 +59,7 @@ async def get_content(self, block: BlockModel) -> str | bytes:
5959
class HTTPImageStorage(HTTPStorage):
6060
"""HTTP storage for image content (returns base64-encoded data)."""
6161

62-
async def get_content(self, block: BlockModel) -> str:
62+
async def get_raw_content(self, block: BlockModel) -> str:
6363
"""Fetch and encode image content as base64."""
6464
url = block.content
6565
config = self.get_config()
@@ -76,7 +76,7 @@ async def get_content(self, block: BlockModel) -> str:
7676
class HTTPVideoStorage(HTTPStorage):
7777
"""HTTP storage for video content (returns raw bytes)."""
7878

79-
async def get_content(self, block: BlockModel) -> bytes:
79+
async def get_raw_content(self, block: BlockModel) -> bytes:
8080
"""Fetch video content as bytes."""
8181
url = block.content
8282
config = self.get_config()
@@ -93,7 +93,7 @@ async def get_content(self, block: BlockModel) -> bytes:
9393
class HTTPTextStorage(HTTPStorage):
9494
"""HTTP storage for plain text content."""
9595

96-
async def get_content(self, block: BlockModel) -> str:
96+
async def get_raw_content(self, block: BlockModel) -> str:
9797
"""Fetch text content from URL."""
9898
url = block.content
9999
config = self.get_config()
@@ -109,7 +109,7 @@ async def get_content(self, block: BlockModel) -> str:
109109
class HTTPJsonStorage(HTTPStorage):
110110
"""HTTP storage for JSON content."""
111111

112-
async def get_content(self, block: BlockModel) -> str:
112+
async def get_raw_content(self, block: BlockModel) -> str:
113113
"""Fetch JSON content from URL."""
114114
url = block.content
115115
config = self.get_config()
@@ -125,7 +125,7 @@ async def get_content(self, block: BlockModel) -> str:
125125
class HTTPBinaryStorage(HTTPStorage):
126126
"""HTTP storage for binary content (returns raw bytes)."""
127127

128-
async def get_content(self, block: BlockModel) -> bytes:
128+
async def get_raw_content(self, block: BlockModel) -> bytes:
129129
"""Fetch binary content as bytes."""
130130
url = block.content
131131
config = self.get_config()
@@ -142,7 +142,7 @@ async def get_content(self, block: BlockModel) -> bytes:
142142
class HTTPHtmlStorage(HTTPStorage):
143143
"""HTTP storage for HTML content (returns raw bytes)."""
144144

145-
async def get_content(self, block: BlockModel) -> bytes:
145+
async def get_raw_content(self, block: BlockModel) -> bytes:
146146
"""Fetch HTML content as bytes."""
147147
url = block.content
148148
config = self.get_config()

app/business/info_base/storage/main.py

Lines changed: 7 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,7 @@ def register_storage(cls, storage_cls: type["Storage"]):
5959
cls._STORAGE_CLASSES[storage_type] = storage_cls
6060

6161
@classmethod
62-
def _get_storage_ins(cls, storage_id: StorageID) -> "Storage":
62+
def get_storage(cls, storage_id: StorageID) -> "Storage":
6363
"""Create a storage instance."""
6464
with SessionLocal() as db:
6565
storage_record = db.exec(
@@ -74,15 +74,6 @@ def _get_storage_ins(cls, storage_id: StorageID) -> "Storage":
7474

7575
return storage_class(storage_record)
7676

77-
@classmethod
78-
def new_storage(cls, block: BlockModel) -> "Storage":
79-
"""Create storage instance from block."""
80-
if block.storage is None:
81-
return Storage(None)
82-
83-
storage_id = typing.cast(StorageID, block.storage)
84-
return cls._get_storage_ins(storage_id)
85-
8677
@classmethod
8778
def create(
8879
cls,
@@ -199,32 +190,18 @@ def __init_subclass__(cls, config_cls: type[ConfigTV], **kwargs) -> None:
199190
StorageManager.register_storage(cls)
200191
return super().__init_subclass__(**kwargs)
201192

202-
def __init__(self, storage_record: Opt[StorageModel]):
203-
if storage_record is not None:
204-
self._type = storage_record.type
205-
self._config = storage_record.config
206-
else:
207-
self._type = None # Inline content storage
208-
self._config = {}
193+
def __init__(self, storage_record: StorageModel):
194+
self._type = storage_record.type
195+
self._config = storage_record.config
209196

210197
self.__post_init__()
211198

212199
def __post_init__(self):
213200
"""Post-initialization hook for subclasses."""
214201
pass
215202

216-
async def get_content(self, block_content: str) -> ContentTV:
217-
"""Get the actual content of the block."""
218-
if self._type is None:
219-
# Inline content storage
220-
return typing.cast(ContentTV, block_content)
221-
return await self._get_content(block_content)
222-
223-
async def _get_content(self, block_content: str) -> ContentTV:
224-
"""Get the actual content from the concrete storage implementation.
225-
226-
Subclasses must override this method.
227-
"""
203+
async def get_raw_content(self, block_content: str) -> ContentTV:
204+
"""Get the raw content of the block."""
228205
raise NotImplementedError(
229-
f"{self.__class__.__name__}._get_content() must be implemented by subclasses."
206+
f"{self.__class__.__name__}.get_raw_content() must be implemented by subclasses."
230207
)

app/business/sink/embedding.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -47,10 +47,12 @@ async def upsert_block_embedding(
4747
if block is None:
4848
raise ValueError(f"Block with id {block_id} not found")
4949

50-
resolver = ResolverManager.new_resolver(block)
50+
resolver = ResolverManager.get(block)
5151
embedding = BlockEmbeddingModel(
5252
id=block.id, # type: ignore[arg-type]
53-
embedding=Embedding("", "text-embedding-v3").embed(resolver.get_str_for_embedding()),
53+
embedding=Embedding("", "text-embedding-v3").embed(
54+
await resolver.get_str_for_embedding()
55+
),
5456
)
5557
if db_session:
5658
db_session.merge(embedding)

app/schemas/info_base/block.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,5 +60,5 @@ class BlockModel(sqlmodel.SQLModel, table=True):
6060
async def get_context_as_text(self) -> str:
6161
from app.business.info_base.resolver import ResolverManager
6262

63-
resolver = ResolverManager.new_resolver(self)
63+
resolver = ResolverManager.get(self)
6464
return await resolver.get_text()

docs/todo.md

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -44,8 +44,7 @@
4444
- [x] Resolver relies on Storage to get the actual content (don't do it yourself, never considering what storage is)
4545
- [x] 改进加载模式
4646
- 在未找到时,按照类型(和 Python 导入路径语法一致)尝试从插件中导入 (否则插件就需要在初始化时导入)
47-
- [ ] Resolver 和 Storage 应该解耦;比如 ImageResolver 要的就是 content 为图片二进制的 block,storage就负责搞定这件事,resolver不应该在乎
48-
-
47+
- [ ] 规范化 Resolver,其负责解析 Block 的 StarGraph
4948

5049
## Sink
5150

extensions/github/resolver.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -7,17 +7,17 @@
77
from app.business.info_base.resolver import Resolver
88
from app.schemas.info_base.block import BlockModel
99
from app.schemas.info_base.relation import RelationModel
10-
from app.schemas.info_base.main import ArcForm, SubGraphForm
10+
from app.schemas.info_base.main import InArcForm, SubGraphForm
1111
from utils.sql import find_by_json_contains
1212
from .schema import GithubRepo, GithubUser
1313

1414

1515
class GithubRepoResolver(Resolver, rso_type="github_repo"):
1616
"""Resolver for GitHub repository blocks."""
1717

18-
def __post_init__(self):
19-
"""Parse GitHub repo content after initialization."""
20-
self.content = GithubRepo.model_validate_json(self._block.content)
18+
def __post_init__(self, raw_content):
19+
if raw_content is not None:
20+
self.set_solved_content(GithubRepo.model_validate_json(raw_content))
2121

2222
@classmethod
2323
def create_graph(
@@ -38,7 +38,7 @@ def create_graph(
3838
in_relations = ()
3939
if owner:
4040
in_relations = (
41-
ArcForm(
41+
InArcForm(
4242
relation=RelationModel(content="owns"),
4343
from_subgraph=GithubUserResolver.create_graph(owner),
4444
),
@@ -73,7 +73,7 @@ async def get_text(self) -> str:
7373
text += f": {self.content.description}"
7474
return text
7575

76-
def get_str_for_embedding(self) -> str:
76+
async def get_str_for_embedding(self) -> str:
7777
"""Get text for embedding generation.
7878
7979
Combines name, description, topics and language for better semantic search.

0 commit comments

Comments
 (0)