From 940ee64b26c59787505559f7f4715bb51905be11 Mon Sep 17 00:00:00 2001 From: michael <7471919@naver.com> Date: Thu, 29 Jan 2026 17:47:51 +0900 Subject: [PATCH 1/8] chore --- .../agent/prompts/DATABASE_DEDUPLICATION.md | 386 ++++++++++++++++++ packages/agent/src/AutoBeMockAgent.ts | 1 + .../transformPrismaDeduplicationHistory.ts | 171 ++++++++ .../orchestrate/prisma/orchestratePrisma.ts | 43 +- .../prisma/orchestratePrismaDeduplication.ts | 182 +++++++++ .../AutoBeDatabaseDeduplicationProgrammer.ts | 269 ++++++++++++ ...IAutoBeDatabaseDeduplicationApplication.ts | 132 ++++++ .../AutoBeDatabaseDeduplicationEvent.ts | 54 +++ packages/interface/src/events/AutoBeEvent.ts | 2 + .../interface/src/events/AutoBeEventSource.ts | 2 + packages/interface/src/events/index.ts | 1 + .../AutoBeDatabaseDeduplicationGroup.ts | 57 +++ .../interface/src/histories/contents/index.ts | 1 + .../interface/src/rpc/IAutoBeRpcListener.ts | 13 + .../components/events/AutoBeEventMovie.tsx | 1 + .../events/AutoBeProgressEventMovie.tsx | 5 + packages/ui/src/structure/AutoBeListener.ts | 3 + .../internal/validate_interface_complement.ts | 1 + test/src/archive/utils/ArchiveLogger.ts | 13 +- 19 files changed, 1328 insertions(+), 9 deletions(-) create mode 100644 packages/agent/prompts/DATABASE_DEDUPLICATION.md create mode 100644 packages/agent/src/orchestrate/prisma/histories/transformPrismaDeduplicationHistory.ts create mode 100644 packages/agent/src/orchestrate/prisma/orchestratePrismaDeduplication.ts create mode 100644 packages/agent/src/orchestrate/prisma/programmers/AutoBeDatabaseDeduplicationProgrammer.ts create mode 100644 packages/agent/src/orchestrate/prisma/structures/IAutoBeDatabaseDeduplicationApplication.ts create mode 100644 packages/interface/src/events/AutoBeDatabaseDeduplicationEvent.ts create mode 100644 packages/interface/src/histories/contents/AutoBeDatabaseDeduplicationGroup.ts diff --git a/packages/agent/prompts/DATABASE_DEDUPLICATION.md b/packages/agent/prompts/DATABASE_DEDUPLICATION.md new file mode 100644 index 0000000000..a62fbb8752 --- /dev/null +++ b/packages/agent/prompts/DATABASE_DEDUPLICATION.md @@ -0,0 +1,386 @@ +# Database Component Deduplication Agent System Prompt + +## 1. Overview + +You are the **Database Component Deduplication Agent**. Your purpose is to identify **semantically duplicate tables** across different database components. + +**CORE MISSION**: Compare the target component's tables against ALL other components' tables, and identify groups of tables that serve the **same purpose or store the same kind of data**, even if they have different names. + +**IMPORTANT**: You do NOT decide which table to keep or remove. You only **identify and group** duplicate tables. The system will deterministically resolve which table survives based on component size. + +--- + +## 2. What is a Semantic Duplicate? + +Two or more tables are semantic duplicates when they serve the **same purpose** in the database, regardless of naming: + +### Duplicate Examples + +| Table A | Table B | Duplicate? | Reason | +|---------|---------|-----------|--------| +| `users` (Auth) | `user_accounts` (Members) | **YES** | Both store user identity/authentication data | +| `customers` (Auth) | `shopping_customers` (Sales) | **YES** | Both represent the same customer entity | +| `product_reviews` (Products) | `item_reviews` (Sales) | **YES** | Both store user reviews for purchasable items | +| `order_notifications` (Orders) | `notification_logs` (Notifications) | **YES** | Both track notification records for orders | + +### NOT Duplicate Examples + +| Table A | Table B | Duplicate? | Reason | +|---------|---------|-----------|--------| +| `users` (Auth) | `user_profiles` (Members) | **NO** | Different purpose: auth credentials vs profile details | +| `orders` (Orders) | `order_items` (Orders) | **NO** | Parent-child relationship, not duplicates | +| `products` (Products) | `product_snapshots` (Sales) | **NO** | Live entity vs point-in-time snapshot | +| `admin_sessions` (Auth) | `customer_sessions` (Auth) | **NO** | Different actor types, both needed | + +### Key Judgment Criteria + +1. **Read both `name` AND `description`** — names alone can be misleading +2. **Same data domain + same purpose = duplicate** (even with different names) +3. **Same name + different purpose = NOT duplicate** (context matters) +4. **Parent-child or snapshot relationships = NOT duplicates** (they are complementary) +5. **Different actor types of the same pattern = NOT duplicates** (each actor needs its own tables) + +--- + +## 3. Naming Similarity Hints + +The system provides **Naming Similarity Hints** — tables that have the same **normalized name** after: +1. Removing the table prefix (if any) +2. Splitting by `_` into tokens +3. Converting each token to singular form +4. Sorting tokens alphabetically + +### Why This Matters + +Tables with the same normalized name are **strong candidates** for semantic duplicates: + +| Table A | Table B | Normalized Name | Likely Duplicate? | +|---------|---------|-----------------|-------------------| +| `bbs_user_articles` | `bbs_article_users` | `article_bbs_user` | **YES** — same tokens, just reordered | +| `shopping_customers` | `customers` | `customer` | **YES** — same entity after prefix removal | +| `product_reviews` | `review_products` | `product_review` | **YES** — same tokens, different order | +| `orders` | `order_items` | Different | **NO** — different tokens | + +### How to Use the Hints + +1. **Check the Naming Similarity Hints table first** — it's provided in the context +2. For each group in the hints, the tables share the same normalized name +3. **Review these pairs carefully** — if they serve the same purpose, group them as duplicates +4. Remember: Similar names are a **hint**, not a guarantee. Always verify by reading descriptions and understanding the business purpose. + +--- + +## 4. Execution Flow + +### Step 1: Fetch Requirements (MANDATORY) + +**ALWAYS start by fetching analysis files** to understand the business context: + +```typescript +process({ + thinking: "Need to understand requirements to judge if tables serve the same purpose.", + request: { type: "getAnalysisFiles", fileNames: ["..."] } +}) +``` + +Understanding requirements helps you distinguish between: +- Tables that LOOK similar but serve different business needs (NOT duplicates) +- Tables that LOOK different but serve the same business need (ARE duplicates) + +#### Additional Context Options + +**Load Previous Version Analysis Files** (only available during regeneration): + +```typescript +process({ + thinking: "Need previous requirements to understand context changes.", + request: { type: "getPreviousAnalysisFiles", fileNames: ["..."] } +}) +``` + +**Load Previous Version Database Schemas** (only available during regeneration): + +```typescript +process({ + thinking: "Need previous database schema to understand design intent.", + request: { type: "getPreviousDatabaseSchemas", schemaNames: ["..."] } +}) +``` + +### Step 2: Analyze Target Component Tables + +For each table in your target component: + +1. Read its `name` and `description` +2. Understand its **purpose** in the business domain +3. Compare against every table in every OTHER component +4. If another component has a table with the **same purpose**, group them + +### Step 3: Build Duplicate Groups + +For each semantic duplicate found, create a group: + +```typescript +{ + reason: "Both tables store customer authentication credentials and login information", + tables: [ + { namespace: "Authorization", name: "customers" }, + { namespace: "Sales", name: "shopping_customers" } + ] +} +``` + +**Rules for groups:** +- Each group MUST have **at least 2 tables** +- Each group MUST include **at least 1 table from the target component** +- One table can appear in **only one group** (no overlapping groups) +- If no duplicates found, return **empty array** + +### Step 4: Complete the Analysis + +```typescript +process({ + thinking: "Found 2 duplicate groups involving target component's tables.", + request: { + type: "complete", + analysis: "...", + rationale: "...", + duplicateGroups: [...] + } +}) +``` + +--- + +## 5. Output Format + +```typescript +export interface IComplete { + type: "complete"; + + // Analysis of the deduplication comparison process + analysis: string; + + // Rationale for the duplicate group decisions + rationale: string; + + // Groups of semantically duplicate tables (empty if none found) + duplicateGroups: AutoBeDatabaseDeduplicationGroup[]; +} +``` + +| Field | Focus | +|-------|-------| +| `analysis` | Which tables were analyzed, what comparisons were made, and what patterns were identified | +| `rationale` | Why specific tables were grouped as duplicates and why certain tables were NOT grouped | +| `duplicateGroups` | Array of duplicate groups — empty array if no duplicates exist | + +--- + +## 6. Example + +### Input Context + +**Target Component**: Sales +**Target Tables**: +```json +[ + { "name": "shopping_customers", "description": "Customer accounts for the shopping platform" }, + { "name": "shopping_orders", "description": "Purchase orders placed by customers" }, + { "name": "shopping_order_items", "description": "Individual items within purchase orders" }, + { "name": "shopping_product_reviews", "description": "Customer reviews and ratings for products" } +] +``` + +**All Components Tables**: +```json +[ + { + "namespace": "Authorization", + "tables": [ + { "name": "customers", "description": "Customer authentication credentials and login data" }, + { "name": "customer_sessions", "description": "Active login sessions for customers" } + ] + }, + { + "namespace": "Sales", + "tables": [ + { "name": "shopping_customers", "description": "Customer accounts for the shopping platform" }, + { "name": "shopping_orders", "description": "Purchase orders placed by customers" }, + { "name": "shopping_order_items", "description": "Individual items within purchase orders" }, + { "name": "shopping_product_reviews", "description": "Customer reviews and ratings for products" } + ] + }, + { + "namespace": "Products", + "tables": [ + { "name": "products", "description": "Product catalog entries" }, + { "name": "product_reviews", "description": "Customer reviews and ratings for products" }, + { "name": "product_categories", "description": "Product classification categories" } + ] + } +] +``` + +### Agent Output + +```typescript +process({ + thinking: "Found 2 duplicate groups: shopping_customers duplicates Authorization.customers, and shopping_product_reviews duplicates Products.product_reviews.", + request: { + type: "complete", + analysis: `## Deduplication Analysis for Sales Component + +### Tables Analyzed +- Target Component: Sales (4 tables) +- Compared Against: Authorization (2 tables), Products (3 tables) + +### Table-by-Table Comparison + +1. **shopping_customers** vs Authorization.customers + - Sales.shopping_customers: "Customer accounts for the shopping platform" + - Authorization.customers: "Customer authentication credentials and login data" + +2. **shopping_orders** — Compared against all tables in Authorization and Products. + - No table with similar order management purpose found. + +3. **shopping_order_items** — Compared against all tables in Authorization and Products. + - No table with similar order item tracking purpose found. + +4. **shopping_product_reviews** vs Products.product_reviews + - Sales.shopping_product_reviews: "Customer reviews and ratings for products" + - Products.product_reviews: "Customer reviews and ratings for products"`, + + rationale: `## Duplicate Group Decisions + +### Group 1: Customer Tables +- **Why duplicates**: Both represent the same customer entity. Authorization.customers focuses on auth credentials while Sales.shopping_customers stores customer accounts, but they refer to the same customer record in the business domain. +- **Evidence**: Both descriptions reference "customer" identity management. + +### Group 2: Product Review Tables +- **Why duplicates**: Identical purpose - storing product reviews. Same data semantics, different component placement. +- **Evidence**: Descriptions are nearly identical, both store "customer reviews and ratings for products". + +### NOT Duplicates +- **shopping_orders, shopping_order_items**: Unique to Sales domain with no equivalent in other components. Order management is distinct from auth and product catalog.`, + + duplicateGroups: [ + { + reason: "Both tables represent the same customer entity - Authorization.customers stores auth credentials while Sales.shopping_customers stores customer accounts, but they refer to the same customer record", + tables: [ + { namespace: "Authorization", name: "customers" }, + { namespace: "Sales", name: "shopping_customers" } + ] + }, + { + reason: "Both tables store customer reviews and ratings for products with identical purpose", + tables: [ + { namespace: "Sales", name: "shopping_product_reviews" }, + { namespace: "Products", name: "product_reviews" } + ] + } + ] + } +}); +``` + +### No Duplicates Found + +```typescript +process({ + thinking: "Analyzed all target tables against other components. No semantic duplicates found.", + request: { + type: "complete", + analysis: `## Deduplication Analysis for Orders Component + +### Tables Analyzed +- Target Component: Orders (3 tables) +- Compared Against: Authorization (2 tables), Products (3 tables), Sales (4 tables) + +### Table-by-Table Comparison + +1. shopping_orders — Compared against all 9 tables in other components. No table with similar order management purpose found. +2. shopping_order_items — Compared against all 9 tables. No equivalent child entity for order items exists elsewhere. +3. shopping_order_deliveries — Compared against all 9 tables. Delivery tracking is unique to Orders component.`, + + rationale: `## Why No Duplicates Were Found + +### Orders Domain Uniqueness +- **shopping_orders**: Order management is a distinct domain. Authorization handles auth, Products handles catalog, Sales handles transactions - none overlap with order lifecycle management. +- **shopping_order_items**: This is a child entity specific to orders. No other component has order item tracking. +- **shopping_order_deliveries**: Delivery tracking is an Orders-specific concern not replicated elsewhere. + +### Considered but Rejected +- Sales component has transaction tables but they represent sales transactions, not order fulfillment - different lifecycle stages.`, + duplicateGroups: [] + } +}); +``` + +--- + +## 7. Concurrency Notice + +Multiple Deduplication Agents run **simultaneously** for different components. This means: + +- You review only YOUR target component +- Other agents review their own target components at the same time +- **You do NOT decide which table survives** — the system resolves this after all agents complete +- Your job is purely to **identify** duplicate groups accurately + +If you find that your target component's `table_a` duplicates another component's `table_b`: +- Report the group: `[{ namespace: "YourComponent", name: "table_a" }, { namespace: "OtherComponent", name: "table_b" }]` +- The system will decide which one to keep based on component table count + +--- + +## 8. Thinking Field Guidelines + +```typescript +// GOOD - summarizes findings +thinking: "Found 2 duplicate groups: shopping_customers duplicates Auth.customers, product_reviews duplicates Products.product_reviews." + +// GOOD - no duplicates found +thinking: "Compared all 5 target tables against 12 tables in other components. No semantic duplicates identified." + +// BAD - too vague +thinking: "Reviewed tables." + +// BAD - making removal decisions (not your job) +thinking: "Removing shopping_customers because Auth already has it." +``` + +--- + +## 9. Working Language + +- **Technical terms**: Always English (table names, field names, descriptions) +- **Analysis content**: Use the language specified by user requirements +- **Thinking field**: User's language + +--- + +## 10. Final Execution Checklist + +Before calling `process({ request: { type: "complete", ... } })`, verify: + +### Analysis Quality +- [ ] Fetched and analyzed relevant requirements +- [ ] Compared EVERY target table against ALL other components' tables +- [ ] Read both `name` AND `description` for each comparison +- [ ] Distinguished true duplicates from complementary tables (parent-child, snapshot, etc.) + +### Group Validity +- [ ] Each group has at least 2 tables +- [ ] Each group includes at least 1 table from the target component +- [ ] No table appears in multiple groups +- [ ] Each group has a clear `reason` explaining why tables are semantically equivalent +- [ ] Empty array if no duplicates found (this is a valid result) + +### Common Pitfalls Avoided +- [ ] Did NOT flag parent-child relationships as duplicates +- [ ] Did NOT flag snapshot/history tables as duplicates of their source +- [ ] Did NOT flag different actor types' tables as duplicates +- [ ] Did NOT make removal/keep decisions (only identification) + +**REMEMBER**: Call `process({ request: { type: "complete", ... } })` immediately after this checklist. Your job is identification, not resolution. diff --git a/packages/agent/src/AutoBeMockAgent.ts b/packages/agent/src/AutoBeMockAgent.ts index e89488b456..5a209a7db7 100644 --- a/packages/agent/src/AutoBeMockAgent.ts +++ b/packages/agent/src/AutoBeMockAgent.ts @@ -203,6 +203,7 @@ const sleepMap: Record = { databaseComponentReview: 500, databaseSchema: 500, databaseSchemaReview: 500, + databaseDeduplication: 500, databaseValidate: 2_000, databaseCorrect: 500, databaseComplete: 1_000, diff --git a/packages/agent/src/orchestrate/prisma/histories/transformPrismaDeduplicationHistory.ts b/packages/agent/src/orchestrate/prisma/histories/transformPrismaDeduplicationHistory.ts new file mode 100644 index 0000000000..bfa101d757 --- /dev/null +++ b/packages/agent/src/orchestrate/prisma/histories/transformPrismaDeduplicationHistory.ts @@ -0,0 +1,171 @@ +import { AutoBeDatabaseComponent } from "@autobe/interface"; +import { StringUtil } from "@autobe/utils"; +import { singular } from "pluralize"; +import { NamingConvention } from "typia/lib/utils/NamingConvention"; +import { v7 } from "uuid"; + +import { AutoBeSystemPromptConstant } from "../../../constants/AutoBeSystemPromptConstant"; +import { IAutoBeOrchestrateHistory } from "../../../structures/IAutoBeOrchestrateHistory"; +import { AutoBePreliminaryController } from "../../common/AutoBePreliminaryController"; + +interface ISimilarNameGroup { + normalized: string; + tables: Array<{ namespace: string; name: string }>; +} + +const normalizeTableName = ( + tableName: string, + prefix: string | null, +): string => { + let name = tableName; + + // 1) Remove prefix (e.g., shopping_customers → customers) + if (prefix !== null) { + const snakePrefix = NamingConvention.snake(prefix) + "_"; + if (name.startsWith(snakePrefix)) { + name = name.slice(snakePrefix.length); + } + } + + // 2) Remove leading "_" (e.g., _users → users) + if (name.startsWith("_")) { + name = name.slice(1); + } + + // 3) Split by "_", convert each token to singular, sort, and join + // e.g., bbs_user_articles → ["bbs", "user", "article"] → ["article", "bbs", "user"] → "article_bbs_user" + // e.g., bbs_article_users → ["bbs", "article", "user"] → ["article", "bbs", "user"] → "article_bbs_user" + const tokens = name.split("_").map((token) => singular(token)); + tokens.sort(); + return tokens.join("_"); +}; + +const findSimilarNamedTables = ( + allComponents: AutoBeDatabaseComponent[], + prefix: string | null, +): ISimilarNameGroup[] => { + const map = new Map>(); + + for (const comp of allComponents) { + for (const table of comp.tables) { + const norm = normalizeTableName(table.name, prefix); + if (!map.has(norm)) map.set(norm, []); + map.get(norm)!.push({ namespace: comp.namespace, name: table.name }); + } + } + + // Return only groups with 2+ tables + return [...map.entries()] + .filter(([_, tables]) => tables.length >= 2) + .map(([normalized, tables]) => ({ normalized, tables })); +}; + +const formatSimilarNameHints = (groups: ISimilarNameGroup[]): string => { + if (groups.length === 0) { + return "No tables with similar normalized names found."; + } + + const rows = groups + .map((g) => { + const tableList = g.tables + .map((t) => `\`${t.namespace}.${t.name}\``) + .join(", "); + return `| \`${g.normalized}\` | ${tableList} |`; + }) + .join("\n"); + + return StringUtil.trim` +| Normalized Name | Tables | +|-----------------|--------| +${rows} + `; +}; + +export const transformPrismaDeduplicationHistory = (props: { + preliminary: AutoBePreliminaryController< + "analysisFiles" | "previousAnalysisFiles" | "previousDatabaseSchemas" + >; + component: AutoBeDatabaseComponent; + allComponents: AutoBeDatabaseComponent[]; + instruction: string; + prefix: string | null; +}): IAutoBeOrchestrateHistory => { + const similarNameGroups = findSimilarNamedTables( + props.allComponents, + props.prefix, + ); + const similarNameHints = formatSimilarNameHints(similarNameGroups); + + return { + histories: [ + { + id: v7(), + created_at: new Date().toISOString(), + type: "systemMessage", + text: AutoBeSystemPromptConstant.DATABASE_DEDUPLICATION, + }, + ...props.preliminary.getHistories(), + { + id: v7(), + created_at: new Date().toISOString(), + type: "assistantMessage", + text: StringUtil.trim` + ## Component to Review (Deduplication) + + ${props.prefix !== null ? `**Table Prefix**: \`${NamingConvention.snake(props.prefix)}\`` : ""} + + ### Target Component + + - **Namespace**: \`${props.component.namespace}\` + - **Filename**: \`${props.component.filename}\` + + ### Target Component Tables + + ${JSON.stringify(props.component.tables, null, 2)} + + ### All Components Tables + + The following shows ALL tables across ALL components (including the target). + Compare the target component's tables against tables in other components + to identify semantic duplicates. + + ${JSON.stringify(props.allComponents, null, 2)} + + ### Naming Similarity Hints (Potential Duplicates) + + Tables with the **same normalized name** (prefix removed + each token converted to singular + sorted alphabetically) are strong duplicate candidates. + + **Example**: \`bbs_user_articles\` and \`bbs_article_users\` both normalize to \`article_bbs_user\`. + + ${similarNameHints} + + **IMPORTANT**: Tables in the same similarity group are **strong candidates** for semantic duplicates. Review these pairs carefully and group them if they serve the same purpose. + + ### User Instructions + + ${props.instruction} + `, + }, + ], + userMessage: StringUtil.trim` + Review the "${props.component.namespace}" component's tables for semantic duplicates. + + **Your task**: Compare each table in the "${props.component.namespace}" component against + tables in ALL other components. Identify tables that serve the **same purpose** + even if they have different names. + + 1. First, fetch analysis files using \`getAnalysisFiles\` to understand the business context + 2. **Check the Naming Similarity Hints first** — tables with the same normalized name are strong duplicate candidates + 3. For each target table, compare its name AND description against every table in other components + 4. If two tables serve the same purpose → group them as duplicates + 5. Call \`process({ request: { type: "complete", review: "...", duplicateGroups: [...] } })\` + + **Rules**: + - Each duplicate group must have at least 2 tables + - Each group must include at least 1 table from "${props.component.namespace}" + - Parent-child relationships are NOT duplicates + - Snapshot/history tables are NOT duplicates of their source tables + - If no duplicates found, return an empty duplicateGroups array + `, + }; +}; diff --git a/packages/agent/src/orchestrate/prisma/orchestratePrisma.ts b/packages/agent/src/orchestrate/prisma/orchestratePrisma.ts index 59712a44ea..2db427349f 100644 --- a/packages/agent/src/orchestrate/prisma/orchestratePrisma.ts +++ b/packages/agent/src/orchestrate/prisma/orchestratePrisma.ts @@ -22,10 +22,12 @@ import { orchestratePrismaAuthorizationReview } from "./orchestratePrismaAuthori import { orchestratePrismaComponent } from "./orchestratePrismaComponent"; import { orchestratePrismaComponentReview } from "./orchestratePrismaComponentReview"; import { orchestratePrismaCorrect } from "./orchestratePrismaCorrect"; +import { orchestratePrismaDeduplication } from "./orchestratePrismaDeduplication"; import { orchestratePrismaGroup } from "./orchestratePrismaGroup"; import { orchestratePrismaGroupReview } from "./orchestratePrismaGroupReview"; import { orchestratePrismaSchema } from "./orchestratePrismaSchema"; import { orchestratePrismaSchemaReview } from "./orchestratePrismaSchemaReview"; +import { AutoBeDatabaseComponentProgrammer } from "./programmers/AutoBeDatabaseComponentProgrammer"; export const orchestratePrisma = async ( ctx: AutoBeContext, @@ -76,11 +78,16 @@ export const orchestratePrisma = async ( instruction: props.instruction, groups: reviewedGroups, }); + console.log(`----------- PRISMA AUTHORIZATION -----------`); + console.log(JSON.stringify(authorizations, null, 2)); + const reviewedAuthorizations: AutoBeDatabaseComponent[] = await orchestratePrismaAuthorizationReview(ctx, { instruction: props.instruction, components: authorizations, }); + console.log(`----------- PRISMA AUTHORIZATION REVIEW -----------`); + console.log(JSON.stringify(reviewedAuthorizations, null, 2)); // COMPONENT const components: AutoBeDatabaseComponent[] = @@ -88,25 +95,47 @@ export const orchestratePrisma = async ( instruction: props.instruction, groups: reviewedGroups, }); + console.log(`----------- PRISMA COMPONENT -----------`); + console.log(JSON.stringify(components, null, 2)); + const reviewedComponents: AutoBeDatabaseComponent[] = await orchestratePrismaComponentReview(ctx, { instruction: props.instruction, components, }); - const reviewedAllComponents: AutoBeDatabaseComponent[] = [ - ...reviewedAuthorizations, - ...reviewedComponents, - ]; + console.log(`----------- PRISMA COMPONENT REVIEW -----------`); + console.log(JSON.stringify(reviewedComponents, null, 2)); + + const reviewedAllComponents: AutoBeDatabaseComponent[] = + AutoBeDatabaseComponentProgrammer.removeDuplicatedTable([ + ...reviewedAuthorizations, + ...reviewedComponents, + ]); + + // DEDUPLICATION (semantic) + const deduplicatedComponents: AutoBeDatabaseComponent[] = + await orchestratePrismaDeduplication(ctx, { + instruction: props.instruction, + components: reviewedAllComponents, + }); + console.log(`----------- PRISMA DEDUPLICATION -----------`); + console.log(JSON.stringify(deduplicatedComponents, null, 2)); + console.log( + `before Tables: ${reviewedAllComponents.flatMap((c) => c.tables).length}`, + ); + console.log( + `after Tables: ${deduplicatedComponents.flatMap((c) => c.tables).length}`, + ); // CONSTRUCT AST DATA const schemaEvents: AutoBeDatabaseSchemaEvent[] = await orchestratePrismaSchema( ctx, props.instruction, - reviewedAllComponents, + deduplicatedComponents, ); const application: AutoBeDatabase.IApplication = { - files: reviewedAllComponents.map((comp) => ({ + files: deduplicatedComponents.map((comp) => ({ filename: comp.filename, namespace: comp.namespace, models: schemaEvents @@ -120,7 +149,7 @@ export const orchestratePrisma = async ( await orchestratePrismaSchemaReview( ctx, application, - reviewedAllComponents, + deduplicatedComponents, ); for (const event of reviewEvents) { if (event.content === null) continue; diff --git a/packages/agent/src/orchestrate/prisma/orchestratePrismaDeduplication.ts b/packages/agent/src/orchestrate/prisma/orchestratePrismaDeduplication.ts new file mode 100644 index 0000000000..8e1ac04047 --- /dev/null +++ b/packages/agent/src/orchestrate/prisma/orchestratePrismaDeduplication.ts @@ -0,0 +1,182 @@ +import { IAgenticaController } from "@agentica/core"; +import { + AutoBeDatabaseComponent, + AutoBeDatabaseDeduplicationEvent, + AutoBeEventSource, + AutoBeProgressEventBase, +} from "@autobe/interface"; +import { ILlmApplication, IValidation } from "@samchon/openapi"; +import { IPointer } from "tstl"; +import typia from "typia"; +import { v7 } from "uuid"; + +import { AutoBeContext } from "../../context/AutoBeContext"; +import { executeCachedBatch } from "../../utils/executeCachedBatch"; +import { AutoBePreliminaryController } from "../common/AutoBePreliminaryController"; +import { transformPrismaDeduplicationHistory } from "./histories/transformPrismaDeduplicationHistory"; +import { AutoBeDatabaseDeduplicationProgrammer } from "./programmers/AutoBeDatabaseDeduplicationProgrammer"; +import { IAutoBeDatabaseDeduplicationApplication } from "./structures/IAutoBeDatabaseDeduplicationApplication"; + +export async function orchestratePrismaDeduplication( + ctx: AutoBeContext, + props: { + instruction: string; + components: AutoBeDatabaseComponent[]; + }, +): Promise { + const prefix: string | null = ctx.state().analyze?.prefix ?? null; + const progress: AutoBeProgressEventBase = { + completed: 0, + total: props.components.length, + }; + const events: AutoBeDatabaseDeduplicationEvent[] = await executeCachedBatch( + ctx, + props.components.map((component) => async (promptCacheKey) => { + const event: AutoBeDatabaseDeduplicationEvent = await process(ctx, { + target: component, + allComponents: props.components, + instruction: props.instruction, + prefix, + progress, + promptCacheKey, + }); + ctx.dispatch(event); + return event; + }), + ); + // Resolve duplicates + const results: AutoBeDatabaseComponent[] = + AutoBeDatabaseDeduplicationProgrammer.resolve(props.components, events); + + return results; +} + +async function process( + ctx: AutoBeContext, + props: { + target: AutoBeDatabaseComponent; + allComponents: AutoBeDatabaseComponent[]; + instruction: string; + prefix: string | null; + progress: AutoBeProgressEventBase; + promptCacheKey: string; + }, +): Promise { + const preliminary: AutoBePreliminaryController< + "analysisFiles" | "previousAnalysisFiles" | "previousDatabaseSchemas" + > = new AutoBePreliminaryController({ + application: + typia.json.application(), + source: SOURCE, + kinds: [ + "analysisFiles", + "previousAnalysisFiles", + "previousDatabaseSchemas", + ], + state: ctx.state(), + }); + + return await preliminary.orchestrate(ctx, async (out) => { + const pointer: IPointer = + { + value: null, + }; + + const result: AutoBeContext.IResult = await ctx.conversate({ + source: SOURCE, + controller: createController({ + preliminary, + target: props.target, + allComponents: props.allComponents, + build: (next) => { + pointer.value = next; + }, + }), + enforceFunctionCall: true, + promptCacheKey: props.promptCacheKey, + ...transformPrismaDeduplicationHistory({ + component: props.target, + allComponents: props.allComponents, + instruction: props.instruction, + prefix: props.prefix, + preliminary, + }), + }); + if (pointer.value === null) return out(result)(null); + + return out(result)({ + type: SOURCE, + id: v7(), + created_at: new Date().toISOString(), + step: ctx.state().analyze?.step ?? 0, + metric: result.metric, + tokenUsage: result.tokenUsage, + completed: ++props.progress.completed, + total: props.progress.total, + analysis: pointer.value.analysis, + rationale: pointer.value.rationale, + duplicateGroups: pointer.value.duplicateGroups, + namespace: props.target.namespace, + }); + }); +} + +function createController(props: { + preliminary: AutoBePreliminaryController< + "analysisFiles" | "previousAnalysisFiles" | "previousDatabaseSchemas" + >; + target: AutoBeDatabaseComponent; + allComponents: AutoBeDatabaseComponent[]; + build: (next: IAutoBeDatabaseDeduplicationApplication.IComplete) => void; +}): IAgenticaController.IClass { + const validate: Validator = (input) => { + const result: IValidation = + typia.validate(input); + if (result.success === false) return result; + + if (result.data.request.type !== "complete") + return props.preliminary.validate({ + thinking: result.data.thinking, + request: result.data.request, + }); + + const errors: IValidation.IError[] = []; + AutoBeDatabaseDeduplicationProgrammer.validate({ + errors, + path: "$input.request.duplicateGroups", + target: props.target, + allComponents: props.allComponents, + duplicateGroups: result.data.request.duplicateGroups, + }); + if (errors.length > 0) + return { + success: false, + errors, + data: result.data, + }; + return result; + }; + const application: ILlmApplication = props.preliminary.fixApplication( + typia.llm.application({ + validate: { + process: validate, + }, + }), + ); + return { + protocol: "class", + name: SOURCE, + application, + execute: { + process: (next) => { + if (next.request.type === "complete") props.build(next.request); + }, + } satisfies IAutoBeDatabaseDeduplicationApplication, + }; +} + +type Validator = ( + input: unknown, +) => IValidation; + +const SOURCE = "databaseDeduplication" satisfies AutoBeEventSource; diff --git a/packages/agent/src/orchestrate/prisma/programmers/AutoBeDatabaseDeduplicationProgrammer.ts b/packages/agent/src/orchestrate/prisma/programmers/AutoBeDatabaseDeduplicationProgrammer.ts new file mode 100644 index 0000000000..ad67408ab2 --- /dev/null +++ b/packages/agent/src/orchestrate/prisma/programmers/AutoBeDatabaseDeduplicationProgrammer.ts @@ -0,0 +1,269 @@ +import { + AutoBeDatabaseComponent, + AutoBeDatabaseDeduplicationEvent, + AutoBeDatabaseDeduplicationGroup, +} from "@autobe/interface"; +import { StringUtil } from "@autobe/utils"; +import { Pair } from "tstl"; +import { IValidation } from "typia"; + +export namespace AutoBeDatabaseDeduplicationProgrammer { + /** Validate duplicate groups reported by the agent. */ + export const validate = (props: { + errors: IValidation.IError[]; + path: string; + target: AutoBeDatabaseComponent; + allComponents: AutoBeDatabaseComponent[]; + duplicateGroups: AutoBeDatabaseDeduplicationGroup[]; + }): void => { + props.duplicateGroups.forEach((group, i) => { + // Each group must have at least 2 tables + if (group.tables.length < 2) + props.errors.push({ + path: `${props.path}[${i}].tables`, + expected: "at least 2 tables per group", + value: group.tables.length, + description: StringUtil.trim` + Duplicate group must contain at least 2 tables to be meaningful. + + Fix: Add more tables to this group, or remove the group entirely + if there are no actual duplicates. + `, + }); + + // Each table must exist in actual components + group.tables.forEach((table, j) => { + const component: AutoBeDatabaseComponent | undefined = + props.allComponents.find((c) => c.namespace === table.namespace); + if (component === undefined) + props.errors.push({ + path: `${props.path}[${i}].tables[${j}].namespace`, + expected: "existing component namespace", + value: table.namespace, + description: StringUtil.trim` + Component namespace "${table.namespace}" does not exist. + + Fix: Use one of the existing component namespaces: + - ${props.allComponents.map((c) => c.namespace).join(", ")} + `, + }); + else if (!component.tables.some((t) => t.name === table.name)) + props.errors.push({ + path: `${props.path}[${i}].tables[${j}].name`, + expected: `existing table in "${table.namespace}" component`, + value: table.name, + description: StringUtil.trim` + Table "${table.name}" does not exist in component "${table.namespace}". + + Fix: Use one of the existing tables: + - ${component.tables.map((t) => t.name).join(", ")} + `, + }); + }); + + // Each group must include at least 1 table from target component + const hasTargetTable = group.tables.some( + (t) => t.namespace === props.target.namespace, + ); + if (!hasTargetTable) + props.errors.push({ + path: `${props.path}[${i}].tables`, + expected: `at least 1 table from target component "${props.target.namespace}"`, + value: group.tables.map((t) => t.namespace), + description: StringUtil.trim` + This agent is responsible for finding duplicates in component + "${props.target.namespace}", but this group contains no tables + from that component. + + Fix: Include at least one table from "${props.target.namespace}" + in this duplicate group. + `, + }); + }); + }; + /** + * Resolve semantic duplicate groups by deterministically keeping the table + * from the component with the fewest total tables. + */ + export const resolve = ( + components: AutoBeDatabaseComponent[], + events: AutoBeDatabaseDeduplicationEvent[], + ): AutoBeDatabaseComponent[] => { + // 1. Collect all duplicate groups from events + const duplicatedGroups: AutoBeDatabaseDeduplicationGroup[] = events.flatMap( + (e) => e.duplicateGroups, + ); + if (duplicatedGroups.length === 0) return components; + + // 2. Merge overlapping groups into clusters using Union-Find + const clusters: AutoBeDatabaseDeduplicationGroup.ITable[][] = + mergeGroups(duplicatedGroups); + + // 3. Remove duplicates, keeping table from smallest component + return removeDuplicates(components, clusters); + }; + + /** + * Merge overlapping duplicate groups into clusters using Union-Find. + * + * If group1 = [A, B] and group2 = [B, C], they merge into one cluster [A, B, + * C]. + * + * @returns Array of clusters, where each cluster is a set of duplicate + * tables. + */ + const mergeGroups = ( + groups: AutoBeDatabaseDeduplicationGroup[], + ): AutoBeDatabaseDeduplicationGroup.ITable[][] => { + // Build table key → index mapping + const tableKeys: string[] = []; + const tableKeyToIndex: Map = new Map(); + + const getOrCreateIndex = (namespace: string, name: string): number => { + const key: string = `${namespace}::${name}`; + let index: number | undefined = tableKeyToIndex.get(key); + if (index === undefined) { + index = tableKeys.length; + tableKeys.push(key); + tableKeyToIndex.set(key, index); + } + return index; + }; + + // Register all tables + for (const group of groups) { + for (const table of group.tables) { + getOrCreateIndex(table.namespace, table.name); + } + } + + // Union-Find: each table starts as its own parent + const parent: number[] = tableKeys.map((_, i) => i); + const rank: number[] = tableKeys.map(() => 0); + + const find = (x: number): number => { + while (parent[x] !== x) { + parent[x] = parent[parent[x]]; // path compression + x = parent[x]; + } + return x; + }; + + const union = (a: number, b: number): void => { + const rootA: number = find(a); + const rootB: number = find(b); + if (rootA === rootB) return; + + // Union by rank: attach smaller tree under larger tree + if (rank[rootA] < rank[rootB]) { + parent[rootA] = rootB; + } else if (rank[rootA] > rank[rootB]) { + parent[rootB] = rootA; + } else { + parent[rootB] = rootA; + rank[rootA]++; + } + }; + + // Union all tables within each group + for (const group of groups) { + if (group.tables.length < 2) continue; + const firstIndex: number = getOrCreateIndex( + group.tables[0].namespace, + group.tables[0].name, + ); + for (let i = 1; i < group.tables.length; i++) { + const idx: number = getOrCreateIndex( + group.tables[i].namespace, + group.tables[i].name, + ); + union(firstIndex, idx); + } + } + + // Group tables by their root → clusters + const clusterMap = new Map< + number, + AutoBeDatabaseDeduplicationGroup.ITable[] + >(); + for (const [key, index] of tableKeyToIndex) { + const root: number = find(index); + let cluster = clusterMap.get(root); + if (cluster === undefined) { + cluster = []; + clusterMap.set(root, cluster); + } + const [namespace, name] = key.split("::"); + cluster.push({ namespace: namespace!, name: name! }); + } + + return [...clusterMap.values()]; + }; + + /** + * Remove duplicate tables from components, keeping one per cluster. + * + * Rule: Keep the table from the component with fewest total tables. + * Tie-break: Keep the table from the component that appears first. + * + * Algorithm (similar to removeDuplicatedTable): + * + * 1. Build tableKey → clusterId mapping + * 2. Sort components by table count (ascending) + * 3. Traverse and keep first table encountered per cluster + * 4. Restore original order + */ + const removeDuplicates = ( + components: AutoBeDatabaseComponent[], + clusters: AutoBeDatabaseDeduplicationGroup.ITable[][], + ): AutoBeDatabaseComponent[] => { + // Build tableKey → clusterId mapping + const tableToCluster: Map = new Map(); + clusters.forEach((cluster, clusterId) => { + for (const table of cluster) { + tableToCluster.set(`${table.namespace}::${table.name}`, clusterId); + } + }); + + // Track which clusters already have a kept table + const clusterSet: Set = new Set(); + + // Sort by table count (smallest first), keep original index + const sorted: Pair[] = components + .map((c, i) => new Pair(c, i)) + .sort((a, b) => a.first.tables.length - b.first.tables.length); + + // Filter tables: keep first encountered per cluster + const processed: Pair[] = sorted.map( + (p) => + new Pair( + { + ...p.first, + tables: p.first.tables.filter((t) => { + const key: string = `${p.first.namespace}::${t.name}`; + const clusterId: number | undefined = tableToCluster.get(key); + + // Not in any cluster → keep + if (clusterId === undefined) return true; + + // First in cluster → keep and mark + if (!clusterSet.has(clusterId)) { + clusterSet.add(clusterId); + return true; + } + + // Already have one from this cluster → remove + return false; + }), + }, + p.second, + ), + ); + + // Restore original order and filter empty components + return processed + .sort((a, b) => a.second - b.second) + .map((p) => p.first) + .filter((c) => c.tables.length > 0); + }; +} diff --git a/packages/agent/src/orchestrate/prisma/structures/IAutoBeDatabaseDeduplicationApplication.ts b/packages/agent/src/orchestrate/prisma/structures/IAutoBeDatabaseDeduplicationApplication.ts new file mode 100644 index 0000000000..c1b04a8f92 --- /dev/null +++ b/packages/agent/src/orchestrate/prisma/structures/IAutoBeDatabaseDeduplicationApplication.ts @@ -0,0 +1,132 @@ +import { AutoBeDatabaseDeduplicationGroup } from "@autobe/interface"; + +import { IAutoBePreliminaryGetAnalysisFiles } from "../../common/structures/IAutoBePreliminaryGetAnalysisFiles"; +import { IAutoBePreliminaryGetPreviousAnalysisFiles } from "../../common/structures/IAutoBePreliminaryGetPreviousAnalysisFiles"; +import { IAutoBePreliminaryGetPreviousDatabaseSchemas } from "../../common/structures/IAutoBePreliminaryGetPreviousDatabaseSchemas"; + +export interface IAutoBeDatabaseDeduplicationApplication { + /** + * Analyze tables for semantic duplicates across components. + * + * Your PRIMARY task is to compare the target component's tables against all + * other components' tables and identify groups of tables that serve the same + * purpose, even if they have different names. + * + * ALWAYS fetch analysis files first using `getAnalysisFiles` to understand + * the business context, then systematically compare tables and build + * duplicate groups. + * + * @param props Request containing either preliminary data request or complete + * task with duplicate groups + */ + process(props: IAutoBeDatabaseDeduplicationApplication.IProps): void; +} + +export namespace IAutoBeDatabaseDeduplicationApplication { + export interface IProps { + /** + * Reflect on the deduplication analysis before acting. + * + * For preliminary requests (getAnalysisFiles, getPreviousAnalysisFiles, + * getPreviousDatabaseSchemas): + * + * - What requirements documents do you need to understand table purposes? + * - Which business domains need to be understood for comparison? + * + * For completion (complete): + * + * - How many duplicate groups did you find? + * - Which tables are duplicated and why? + * - Summarize the comparison results. + */ + thinking: string; + + /** + * Request type discriminator. + * + * Use preliminary requests (getAnalysisFiles, etc.) to fetch requirements + * documents for understanding table purposes. Use complete to submit + * duplicate group identification results. + */ + request: + | IComplete + | IAutoBePreliminaryGetAnalysisFiles + | IAutoBePreliminaryGetPreviousAnalysisFiles + | IAutoBePreliminaryGetPreviousDatabaseSchemas; + } + + /** + * Submit duplicate group identification results. + * + * Call this after you have: + * + * 1. Fetched and analyzed requirements documents + * 2. Compared each target component table against all other tables + * 3. Identified groups of semantically equivalent tables + */ + export interface IComplete { + /** + * Type discriminator. Value "complete" indicates final submission. + */ + type: "complete"; + + /** + * Analysis of the deduplication comparison process. + * + * Documents the agent's understanding and comparison approach: + * + * - What tables in the target component were analyzed? + * - What tables in other components were compared against? + * - What semantic patterns were identified across components? + * - How were table purposes determined from names and descriptions? + */ + analysis: string; + + /** + * Rationale for the duplicate group decisions. + * + * Explains why specific tables were grouped as duplicates: + * + * - Why are identified groups considered semantically equivalent? + * - What evidence supports each grouping decision? + * - Why were certain similar-looking tables NOT grouped? + * - What distinguishes true duplicates from related but distinct tables? + */ + rationale: string; + + /** + * Groups of semantically duplicate tables. + * + * Each group contains tables from different components that serve the + * same purpose. Empty array if no duplicates are found. + * + * ## Group Rules: + * + * - Each group must have at least 2 tables + * - Each group must include at least 1 table from the target component + * - Each table can appear in only one group + * + * ## Example: + * + * ```typescript + * [ + * { + * reason: "Both tables store customer authentication data", + * tables: [ + * { namespace: "Authorization", name: "customers" }, + * { namespace: "Sales", name: "shopping_customers" } + * ] + * } + * ] + * ``` + * + * ## Judgment Criteria: + * + * - Read both name AND description to determine purpose + * - Same purpose = duplicate (even with different names) + * - Different purpose = NOT duplicate (even with same name) + * - Parent-child or snapshot relationships = NOT duplicates + */ + duplicateGroups: AutoBeDatabaseDeduplicationGroup[]; + } +} diff --git a/packages/interface/src/events/AutoBeDatabaseDeduplicationEvent.ts b/packages/interface/src/events/AutoBeDatabaseDeduplicationEvent.ts new file mode 100644 index 0000000000..0e29884790 --- /dev/null +++ b/packages/interface/src/events/AutoBeDatabaseDeduplicationEvent.ts @@ -0,0 +1,54 @@ +import { AutoBeDatabaseDeduplicationGroup } from "../histories/contents"; +import { AutoBeAggregateEventBase } from "./base/AutoBeAggregateEventBase"; +import { AutoBeEventBase } from "./base/AutoBeEventBase"; +import { AutoBeProgressEventBase } from "./base/AutoBeProgressEventBase"; + +/** + * Event fired when an agent completes reviewing a single component for semantic + * duplicates during the Database Component Deduplication phase. + * + * This event occurs after both Authorization Review and Component Review phases, + * where deduplication agents compare each component's tables against all other + * components' tables to identify semantically equivalent tables that serve the + * same purpose. + * + * Multiple events of this type are emitted (one per component) as the + * deduplication agents process each component in parallel. + * + * @author Michael + */ +export interface AutoBeDatabaseDeduplicationEvent + extends AutoBeEventBase<"databaseDeduplication">, + AutoBeAggregateEventBase, + AutoBeProgressEventBase { + /** Requirements analysis iteration step number. */ + step: number; + + /** + * Analysis of the deduplication comparison process. + * + * Documents the agent's understanding of which tables were analyzed in the + * target component and how they were compared against tables in other + * components. + */ + analysis: string; + + /** + * Rationale for the duplicate group decisions. + * + * Explains why specific tables were grouped as duplicates and why certain + * similar-looking tables were NOT grouped. + */ + rationale: string; + + /** + * Groups of semantically duplicate tables identified by the agent. + * + * Each group contains tables from different components that serve the same + * purpose. May be empty if no duplicates were found for this component. + */ + duplicateGroups: AutoBeDatabaseDeduplicationGroup[]; + + /** Namespace of the component that was reviewed for duplicates. */ + namespace: string; +} diff --git a/packages/interface/src/events/AutoBeEvent.ts b/packages/interface/src/events/AutoBeEvent.ts index 86459e9636..b6152b40fe 100644 --- a/packages/interface/src/events/AutoBeEvent.ts +++ b/packages/interface/src/events/AutoBeEvent.ts @@ -10,6 +10,7 @@ import { AutoBeDatabaseAuthorizationReviewEvent } from "./AutoBeDatabaseAuthoriz import { AutoBeDatabaseCompleteEvent } from "./AutoBeDatabaseCompleteEvent"; import { AutoBeDatabaseComponentEvent } from "./AutoBeDatabaseComponentEvent"; import { AutoBeDatabaseComponentReviewEvent } from "./AutoBeDatabaseComponentReviewEvent"; +import { AutoBeDatabaseDeduplicationEvent } from "./AutoBeDatabaseDeduplicationEvent"; import { AutoBeDatabaseCorrectEvent } from "./AutoBeDatabaseCorrectEvent"; import { AutoBeDatabaseGroupEvent } from "./AutoBeDatabaseGroupEvent"; import { AutoBeDatabaseGroupReviewEvent } from "./AutoBeDatabaseGroupReviewEvent"; @@ -116,6 +117,7 @@ export type AutoBeEvent = | AutoBeDatabaseAuthorizationReviewEvent | AutoBeDatabaseComponentEvent | AutoBeDatabaseComponentReviewEvent + | AutoBeDatabaseDeduplicationEvent | AutoBeDatabaseSchemaEvent | AutoBeDatabaseSchemaReviewEvent | AutoBeDatabaseValidateEvent diff --git a/packages/interface/src/events/AutoBeEventSource.ts b/packages/interface/src/events/AutoBeEventSource.ts index 93c4f752c2..781ce7ec94 100644 --- a/packages/interface/src/events/AutoBeEventSource.ts +++ b/packages/interface/src/events/AutoBeEventSource.ts @@ -5,6 +5,7 @@ import { AutoBeDatabaseAuthorizationEvent } from "./AutoBeDatabaseAuthorizationE import { AutoBeDatabaseAuthorizationReviewEvent } from "./AutoBeDatabaseAuthorizationReviewEvent"; import { AutoBeDatabaseComponentEvent } from "./AutoBeDatabaseComponentEvent"; import { AutoBeDatabaseComponentReviewEvent } from "./AutoBeDatabaseComponentReviewEvent"; +import { AutoBeDatabaseDeduplicationEvent } from "./AutoBeDatabaseDeduplicationEvent"; import { AutoBeDatabaseCorrectEvent } from "./AutoBeDatabaseCorrectEvent"; import { AutoBeDatabaseGroupEvent } from "./AutoBeDatabaseGroupEvent"; import { AutoBeDatabaseGroupReviewEvent } from "./AutoBeDatabaseGroupReviewEvent"; @@ -83,6 +84,7 @@ export type AutoBeEventSource = | AutoBeDatabaseAuthorizationReviewEvent["type"] | AutoBeDatabaseComponentEvent["type"] | AutoBeDatabaseComponentReviewEvent["type"] + | AutoBeDatabaseDeduplicationEvent["type"] | AutoBeDatabaseSchemaEvent["type"] | AutoBeDatabaseSchemaReviewEvent["type"] | AutoBeDatabaseCorrectEvent["type"] diff --git a/packages/interface/src/events/index.ts b/packages/interface/src/events/index.ts index d7b704dc93..1adafdfed4 100644 --- a/packages/interface/src/events/index.ts +++ b/packages/interface/src/events/index.ts @@ -37,6 +37,7 @@ export * from "./AutoBeDatabaseAuthorizationReviewEvent"; export * from "./AutoBeDatabaseCompleteEvent"; export * from "./AutoBeDatabaseComponentEvent"; export * from "./AutoBeDatabaseComponentReviewEvent"; +export * from "./AutoBeDatabaseDeduplicationEvent"; export * from "./AutoBeDatabaseCorrectEvent"; export * from "./AutoBeDatabaseGroupEvent"; export * from "./AutoBeDatabaseGroupReviewEvent"; diff --git a/packages/interface/src/histories/contents/AutoBeDatabaseDeduplicationGroup.ts b/packages/interface/src/histories/contents/AutoBeDatabaseDeduplicationGroup.ts new file mode 100644 index 0000000000..d6784014c8 --- /dev/null +++ b/packages/interface/src/histories/contents/AutoBeDatabaseDeduplicationGroup.ts @@ -0,0 +1,57 @@ +/** + * Represents a group of semantically duplicate tables identified across + * different database components. + * + * Each group contains tables from different components that serve the same + * purpose or store the same kind of data, even if they have different names. + * The deduplication agent identifies these groups by analyzing both table names + * and descriptions to determine semantic equivalence. + * + * After identification, the system resolves each group by keeping only the + * table from the component with the fewest total tables (most specialized), + * ensuring deterministic and fair deduplication. + * + * @author Michael + */ +export interface AutoBeDatabaseDeduplicationGroup { + /** + * Explanation of why these tables are considered semantically duplicate. + * + * Should describe the shared purpose or functionality that makes these tables + * redundant, referencing their names and descriptions. + */ + reason: string; + + /** + * List of tables that serve the same purpose across different components. + * + * Must contain at least 2 tables, and at least one must belong to the target + * component being reviewed. + */ + tables: AutoBeDatabaseDeduplicationGroup.ITable[]; +} + +export namespace AutoBeDatabaseDeduplicationGroup { + /** + * Reference to a specific table within a specific component. + * + * Used to uniquely identify a table by its component namespace and table + * name. + */ + export interface ITable { + /** + * The namespace of the component that owns this table. + * + * Must match an existing component's namespace (e.g., "Authorization", + * "Sales", "Orders"). + */ + namespace: string; + + /** + * The snake_case name of the table. + * + * Must match an existing table name within the specified component. + */ + name: string; + } +} diff --git a/packages/interface/src/histories/contents/index.ts b/packages/interface/src/histories/contents/index.ts index a394303fbf..56559d78df 100644 --- a/packages/interface/src/histories/contents/index.ts +++ b/packages/interface/src/histories/contents/index.ts @@ -15,6 +15,7 @@ export * from "./AutoBeDatabaseComponentTableRevise"; export * from "./AutoBeDatabaseComponentTableCreate"; export * from "./AutoBeDatabaseComponentTableUpdate"; export * from "./AutoBeDatabaseComponentTableErase"; +export * from "./AutoBeDatabaseDeduplicationGroup"; export * from "./AutoBeDatabaseGroup"; export * from "./AutoBeDatabaseGroupRevise"; export * from "./AutoBeDatabaseGroupReviseCreate"; diff --git a/packages/interface/src/rpc/IAutoBeRpcListener.ts b/packages/interface/src/rpc/IAutoBeRpcListener.ts index cc31645c45..f3e4938a16 100644 --- a/packages/interface/src/rpc/IAutoBeRpcListener.ts +++ b/packages/interface/src/rpc/IAutoBeRpcListener.ts @@ -10,6 +10,7 @@ import { AutoBeDatabaseCompleteEvent, AutoBeDatabaseComponentEvent, AutoBeDatabaseComponentReviewEvent, + AutoBeDatabaseDeduplicationEvent, AutoBeDatabaseCorrectEvent, AutoBeDatabaseGroupEvent, AutoBeDatabaseGroupReviewEvent, @@ -266,6 +267,18 @@ export interface IAutoBeRpcListener { event: AutoBeDatabaseComponentReviewEvent, ): Promise; + /** + * Optional handler for database component deduplication events. + * + * Called when the Database Component Deduplication Agent identifies + * semantically duplicate tables across components. Each event represents + * the deduplication review of a single component, containing groups of + * tables that serve the same purpose. + */ + databaseDeduplication?( + event: AutoBeDatabaseDeduplicationEvent, + ): Promise; + /** * Optional handler for database schema creation progress events. * diff --git a/packages/ui/src/components/events/AutoBeEventMovie.tsx b/packages/ui/src/components/events/AutoBeEventMovie.tsx index f40719cfb3..0e9828348e 100644 --- a/packages/ui/src/components/events/AutoBeEventMovie.tsx +++ b/packages/ui/src/components/events/AutoBeEventMovie.tsx @@ -66,6 +66,7 @@ export function AutoBeEventMovie( case "databaseAuthorizationReview": case "databaseComponent": case "databaseComponentReview": + case "databaseDeduplication": case "databaseSchema": case "databaseSchemaReview": case "interfaceOperation": diff --git a/packages/ui/src/components/events/AutoBeProgressEventMovie.tsx b/packages/ui/src/components/events/AutoBeProgressEventMovie.tsx index 56b8e0fd35..c976c3d72f 100644 --- a/packages/ui/src/components/events/AutoBeProgressEventMovie.tsx +++ b/packages/ui/src/components/events/AutoBeProgressEventMovie.tsx @@ -79,6 +79,11 @@ function getState(event: AutoBeProgressEventMovie.IProps["event"]): IState { title: "Prisma Review", description: "Reviewing the Prisma schemas", }; + case "databaseDeduplication": + return { + title: "Database Deduplication", + description: "Reviewing component tables for semantic duplicates", + }; case "interfaceEndpoint": return { title: "Interface Endpoints", diff --git a/packages/ui/src/structure/AutoBeListener.ts b/packages/ui/src/structure/AutoBeListener.ts index ac7b1df909..da36069664 100644 --- a/packages/ui/src/structure/AutoBeListener.ts +++ b/packages/ui/src/structure/AutoBeListener.ts @@ -102,6 +102,9 @@ export class AutoBeListener { databaseSchemaReview: async (event) => { this.accumulate(event); }, + databaseDeduplication: async (event) => { + this.accumulate(event); + }, databaseValidate: async (event) => { this.insert(event); }, diff --git a/test/src/agent/internal/validate_interface_complement.ts b/test/src/agent/internal/validate_interface_complement.ts index 150fcb57fb..f9e92e5402 100644 --- a/test/src/agent/internal/validate_interface_complement.ts +++ b/test/src/agent/internal/validate_interface_complement.ts @@ -74,6 +74,7 @@ export const validate_interface_complement = async (props: { instruction: "Design API specs carefully considering the security.", progress: complementProgress, document, + failures: new Map(), }); // Get only newly added schemas diff --git a/test/src/archive/utils/ArchiveLogger.ts b/test/src/archive/utils/ArchiveLogger.ts index ff988c3ec7..7f3b967da6 100644 --- a/test/src/archive/utils/ArchiveLogger.ts +++ b/test/src/archive/utils/ArchiveLogger.ts @@ -141,7 +141,9 @@ export namespace ArchiveLogger { ` - update: ${event.revises.filter((r) => r.type === "update").length}`, ...event.revises .filter((r) => r.type === "update") - .map((r) => ` - ${r.original_namespace} => ${r.group.namespace}`), + .map( + (r) => ` - ${r.original_namespace} => ${r.group.namespace}`, + ), ` - erase: ${event.revises.filter((r) => r.type === "erase").length}`, ...event.revises .filter((r) => r.type === "erase") @@ -198,7 +200,14 @@ export namespace ArchiveLogger { .filter((r) => r.type === "erase") .map((r) => ` - ${r.table}`), ); - else if (event.type === "databaseSchema") + else if (event.type === "databaseDeduplication") { + content.push( + ` - process: progress`, + ` - progress: (${event.completed} of ${event.total})`, + ` - namespace: ${event.namespace}`, + ` - duplicated tables: ${event.duplicateGroups.map((g) => g.tables.map((t) => t.name).join(", ")).join(", ")}`, + ); + } else if (event.type === "databaseSchema") content.push( ` - model: ${event.model.name} (stance: ${event.model.stance})`, ); From 31fac3e8682af1318cf3694bee9866e8af6eb06f Mon Sep 17 00:00:00 2001 From: Jeongho Nam Date: Thu, 29 Jan 2026 20:38:05 +0900 Subject: [PATCH 2/8] Update packages/agent/src/orchestrate/prisma/histories/transformPrismaDeduplicationHistory.ts Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- .../histories/transformPrismaDeduplicationHistory.ts | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/packages/agent/src/orchestrate/prisma/histories/transformPrismaDeduplicationHistory.ts b/packages/agent/src/orchestrate/prisma/histories/transformPrismaDeduplicationHistory.ts index bfa101d757..c76543d102 100644 --- a/packages/agent/src/orchestrate/prisma/histories/transformPrismaDeduplicationHistory.ts +++ b/packages/agent/src/orchestrate/prisma/histories/transformPrismaDeduplicationHistory.ts @@ -32,10 +32,13 @@ const normalizeTableName = ( name = name.slice(1); } - // 3) Split by "_", convert each token to singular, sort, and join + // 3) Split by "_", remove empty tokens, convert each token to singular, sort, and join // e.g., bbs_user_articles → ["bbs", "user", "article"] → ["article", "bbs", "user"] → "article_bbs_user" // e.g., bbs_article_users → ["bbs", "article", "user"] → ["article", "bbs", "user"] → "article_bbs_user" - const tokens = name.split("_").map((token) => singular(token)); + const tokens = name + .split("_") + .filter((token) => token.length > 0) + .map((token) => singular(token)); tokens.sort(); return tokens.join("_"); }; From 3b1533d2c06c59a6238c6e37341914bc5b93028d Mon Sep 17 00:00:00 2001 From: michael <7471919@naver.com> Date: Tue, 3 Feb 2026 14:09:24 +0900 Subject: [PATCH 3/8] chore: update databaseDeduplication ArchiveLogger --- test/src/archive/utils/ArchiveLogger.ts | 2 -- 1 file changed, 2 deletions(-) diff --git a/test/src/archive/utils/ArchiveLogger.ts b/test/src/archive/utils/ArchiveLogger.ts index f6dff02d76..499f02c50d 100644 --- a/test/src/archive/utils/ArchiveLogger.ts +++ b/test/src/archive/utils/ArchiveLogger.ts @@ -202,8 +202,6 @@ export namespace ArchiveLogger { ); else if (event.type === "databaseDeduplication") { content.push( - ` - process: progress`, - ` - progress: (${event.completed} of ${event.total})`, ` - namespace: ${event.namespace}`, ` - duplicated tables: ${event.duplicateGroups.map((g) => g.tables.map((t) => t.name).join(", ")).join(", ")}`, ); From a5ed15b1b9582f90466ce2a265288ad5bfc2b40b Mon Sep 17 00:00:00 2001 From: michael <7471919@naver.com> Date: Tue, 3 Feb 2026 14:19:44 +0900 Subject: [PATCH 4/8] fix: Update Database Authorization, Component Description and Deduplication Agent Prompt --- .../agent/prompts/DATABASE_AUTHORIZATION.md | 162 ++++- packages/agent/prompts/DATABASE_COMPONENT.md | 168 +++++- .../prompts/DATABASE_COMPONENT_REVIEW.md | 125 +++- .../agent/prompts/DATABASE_DEDUPLICATION.md | 557 +++++++++++++----- .../transformPrismaDeduplicationHistory.ts | 132 +---- .../orchestrate/prisma/orchestratePrisma.ts | 23 +- .../prisma/orchestratePrismaDeduplication.ts | 28 +- .../AutoBeDatabaseDeduplicationProgrammer.ts | 181 +++++- ...IAutoBeDatabaseDeduplicationApplication.ts | 162 ++++- 9 files changed, 1182 insertions(+), 356 deletions(-) diff --git a/packages/agent/prompts/DATABASE_AUTHORIZATION.md b/packages/agent/prompts/DATABASE_AUTHORIZATION.md index 1321aaa367..cc15a99c0a 100644 --- a/packages/agent/prompts/DATABASE_AUTHORIZATION.md +++ b/packages/agent/prompts/DATABASE_AUTHORIZATION.md @@ -212,6 +212,80 @@ Same authentication pattern as member but may have additional security considera --- +## 📝 TABLE DESCRIPTION REQUIREMENTS FOR DEDUPLICATION + +**CRITICAL**: Table descriptions are the PRIMARY source for deduplication analysis. +Brief descriptions cause duplicate detection failures. Write RICH descriptions. + +### Required Elements (ALL 5 must be included) + +| Element | Purpose | Example | +|---------|---------|---------| +| **1. Role Tag** | Quick classification | `[MASTER DATA]`, `[INPUT]`, `[AUDIT]`, `[CONFIG]`, `[SNAPSHOT]`, `[JUNCTION]` | +| **2. Core Entity** | What specific business entity is stored | "customer authentication credentials" | +| **3. Key Data Fields** | Main data this table contains | "stores email, password_hash, 2FA settings" | +| **4. Business Context** | What workflow/process uses this | "used in login, password reset, session creation flows" | +| **5. Distinguishing Characteristics** | How it differs from similar tables | "does NOT store profile data - see customer_profiles" | + +### Role Tag Definitions for Authorization Tables + +| Tag | Meaning | Typical Authorization Use | +|-----|---------|---------------------------| +| `[MASTER DATA]` | Core actor identity | Actor tables (users, customers, administrators) | +| `[MASTER DATA]` | Session management | Session tables (user_sessions, customer_sessions) | +| `[INPUT]` | Auth requests | Password reset requests, verification requests | +| `[AUDIT]` | Auth logging | Login attempts, security events | +| `[CONFIG]` | Auth settings | 2FA settings, notification preferences | +| `[JUNCTION]` | Auth relationships | OAuth connections, role assignments | + +### Description Examples for Authorization + +#### ❌ BAD - Too vague, causes deduplication failures + +```typescript +{ name: "shopping_customers", description: "Customer accounts" } +// → Cannot distinguish from customer tables in other components +``` + +#### ✅ GOOD - Rich descriptions enable accurate deduplication + +```typescript +// [MASTER DATA] - Actor identity with explicit scope separation +{ + name: "shopping_customers", + description: "[MASTER DATA] Customer actor identity for authentication. Stores authentication credentials (email, password_hash), 2FA settings, and account status. Created during customer registration. Used exclusively in authentication flow (login, password reset, session validation). Does NOT store personal profile (name, address) - those belong in business domain tables that reference this actor." +} + +// [INPUT] - Auth request with lifecycle distinction +{ + name: "shopping_customer_password_resets", + description: "[INPUT] Password reset request tokens for customers. Stores reset token (token_hash, expires_at), customer reference, and request metadata (requested_ip). Created when customer requests password reset. Consumed and invalidated after use. Part of password recovery workflow - different from customer_sessions which are login sessions." +} + +// [AUDIT] - Compliance log distinguished from active sessions +{ + name: "shopping_administrator_audit_logs", + description: "[AUDIT] Immutable record of administrator actions for security compliance. Stores action details (action_type, target_entity, changes_made), admin reference, timestamp, and request context (ip, session). Created automatically on any admin action. Used for security auditing and compliance. Different from administrator_sessions which tracks active logins." +} +``` + +### Why Rich Descriptions Matter for Authorization Tables + +Authorization tables are particularly prone to duplication across components because: +- Multiple components might create their own "users" or "customers" tables +- Session tables might be duplicated if domain boundaries are unclear + +**With vague descriptions:** +- "Customer accounts" vs "Shopping customers" → Cannot determine if duplicate +- "User sessions" vs "Customer sessions" → Looks like duplicate but might be different actors + +**With rich descriptions:** +- Actor type clearly identified (customer vs admin vs guest) +- Authentication scope explicitly stated +- Relationship to profile/business data clarified + +--- + ## Table Naming Conventions ### Required Naming Rules @@ -492,19 +566,46 @@ process({ rationale: "Created main actor + session tables for each actor. Added password_resets for user/admin since requirements specify password recovery. Added audit_logs for admin per security requirements. Guest has minimal tables without password support.", tables: [ // User (member) tables - { name: "shopping_users", description: "Registered user accounts with email/password authentication credentials and profile information." }, - { name: "shopping_user_sessions", description: "JWT session tokens for user authentication with access and refresh token support." }, - { name: "shopping_user_password_resets", description: "Password reset tokens with expiration for secure user password recovery workflow." }, + { + name: "shopping_users", + description: "[MASTER DATA] Registered user actor for authentication. Stores credentials (email, password_hash), 2FA settings, account status, and registration timestamp. Created during user signup. Used in login, password reset, and session validation flows. Does NOT store profile details (name, avatar) - those belong in user_profiles table in business domain." + }, + { + name: "shopping_user_sessions", + description: "[MASTER DATA] Active authentication sessions for users. Stores session context (access_token, refresh_token, device_id, ip_address, user_agent), creation and expiration timestamps. Created on successful login. Used for authenticating all user requests. Multiple concurrent sessions supported per user." + }, + { + name: "shopping_user_password_resets", + description: "[INPUT] Password reset request tokens for users. Stores reset token (token_hash, expires_at), user reference, request metadata (requested_ip, requested_at). Created when user initiates password reset. Single-use token consumed after password change. Different from sessions which are for authenticated access." + }, // Admin tables - { name: "shopping_administrators", description: "Administrator accounts with elevated privileges for platform management." }, - { name: "shopping_administrator_sessions", description: "JWT session tokens for administrator authentication with access and refresh token support." }, - { name: "shopping_administrator_password_resets", description: "Password reset tokens with expiration for secure administrator password recovery." }, - { name: "shopping_administrator_audit_logs", description: "Audit trail of administrator actions for security compliance and accountability." }, + { + name: "shopping_administrators", + description: "[MASTER DATA] Administrator actor for platform management. Stores admin credentials (email, password_hash), role/permission level, and account status. Created by super admin or system setup. Used in admin authentication with elevated privilege checks. Separate from users due to different security requirements and access patterns." + }, + { + name: "shopping_administrator_sessions", + description: "[MASTER DATA] Active authentication sessions for administrators. Stores session context (access_token, ip_address, user_agent), security metadata, and shorter expiration for security. Created on admin login with stricter validation. Used for admin request authentication. Separate from user_sessions due to elevated security requirements." + }, + { + name: "shopping_administrator_password_resets", + description: "[INPUT] Password reset request tokens for administrators. Stores reset token (token_hash, expires_at), admin reference, request metadata with additional security logging. Created when admin requests password reset. Enhanced security compared to user resets including notification to other admins." + }, + { + name: "shopping_administrator_audit_logs", + description: "[AUDIT] Immutable compliance record of all administrator actions. Stores action type, target entity, before/after state, admin reference, timestamp, and request context (ip, session_id). Created automatically on any admin modification. Used for security auditing, compliance reporting, and incident investigation. Write-only, never modified or deleted." + }, // Guest tables - { name: "shopping_guests", description: "Anonymous guest entities representing unauthenticated visitors. Stores identity only, no credentials or session data." }, - { name: "shopping_guest_sessions", description: "Session records for guest access containing device_id, token, IP, and connection context with expiration." } + { + name: "shopping_guests", + description: "[MASTER DATA] Anonymous guest actor for unauthenticated visitors. Stores minimal identity (id, created_at) with NO credentials or password. Created on first anonymous visit. Used to track anonymous shopping carts and enable guest checkout. Can be linked to user account upon registration. Does NOT store session data - see guest_sessions." + }, + { + name: "shopping_guest_sessions", + description: "[MASTER DATA] Session tracking for anonymous guests. Stores session context (device_id, token, ip_address, href, referrer, user_agent), expiration timestamp. Created when guest is identified. Used for cart persistence and anonymous user tracking. Shorter expiration than authenticated sessions. Multiple sessions per guest supported." + } ] } }) @@ -523,15 +624,36 @@ process({ rationale: "Both actors need main + session tables with full auth fields. Added email_verifications for both per requirements. Added oauth_connections only for customer since requirements specify social login for buyers only.", tables: [ // Customer tables - { name: "customers", description: "Customer accounts for buyers with email/password authentication." }, - { name: "customer_sessions", description: "JWT session tokens for customer authentication." }, - { name: "customer_email_verifications", description: "Email verification tokens for customer registration confirmation." }, - { name: "customer_oauth_connections", description: "OAuth provider connections for customer social login." }, + { + name: "customers", + description: "[MASTER DATA] Customer actor for buyers on the marketplace. Stores authentication credentials (email, password_hash), verification status, and account state. Created during customer registration. Used in customer login, checkout, and order flows. Does NOT store shipping addresses or payment methods - those are in customer_addresses and customer_payment_methods in order domain." + }, + { + name: "customer_sessions", + description: "[MASTER DATA] Active authentication sessions for customers. Stores session tokens (access_token, refresh_token), device info (device_id, ip_address, user_agent), and expiration. Created on customer login. Used to authenticate all customer API requests. Supports multiple concurrent sessions for cross-device shopping." + }, + { + name: "customer_email_verifications", + description: "[INPUT] Email verification tokens for customer registration. Stores verification token (token_hash, expires_at), customer reference, and email being verified. Created during registration. Single-use token consumed when customer clicks verification link. Required before customer can place orders." + }, + { + name: "customer_oauth_connections", + description: "[JUNCTION] OAuth provider links for customer social login. Stores provider info (provider_name, provider_user_id, access_token), customer reference. Created when customer connects social account. Enables login via Google, Facebook, etc. One customer can have multiple OAuth connections for different providers." + }, // Seller tables - { name: "sellers", description: "Seller accounts for merchants with email/password authentication." }, - { name: "seller_sessions", description: "JWT session tokens for seller authentication." }, - { name: "seller_email_verifications", description: "Email verification tokens for seller registration confirmation." } + { + name: "sellers", + description: "[MASTER DATA] Seller actor for merchants on the marketplace. Stores authentication credentials (email, password_hash), verification status, seller tier/status. Created during seller application approval. Used in seller dashboard login and product management flows. Does NOT store store details or bank info - those are in seller_profiles and seller_payment_accounts in seller domain." + }, + { + name: "seller_sessions", + description: "[MASTER DATA] Active authentication sessions for sellers. Stores session tokens (access_token, refresh_token), device info, and expiration. Created on seller login. Used to authenticate seller dashboard and API requests. Separate from customer_sessions due to different permission scopes and security requirements." + }, + { + name: "seller_email_verifications", + description: "[INPUT] Email verification tokens for seller registration. Stores verification token (token_hash, expires_at), seller reference, and email being verified. Created during seller application. Single-use token consumed on verification. Part of seller onboarding workflow which includes additional business verification steps." + } ] } }) @@ -573,8 +695,12 @@ Before calling `process({ request: { type: "complete", ... } })`, verify: - [ ] **Guest actors have minimal fields**: only id and created_at, NO device_id or token ### Table Content Quality -- [ ] Each table has clear, concise description -- [ ] Descriptions explain purpose and what data is stored +- [ ] **TABLE DESCRIPTIONS - ALL 5 ELEMENTS**: Every description includes: + - [ ] Role Tag: `[MASTER DATA]`, `[INPUT]`, `[AUDIT]`, `[CONFIG]`, or `[JUNCTION]` + - [ ] Core Entity: What specific authentication entity is stored + - [ ] Key Data Fields: Main data this table contains + - [ ] Business Context: What authentication workflow uses this table + - [ ] Distinguishing Characteristics: How it differs from similar tables (especially across components) - [ ] Descriptions do NOT imply session fields in actor tables - [ ] No duplicate tables - [ ] All required tables included for EACH actor diff --git a/packages/agent/prompts/DATABASE_COMPONENT.md b/packages/agent/prompts/DATABASE_COMPONENT.md index eee29cb26d..d199a49972 100644 --- a/packages/agent/prompts/DATABASE_COMPONENT.md +++ b/packages/agent/prompts/DATABASE_COMPONENT.md @@ -274,9 +274,9 @@ Consistency across components indicates completeness. request: { type: "complete", tables: [ - { name: "sales", description: "Main sale listings" }, - { name: "sale_snapshots", description: "Audit trail for sales" }, - { name: "sale_units", description: "Individual units within a sale" } + { name: "sales", description: "[MASTER DATA] Main sale listings. Stores product info and pricing." }, + { name: "sale_snapshots", description: "[SNAPSHOT] Audit trail for sales. Stores point-in-time copies." }, + { name: "sale_units", description: "[MASTER DATA] Individual units within a sale. Stores stock info." } ] } } @@ -306,24 +306,60 @@ Consistency across components indicates completeness. type: "complete", tables: [ // Core sale entities - { name: "sales", description: "Main sale listings with product, pricing, seller" }, - { name: "sale_snapshots", description: "Point-in-time snapshots for audit trail" }, - { name: "sale_units", description: "Individual stock units within a sale" }, + { + name: "sales", + description: "[MASTER DATA] Main sale listings representing products for sale. Stores sale metadata (title, description, price, status, seller_id), inventory info, and timestamps. Created when seller lists a product. Used in product browsing, cart, checkout, and order workflows. Parent entity for sale_images, sale_units, sale_reviews." + }, + { + name: "sale_snapshots", + description: "[SNAPSHOT] Point-in-time copy of sale state for audit and order integrity. Stores complete sale data (price, description, seller info) at moment of purchase. Created when order is placed. Used for order history display, refund calculation, and dispute resolution. Immutable after creation - different from sales which can be updated." + }, + { + name: "sale_units", + description: "[MASTER DATA] Individual stock units within a sale for inventory tracking. Stores unit-specific data (SKU, stock_quantity, variant_options like size/color). Created alongside sale. Used by inventory management and cart validation. One sale can have multiple units for different variants." + }, // Sale content - { name: "sale_images", description: "Multiple images per sale for product display" }, - { name: "sale_specifications", description: "Product specifications and technical details" }, + { + name: "sale_images", + description: "[MASTER DATA] Product images for sale listings. Stores image metadata (url, display_order, alt_text, is_primary). Created when seller uploads images. Used in product display across all channels. Multiple images per sale with ordering. Different from sale_snapshots which captures entire sale state." + }, + { + name: "sale_specifications", + description: "[MASTER DATA] Technical specifications and attributes for sale products. Stores key-value pairs (spec_name, spec_value, display_order). Created when seller adds product details. Used for product comparison and filtering. Separate from sale description which is free-form text." + }, // Customer interaction - { name: "sale_reviews", description: "Customer reviews and ratings for sales" }, - { name: "sale_review_votes", description: "Helpful votes on reviews" }, - { name: "sale_questions", description: "Customer questions about sales" }, - { name: "sale_question_answers", description: "Seller answers to customer questions" }, + { + name: "sale_reviews", + description: "[INPUT] Customer reviews and ratings for purchased sales. Stores review content (rating, title, body, images), customer reference, and verified_purchase flag. Created after customer receives order. Used in product page display and seller rating calculation. Does NOT store review responses - see sale_review_replies for seller responses." + }, + { + name: "sale_review_votes", + description: "[INPUT] Customer votes on review helpfulness. Stores vote data (review_id, customer_id, is_helpful). Created when customer votes on a review. Used for sorting reviews by helpfulness. One vote per customer per review. Different from sale_reviews which contains the review content itself." + }, + { + name: "sale_questions", + description: "[INPUT] Customer inquiries about sale listings before purchase. Stores question content (title, body), customer reference, and target sale. Created when customer asks question on sale page. Part of Q&A workflow - awaits seller response. Answers stored in sale_question_answers (different owner: seller creates answers)." + }, + { + name: "sale_question_answers", + description: "[OUTPUT] Seller responses to customer questions. Stores answer content (body), seller reference, and parent question link. Created when seller responds to a question. Completes Q&A workflow started by sale_questions. Separate table because different actor (seller) owns this data with different creation lifecycle." + }, // Sale management - { name: "sale_promotions", description: "Active promotions and discounts on sales" }, - { name: "sale_favorites", description: "User favorites/wishlists for sales" }, - { name: "sale_view_stats", description: "View count and analytics for sales" } + { + name: "sale_promotions", + description: "[MASTER DATA] Active promotional campaigns and discounts on sales. Stores promotion rules (discount_type, discount_value, start_date, end_date, conditions). Created by seller or admin. Used during cart calculation and checkout. Different from discount_codes which are customer-entered codes." + }, + { + name: "sale_favorites", + description: "[JUNCTION] Customer wishlists linking customers to favorite sales. Stores customer_id, sale_id, and added_at timestamp. Created when customer favorites a sale. Used for wishlist display and back-in-stock notifications. Many-to-many relationship between customers and sales." + }, + { + name: "sale_view_stats", + description: "[AUDIT] Analytics tracking for sale page views. Stores aggregated metrics (view_count, unique_visitors, last_viewed_at) per sale. Updated on each page view. Used for seller analytics dashboard and trending products algorithm. Does NOT store individual view events - see sale_view_logs for detailed tracking." + } ] } } @@ -412,6 +448,82 @@ Consistency across components indicates completeness. --- +## 📝 TABLE DESCRIPTION REQUIREMENTS FOR DEDUPLICATION + +**CRITICAL**: Table descriptions are the PRIMARY source for deduplication analysis. +Brief descriptions cause duplicate detection failures. Write RICH descriptions. + +### Required Elements (ALL 5 must be included) + +| Element | Purpose | Example | +|---------|---------|---------| +| **1. Role Tag** | Quick classification | `[MASTER DATA]`, `[INPUT]`, `[OUTPUT]`, `[AUDIT]`, `[CONFIG]`, `[SNAPSHOT]`, `[JUNCTION]` | +| **2. Core Entity** | What specific business entity is stored | "customer identity and authentication credentials" | +| **3. Key Data Fields** | Main data this table contains | "stores email, password_hash, name, phone, and address" | +| **4. Business Context** | What workflow/process uses this | "used in registration, login, and profile management flows" | +| **5. Distinguishing Characteristics** | How it differs from similar tables | "does NOT store order history - see customer_orders for that" | + +### Role Tag Definitions + +| Tag | Meaning | Lifecycle | Examples | +|-----|---------|-----------|----------| +| `[MASTER DATA]` | Core business entities | Long-lived, frequently updated | users, products, orders | +| `[INPUT]` | Data triggering processes | Created by user action | reports, requests, questions | +| `[OUTPUT]` | Results of processing | Created by system/admin | decisions, approvals, answers | +| `[AUDIT]` | Immutable compliance records | Write-once, never modified | logs, histories, audit trails | +| `[CONFIG]` | System/entity settings | Rarely changed | preferences, feature flags | +| `[SNAPSHOT]` | Point-in-time copies | Created at specific moments | order_snapshots, price_histories | +| `[JUNCTION]` | Many-to-many relationships | Linking records | product_categories, user_roles | + +### Description Examples + +#### ❌ BAD - Too vague, causes deduplication failures + +```typescript +{ name: "shopping_customers", description: "Customer accounts for shopping" } +{ name: "customers", description: "Customer data" } +// → Cannot determine if these are duplicates or intentionally separate +``` + +#### ✅ GOOD - Rich descriptions enable accurate deduplication + +```typescript +// Pair 1: Same role [MASTER DATA], same entity (customer) but explicitly separated +{ + name: "shopping_customers", + description: "[MASTER DATA] Customer identity for the shopping platform. Stores personal profile (name, phone, address) and shopping preferences. Created during customer registration. Used by order placement, delivery, and customer service workflows. Does NOT store authentication credentials - see shopping_customer_authentications for login data." +} +{ + name: "customers", + description: "[MASTER DATA] Customer authentication credentials for the general platform. Stores email, password_hash, and 2FA settings. Created during signup. Used exclusively in authentication flow (login, password reset, session creation). Does NOT store profile data - see customer_profiles for personal information." +} + +// Pair 2: Different roles [INPUT] vs [OUTPUT] — NOT duplicates +{ + name: "sale_questions", + description: "[INPUT] Customer inquiries about sale listings. Stores question text, customer reference, and target sale. Created when customer submits question on sale page. Part of Q&A workflow - awaits seller response. Answers stored separately in sale_question_answers (different owner: seller vs customer)." +} +{ + name: "sale_question_answers", + description: "[OUTPUT] Seller responses to customer questions. Stores answer text, seller reference, and parent question link. Created when seller responds to a question. Completes Q&A workflow. Separate from questions because different actor (seller) owns this data with different lifecycle." +} +``` + +### Why Rich Descriptions Matter for Deduplication + +The Database Deduplication Agent compares tables across components by reading descriptions. + +**With vague descriptions:** +- "Customer accounts" vs "Customer data" → Cannot determine if duplicate +- "Order information" vs "Purchase records" → Looks like duplicate but might not be + +**With rich descriptions:** +- Role tags immediately show if tables serve same role +- Business context shows if they're in same workflow +- Distinguishing characteristics explicitly state differences + +--- + ## ABSOLUTE PROHIBITION: Actor and Authorization Tables **CRITICAL RULE**: You MUST NEVER create any actor or authentication-related tables. @@ -866,7 +978,7 @@ Each table must follow the `AutoBeDatabaseComponentTableDesign` structure: ```typescript interface AutoBeDatabaseComponentTableDesign { name: string & tags.Pattern<"^[a-z][a-z0-9_]*$">; // snake_case, plural - description: string; // Brief, concise explanation of why this table is needed and what it stores + description: string; // Rich description with 5 elements: [ROLE TAG] + Core Entity + Key Data + Business Context + Distinguishing Characteristics } ``` @@ -875,7 +987,7 @@ interface AutoBeDatabaseComponentTableDesign { - **Using Component Skeleton**: Use EXACT namespace and filename from the component skeleton provided - **Table Completeness**: Include ALL tables required for THIS COMPONENT'S domain based on its rationale - **Pattern Compliance**: All table names must match the regex pattern `^[a-z][a-z0-9_]*$` -- **Table Descriptions**: Each table MUST include a clear and **concise** description explaining its purpose and what data it stores (keep it brief - one or two sentences maximum) +- **Table Descriptions**: Each table MUST include a RICH description with ALL 5 elements: [ROLE TAG], Core Entity, Key Data Fields, Business Context, and Distinguishing Characteristics. See "TABLE DESCRIPTION REQUIREMENTS FOR DEDUPLICATION" section above. - **Thinking Field**: Brief summary of what tables you designed (in IProps.thinking field) - **Request Structure**: Provide `{ type: "complete", analysis: "...", rationale: "...", tables: [...] }` - analysis and rationale document TABLE DESIGN reasoning @@ -903,9 +1015,18 @@ const output: IAutoBeDatabaseComponentApplication.IProps = { request: { type: "complete", tables: [ - { name: "channels", description: "Sales channels (e.g., online store, mobile app) with branding and configuration." }, - { name: "sections", description: "Sections within a channel for organizing content and products hierarchically." }, - { name: "configurations", description: "System-wide configuration settings and feature flags." } + { + name: "channels", + description: "[MASTER DATA] Sales channels representing different storefronts (online store, mobile app, kiosk). Stores channel metadata (name, code, branding settings, timezone, currency). Created during system setup. Used by all customer-facing workflows to determine display settings and business rules. Each channel operates independently with its own configurations." + }, + { + name: "sections", + description: "[MASTER DATA] Hierarchical content sections within a channel. Stores section metadata (name, parent_section_id, display_order, visibility settings). Created by administrators. Used for organizing products, articles, and navigation menus. Supports nested structure via parent_section_id. Different from categories which classify products - sections organize UI layout." + }, + { + name: "configurations", + description: "[CONFIG] System-wide configuration settings and feature flags. Stores key-value pairs (config_key, config_value, config_type, last_modified_by). Created during deployment, updated by administrators. Used by all system components to control behavior (payment gateways, notification settings, rate limits). Does NOT store per-user preferences - see user_settings for that." + } ] } }; @@ -1452,7 +1573,12 @@ Before calling `process({ request: { type: "complete", analysis: "...", rational - [ ] Using the EXACT namespace and filename from the component skeleton - [ ] No duplicate table names within this component - [ ] All table names match the required regex pattern `^[a-z][a-z0-9_]*$` -- [ ] **TABLE DESCRIPTIONS**: Every table has a meaningful description explaining its purpose +- [ ] **TABLE DESCRIPTIONS - ALL 5 ELEMENTS**: Every description includes: + - [ ] Role Tag: `[MASTER DATA]`, `[INPUT]`, `[OUTPUT]`, `[AUDIT]`, `[CONFIG]`, `[SNAPSHOT]`, or `[JUNCTION]` + - [ ] Core Entity: What specific business entity is stored + - [ ] Key Data Fields: Main data this table contains + - [ ] Business Context: What workflow/process uses this table + - [ ] Distinguishing Characteristics: How it differs from similar tables - [ ] **NO PREFIX DUPLICATION**: No table name has duplicated domain prefixes (e.g., `prefix_prefix_tablename`) - [ ] All descriptions written in English diff --git a/packages/agent/prompts/DATABASE_COMPONENT_REVIEW.md b/packages/agent/prompts/DATABASE_COMPONENT_REVIEW.md index 607bf1ec6a..c44ef8eab3 100644 --- a/packages/agent/prompts/DATABASE_COMPONENT_REVIEW.md +++ b/packages/agent/prompts/DATABASE_COMPONENT_REVIEW.md @@ -289,31 +289,31 @@ Verify the existing tables follow normalization patterns: type: "create", reason: "Requirement 3.5 specifies customer reviews on sales, but no review table exists", table: "sale_reviews", - description: "Customer reviews and ratings for sales with helpful votes" + description: "[INPUT] Customer reviews and ratings for purchased sales. Stores review content (rating, title, body, images), customer reference, verified_purchase flag, timestamps. Created after customer receives order. Used in product page display and seller rating calculation. Does NOT store review responses - see sale_review_replies for seller responses." }, { type: "create", reason: "Requirement 3.7 specifies Q&A functionality for sales, but no question table exists", table: "sale_questions", - description: "Customer questions about sales" + description: "[INPUT] Customer inquiries about sale listings before purchase. Stores question text, customer reference, target sale. Created when customer asks question on sale page. Part of Q&A workflow - awaits seller response. Answers stored separately in sale_question_answers (different owner: seller)." }, { type: "create", reason: "Requirement 3.7 specifies Q&A functionality for sales, answers need separate table for normalization", table: "sale_question_answers", - description: "Seller answers to customer questions about sales" + description: "[OUTPUT] Seller responses to customer questions. Stores answer text, seller reference, parent question link, timestamps. Created when seller responds. Completes Q&A workflow. Separate from questions because different actor (seller) owns this data with different creation lifecycle." }, { type: "create", reason: "Requirement 2.4 specifies multiple images per sale, but no image table exists", table: "sale_images", - description: "Multiple images per sale for product display" + description: "[MASTER DATA] Product images for sale listings. Stores image URL, display_order, alt_text, is_primary flag. Created when seller uploads images. Used in product display across all channels. Multiple images per sale with ordering. Different from sale_snapshots which captures entire sale state." }, { type: "create", reason: "Requirement 4.2 specifies promotional campaigns on sales, but no promotion table exists", table: "sale_promotions", - description: "Active promotions and discounts on sales" + description: "[MASTER DATA] Active promotional campaigns on sales. Stores discount_type, discount_value, start_date, end_date, conditions. Created by seller or admin. Used during cart calculation and checkout. Different from discount_codes which are customer-entered codes requiring validation." } ``` @@ -347,6 +347,56 @@ process({ --- +## 📝 TABLE DESCRIPTION REQUIREMENTS FOR DEDUPLICATION + +**CRITICAL**: When creating or updating tables, descriptions MUST enable accurate deduplication. + +### Required Elements (ALL 5 must be included in CREATE/UPDATE descriptions) + +| Element | Purpose | Example | +|---------|---------|---------| +| **1. Role Tag** | Quick classification | `[MASTER DATA]`, `[INPUT]`, `[OUTPUT]`, `[AUDIT]`, `[CONFIG]`, `[SNAPSHOT]`, `[JUNCTION]` | +| **2. Core Entity** | What specific business entity is stored | "order cancellation records" | +| **3. Key Data Fields** | Main data this table contains | "stores cancellation reason, refund status, timestamps" | +| **4. Business Context** | What workflow/process uses this | "part of order cancellation workflow" | +| **5. Distinguishing Characteristics** | How it differs from similar tables | "different from order_refunds which tracks refund processing" | + +### Role Tag Definitions + +| Tag | Meaning | Examples | +|-----|---------|----------| +| `[MASTER DATA]` | Core business entities | users, products, orders | +| `[INPUT]` | Data triggering processes | reports, requests, questions | +| `[OUTPUT]` | Results of processing | decisions, approvals, answers | +| `[AUDIT]` | Immutable compliance records | logs, histories, audit trails | +| `[CONFIG]` | System/entity settings | preferences, feature flags | +| `[SNAPSHOT]` | Point-in-time copies | order_snapshots, price_histories | +| `[JUNCTION]` | Many-to-many relationships | product_categories, user_roles | + +### Description Quality Check for CREATE/UPDATE + +Before adding a CREATE or UPDATE revision, verify the description includes ALL 5 elements: + +```typescript +// ❌ BAD - Missing elements, will cause deduplication issues +{ + type: "create", + reason: "Requirement 3.2 specifies order cancellation", + table: "shopping_order_cancellations", + description: "Stores cancellation records" // Missing role tag, context, distinguishing characteristics +} + +// ✅ GOOD - All 5 elements present +{ + type: "create", + reason: "Requirement 3.2 specifies order cancellation", + table: "shopping_order_cancellations", + description: "[INPUT] Order cancellation requests initiated by customers. Stores cancellation reason, requested_at timestamp, customer reference, and order reference. Created when customer requests cancellation. Part of cancellation workflow - awaits admin approval. Different from order_refunds which tracks actual refund processing after approval." +} +``` + +--- + ## 3. Revision Operations ### Create - Add Missing Tables @@ -356,9 +406,9 @@ Use when a table is needed to fulfill requirements but doesn't exist: ```typescript { type: "create", - reason: "Requirement 3.2 specifies order cancellation tracking, but no table exists", // Keep concise + reason: "Requirement 3.2 specifies order cancellation tracking, but no table exists", table: "shopping_order_cancellations", - description: "Stores cancellation records with reasons, timestamps, and refund status" // Keep concise + description: "[INPUT] Customer requests to cancel orders. Stores cancellation reason, requested_at timestamp, order reference, and customer reference. Created when customer initiates cancellation. Part of cancellation workflow - awaits processing. Different from order_refunds which tracks refund execution after cancellation approval." } ``` @@ -374,10 +424,10 @@ Use when a table has naming convention issues: ```typescript { type: "update", - reason: "Table name violates snake_case convention and missing domain prefix", // Keep concise + reason: "Table name violates snake_case convention and missing domain prefix", original: "orderCancel", updated: "shopping_order_cancellations", - description: "Stores cancellation records with reasons, timestamps, and refund status" // Keep concise + description: "[INPUT] Customer requests to cancel orders. Stores cancellation reason, requested_at timestamp, order reference, and customer reference. Created when customer initiates cancellation. Part of cancellation workflow - awaits processing. Different from order_refunds which tracks refund execution after cancellation approval." } ``` @@ -499,32 +549,32 @@ process({ type: "create", reason: "Requirement 3.2 - cancellation lifecycle requires dedicated tracking with status, reason, and initiator", table: "shopping_order_cancellations", - description: "Stores order cancellation records including cancellation reason, status (requested/approved/completed), initiator (customer/admin), and timestamps" + description: "[INPUT] Customer requests to cancel orders. Stores cancellation reason, status (requested/approved/completed), initiator type (customer/admin), order reference, timestamps. Created when cancellation is requested. Part of cancellation workflow - triggers refund processing upon approval. Different from order_refunds which tracks actual money movement." }, { type: "create", reason: "Requirement 3.4 - refund processing has its own lifecycle separate from cancellation", table: "shopping_order_refunds", - description: "Stores refund records with requested/approved amounts, refund reason, approval status, processor info, and processing timestamps" + description: "[OUTPUT] Refund processing records after cancellation approval. Stores refund amount (requested/approved), payment method, processing status, processor reference, timestamps. Created when refund is initiated. Part of refund workflow - executes money transfer. Different from order_cancellations which is the customer request." }, { type: "create", reason: "Requirement 3.5 - delivery requires tracking carrier info, tracking numbers, and current status", table: "shopping_order_deliveries", - description: "Stores delivery information including carrier, tracking number, estimated delivery date, and current delivery status" + description: "[MASTER DATA] Delivery information for shipped orders. Stores carrier info, tracking number, estimated_delivery_date, current_status, shipping_address snapshot. Created when order is shipped. Used by delivery tracking and notification workflows. One order can have multiple deliveries for split shipments." }, { type: "create", reason: "Requirement 3.5 - delivery status changes over time need history tracking for customer visibility", table: "shopping_order_delivery_histories", - description: "Stores delivery status change history with timestamp, location, status, and optional notes for each update" + description: "[AUDIT] Delivery status change history for tracking visibility. Stores status, location, timestamp, and carrier notes for each update. Created automatically on each delivery status change. Used for customer tracking page and delivery analytics. Immutable log - different from deliveries which stores current state." }, { type: "update", reason: "Naming convention violation - camelCase and missing domain prefix", original: "orderItems", updated: "shopping_order_items", - description: "Line items within orders with quantity, unit price, subtotal, and product/variant references" + description: "[MASTER DATA] Individual line items within orders. Stores product reference, variant reference, quantity, unit_price, subtotal, and item-level discounts. Created during checkout. Used in order display, fulfillment, and refund calculation. Child of shopping_orders - one order has multiple items." } ] } @@ -617,11 +667,36 @@ Current tables: `[sales, sale_snapshots, sale_units]` **Required CREATE Revisions:** ```typescript revises: [ - { type: "create", reason: "Requirements specify Q&A functionality - questions need dedicated table", table: "sale_questions", description: "Customer questions about sales" }, - { type: "create", reason: "Requirements specify Q&A - answers must be separate for normalization (different actor owns)", table: "sale_question_answers", description: "Seller answers to customer questions" }, - { type: "create", reason: "Requirements specify customer reviews with ratings", table: "sale_reviews", description: "Customer reviews and ratings for sales" }, - { type: "create", reason: "Requirements specify helpful vote functionality on reviews", table: "sale_review_votes", description: "Helpful votes on sale reviews" }, - { type: "create", reason: "Requirements specify multiple images per sale", table: "sale_images", description: "Multiple product images for sales" } + { + type: "create", + reason: "Requirements specify Q&A functionality - questions need dedicated table", + table: "sale_questions", + description: "[INPUT] Customer inquiries about sales before purchase. Stores question text, customer reference, target sale, timestamps. Created when customer submits question. Part of Q&A workflow - awaits seller response. Answers in separate table (different owner)." + }, + { + type: "create", + reason: "Requirements specify Q&A - answers must be separate for normalization (different actor owns)", + table: "sale_question_answers", + description: "[OUTPUT] Seller responses to customer questions. Stores answer text, seller reference, parent question link, timestamps. Created when seller responds. Completes Q&A workflow. Separate because seller owns with different lifecycle." + }, + { + type: "create", + reason: "Requirements specify customer reviews with ratings", + table: "sale_reviews", + description: "[INPUT] Customer reviews for purchased sales. Stores rating, title, body, customer reference, verified_purchase flag. Created after delivery. Used for product display and seller rating. Does NOT store votes - see sale_review_votes." + }, + { + type: "create", + reason: "Requirements specify helpful vote functionality on reviews", + table: "sale_review_votes", + description: "[INPUT] Helpfulness votes on sale reviews. Stores review_id, customer_id, is_helpful flag. Created when customer votes. Used for sorting reviews. One vote per customer per review. Different from reviews which contain content." + }, + { + type: "create", + reason: "Requirements specify multiple images per sale", + table: "sale_images", + description: "[MASTER DATA] Product images for sale listings. Stores image URL, display_order, alt_text, is_primary. Created on image upload. Used in product display. Multiple per sale with ordering. Different from sale_snapshots." + } ] ``` @@ -786,10 +861,14 @@ Before calling `process({ request: { type: "complete", review: "...", revises: [ ### Review Quality - [ ] Review field contains comprehensive analysis of the component -- [ ] Each revision has clear, requirement-based **concise** reason (one or two sentences maximum) -- [ ] Each CREATE revision has meaningful **concise** table description (one or two sentences maximum) -- [ ] Each UPDATE revision specifies both original and updated names with **concise** description (one or two sentences maximum) -- [ ] Each ERASE revision explains why table doesn't belong with **concise** reason (one or two sentences maximum) +- [ ] Each revision has clear, requirement-based reason +- [ ] **CREATE/UPDATE DESCRIPTIONS - ALL 5 ELEMENTS**: Each description includes: + - [ ] Role Tag: `[MASTER DATA]`, `[INPUT]`, `[OUTPUT]`, `[AUDIT]`, `[CONFIG]`, `[SNAPSHOT]`, or `[JUNCTION]` + - [ ] Core Entity: What specific business entity is stored + - [ ] Key Data Fields: Main data this table contains + - [ ] Business Context: What workflow/process uses this table + - [ ] Distinguishing Characteristics: How it differs from similar tables +- [ ] Each ERASE revision explains why table doesn't belong - [ ] All table names follow snake_case, plural, domain prefix conventions - [ ] All descriptions written in English diff --git a/packages/agent/prompts/DATABASE_DEDUPLICATION.md b/packages/agent/prompts/DATABASE_DEDUPLICATION.md index a62fbb8752..b32fa63242 100644 --- a/packages/agent/prompts/DATABASE_DEDUPLICATION.md +++ b/packages/agent/prompts/DATABASE_DEDUPLICATION.md @@ -10,67 +10,264 @@ You are the **Database Component Deduplication Agent**. Your purpose is to ident --- +## ⚠️ CRITICAL: YOUR RESPONSIBILITY SCOPE + +**You are assigned to ONE specific target component.** Your job is to find duplicates **involving YOUR target component's tables**. + +### What You MUST Do +- Find duplicate groups where **at least one table belongs to YOUR target component** +- Example: If your target is "Posts", every group you report MUST contain at least one table from "Posts" + +### What You MUST NOT Do +- **NEVER report duplicate groups between OTHER components only** +- If you notice that "Reporting::reports" duplicates "Moderation::content_reports", but NEITHER is from your target component → **DO NOT REPORT IT** +- That's another agent's responsibility, not yours + +### Why This Matters +- Multiple agents run in parallel, each assigned to a different component +- If your target is "Posts" but you report `["Reporting", "Comments"]` → **VALIDATION FAILS** +- Each agent is responsible ONLY for duplicates involving their own target component + +**SELF-CHECK**: Before adding any duplicate group, ask yourself: +> "Does this group contain at least one table from MY target component?" +> If NO → Do not include this group. It's not your responsibility. + +--- + ## 2. What is a Semantic Duplicate? -Two or more tables are semantic duplicates when they serve the **same purpose** in the database, regardless of naming: +Two or more tables are semantic duplicates when they serve the **same purpose** in the database, regardless of naming. + +### Definition of "Same Purpose" + +Two tables have the **SAME purpose** ONLY when: +- They store the **exact same type of entity** (e.g., both store "customer accounts") +- They would cause **data duplication** if both existed (same rows would exist in both tables) +- Their descriptions indicate they serve **identical business functions** + +Two tables have **DIFFERENT purposes** when: +- One stores **entities** (users, products), the other stores **settings/config** +- One stores **logs/events** (audit trail), the other stores **master data** +- One stores **user-facing data**, the other stores **system infrastructure data** +- They represent **different lifecycle stages** (live entity vs snapshot/history) ### Duplicate Examples -| Table A | Table B | Duplicate? | Reason | -|---------|---------|-----------|--------| -| `users` (Auth) | `user_accounts` (Members) | **YES** | Both store user identity/authentication data | -| `customers` (Auth) | `shopping_customers` (Sales) | **YES** | Both represent the same customer entity | -| `product_reviews` (Products) | `item_reviews` (Sales) | **YES** | Both store user reviews for purchasable items | -| `order_notifications` (Orders) | `notification_logs` (Notifications) | **YES** | Both track notification records for orders | +| Table A (with description excerpt) | Table B (with description excerpt) | Duplicate? | Reason | +|-----------------------------------|-----------------------------------|-----------|--------| +| `users`: "[MASTER DATA] User identity and profile... Stores name, email, preferences" | `user_accounts`: "[MASTER DATA] User accounts for the platform... Stores name, email, settings" | **YES** | Same role [MASTER DATA], same entity (user identity), same data (name, email), no explicit exclusion | +| `customers`: "[MASTER DATA] Customer accounts... Stores profile and preferences" | `shopping_customers`: "[MASTER DATA] Customer identity... Stores profile data" | **YES** | Same role, same entity, same data - NO explicit "does NOT store" to separate them | +| `product_reviews`: "[INPUT] Customer reviews for products... rating, body, customer_id" | `item_reviews`: "[INPUT] User reviews for purchasable items... rating, content, user_id" | **YES** | Same role [INPUT], same entity (product reviews), same structure | ### NOT Duplicate Examples -| Table A | Table B | Duplicate? | Reason | -|---------|---------|-----------|--------| -| `users` (Auth) | `user_profiles` (Members) | **NO** | Different purpose: auth credentials vs profile details | -| `orders` (Orders) | `order_items` (Orders) | **NO** | Parent-child relationship, not duplicates | -| `products` (Products) | `product_snapshots` (Sales) | **NO** | Live entity vs point-in-time snapshot | -| `admin_sessions` (Auth) | `customer_sessions` (Auth) | **NO** | Different actor types, both needed | +| Table A (with description excerpt) | Table B (with description excerpt) | Duplicate? | Reason | +|-----------------------------------|-----------------------------------|-----------|--------| +| `users`: "[MASTER DATA] User authentication... Does NOT store profile" | `user_profiles`: "[MASTER DATA] User profile data... Does NOT store credentials" | **NO** | Explicit mutual exclusion in descriptions | +| `orders`: "[MASTER DATA] Purchase orders..." | `order_items`: "[MASTER DATA] Line items within orders... Child of orders" | **NO** | Parent-child relationship explicitly stated | +| `products`: "[MASTER DATA] Live product catalog entries..." | `product_snapshots`: "[SNAPSHOT] Point-in-time copy of product..." | **NO** | Different role tags: [MASTER DATA] vs [SNAPSHOT] | +| `sale_questions`: "[INPUT] Customer inquiries... awaits seller response" | `sale_question_answers`: "[OUTPUT] Seller responses to questions..." | **NO** | Different role tags: [INPUT] vs [OUTPUT] | +| `admin_sessions`: "[MASTER DATA] Sessions for administrators..." | `customer_sessions`: "[MASTER DATA] Sessions for customers..." | **NO** | Different actor types explicitly stated | +| `configurations`: "[CONFIG] System settings..." | `admins`: "[MASTER DATA] Administrator accounts..." | **NO** | Different role tags: [CONFIG] vs [MASTER DATA] | +| `moderation_actions`: "[OUTPUT] Moderator decisions..." | `audit_logs`: "[AUDIT] Immutable compliance record..." | **NO** | Different role tags: [OUTPUT] vs [AUDIT] | + +--- + +## 3. ❌ WRONG Reasoning Patterns (NEVER use these) + +**These abstract categories are NOT valid reasons to consider tables as duplicates:** + +| Wrong Reasoning | Why It's Wrong | +|-----------------|----------------| +| "Both are system-related tables" | Too abstract — configs ≠ logs ≠ channels ≠ metadata | +| "Both store application data" | Everything stores data — not a meaningful comparison | +| "Both have similar prefixes" | Names don't determine purpose | +| "Both are infrastructure tables" | Infrastructure has many distinct purposes | +| "Both relate to admin/management" | Admin users ≠ admin configs ≠ admin logs | +| "Both are used for tracking" | Tracking orders ≠ tracking logs ≠ tracking sessions | +| "Both belong to the same domain" | Same domain can have many non-duplicate tables | +| "Both are about reporting/moderation" | Reports (input) ≠ actions (output) ≠ logs (audit) | +| "Both store similar metadata" | Metadata for different entities serves different purposes | + +**If you find yourself using any of these phrases, STOP and re-read the descriptions.** -### Key Judgment Criteria +### The Right Approach -1. **Read both `name` AND `description`** — names alone can be misleading -2. **Same data domain + same purpose = duplicate** (even with different names) -3. **Same name + different purpose = NOT duplicate** (context matters) -4. **Parent-child or snapshot relationships = NOT duplicates** (they are complementary) -5. **Different actor types of the same pattern = NOT duplicates** (each actor needs its own tables) +Instead of abstract categorization, **analyze the specific purpose**: + +``` +❌ WRONG: "Both tables are related to moderation, so they might be duplicates." + +✅ RIGHT: +"Let me read the descriptions: +- Table A: 'Records user complaints about inappropriate content' → This is INPUT to moderation +- Table B: 'Records moderator decisions on flagged content' → This is OUTPUT of moderation +- Table C: 'Immutable audit trail of all moderator actions' → This is AUDIT for compliance + +These serve different purposes in the moderation workflow. NOT duplicates." +``` --- -## 3. Naming Similarity Hints +## 4. Reading Rich Descriptions -The system provides **Naming Similarity Hints** — tables that have the same **normalized name** after: -1. Removing the table prefix (if any) -2. Splitting by `_` into tokens -3. Converting each token to singular form -4. Sorting tokens alphabetically +**⚠️ CRITICAL: Tables now have structured descriptions with 5 elements. Parse them systematically.** -### Why This Matters +### 4.1 Description Anatomy + +Each table description follows this structure: + +``` +"[ROLE TAG] Core entity description. Key data fields stored. Business context/workflow. Distinguishing characteristics." +``` + +**Example Parsing:** + +``` +Description: "[MASTER DATA] Customer identity for the shopping platform. Stores +personal profile (name, phone, address) and shopping preferences. Created during +customer registration. Used by order placement, delivery, and customer service +workflows. Does NOT store authentication credentials - see +shopping_customer_authentications for login data." + +Parsed: +├─ Role Tag: [MASTER DATA] +├─ Core Entity: Customer identity +├─ Key Data: name, phone, address, shopping preferences +├─ Business Context: registration, order placement, delivery, customer service +└─ Distinguishing: "Does NOT store authentication credentials" +``` + +### 4.2 Role Tag Definitions + +| Tag | Meaning | Lifecycle | Duplicate Check | +|-----|---------|-----------|-----------------| +| `[MASTER DATA]` | Core business entities | Long-lived, frequently updated | Compare with other `[MASTER DATA]` only | +| `[INPUT]` | Data triggering processes | Created by user action | NEVER duplicate of `[OUTPUT]` | +| `[OUTPUT]` | Results of processing | Created by system/admin | NEVER duplicate of `[INPUT]` | +| `[AUDIT]` | Immutable compliance records | Write-once, never modified | NEVER duplicate of `[MASTER DATA]` | +| `[CONFIG]` | System/entity settings | Rarely changed | NEVER duplicate of `[MASTER DATA]` | +| `[SNAPSHOT]` | Point-in-time copies | Created at specific moments | NEVER duplicate of source `[MASTER DATA]` | +| `[JUNCTION]` | Many-to-many relationships | Linking records | Compare carefully - often unique | + +### 4.3 Quick Duplicate Check Using Role Tags + +**Different role tags = NOT duplicates (stop comparison immediately)** + +| Comparison | Result | Reason | +|------------|--------|--------| +| `[MASTER DATA]` vs `[MASTER DATA]` | **INVESTIGATE** | Same role, check entity and context | +| `[MASTER DATA]` vs `[SNAPSHOT]` | **NOT DUPLICATE** | Live entity vs point-in-time copy | +| `[INPUT]` vs `[OUTPUT]` | **NOT DUPLICATE** | Different workflow stages | +| `[MASTER DATA]` vs `[AUDIT]` | **NOT DUPLICATE** | Business entity vs compliance log | +| `[CONFIG]` vs `[MASTER DATA]` | **NOT DUPLICATE** | Settings vs entity | +| `[INPUT]` vs `[INPUT]` | **INVESTIGATE** | Same role, check if same trigger type | + +### 4.4 The 4-Step Duplicate Detection Process + +**Step 1: Extract and Compare Role Tags** + +Read the `[ROLE TAG]` at the start of each description: + +``` +Table A: "[MASTER DATA] Customer identity..." +Table B: "[INPUT] Customer questions..." + +→ Different roles ([MASTER DATA] vs [INPUT]) = NOT DUPLICATE +→ Stop here, no further comparison needed +``` + +**Step 2: Compare Core Entity (if same role)** + +What SPECIFIC business entity does each table store? + +``` +Table A: "[MASTER DATA] Customer identity for shopping..." +Table B: "[MASTER DATA] Customer authentication credentials..." + +→ "identity" vs "credentials" = Different aspects of customer +→ Need more investigation +``` + +**Step 3: Compare Business Context (if same entity)** + +What workflow uses this table? What triggers creation? + +``` +Table A: "...Created during registration. Used by order placement..." +Table B: "...Created during signup. Used in authentication flow..." + +→ Both registration-time creation BUT different usage workflows +→ Need to check distinguishing characteristics +``` + +**Step 4: Check Distinguishing Characteristics** + +Look for explicit exclusions: + +``` +Table A: "...Does NOT store authentication credentials - see Y for login data" +Table B: "...Does NOT store profile data - see X for personal information" + +→ Explicit mutual exclusion = NOT DUPLICATE +→ These are deliberately separated tables +``` + +### 4.5 Key Judgment Rules Summary + +1. **Different role tags = NOT duplicate** (stop immediately) +2. **Same role tag = INVESTIGATE further** (proceed to entity comparison) +3. **Explicit "does NOT store X" = NOT duplicate of X** +4. **Different workflow stages = NOT duplicate** (input ≠ output ≠ audit) +5. **Different actor ownership = NOT duplicate** (customer creates ≠ seller creates) +6. **Same entity + same role + same workflow + no exclusions = DUPLICATE** + +### 4.2 Common Misconception: Similar Domain ≠ Duplicate + +Tables in the same domain (e.g., "moderation", "reporting") often serve **completely different purposes**: + +``` +❌ WRONG: "Both are about moderation, so they're duplicates" + +✅ CORRECT Analysis: +- reports: "Records user complaints about content" → ROLE: INPUT (triggers moderation) +- moderation_actions: "Records moderator decisions" → ROLE: OUTPUT (result of moderation) +- audit_logs: "Immutable record for compliance" → ROLE: AUDIT (accountability trail) + +These are THREE DIFFERENT tables serving THREE DIFFERENT purposes in ONE workflow: + User Report (INPUT) → Moderator Decision (OUTPUT) → Audit Record (AUDIT) +``` -Tables with the same normalized name are **strong candidates** for semantic duplicates: +### 4.3 The Definitive Test -| Table A | Table B | Normalized Name | Likely Duplicate? | -|---------|---------|-----------------|-------------------| -| `bbs_user_articles` | `bbs_article_users` | `article_bbs_user` | **YES** — same tokens, just reordered | -| `shopping_customers` | `customers` | `customer` | **YES** — same entity after prefix removal | -| `product_reviews` | `review_products` | `product_review` | **YES** — same tokens, different order | -| `orders` | `order_items` | Different | **NO** — different tokens | +Ask yourself these questions: -### How to Use the Hints +1. **"If I inserted the same row into both tables, would it make sense?"** + - YES → Likely duplicates (same entity) + - NO → NOT duplicates (different purposes) -1. **Check the Naming Similarity Hints table first** — it's provided in the context -2. For each group in the hints, the tables share the same normalized name -3. **Review these pairs carefully** — if they serve the same purpose, group them as duplicates -4. Remember: Similar names are a **hint**, not a guarantee. Always verify by reading descriptions and understanding the business purpose. +2. **"Do both tables represent the same STAGE in a business process?"** + - Both are inputs? → Possible duplicates + - One is input, one is output? → NOT duplicates + - One is live data, one is audit trail? → NOT duplicates + +3. **"Can I quote BOTH descriptions showing they store the SAME thing?"** + - YES, and quotes clearly match → Duplicates + - NO, descriptions show different purposes → NOT duplicates + +### 4.4 Judgment Rules Summary + +1. **ALWAYS read the `description` field carefully** — this is the most reliable indicator of what a table stores +2. **Tables with the same purpose in their descriptions = DUPLICATE** (even if names differ) +3. **Tables with different purposes in their descriptions = NOT duplicate** (even if names look similar) +4. **Do NOT rely on table names alone** — names can be misleading +5. **Parent-child or snapshot relationships = NOT duplicates** (they are complementary) +6. **Different actor types of the same pattern = NOT duplicates** (each actor needs its own tables) +7. **Different roles in the same workflow = NOT duplicates** (input ≠ output ≠ audit) --- -## 4. Execution Flow +## 5. Execution Flow ### Step 1: Fetch Requirements (MANDATORY) @@ -111,10 +308,15 @@ process({ For each table in your target component: -1. Read its `name` and `description` -2. Understand its **purpose** in the business domain -3. Compare against every table in every OTHER component -4. If another component has a table with the **same purpose**, group them +1. **Read the `description` field carefully** — this tells you what the table stores and why +2. Extract the **core purpose** from the description (e.g., "stores customer data", "tracks orders") +3. For each table in OTHER components, **read its description** and extract its purpose +4. **Compare purposes**: If two tables have descriptions indicating the **same purpose** → they are duplicates + +**Important**: Two tables are duplicates if their descriptions indicate they store the **same kind of data for the same business purpose**, regardless of: +- Different table names +- Different column structures +- Being in different components ### Step 3: Build Duplicate Groups @@ -122,7 +324,9 @@ For each semantic duplicate found, create a group: ```typescript { - reason: "Both tables store customer authentication credentials and login information", + reason: `Both tables store customer account data: + - Authorization.customers: "Customer authentication credentials and login data" + - Sales.shopping_customers: "Customer accounts for the shopping platform"`, tables: [ { namespace: "Authorization", name: "customers" }, { namespace: "Sales", name: "shopping_customers" } @@ -130,11 +334,17 @@ For each semantic duplicate found, create a group: } ``` +**⚠️ CRITICAL: The `reason` field MUST include:** +1. **Direct quotes** from each table's `description` field +2. **Specific explanation** of why these descriptions indicate the same purpose +3. If you cannot quote descriptions that clearly show same purpose → **NOT duplicates** + **Rules for groups:** - Each group MUST have **at least 2 tables** - Each group MUST include **at least 1 table from the target component** - One table can appear in **only one group** (no overlapping groups) - If no duplicates found, return **empty array** +- **reason MUST quote actual descriptions** — abstract reasoning without quotes is invalid ### Step 4: Complete the Analysis @@ -152,7 +362,7 @@ process({ --- -## 5. Output Format +## 6. Output Format ```typescript export interface IComplete { @@ -177,7 +387,7 @@ export interface IComplete { --- -## 6. Example +## 7. Example ### Input Context @@ -185,38 +395,56 @@ export interface IComplete { **Target Tables**: ```json [ - { "name": "shopping_customers", "description": "Customer accounts for the shopping platform" }, - { "name": "shopping_orders", "description": "Purchase orders placed by customers" }, - { "name": "shopping_order_items", "description": "Individual items within purchase orders" }, - { "name": "shopping_product_reviews", "description": "Customer reviews and ratings for products" } + { + "name": "shopping_customers", + "description": "[MASTER DATA] Customer identity for the shopping platform. Stores personal profile (name, phone, address) and shopping preferences. Created during customer registration. Used by order placement, delivery, and customer service workflows. Does NOT store authentication credentials - see Authorization.customers for login data." + }, + { + "name": "shopping_orders", + "description": "[MASTER DATA] Purchase orders placed by customers. Stores order metadata (order_number, status, total_amount, shipping_address), customer reference, timestamps. Created when customer completes checkout. Used in order fulfillment, payment, and delivery workflows. Child items in shopping_order_items." + }, + { + "name": "shopping_order_items", + "description": "[MASTER DATA] Individual line items within orders. Stores product reference, quantity, unit_price, subtotal. Created during checkout. Child of shopping_orders. Used in fulfillment and refund calculations." + }, + { + "name": "shopping_product_reviews", + "description": "[INPUT] Customer reviews for purchased products. Stores rating, title, body, customer reference, verified_purchase flag. Created after customer receives order. Used for product page display and seller ratings. Different from Products.product_reviews which may have different ownership model." + } ] ``` -**All Components Tables**: +**Other Components Tables** (excluding target): ```json [ { "namespace": "Authorization", "tables": [ - { "name": "customers", "description": "Customer authentication credentials and login data" }, - { "name": "customer_sessions", "description": "Active login sessions for customers" } - ] - }, - { - "namespace": "Sales", - "tables": [ - { "name": "shopping_customers", "description": "Customer accounts for the shopping platform" }, - { "name": "shopping_orders", "description": "Purchase orders placed by customers" }, - { "name": "shopping_order_items", "description": "Individual items within purchase orders" }, - { "name": "shopping_product_reviews", "description": "Customer reviews and ratings for products" } + { + "name": "customers", + "description": "[MASTER DATA] Customer authentication credentials. Stores email, password_hash, 2FA settings, account status. Created during signup. Used exclusively in authentication flow (login, password reset). Does NOT store profile data - see shopping_customers for personal information." + }, + { + "name": "customer_sessions", + "description": "[MASTER DATA] Active authentication sessions for customers. Stores access_token, device_id, ip_address, expiration. Created on login. Used for request authentication." + } ] }, { "namespace": "Products", "tables": [ - { "name": "products", "description": "Product catalog entries" }, - { "name": "product_reviews", "description": "Customer reviews and ratings for products" }, - { "name": "product_categories", "description": "Product classification categories" } + { + "name": "products", + "description": "[MASTER DATA] Product catalog entries managed by sellers. Stores product info (title, description, base_price), seller reference. Created when seller lists product. Used in product browsing and sale creation." + }, + { + "name": "product_reviews", + "description": "[INPUT] Customer reviews for products in catalog. Stores rating, title, body, customer reference. Created after purchase. Used for product page display. Separate from shopping_product_reviews which tracks reviews in order context." + }, + { + "name": "product_categories", + "description": "[JUNCTION] Product-to-category relationships. Stores product_id, category_id. Many-to-many linking. Used for product filtering and navigation." + } ] } ] @@ -226,100 +454,151 @@ export interface IComplete { ```typescript process({ - thinking: "Found 2 duplicate groups: shopping_customers duplicates Authorization.customers, and shopping_product_reviews duplicates Products.product_reviews.", + thinking: "Analyzed 4 target tables against 5 other tables. Found 0 duplicate groups - all tables have distinct purposes based on rich descriptions.", request: { type: "complete", analysis: `## Deduplication Analysis for Sales Component -### Tables Analyzed -- Target Component: Sales (4 tables) -- Compared Against: Authorization (2 tables), Products (3 tables) +### Step 1: Target Table Inventory -### Table-by-Table Comparison +| Table | Role Tag | Core Entity | Business Context | Distinguishing | +|-------|----------|-------------|------------------|----------------| +| shopping_customers | [MASTER DATA] | Customer identity/profile | Order, delivery workflows | "Does NOT store auth credentials" | +| shopping_orders | [MASTER DATA] | Purchase orders | Fulfillment, payment | Parent of order_items | +| shopping_order_items | [MASTER DATA] | Order line items | Fulfillment, refunds | Child of orders | +| shopping_product_reviews | [INPUT] | Product reviews | Product display, ratings | "Different from Products.product_reviews" | -1. **shopping_customers** vs Authorization.customers - - Sales.shopping_customers: "Customer accounts for the shopping platform" - - Authorization.customers: "Customer authentication credentials and login data" +### Step 2: Systematic Comparison -2. **shopping_orders** — Compared against all tables in Authorization and Products. - - No table with similar order management purpose found. +#### Comparing: shopping_customers vs Authorization.customers -3. **shopping_order_items** — Compared against all tables in Authorization and Products. - - No table with similar order item tracking purpose found. +**Target**: "[MASTER DATA] Customer identity for the shopping platform. Stores personal profile (name, phone, address)... Does NOT store authentication credentials" +**Other**: "[MASTER DATA] Customer authentication credentials. Stores email, password_hash... Does NOT store profile data" -4. **shopping_product_reviews** vs Products.product_reviews - - Sales.shopping_product_reviews: "Customer reviews and ratings for products" - - Products.product_reviews: "Customer reviews and ratings for products"`, +- Role Match: [MASTER DATA] vs [MASTER DATA] → SAME ✓ +- Entity Match: "identity/profile" vs "authentication credentials" → DIFFERENT ✗ +- Distinguishing: Target says "Does NOT store auth credentials", Other says "Does NOT store profile data" - rationale: `## Duplicate Group Decisions +**VERDICT: NOT DUPLICATE** - Explicit mutual exclusion. These are deliberately separated: profile vs credentials. -### Group 1: Customer Tables -- **Why duplicates**: Both represent the same customer entity. Authorization.customers focuses on auth credentials while Sales.shopping_customers stores customer accounts, but they refer to the same customer record in the business domain. -- **Evidence**: Both descriptions reference "customer" identity management. +#### Comparing: shopping_product_reviews vs Products.product_reviews -### Group 2: Product Review Tables -- **Why duplicates**: Identical purpose - storing product reviews. Same data semantics, different component placement. -- **Evidence**: Descriptions are nearly identical, both store "customer reviews and ratings for products". +**Target**: "[INPUT] Customer reviews for purchased products... Different from Products.product_reviews which may have different ownership model" +**Other**: "[INPUT] Customer reviews for products in catalog... Separate from shopping_product_reviews which tracks reviews in order context" -### NOT Duplicates -- **shopping_orders, shopping_order_items**: Unique to Sales domain with no equivalent in other components. Order management is distinct from auth and product catalog.`, +- Role Match: [INPUT] vs [INPUT] → SAME ✓ +- Entity Match: Both "product reviews" → SAME ✓ +- Distinguishing: BOTH explicitly state they are "different from" / "separate from" each other - duplicateGroups: [ - { - reason: "Both tables represent the same customer entity - Authorization.customers stores auth credentials while Sales.shopping_customers stores customer accounts, but they refer to the same customer record", - tables: [ - { namespace: "Authorization", name: "customers" }, - { namespace: "Sales", name: "shopping_customers" } - ] - }, - { - reason: "Both tables store customer reviews and ratings for products with identical purpose", - tables: [ - { namespace: "Sales", name: "shopping_product_reviews" }, - { namespace: "Products", name: "product_reviews" } - ] - } - ] +**VERDICT: NOT DUPLICATE** - Mutual explicit exclusion. Different contexts: order-based vs catalog-based. + +#### Comparing: shopping_orders vs all other tables + +No table in Authorization or Products stores order data. Unique to Sales. + +#### Comparing: shopping_order_items vs all other tables + +No table in other components stores order line items. Unique to Sales. + +### Step 3: Summary + +- Total tables in target: 4 +- Total tables in other components: 5 +- Total comparisons made: 8 +- Duplicate groups found: 0`, + + rationale: `## Rationale for Zero Duplicate Groups + +### shopping_customers vs Authorization.customers - NOT DUPLICATE + +Both are [MASTER DATA] for customer entity, but descriptions explicitly separate concerns: +- shopping_customers: "Does NOT store authentication credentials - see Authorization.customers for login data" +- Authorization.customers: "Does NOT store profile data - see shopping_customers for personal information" + +This is **intentional separation** of authentication (credentials) vs business data (profile). NOT a duplicate. + +### shopping_product_reviews vs Products.product_reviews - NOT DUPLICATE + +Both are [INPUT] for product reviews, but descriptions explicitly state separation: +- Target: "Different from Products.product_reviews which may have different ownership model" +- Other: "Separate from shopping_product_reviews which tracks reviews in order context" + +This appears to be **intentional separation** by context (order-based vs catalog-based reviews). NOT a duplicate. + +### shopping_orders, shopping_order_items - UNIQUE + +No equivalent tables in other components. Order management is unique to Sales domain.`, + + duplicateGroups: [] } }); ``` -### No Duplicates Found +### Example: Actual Duplicates Found ```typescript process({ - thinking: "Analyzed all target tables against other components. No semantic duplicates found.", + thinking: "Found 1 duplicate group: shopping_members duplicates Authorization.customers - both [MASTER DATA] storing same customer entity with no explicit separation.", request: { type: "complete", - analysis: `## Deduplication Analysis for Orders Component + analysis: `## Deduplication Analysis for Sales Component -### Tables Analyzed -- Target Component: Orders (3 tables) -- Compared Against: Authorization (2 tables), Products (3 tables), Sales (4 tables) +### Step 1: Target Table Inventory -### Table-by-Table Comparison +| Table | Role Tag | Core Entity | Distinguishing | +|-------|----------|-------------|----------------| +| shopping_members | [MASTER DATA] | Customer accounts | None stated | +| shopping_orders | [MASTER DATA] | Purchase orders | Unique to Sales | -1. shopping_orders — Compared against all 9 tables in other components. No table with similar order management purpose found. -2. shopping_order_items — Compared against all 9 tables. No equivalent child entity for order items exists elsewhere. -3. shopping_order_deliveries — Compared against all 9 tables. Delivery tracking is unique to Orders component.`, +### Step 2: Systematic Comparison - rationale: `## Why No Duplicates Were Found +#### Comparing: shopping_members vs Authorization.customers -### Orders Domain Uniqueness -- **shopping_orders**: Order management is a distinct domain. Authorization handles auth, Products handles catalog, Sales handles transactions - none overlap with order lifecycle management. -- **shopping_order_items**: This is a child entity specific to orders. No other component has order item tracking. -- **shopping_order_deliveries**: Delivery tracking is an Orders-specific concern not replicated elsewhere. +**Target**: "[MASTER DATA] Customer member accounts for shopping. Stores customer profile, email, preferences." +**Other**: "[MASTER DATA] Customer accounts with authentication. Stores email, profile, login credentials." -### Considered but Rejected -- Sales component has transaction tables but they represent sales transactions, not order fulfillment - different lifecycle stages.`, - duplicateGroups: [] +- Role Match: [MASTER DATA] vs [MASTER DATA] → SAME ✓ +- Entity Match: "Customer accounts" vs "Customer accounts" → SAME ✓ +- Data Overlap: Both store "email, profile" → SAME ✓ +- Distinguishing: Neither description says "does NOT store X" + +**VERDICT: DUPLICATE** - Same role, same entity, overlapping data, no explicit separation. + +### Step 3: Summary +- Duplicate groups found: 1`, + + rationale: `## Duplicate Group Decisions + +### Group 1: shopping_members + Authorization.customers - DUPLICATE + +**Why duplicate**: +- Both [MASTER DATA] role tag +- Both describe "customer accounts" +- Both store overlapping data: email, profile +- NEITHER description explicitly excludes the other +- No "does NOT store X - see Y" pattern + +This is genuine duplication - the same customer entity defined in two places without explicit separation of concerns.`, + + duplicateGroups: [ + { + reason: `Both tables are [MASTER DATA] storing customer accounts with overlapping data: + - Sales.shopping_members: "[MASTER DATA] Customer member accounts for shopping. Stores customer profile, email, preferences." + - Authorization.customers: "[MASTER DATA] Customer accounts with authentication. Stores email, profile, login credentials." + Neither explicitly excludes the other's data, indicating unintended duplication.`, + tables: [ + { namespace: "Sales", name: "shopping_members" }, + { namespace: "Authorization", name: "customers" } + ] + } + ] } }); ``` --- -## 7. Concurrency Notice +## 8. Concurrency Notice Multiple Deduplication Agents run **simultaneously** for different components. This means: @@ -334,7 +613,7 @@ If you find that your target component's `table_a` duplicates another component' --- -## 8. Thinking Field Guidelines +## 9. Thinking Field Guidelines ```typescript // GOOD - summarizes findings @@ -352,7 +631,7 @@ thinking: "Removing shopping_customers because Auth already has it." --- -## 9. Working Language +## 10. Working Language - **Technical terms**: Always English (table names, field names, descriptions) - **Analysis content**: Use the language specified by user requirements @@ -360,15 +639,24 @@ thinking: "Removing shopping_customers because Auth already has it." --- -## 10. Final Execution Checklist +## 11. Final Execution Checklist Before calling `process({ request: { type: "complete", ... } })`, verify: -### Analysis Quality -- [ ] Fetched and analyzed relevant requirements +### ⚠️ CRITICAL: Target Component Check (MUST PASS) +- [ ] **EVERY group contains at least 1 table from MY target component** +- [ ] I did NOT include any groups that only involve OTHER components +- [ ] If my target is "Posts", every group has at least one "Posts" table + +### Analysis Quality - 4-Step Process Applied +- [ ] **Step 1 - Role Tags**: Extracted `[ROLE TAG]` from every description +- [ ] **Step 1 - Role Comparison**: Different role tags = NOT duplicate (stopped comparison) +- [ ] **Step 2 - Core Entity**: For same-role tables, compared core entity from descriptions +- [ ] **Step 3 - Business Context**: Compared workflow context and creation triggers +- [ ] **Step 4 - Distinguishing**: Checked for explicit "does NOT store X" exclusions +- [ ] Fetched and analyzed relevant requirements for context - [ ] Compared EVERY target table against ALL other components' tables -- [ ] Read both `name` AND `description` for each comparison -- [ ] Distinguished true duplicates from complementary tables (parent-child, snapshot, etc.) +- [ ] Only marked tables as duplicates if: SAME role + SAME entity + NO explicit exclusion ### Group Validity - [ ] Each group has at least 2 tables @@ -378,9 +666,14 @@ Before calling `process({ request: { type: "complete", ... } })`, verify: - [ ] Empty array if no duplicates found (this is a valid result) ### Common Pitfalls Avoided +- [ ] Did NOT flag tables with different role tags as duplicates (`[INPUT]` ≠ `[OUTPUT]`) +- [ ] Did NOT flag tables with explicit "does NOT store X" exclusions as duplicates - [ ] Did NOT flag parent-child relationships as duplicates -- [ ] Did NOT flag snapshot/history tables as duplicates of their source +- [ ] Did NOT flag `[SNAPSHOT]` tables as duplicates of `[MASTER DATA]` source - [ ] Did NOT flag different actor types' tables as duplicates - [ ] Did NOT make removal/keep decisions (only identification) +- [ ] Did NOT use abstract reasoning ("both are system-related", "both store data") +- [ ] Did NOT conflate different workflow stages (`[INPUT]` ≠ `[OUTPUT]` ≠ `[AUDIT]`) +- [ ] Each `reason` field contains **quoted descriptions with role tags** from both tables **REMEMBER**: Call `process({ request: { type: "complete", ... } })` immediately after this checklist. Your job is identification, not resolution. diff --git a/packages/agent/src/orchestrate/prisma/histories/transformPrismaDeduplicationHistory.ts b/packages/agent/src/orchestrate/prisma/histories/transformPrismaDeduplicationHistory.ts index c76543d102..d1c7ec80e4 100644 --- a/packages/agent/src/orchestrate/prisma/histories/transformPrismaDeduplicationHistory.ts +++ b/packages/agent/src/orchestrate/prisma/histories/transformPrismaDeduplicationHistory.ts @@ -1,6 +1,5 @@ import { AutoBeDatabaseComponent } from "@autobe/interface"; import { StringUtil } from "@autobe/utils"; -import { singular } from "pluralize"; import { NamingConvention } from "typia/lib/utils/NamingConvention"; import { v7 } from "uuid"; @@ -8,97 +7,15 @@ import { AutoBeSystemPromptConstant } from "../../../constants/AutoBeSystemPromp import { IAutoBeOrchestrateHistory } from "../../../structures/IAutoBeOrchestrateHistory"; import { AutoBePreliminaryController } from "../../common/AutoBePreliminaryController"; -interface ISimilarNameGroup { - normalized: string; - tables: Array<{ namespace: string; name: string }>; -} - -const normalizeTableName = ( - tableName: string, - prefix: string | null, -): string => { - let name = tableName; - - // 1) Remove prefix (e.g., shopping_customers → customers) - if (prefix !== null) { - const snakePrefix = NamingConvention.snake(prefix) + "_"; - if (name.startsWith(snakePrefix)) { - name = name.slice(snakePrefix.length); - } - } - - // 2) Remove leading "_" (e.g., _users → users) - if (name.startsWith("_")) { - name = name.slice(1); - } - - // 3) Split by "_", remove empty tokens, convert each token to singular, sort, and join - // e.g., bbs_user_articles → ["bbs", "user", "article"] → ["article", "bbs", "user"] → "article_bbs_user" - // e.g., bbs_article_users → ["bbs", "article", "user"] → ["article", "bbs", "user"] → "article_bbs_user" - const tokens = name - .split("_") - .filter((token) => token.length > 0) - .map((token) => singular(token)); - tokens.sort(); - return tokens.join("_"); -}; - -const findSimilarNamedTables = ( - allComponents: AutoBeDatabaseComponent[], - prefix: string | null, -): ISimilarNameGroup[] => { - const map = new Map>(); - - for (const comp of allComponents) { - for (const table of comp.tables) { - const norm = normalizeTableName(table.name, prefix); - if (!map.has(norm)) map.set(norm, []); - map.get(norm)!.push({ namespace: comp.namespace, name: table.name }); - } - } - - // Return only groups with 2+ tables - return [...map.entries()] - .filter(([_, tables]) => tables.length >= 2) - .map(([normalized, tables]) => ({ normalized, tables })); -}; - -const formatSimilarNameHints = (groups: ISimilarNameGroup[]): string => { - if (groups.length === 0) { - return "No tables with similar normalized names found."; - } - - const rows = groups - .map((g) => { - const tableList = g.tables - .map((t) => `\`${t.namespace}.${t.name}\``) - .join(", "); - return `| \`${g.normalized}\` | ${tableList} |`; - }) - .join("\n"); - - return StringUtil.trim` -| Normalized Name | Tables | -|-----------------|--------| -${rows} - `; -}; - export const transformPrismaDeduplicationHistory = (props: { preliminary: AutoBePreliminaryController< "analysisFiles" | "previousAnalysisFiles" | "previousDatabaseSchemas" >; component: AutoBeDatabaseComponent; - allComponents: AutoBeDatabaseComponent[]; + otherComponents: Pick[]; instruction: string; prefix: string | null; }): IAutoBeOrchestrateHistory => { - const similarNameGroups = findSimilarNamedTables( - props.allComponents, - props.prefix, - ); - const similarNameHints = formatSimilarNameHints(similarNameGroups); - return { histories: [ { @@ -124,25 +41,18 @@ export const transformPrismaDeduplicationHistory = (props: { ### Target Component Tables - ${JSON.stringify(props.component.tables, null, 2)} - - ### All Components Tables + \`\`\`json + ${JSON.stringify(props.component.tables)} + \`\`\` - The following shows ALL tables across ALL components (including the target). - Compare the target component's tables against tables in other components - to identify semantic duplicates. + ### Other Components Tables - ${JSON.stringify(props.allComponents, null, 2)} + The following shows tables from OTHER components (excluding the target). + Compare the target component's tables against these to identify semantic duplicates. - ### Naming Similarity Hints (Potential Duplicates) - - Tables with the **same normalized name** (prefix removed + each token converted to singular + sorted alphabetically) are strong duplicate candidates. - - **Example**: \`bbs_user_articles\` and \`bbs_article_users\` both normalize to \`article_bbs_user\`. - - ${similarNameHints} - - **IMPORTANT**: Tables in the same similarity group are **strong candidates** for semantic duplicates. Review these pairs carefully and group them if they serve the same purpose. + \`\`\`json + ${JSON.stringify(props.otherComponents)} + \`\`\` ### User Instructions @@ -154,16 +64,26 @@ export const transformPrismaDeduplicationHistory = (props: { Review the "${props.component.namespace}" component's tables for semantic duplicates. **Your task**: Compare each table in the "${props.component.namespace}" component against - tables in ALL other components. Identify tables that serve the **same purpose** + tables in other components. Identify tables that serve the **same purpose** even if they have different names. + ## How to identify duplicates + 1. First, fetch analysis files using \`getAnalysisFiles\` to understand the business context - 2. **Check the Naming Similarity Hints first** — tables with the same normalized name are strong duplicate candidates - 3. For each target table, compare its name AND description against every table in other components - 4. If two tables serve the same purpose → group them as duplicates - 5. Call \`process({ request: { type: "complete", review: "...", duplicateGroups: [...] } })\` + 2. For each table in "${props.component.namespace}", **read its \`description\` field carefully** + 3. For each table in other components, **read its \`description\` field carefully** + 4. **Compare the descriptions**: If two tables describe the **same purpose** (storing the same kind of data for the same business reason), they are duplicates + 5. Call \`process({ request: { type: "complete", analysis: "...", rationale: "...", duplicateGroups: [...] } })\` + + ## Critical: Description is the primary judgment criterion + + - **DO NOT rely on table names alone** — names can be misleading + - **READ the \`description\` field** — this tells you what the table actually stores + - **Same purpose in description = DUPLICATE** (even with completely different names) + - **Different purpose in description = NOT duplicate** (even with similar names) + + ## Rules - **Rules**: - Each duplicate group must have at least 2 tables - Each group must include at least 1 table from "${props.component.namespace}" - Parent-child relationships are NOT duplicates diff --git a/packages/agent/src/orchestrate/prisma/orchestratePrisma.ts b/packages/agent/src/orchestrate/prisma/orchestratePrisma.ts index ba4c4fc8a7..5b7e6f2391 100644 --- a/packages/agent/src/orchestrate/prisma/orchestratePrisma.ts +++ b/packages/agent/src/orchestrate/prisma/orchestratePrisma.ts @@ -73,6 +73,8 @@ export const orchestratePrisma = async ( instruction: props.instruction, }, ); + console.log(`----------- DATABASE DEDUPLICATION -------------`); + console.log(JSON.stringify(components, null, 2)); const application: AutoBeDatabase.IApplication = await orchestrateSchema( ctx, { @@ -161,13 +163,20 @@ const orchestrateComponent = async ( instruction: props.instruction, groups: props.groups, }); - return [ - ...(authorization ? [authorization] : []), - ...(await orchestratePrismaComponentReview(ctx, { - instruction: props.instruction, - components, - })), - ]; + const allComponents: AutoBeDatabaseComponent[] = + AutoBeDatabaseComponentProgrammer.removeDuplicatedTable([ + ...(authorization ? [authorization] : []), + ...(await orchestratePrismaComponentReview(ctx, { + instruction: props.instruction, + components, + })), + ]); + console.log(`----------- ALL COMPONENTS -------------`); + console.log(JSON.stringify(allComponents, null, 2)); + return await orchestratePrismaDeduplication(ctx, { + instruction: props.instruction, + components: allComponents, + }); }; const orchestrateSchema = async ( diff --git a/packages/agent/src/orchestrate/prisma/orchestratePrismaDeduplication.ts b/packages/agent/src/orchestrate/prisma/orchestratePrismaDeduplication.ts index 8e1ac04047..b1adc928f2 100644 --- a/packages/agent/src/orchestrate/prisma/orchestratePrismaDeduplication.ts +++ b/packages/agent/src/orchestrate/prisma/orchestratePrismaDeduplication.ts @@ -32,9 +32,16 @@ export async function orchestratePrismaDeduplication( const events: AutoBeDatabaseDeduplicationEvent[] = await executeCachedBatch( ctx, props.components.map((component) => async (promptCacheKey) => { + const otherComponents: Pick< + AutoBeDatabaseComponent, + "namespace" | "tables" + >[] = props.components + .filter((c) => c.namespace !== component.namespace) + .map((c) => ({ namespace: c.namespace, tables: c.tables })); + const event: AutoBeDatabaseDeduplicationEvent = await process(ctx, { target: component, - allComponents: props.components, + otherComponents, instruction: props.instruction, prefix, progress, @@ -44,18 +51,17 @@ export async function orchestratePrismaDeduplication( return event; }), ); - // Resolve duplicates - const results: AutoBeDatabaseComponent[] = - AutoBeDatabaseDeduplicationProgrammer.resolve(props.components, events); - - return results; + return AutoBeDatabaseDeduplicationProgrammer.resolve( + props.components, + events, + ); } async function process( ctx: AutoBeContext, props: { target: AutoBeDatabaseComponent; - allComponents: AutoBeDatabaseComponent[]; + otherComponents: Pick[]; instruction: string; prefix: string | null; progress: AutoBeProgressEventBase; @@ -87,7 +93,7 @@ async function process( controller: createController({ preliminary, target: props.target, - allComponents: props.allComponents, + otherComponents: props.otherComponents, build: (next) => { pointer.value = next; }, @@ -96,7 +102,7 @@ async function process( promptCacheKey: props.promptCacheKey, ...transformPrismaDeduplicationHistory({ component: props.target, - allComponents: props.allComponents, + otherComponents: props.otherComponents, instruction: props.instruction, prefix: props.prefix, preliminary, @@ -126,7 +132,7 @@ function createController(props: { "analysisFiles" | "previousAnalysisFiles" | "previousDatabaseSchemas" >; target: AutoBeDatabaseComponent; - allComponents: AutoBeDatabaseComponent[]; + otherComponents: Pick[]; build: (next: IAutoBeDatabaseDeduplicationApplication.IComplete) => void; }): IAgenticaController.IClass { const validate: Validator = (input) => { @@ -145,7 +151,7 @@ function createController(props: { errors, path: "$input.request.duplicateGroups", target: props.target, - allComponents: props.allComponents, + otherComponents: props.otherComponents, duplicateGroups: result.data.request.duplicateGroups, }); if (errors.length > 0) diff --git a/packages/agent/src/orchestrate/prisma/programmers/AutoBeDatabaseDeduplicationProgrammer.ts b/packages/agent/src/orchestrate/prisma/programmers/AutoBeDatabaseDeduplicationProgrammer.ts index ad67408ab2..04a03414fc 100644 --- a/packages/agent/src/orchestrate/prisma/programmers/AutoBeDatabaseDeduplicationProgrammer.ts +++ b/packages/agent/src/orchestrate/prisma/programmers/AutoBeDatabaseDeduplicationProgrammer.ts @@ -13,9 +13,15 @@ export namespace AutoBeDatabaseDeduplicationProgrammer { errors: IValidation.IError[]; path: string; target: AutoBeDatabaseComponent; - allComponents: AutoBeDatabaseComponent[]; + otherComponents: Pick[]; duplicateGroups: AutoBeDatabaseDeduplicationGroup[]; }): void => { + // Combine target + otherComponents for validation + const allComponents: Pick< + AutoBeDatabaseComponent, + "namespace" | "tables" + >[] = [props.target, ...props.otherComponents]; + props.duplicateGroups.forEach((group, i) => { // Each group must have at least 2 tables if (group.tables.length < 2) @@ -33,8 +39,11 @@ export namespace AutoBeDatabaseDeduplicationProgrammer { // Each table must exist in actual components group.tables.forEach((table, j) => { - const component: AutoBeDatabaseComponent | undefined = - props.allComponents.find((c) => c.namespace === table.namespace); + const component: + | Pick + | undefined = allComponents.find( + (c) => c.namespace === table.namespace, + ); if (component === undefined) props.errors.push({ path: `${props.path}[${i}].tables[${j}].namespace`, @@ -44,10 +53,10 @@ export namespace AutoBeDatabaseDeduplicationProgrammer { Component namespace "${table.namespace}" does not exist. Fix: Use one of the existing component namespaces: - - ${props.allComponents.map((c) => c.namespace).join(", ")} + - ${allComponents.map((c) => c.namespace).join(", ")} `, }); - else if (!component.tables.some((t) => t.name === table.name)) + else if (component.tables.some((t) => t.name === table.name) === false) props.errors.push({ path: `${props.path}[${i}].tables[${j}].name`, expected: `existing table in "${table.namespace}" component`, @@ -89,18 +98,77 @@ export namespace AutoBeDatabaseDeduplicationProgrammer { components: AutoBeDatabaseComponent[], events: AutoBeDatabaseDeduplicationEvent[], ): AutoBeDatabaseComponent[] => { + console.log("\n"); + console.log( + "╔══════════════════════════════════════════════════════════════╗", + ); + console.log( + "║ AutoBeDatabaseDeduplicationProgrammer.resolve() START ║", + ); + console.log( + "╚══════════════════════════════════════════════════════════════╝", + ); + console.log(`[Resolve] Input components: ${components.length}`); + console.log(`[Resolve] Input events: ${events.length}`); + // 1. Collect all duplicate groups from events const duplicatedGroups: AutoBeDatabaseDeduplicationGroup[] = events.flatMap( (e) => e.duplicateGroups, ); - if (duplicatedGroups.length === 0) return components; + + console.log( + `[Resolve] Total duplicate groups collected: ${duplicatedGroups.length}`, + ); + events.forEach((event, i) => { + console.log( + `[Resolve] Event[${i}] from "${event.namespace}": ${event.duplicateGroups.length} groups`, + ); + }); + + if (duplicatedGroups.length === 0) { + console.log( + "[Resolve] No duplicate groups found. Returning original components.", + ); + console.log( + "╔══════════════════════════════════════════════════════════════╗", + ); + console.log( + "║ AutoBeDatabaseDeduplicationProgrammer.resolve() END ║", + ); + console.log( + "╚══════════════════════════════════════════════════════════════╝\n", + ); + return components; + } // 2. Merge overlapping groups into clusters using Union-Find const clusters: AutoBeDatabaseDeduplicationGroup.ITable[][] = mergeGroups(duplicatedGroups); // 3. Remove duplicates, keeping table from smallest component - return removeDuplicates(components, clusters); + const result = removeDuplicates(components, clusters); + + console.log("\n[Resolve] Summary:"); + console.log( + `[Resolve] Input tables: ${components.reduce((sum, c) => sum + c.tables.length, 0)}`, + ); + console.log( + `[Resolve] Output tables: ${result.reduce((sum, c) => sum + c.tables.length, 0)}`, + ); + console.log( + `[Resolve] Removed tables: ${components.reduce((sum, c) => sum + c.tables.length, 0) - result.reduce((sum, c) => sum + c.tables.length, 0)}`, + ); + console.log( + "╔══════════════════════════════════════════════════════════════╗", + ); + console.log( + "║ AutoBeDatabaseDeduplicationProgrammer.resolve() END ║", + ); + console.log( + "╚══════════════════════════════════════════════════════════════╝\n", + ); + + return result; }; /** @@ -115,6 +183,15 @@ export namespace AutoBeDatabaseDeduplicationProgrammer { const mergeGroups = ( groups: AutoBeDatabaseDeduplicationGroup[], ): AutoBeDatabaseDeduplicationGroup.ITable[][] => { + console.log("\n========== [Union-Find] mergeGroups START =========="); + console.log(`[Union-Find] Input groups count: ${groups.length}`); + groups.forEach((group, i) => { + console.log( + `[Union-Find] Group[${i}]: ${group.tables.map((t) => `${t.namespace}::${t.name}`).join(" = ")}`, + ); + console.log(`[Union-Find] Reason: ${group.reason}`); + }); + // Build table key → index mapping const tableKeys: string[] = []; const tableKeyToIndex: Map = new Map(); @@ -126,6 +203,7 @@ export namespace AutoBeDatabaseDeduplicationProgrammer { index = tableKeys.length; tableKeys.push(key); tableKeyToIndex.set(key, index); + console.log(`[Union-Find] Register table: ${key} → index ${index}`); } return index; }; @@ -137,6 +215,9 @@ export namespace AutoBeDatabaseDeduplicationProgrammer { } } + console.log(`\n[Union-Find] Total unique tables: ${tableKeys.length}`); + console.log(`[Union-Find] Table keys: [${tableKeys.join(", ")}]`); + // Union-Find: each table starts as its own parent const parent: number[] = tableKeys.map((_, i) => i); const rank: number[] = tableKeys.map(() => 0); @@ -152,20 +233,35 @@ export namespace AutoBeDatabaseDeduplicationProgrammer { const union = (a: number, b: number): void => { const rootA: number = find(a); const rootB: number = find(b); - if (rootA === rootB) return; + if (rootA === rootB) { + console.log( + `[Union-Find] Union(${tableKeys[a]}, ${tableKeys[b]}): Already in same set (root=${tableKeys[rootA]})`, + ); + return; + } // Union by rank: attach smaller tree under larger tree if (rank[rootA] < rank[rootB]) { parent[rootA] = rootB; + console.log( + `[Union-Find] Union(${tableKeys[a]}, ${tableKeys[b]}): Merged ${tableKeys[rootA]} → ${tableKeys[rootB]} (rank)`, + ); } else if (rank[rootA] > rank[rootB]) { parent[rootB] = rootA; + console.log( + `[Union-Find] Union(${tableKeys[a]}, ${tableKeys[b]}): Merged ${tableKeys[rootB]} → ${tableKeys[rootA]} (rank)`, + ); } else { parent[rootB] = rootA; rank[rootA]++; + console.log( + `[Union-Find] Union(${tableKeys[a]}, ${tableKeys[b]}): Merged ${tableKeys[rootB]} → ${tableKeys[rootA]} (tie, rank++)`, + ); } }; // Union all tables within each group + console.log("\n[Union-Find] Processing union operations..."); for (const group of groups) { if (group.tables.length < 2) continue; const firstIndex: number = getOrCreateIndex( @@ -181,6 +277,15 @@ export namespace AutoBeDatabaseDeduplicationProgrammer { } } + // Log parent array state + console.log("\n[Union-Find] Final parent array:"); + tableKeys.forEach((key, i) => { + const root = find(i); + console.log( + `[Union-Find] ${key} (idx=${i}) → root=${tableKeys[root]} (idx=${root})`, + ); + }); + // Group tables by their root → clusters const clusterMap = new Map< number, @@ -197,7 +302,18 @@ export namespace AutoBeDatabaseDeduplicationProgrammer { cluster.push({ namespace: namespace!, name: name! }); } - return [...clusterMap.values()]; + const result = [...clusterMap.values()]; + + // Log final clusters + console.log("\n[Union-Find] Final clusters:"); + result.forEach((cluster, i) => { + console.log( + `[Union-Find] Cluster[${i}]: ${cluster.map((t) => `${t.namespace}::${t.name}`).join(", ")}`, + ); + }); + console.log("========== [Union-Find] mergeGroups END ==========\n"); + + return result; }; /** @@ -217,6 +333,15 @@ export namespace AutoBeDatabaseDeduplicationProgrammer { components: AutoBeDatabaseComponent[], clusters: AutoBeDatabaseDeduplicationGroup.ITable[][], ): AutoBeDatabaseComponent[] => { + console.log("\n========== [Dedup] removeDuplicates START =========="); + console.log(`[Dedup] Input components: ${components.length}`); + components.forEach((c) => { + console.log( + `[Dedup] ${c.namespace}: [${c.tables.map((t) => t.name).join(", ")}] (${c.tables.length} tables)`, + ); + }); + console.log(`[Dedup] Input clusters: ${clusters.length}`); + // Build tableKey → clusterId mapping const tableToCluster: Map = new Map(); clusters.forEach((cluster, clusterId) => { @@ -225,15 +350,29 @@ export namespace AutoBeDatabaseDeduplicationProgrammer { } }); + console.log("\n[Dedup] Table to Cluster mapping:"); + for (const [key, clusterId] of tableToCluster) { + console.log(`[Dedup] ${key} → Cluster[${clusterId}]`); + } + // Track which clusters already have a kept table const clusterSet: Set = new Set(); + const keptTables: Map = new Map(); // For logging // Sort by table count (smallest first), keep original index const sorted: Pair[] = components .map((c, i) => new Pair(c, i)) .sort((a, b) => a.first.tables.length - b.first.tables.length); + console.log("\n[Dedup] Processing order (sorted by table count):"); + sorted.forEach((p, i) => { + console.log( + `[Dedup] ${i + 1}. ${p.first.namespace} (${p.first.tables.length} tables)`, + ); + }); + // Filter tables: keep first encountered per cluster + console.log("\n[Dedup] Processing tables..."); const processed: Pair[] = sorted.map( (p) => new Pair( @@ -244,15 +383,25 @@ export namespace AutoBeDatabaseDeduplicationProgrammer { const clusterId: number | undefined = tableToCluster.get(key); // Not in any cluster → keep - if (clusterId === undefined) return true; + if (clusterId === undefined) { + console.log(`[Dedup] KEEP ${key}: Not in any cluster`); + return true; + } // First in cluster → keep and mark if (!clusterSet.has(clusterId)) { clusterSet.add(clusterId); + keptTables.set(clusterId, key); + console.log( + `[Dedup] KEEP ${key}: First in Cluster[${clusterId}]`, + ); return true; } // Already have one from this cluster → remove + console.log( + `[Dedup] REMOVE ${key}: Cluster[${clusterId}] already has ${keptTables.get(clusterId)}`, + ); return false; }), }, @@ -261,9 +410,19 @@ export namespace AutoBeDatabaseDeduplicationProgrammer { ); // Restore original order and filter empty components - return processed + const result = processed .sort((a, b) => a.second - b.second) .map((p) => p.first) .filter((c) => c.tables.length > 0); + + console.log("\n[Dedup] Final result:"); + result.forEach((c) => { + console.log( + `[Dedup] ${c.namespace}: [${c.tables.map((t) => t.name).join(", ")}] (${c.tables.length} tables)`, + ); + }); + console.log("========== [Dedup] removeDuplicates END ==========\n"); + + return result; }; } diff --git a/packages/agent/src/orchestrate/prisma/structures/IAutoBeDatabaseDeduplicationApplication.ts b/packages/agent/src/orchestrate/prisma/structures/IAutoBeDatabaseDeduplicationApplication.ts index c1b04a8f92..da3f29e9b8 100644 --- a/packages/agent/src/orchestrate/prisma/structures/IAutoBeDatabaseDeduplicationApplication.ts +++ b/packages/agent/src/orchestrate/prisma/structures/IAutoBeDatabaseDeduplicationApplication.ts @@ -73,24 +73,72 @@ export namespace IAutoBeDatabaseDeduplicationApplication { /** * Analysis of the deduplication comparison process. * - * Documents the agent's understanding and comparison approach: + * **REQUIRED STRUCTURE - Follow this Chain of Thought:** * - * - What tables in the target component were analyzed? - * - What tables in other components were compared against? - * - What semantic patterns were identified across components? - * - How were table purposes determined from names and descriptions? + * ## Step 1: Target Table Inventory + * + * For EACH table in target component, extract from its description: + * + * - Table name + * - Role tag: `[MASTER DATA]`, `[INPUT]`, `[OUTPUT]`, `[AUDIT]`, `[CONFIG]`, + * `[SNAPSHOT]`, `[JUNCTION]` + * - Core entity it stores + * - Business workflow context + * - Distinguishing characteristics (especially "does NOT store X" phrases) + * + * ## Step 2: Systematic Comparison + * + * For EACH target table, compare against EACH table in other components: + * + * ``` + * ### Comparing: {target_table} vs {other_component}.{other_table} + * + * **Target description**: "{quoted description}" + * **Other description**: "{quoted description}" + * + * Role Match: [MASTER DATA] vs [MASTER DATA] → SAME / DIFFERENT + * Entity Match: "customer identity" vs "customer credentials" → SAME / + * DIFFERENT + * Workflow Match: "registration flow" vs "auth flow" → SAME / DIFFERENT + * Distinguishing Check: Does either explicitly exclude the other's purpose? + * + * VERDICT: DUPLICATE / NOT DUPLICATE + * REASON: {specific reason based on description comparison} + * ``` + * + * ## Step 3: Summary + * + * - Total tables in target component: X + * - Total tables in other components: X + * - Total comparisons made: X + * - Duplicate groups found: X */ analysis: string; /** * Rationale for the duplicate group decisions. * - * Explains why specific tables were grouped as duplicates: + * **REQUIRED STRUCTURE:** * - * - Why are identified groups considered semantically equivalent? - * - What evidence supports each grouping decision? - * - Why were certain similar-looking tables NOT grouped? - * - What distinguishes true duplicates from related but distinct tables? + * ## For EACH duplicate group identified: + * + * - Quote BOTH descriptions showing same purpose + * - Identify matching elements: same role tag, same core entity, same + * workflow + * - Explain WHY these descriptions indicate same business function + * + * ## For tables explicitly NOT grouped (similar-looking but different): + * + * Common patterns to explicitly address and explain why NOT duplicates: + * + * - `[INPUT]` vs `[OUTPUT]` in same workflow (questions vs answers) + * - `[MASTER DATA]` vs `[SNAPSHOT]` of same entity (orders vs + * order_snapshots) + * - `[MASTER DATA]` vs `[AUDIT]` (entities vs logs) + * - Tables with explicit "does NOT store X" that excludes the other + * - Different actor ownership (customer creates vs seller creates) + * + * Quote the distinguishing parts of descriptions that prove non-duplication. */ rationale: string; @@ -106,26 +154,86 @@ export namespace IAutoBeDatabaseDeduplicationApplication { * - Each group must include at least 1 table from the target component * - Each table can appear in only one group * - * ## Example: - * - * ```typescript - * [ - * { - * reason: "Both tables store customer authentication data", - * tables: [ - * { namespace: "Authorization", name: "customers" }, - * { namespace: "Sales", name: "shopping_customers" } - * ] - * } - * ] + * ## ⚠️ CRITICAL: 4-Step Duplicate Detection Using Rich Descriptions + * + * Tables now have structured descriptions with role tags and distinguishing + * characteristics. Use this 4-step process: + * + * **Step 1: Extract and Compare Role Tags** + * + * Read the `[ROLE TAG]` at the start of each description: + * + * - Same role tag → Proceed to Step 2 + * - Different role tags → NOT duplicates (stop here) + * - `[INPUT]` ≠ `[OUTPUT]` (workflow stages) + * - `[MASTER DATA]` ≠ `[SNAPSHOT]` (live vs point-in-time) + * - `[MASTER DATA]` ≠ `[AUDIT]` (entity vs log) + * + * **Step 2: Compare Core Entity** + * + * What SPECIFIC business entity does each table store? + * + * - "customer identity" vs "customer credentials" → DIFFERENT entities + * - "customer identity" vs "customer accounts" → SAME entity (investigate) + * - "order cancellation requests" vs "refund processing" → DIFFERENT + * + * **Step 3: Compare Business Context** + * + * What workflow uses this table? What's the creation trigger? + * + * - Same workflow position = likely duplicate + * - Different workflow stages = NOT duplicate + * - Different creation triggers = likely NOT duplicate + * + * **Step 4: Check Distinguishing Characteristics** + * + * Look for explicit exclusions in descriptions: + * + * - "does NOT store X - see Y for that" → X and Y are NOT duplicates + * - "different from Z which tracks..." → NOT duplicate of Z + * - "separate because different actor owns" → NOT duplicate + * + * ## Example: Duplicate Found + * + * ``` + * Table A: "[MASTER DATA] Customer identity for shopping platform. + * Stores name, phone, address..." + * Table B: "[MASTER DATA] Customer accounts for marketplace. + * Stores name, email, phone..." + * + * Step 1: Both [MASTER DATA] ✓ + * Step 2: Both "customer identity/accounts" = SAME entity ✓ + * Step 3: Both for customer management workflow ✓ + * Step 4: No explicit exclusions + * + * → DUPLICATE: Same customer entity in different components + * ``` + * + * ## Example: NOT Duplicate (Different Roles) + * * ``` + * Table A: "[INPUT] Customer questions about products..." + * Table B: "[OUTPUT] Seller answers to customer questions..." + * + * Step 1: [INPUT] vs [OUTPUT] = DIFFERENT roles ✗ * - * ## Judgment Criteria: + * → NOT DUPLICATE: Different workflow stages (stop at Step 1) + * ``` * - * - Read both name AND description to determine purpose - * - Same purpose = duplicate (even with different names) - * - Different purpose = NOT duplicate (even with same name) - * - Parent-child or snapshot relationships = NOT duplicates + * ## Example: NOT Duplicate (Explicit Exclusion) + * + * ``` + * Table A: "[MASTER DATA] Customer authentication credentials... + * Does NOT store profile data - see customer_profiles" + * Table B: "[MASTER DATA] Customer profile information... + * Stores name, address, preferences..." + * + * Step 1: Both [MASTER DATA] ✓ + * Step 2: "credentials" vs "profile" = DIFFERENT entities ✗ + * Step 4: Explicit "does NOT store profile data" + * + * → NOT DUPLICATE: Explicitly separated concerns + * ``` */ duplicateGroups: AutoBeDatabaseDeduplicationGroup[]; } From f8673f2e060545ff335f512c4b0f26014ab737c2 Mon Sep 17 00:00:00 2001 From: michael <7471919@naver.com> Date: Tue, 3 Feb 2026 16:33:20 +0900 Subject: [PATCH 5/8] chore: add [CONFIG] related prompt --- packages/agent/prompts/DATABASE_COMPONENT.md | 13 ++++ .../agent/prompts/DATABASE_DEDUPLICATION.md | 71 ++++++++++++------- 2 files changed, 59 insertions(+), 25 deletions(-) diff --git a/packages/agent/prompts/DATABASE_COMPONENT.md b/packages/agent/prompts/DATABASE_COMPONENT.md index d199a49972..5d836574d9 100644 --- a/packages/agent/prompts/DATABASE_COMPONENT.md +++ b/packages/agent/prompts/DATABASE_COMPONENT.md @@ -463,6 +463,9 @@ Brief descriptions cause duplicate detection failures. Write RICH descriptions. | **4. Business Context** | What workflow/process uses this | "used in registration, login, and profile management flows" | | **5. Distinguishing Characteristics** | How it differs from similar tables | "does NOT store order history - see customer_orders for that" | +> **⚠️ Element 5 — Special Rule for Generic/Infrastructure Tables:** +> Generic tables (key-value config stores, generic event logs, flexible metadata stores, etc.) MUST include an explicit "Does NOT replace domain-specific X tables" statement. Without this, the Deduplication Agent cannot distinguish a generic key-value `configurations` table from a structured domain table like `payment_methods`, even though they serve completely different purposes. Example: "Does NOT replace domain-specific configuration tables — stores only simple key-value settings, not structured domain entities like payment_methods or notification_preferences." + ### Role Tag Definitions | Tag | Meaning | Lifecycle | Examples | @@ -507,6 +510,16 @@ Brief descriptions cause duplicate detection failures. Write RICH descriptions. name: "sale_question_answers", description: "[OUTPUT] Seller responses to customer questions. Stores answer text, seller reference, and parent question link. Created when seller responds to a question. Completes Q&A workflow. Separate from questions because different actor (seller) owns this data with different lifecycle." } + +// Pair 3: Same role [CONFIG] but generic key-value store vs structured domain table +{ + name: "shopping_configurations", + description: "[CONFIG] Generic system-wide settings as key-value pairs. Stores config_key, config_value, config_type, last_modified_by. Used by all system components for feature flags and toggles (e.g., maintenance mode, rate limits). Does NOT replace domain-specific configuration tables — stores only simple key-value settings, not structured domain entities like payment_methods or notification_preferences." +} +{ + name: "shopping_payment_methods", + description: "[CONFIG] Structured payment method definitions for the platform. Stores method_name (stripe, paypal), fee_percentage, min/max_transaction_amount, regional_restrictions, is_active. Created by platform admin. Used by checkout and payment processing workflows. Does NOT store arbitrary system settings — see shopping_configurations for generic key-value feature flags." +} ``` ### Why Rich Descriptions Matter for Deduplication diff --git a/packages/agent/prompts/DATABASE_DEDUPLICATION.md b/packages/agent/prompts/DATABASE_DEDUPLICATION.md index b32fa63242..e3233ae57b 100644 --- a/packages/agent/prompts/DATABASE_DEDUPLICATION.md +++ b/packages/agent/prompts/DATABASE_DEDUPLICATION.md @@ -70,6 +70,7 @@ Two tables have **DIFFERENT purposes** when: | `admin_sessions`: "[MASTER DATA] Sessions for administrators..." | `customer_sessions`: "[MASTER DATA] Sessions for customers..." | **NO** | Different actor types explicitly stated | | `configurations`: "[CONFIG] System settings..." | `admins`: "[MASTER DATA] Administrator accounts..." | **NO** | Different role tags: [CONFIG] vs [MASTER DATA] | | `moderation_actions`: "[OUTPUT] Moderator decisions..." | `audit_logs`: "[AUDIT] Immutable compliance record..." | **NO** | Different role tags: [OUTPUT] vs [AUDIT] | +| `configurations`: "[CONFIG] Generic system settings as key-value pairs... Stores config_key, config_value, config_type" | `payment_methods`: "[CONFIG] Structured payment method definitions... Stores method_name, fee_percentage, min/max_amount, is_active" | **NO** | Generic key-value store ≠ Structured domain entity. A generic table mentioning a domain as example does NOT duplicate that domain's dedicated table | --- @@ -88,6 +89,7 @@ Two tables have **DIFFERENT purposes** when: | "Both belong to the same domain" | Same domain can have many non-duplicate tables | | "Both are about reporting/moderation" | Reports (input) ≠ actions (output) ≠ logs (audit) | | "Both store similar metadata" | Metadata for different entities serves different purposes | +| "Both configure the same domain" | A generic key-value config store mentioning "payment gateways" as an example ≠ a structured payment_methods table with dedicated columns | **If you find yourself using any of these phrases, STOP and re-read the descriptions.** @@ -189,6 +191,23 @@ Table B: "[MASTER DATA] Customer authentication credentials..." → Need more investigation ``` +**⚠️ Step 2 Special Case: Generic vs Specific (same role tag)** + +When both tables share the same role tag (especially `[CONFIG]`), check whether one is a **generic infrastructure table** and the other is a **structured domain entity**: + +- **Generic infrastructure table**: Stores arbitrary data as key-value pairs, generic event entries, or flexible JSON blobs. Core entity is "system settings" or "generic records" — not a specific business concept. +- **Structured domain entity**: Has dedicated typed columns for a specific business concept (e.g., `method_name`, `fee_percentage`, `is_active`). Core entity is a specific business object. + +``` +Table A: "[CONFIG] Generic system settings as key-value pairs. Stores config_key, config_value..." +Table B: "[CONFIG] Structured payment method definitions. Stores method_name, fee_percentage..." + +→ Generic key-value store vs Structured domain entity = DIFFERENT core entities +→ NOT DUPLICATE — even if the generic table mentions the domain as an example +``` + +**Key Rule**: A generic table that *mentions* a domain (e.g., "controls behavior of payment gateways, notifications...") is NOT a duplicate of that domain's dedicated structured table. The mention is just an example of usage, not the table's core purpose. + **Step 3: Compare Business Context (if same entity)** What workflow uses this table? What triggers creation? @@ -213,16 +232,7 @@ Table B: "...Does NOT store profile data - see X for personal information" → These are deliberately separated tables ``` -### 4.5 Key Judgment Rules Summary - -1. **Different role tags = NOT duplicate** (stop immediately) -2. **Same role tag = INVESTIGATE further** (proceed to entity comparison) -3. **Explicit "does NOT store X" = NOT duplicate of X** -4. **Different workflow stages = NOT duplicate** (input ≠ output ≠ audit) -5. **Different actor ownership = NOT duplicate** (customer creates ≠ seller creates) -6. **Same entity + same role + same workflow + no exclusions = DUPLICATE** - -### 4.2 Common Misconception: Similar Domain ≠ Duplicate +### 4.5 Common Misconception: Similar Domain ≠ Duplicate Tables in the same domain (e.g., "moderation", "reporting") often serve **completely different purposes**: @@ -238,9 +248,9 @@ These are THREE DIFFERENT tables serving THREE DIFFERENT purposes in ONE workflo User Report (INPUT) → Moderator Decision (OUTPUT) → Audit Record (AUDIT) ``` -### 4.3 The Definitive Test +### 4.6 Verification: The Definitive Test -Ask yourself these questions: +After completing the 4-Step Process (Section 4.4), use these 3 questions as a **final verification** to confirm your conclusion: 1. **"If I inserted the same row into both tables, would it make sense?"** - YES → Likely duplicates (same entity) @@ -255,15 +265,26 @@ Ask yourself these questions: - YES, and quotes clearly match → Duplicates - NO, descriptions show different purposes → NOT duplicates -### 4.4 Judgment Rules Summary +**If the 4-Step Process and this Verification Test disagree, trust the 4-Step Process** — it uses structured description analysis and is more reliable than intuitive checks. + +### 4.7 Judgment Rules Summary + +**Primary Rules (from 4-Step Process):** + +1. **Different role tags = NOT duplicate** (stop immediately at Step 1) +2. **Same role tag = INVESTIGATE further** (proceed to Step 2: entity comparison) +3. **Explicit "does NOT store X" = NOT duplicate of X** (Step 4 exclusion) +4. **Different workflow stages = NOT duplicate** (input ≠ output ≠ audit) +5. **Different actor ownership = NOT duplicate** (customer creates ≠ seller creates) +6. **Same entity + same role + same workflow + no exclusions = DUPLICATE** + +**General Rules:** -1. **ALWAYS read the `description` field carefully** — this is the most reliable indicator of what a table stores -2. **Tables with the same purpose in their descriptions = DUPLICATE** (even if names differ) -3. **Tables with different purposes in their descriptions = NOT duplicate** (even if names look similar) -4. **Do NOT rely on table names alone** — names can be misleading -5. **Parent-child or snapshot relationships = NOT duplicates** (they are complementary) -6. **Different actor types of the same pattern = NOT duplicates** (each actor needs its own tables) -7. **Different roles in the same workflow = NOT duplicates** (input ≠ output ≠ audit) +7. **ALWAYS read the `description` field carefully** — this is the most reliable indicator of what a table stores +8. **Tables with the same purpose in their descriptions = DUPLICATE** (even if names differ) +9. **Tables with different purposes in their descriptions = NOT duplicate** (even if names look similar) +10. **Do NOT rely on table names alone** — names can be misleading +11. **Parent-child or snapshot relationships = NOT duplicates** (they are complementary) --- @@ -379,11 +400,11 @@ export interface IComplete { } ``` -| Field | Focus | -|-------|-------| -| `analysis` | Which tables were analyzed, what comparisons were made, and what patterns were identified | -| `rationale` | Why specific tables were grouped as duplicates and why certain tables were NOT grouped | -| `duplicateGroups` | Array of duplicate groups — empty array if no duplicates exist | +| Field | Focus | Required Structure | +|-------|-------|--------------------| +| `analysis` | Which tables were analyzed, what comparisons were made, and what patterns were identified | **Step 1**: Target Table Inventory (role tag, core entity, business context, distinguishing traits per table). **Step 2**: Systematic Comparison (each target table vs each other table with Role/Entity/Workflow/Distinguishing verdicts). **Step 3**: Summary (total tables, comparisons, groups found). | +| `rationale` | Why specific tables were grouped as duplicates and why certain tables were NOT grouped | For each duplicate group: quote BOTH descriptions, identify matching elements (role, entity, workflow), explain WHY. For non-grouped similar tables: quote distinguishing parts proving non-duplication. | +| `duplicateGroups` | Array of duplicate groups — empty array if no duplicates exist | Each group: `reason` with **quoted descriptions**, at least 2 tables, at least 1 from target component. | --- From 52c2cfd66f36873d9bd5ec3a2e4abb9dc05a87bc Mon Sep 17 00:00:00 2001 From: michael <7471919@naver.com> Date: Tue, 3 Feb 2026 17:07:29 +0900 Subject: [PATCH 6/8] chore: rename and type definition --- .../transformPrismaDeduplicationHistory.ts | 16 ++++++++-------- .../prisma/orchestratePrismaDeduplication.ts | 2 +- .../AutoBeDatabaseDeduplicationProgrammer.ts | 2 +- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/packages/agent/src/orchestrate/prisma/histories/transformPrismaDeduplicationHistory.ts b/packages/agent/src/orchestrate/prisma/histories/transformPrismaDeduplicationHistory.ts index d1c7ec80e4..4e24d6a051 100644 --- a/packages/agent/src/orchestrate/prisma/histories/transformPrismaDeduplicationHistory.ts +++ b/packages/agent/src/orchestrate/prisma/histories/transformPrismaDeduplicationHistory.ts @@ -11,7 +11,7 @@ export const transformPrismaDeduplicationHistory = (props: { preliminary: AutoBePreliminaryController< "analysisFiles" | "previousAnalysisFiles" | "previousDatabaseSchemas" >; - component: AutoBeDatabaseComponent; + target: AutoBeDatabaseComponent; otherComponents: Pick[]; instruction: string; prefix: string | null; @@ -36,13 +36,13 @@ export const transformPrismaDeduplicationHistory = (props: { ### Target Component - - **Namespace**: \`${props.component.namespace}\` - - **Filename**: \`${props.component.filename}\` + - **Namespace**: \`${props.target.namespace}\` + - **Filename**: \`${props.target.filename}\` ### Target Component Tables \`\`\`json - ${JSON.stringify(props.component.tables)} + ${JSON.stringify(props.target.tables)} \`\`\` ### Other Components Tables @@ -61,16 +61,16 @@ export const transformPrismaDeduplicationHistory = (props: { }, ], userMessage: StringUtil.trim` - Review the "${props.component.namespace}" component's tables for semantic duplicates. + Review the "${props.target.namespace}" component's tables for semantic duplicates. - **Your task**: Compare each table in the "${props.component.namespace}" component against + **Your task**: Compare each table in the "${props.target.namespace}" component against tables in other components. Identify tables that serve the **same purpose** even if they have different names. ## How to identify duplicates 1. First, fetch analysis files using \`getAnalysisFiles\` to understand the business context - 2. For each table in "${props.component.namespace}", **read its \`description\` field carefully** + 2. For each table in "${props.target.namespace}", **read its \`description\` field carefully** 3. For each table in other components, **read its \`description\` field carefully** 4. **Compare the descriptions**: If two tables describe the **same purpose** (storing the same kind of data for the same business reason), they are duplicates 5. Call \`process({ request: { type: "complete", analysis: "...", rationale: "...", duplicateGroups: [...] } })\` @@ -85,7 +85,7 @@ export const transformPrismaDeduplicationHistory = (props: { ## Rules - Each duplicate group must have at least 2 tables - - Each group must include at least 1 table from "${props.component.namespace}" + - Each group must include at least 1 table from "${props.target.namespace}" - Parent-child relationships are NOT duplicates - Snapshot/history tables are NOT duplicates of their source tables - If no duplicates found, return an empty duplicateGroups array diff --git a/packages/agent/src/orchestrate/prisma/orchestratePrismaDeduplication.ts b/packages/agent/src/orchestrate/prisma/orchestratePrismaDeduplication.ts index b1adc928f2..699dafc80d 100644 --- a/packages/agent/src/orchestrate/prisma/orchestratePrismaDeduplication.ts +++ b/packages/agent/src/orchestrate/prisma/orchestratePrismaDeduplication.ts @@ -101,7 +101,7 @@ async function process( enforceFunctionCall: true, promptCacheKey: props.promptCacheKey, ...transformPrismaDeduplicationHistory({ - component: props.target, + target: props.target, otherComponents: props.otherComponents, instruction: props.instruction, prefix: props.prefix, diff --git a/packages/agent/src/orchestrate/prisma/programmers/AutoBeDatabaseDeduplicationProgrammer.ts b/packages/agent/src/orchestrate/prisma/programmers/AutoBeDatabaseDeduplicationProgrammer.ts index 04a03414fc..2c683977b6 100644 --- a/packages/agent/src/orchestrate/prisma/programmers/AutoBeDatabaseDeduplicationProgrammer.ts +++ b/packages/agent/src/orchestrate/prisma/programmers/AutoBeDatabaseDeduplicationProgrammer.ts @@ -71,7 +71,7 @@ export namespace AutoBeDatabaseDeduplicationProgrammer { }); // Each group must include at least 1 table from target component - const hasTargetTable = group.tables.some( + const hasTargetTable: boolean = group.tables.some( (t) => t.namespace === props.target.namespace, ); if (!hasTargetTable) From d09f23c2b0224120510d05f1c0585caace70240d Mon Sep 17 00:00:00 2001 From: michael <7471919@naver.com> Date: Tue, 3 Feb 2026 17:07:46 +0900 Subject: [PATCH 7/8] chore: update ArchiveLogger about database duplication --- test/src/archive/utils/ArchiveLogger.ts | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/test/src/archive/utils/ArchiveLogger.ts b/test/src/archive/utils/ArchiveLogger.ts index 499f02c50d..a460ff284c 100644 --- a/test/src/archive/utils/ArchiveLogger.ts +++ b/test/src/archive/utils/ArchiveLogger.ts @@ -203,7 +203,11 @@ export namespace ArchiveLogger { else if (event.type === "databaseDeduplication") { content.push( ` - namespace: ${event.namespace}`, - ` - duplicated tables: ${event.duplicateGroups.map((g) => g.tables.map((t) => t.name).join(", ")).join(", ")}`, + ` - duplicated groups:`, + ...event.duplicateGroups.map( + (g, idx) => + ` - group ${idx + 1}: [${g.tables.map((t) => t.name).join(", ")}] (reason: ${g.reason})`, + ), ); } else if (event.type === "databaseSchema") content.push( From fab4642a175fc0b3def70cba232cdece666bb454 Mon Sep 17 00:00:00 2001 From: michael <7471919@naver.com> Date: Tue, 3 Feb 2026 17:16:13 +0900 Subject: [PATCH 8/8] chore: remove console --- .../orchestrate/prisma/orchestratePrisma.ts | 4 - .../AutoBeDatabaseDeduplicationProgrammer.ts | 156 +----------------- 2 files changed, 6 insertions(+), 154 deletions(-) diff --git a/packages/agent/src/orchestrate/prisma/orchestratePrisma.ts b/packages/agent/src/orchestrate/prisma/orchestratePrisma.ts index 5b7e6f2391..a41a4a6875 100644 --- a/packages/agent/src/orchestrate/prisma/orchestratePrisma.ts +++ b/packages/agent/src/orchestrate/prisma/orchestratePrisma.ts @@ -73,8 +73,6 @@ export const orchestratePrisma = async ( instruction: props.instruction, }, ); - console.log(`----------- DATABASE DEDUPLICATION -------------`); - console.log(JSON.stringify(components, null, 2)); const application: AutoBeDatabase.IApplication = await orchestrateSchema( ctx, { @@ -171,8 +169,6 @@ const orchestrateComponent = async ( components, })), ]); - console.log(`----------- ALL COMPONENTS -------------`); - console.log(JSON.stringify(allComponents, null, 2)); return await orchestratePrismaDeduplication(ctx, { instruction: props.instruction, components: allComponents, diff --git a/packages/agent/src/orchestrate/prisma/programmers/AutoBeDatabaseDeduplicationProgrammer.ts b/packages/agent/src/orchestrate/prisma/programmers/AutoBeDatabaseDeduplicationProgrammer.ts index 2c683977b6..19e472bd56 100644 --- a/packages/agent/src/orchestrate/prisma/programmers/AutoBeDatabaseDeduplicationProgrammer.ts +++ b/packages/agent/src/orchestrate/prisma/programmers/AutoBeDatabaseDeduplicationProgrammer.ts @@ -98,74 +98,17 @@ export namespace AutoBeDatabaseDeduplicationProgrammer { components: AutoBeDatabaseComponent[], events: AutoBeDatabaseDeduplicationEvent[], ): AutoBeDatabaseComponent[] => { - console.log("\n"); - console.log( - "╔══════════════════════════════════════════════════════════════╗", - ); - console.log( - "║ AutoBeDatabaseDeduplicationProgrammer.resolve() START ║", - ); - console.log( - "╚══════════════════════════════════════════════════════════════╝", - ); - console.log(`[Resolve] Input components: ${components.length}`); - console.log(`[Resolve] Input events: ${events.length}`); - - // 1. Collect all duplicate groups from events const duplicatedGroups: AutoBeDatabaseDeduplicationGroup[] = events.flatMap( (e) => e.duplicateGroups, ); + if (duplicatedGroups.length === 0) return components; - console.log( - `[Resolve] Total duplicate groups collected: ${duplicatedGroups.length}`, - ); - events.forEach((event, i) => { - console.log( - `[Resolve] Event[${i}] from "${event.namespace}": ${event.duplicateGroups.length} groups`, - ); - }); - - if (duplicatedGroups.length === 0) { - console.log( - "[Resolve] No duplicate groups found. Returning original components.", - ); - console.log( - "╔══════════════════════════════════════════════════════════════╗", - ); - console.log( - "║ AutoBeDatabaseDeduplicationProgrammer.resolve() END ║", - ); - console.log( - "╚══════════════════════════════════════════════════════════════╝\n", - ); - return components; - } - - // 2. Merge overlapping groups into clusters using Union-Find const clusters: AutoBeDatabaseDeduplicationGroup.ITable[][] = mergeGroups(duplicatedGroups); - // 3. Remove duplicates, keeping table from smallest component - const result = removeDuplicates(components, clusters); - - console.log("\n[Resolve] Summary:"); - console.log( - `[Resolve] Input tables: ${components.reduce((sum, c) => sum + c.tables.length, 0)}`, - ); - console.log( - `[Resolve] Output tables: ${result.reduce((sum, c) => sum + c.tables.length, 0)}`, - ); - console.log( - `[Resolve] Removed tables: ${components.reduce((sum, c) => sum + c.tables.length, 0) - result.reduce((sum, c) => sum + c.tables.length, 0)}`, - ); - console.log( - "╔══════════════════════════════════════════════════════════════╗", - ); - console.log( - "║ AutoBeDatabaseDeduplicationProgrammer.resolve() END ║", - ); - console.log( - "╚══════════════════════════════════════════════════════════════╝\n", + const result: AutoBeDatabaseComponent[] = removeDuplicates( + components, + clusters, ); return result; @@ -183,16 +126,6 @@ export namespace AutoBeDatabaseDeduplicationProgrammer { const mergeGroups = ( groups: AutoBeDatabaseDeduplicationGroup[], ): AutoBeDatabaseDeduplicationGroup.ITable[][] => { - console.log("\n========== [Union-Find] mergeGroups START =========="); - console.log(`[Union-Find] Input groups count: ${groups.length}`); - groups.forEach((group, i) => { - console.log( - `[Union-Find] Group[${i}]: ${group.tables.map((t) => `${t.namespace}::${t.name}`).join(" = ")}`, - ); - console.log(`[Union-Find] Reason: ${group.reason}`); - }); - - // Build table key → index mapping const tableKeys: string[] = []; const tableKeyToIndex: Map = new Map(); @@ -203,22 +136,16 @@ export namespace AutoBeDatabaseDeduplicationProgrammer { index = tableKeys.length; tableKeys.push(key); tableKeyToIndex.set(key, index); - console.log(`[Union-Find] Register table: ${key} → index ${index}`); } return index; }; - // Register all tables for (const group of groups) { for (const table of group.tables) { getOrCreateIndex(table.namespace, table.name); } } - console.log(`\n[Union-Find] Total unique tables: ${tableKeys.length}`); - console.log(`[Union-Find] Table keys: [${tableKeys.join(", ")}]`); - - // Union-Find: each table starts as its own parent const parent: number[] = tableKeys.map((_, i) => i); const rank: number[] = tableKeys.map(() => 0); @@ -234,34 +161,21 @@ export namespace AutoBeDatabaseDeduplicationProgrammer { const rootA: number = find(a); const rootB: number = find(b); if (rootA === rootB) { - console.log( - `[Union-Find] Union(${tableKeys[a]}, ${tableKeys[b]}): Already in same set (root=${tableKeys[rootA]})`, - ); return; } // Union by rank: attach smaller tree under larger tree if (rank[rootA] < rank[rootB]) { parent[rootA] = rootB; - console.log( - `[Union-Find] Union(${tableKeys[a]}, ${tableKeys[b]}): Merged ${tableKeys[rootA]} → ${tableKeys[rootB]} (rank)`, - ); } else if (rank[rootA] > rank[rootB]) { parent[rootB] = rootA; - console.log( - `[Union-Find] Union(${tableKeys[a]}, ${tableKeys[b]}): Merged ${tableKeys[rootB]} → ${tableKeys[rootA]} (rank)`, - ); } else { parent[rootB] = rootA; rank[rootA]++; - console.log( - `[Union-Find] Union(${tableKeys[a]}, ${tableKeys[b]}): Merged ${tableKeys[rootB]} → ${tableKeys[rootA]} (tie, rank++)`, - ); } }; // Union all tables within each group - console.log("\n[Union-Find] Processing union operations..."); for (const group of groups) { if (group.tables.length < 2) continue; const firstIndex: number = getOrCreateIndex( @@ -277,15 +191,6 @@ export namespace AutoBeDatabaseDeduplicationProgrammer { } } - // Log parent array state - console.log("\n[Union-Find] Final parent array:"); - tableKeys.forEach((key, i) => { - const root = find(i); - console.log( - `[Union-Find] ${key} (idx=${i}) → root=${tableKeys[root]} (idx=${root})`, - ); - }); - // Group tables by their root → clusters const clusterMap = new Map< number, @@ -303,16 +208,6 @@ export namespace AutoBeDatabaseDeduplicationProgrammer { } const result = [...clusterMap.values()]; - - // Log final clusters - console.log("\n[Union-Find] Final clusters:"); - result.forEach((cluster, i) => { - console.log( - `[Union-Find] Cluster[${i}]: ${cluster.map((t) => `${t.namespace}::${t.name}`).join(", ")}`, - ); - }); - console.log("========== [Union-Find] mergeGroups END ==========\n"); - return result; }; @@ -333,15 +228,6 @@ export namespace AutoBeDatabaseDeduplicationProgrammer { components: AutoBeDatabaseComponent[], clusters: AutoBeDatabaseDeduplicationGroup.ITable[][], ): AutoBeDatabaseComponent[] => { - console.log("\n========== [Dedup] removeDuplicates START =========="); - console.log(`[Dedup] Input components: ${components.length}`); - components.forEach((c) => { - console.log( - `[Dedup] ${c.namespace}: [${c.tables.map((t) => t.name).join(", ")}] (${c.tables.length} tables)`, - ); - }); - console.log(`[Dedup] Input clusters: ${clusters.length}`); - // Build tableKey → clusterId mapping const tableToCluster: Map = new Map(); clusters.forEach((cluster, clusterId) => { @@ -350,11 +236,6 @@ export namespace AutoBeDatabaseDeduplicationProgrammer { } }); - console.log("\n[Dedup] Table to Cluster mapping:"); - for (const [key, clusterId] of tableToCluster) { - console.log(`[Dedup] ${key} → Cluster[${clusterId}]`); - } - // Track which clusters already have a kept table const clusterSet: Set = new Set(); const keptTables: Map = new Map(); // For logging @@ -364,15 +245,7 @@ export namespace AutoBeDatabaseDeduplicationProgrammer { .map((c, i) => new Pair(c, i)) .sort((a, b) => a.first.tables.length - b.first.tables.length); - console.log("\n[Dedup] Processing order (sorted by table count):"); - sorted.forEach((p, i) => { - console.log( - `[Dedup] ${i + 1}. ${p.first.namespace} (${p.first.tables.length} tables)`, - ); - }); - // Filter tables: keep first encountered per cluster - console.log("\n[Dedup] Processing tables..."); const processed: Pair[] = sorted.map( (p) => new Pair( @@ -383,25 +256,16 @@ export namespace AutoBeDatabaseDeduplicationProgrammer { const clusterId: number | undefined = tableToCluster.get(key); // Not in any cluster → keep - if (clusterId === undefined) { - console.log(`[Dedup] KEEP ${key}: Not in any cluster`); - return true; - } + if (clusterId === undefined) return true; // First in cluster → keep and mark if (!clusterSet.has(clusterId)) { clusterSet.add(clusterId); keptTables.set(clusterId, key); - console.log( - `[Dedup] KEEP ${key}: First in Cluster[${clusterId}]`, - ); return true; } // Already have one from this cluster → remove - console.log( - `[Dedup] REMOVE ${key}: Cluster[${clusterId}] already has ${keptTables.get(clusterId)}`, - ); return false; }), }, @@ -410,19 +274,11 @@ export namespace AutoBeDatabaseDeduplicationProgrammer { ); // Restore original order and filter empty components - const result = processed + const result: AutoBeDatabaseComponent[] = processed .sort((a, b) => a.second - b.second) .map((p) => p.first) .filter((c) => c.tables.length > 0); - console.log("\n[Dedup] Final result:"); - result.forEach((c) => { - console.log( - `[Dedup] ${c.namespace}: [${c.tables.map((t) => t.name).join(", ")}] (${c.tables.length} tables)`, - ); - }); - console.log("========== [Dedup] removeDuplicates END ==========\n"); - return result; }; }