Skip to content

Commit cfc07bc

Browse files
onlycastleclaude
andauthored
fix(plugin): add explicit Agent tool warning against Bash CLI dispatch (#189)
* fix(plugin): add explicit Agent tool warning against Bash CLI dispatch Skills now clearly instruct the host to use the built-in Agent tool instead of shelling out to `claude` CLI for agent dispatch. Adds test assertion for the new contract language. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * chore(plugin): rebuild dist artifacts Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> --------- Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent f30f40f commit cfc07bc

10 files changed

Lines changed: 202 additions & 33 deletions

File tree

Lines changed: 2 additions & 2 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.
File renamed without changes.
Lines changed: 179 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -27560,6 +27560,9 @@ var ParsedMessageSchema = external_exports2.object({
2756027560
role: external_exports2.enum(["user", "assistant"]),
2756127561
timestamp: external_exports2.string(),
2756227562
content: external_exports2.string(),
27563+
isMeta: external_exports2.boolean().optional(),
27564+
sourceToolUseID: external_exports2.string().optional(),
27565+
toolUseResult: external_exports2.unknown().optional(),
2756327566
toolCalls: external_exports2.array(ToolCallSchema).optional(),
2756427567
tokenUsage: external_exports2.object({
2756527568
input: external_exports2.number().int().min(0),
@@ -27686,12 +27689,18 @@ var Phase1OutputSchema = external_exports2.object({
2768627689

2768727690
// ../shared/dist/schemas/deterministic-scores.js
2768827691
var DeterministicScoresSchema = external_exports2.object({
27689-
contextEfficiency: external_exports2.number().min(0).max(100),
27690-
sessionOutcome: external_exports2.number().min(0).max(100),
27691-
thinkingQuality: external_exports2.number().min(0).max(100),
27692-
learningBehavior: external_exports2.number().min(0).max(100),
27693-
communicationPatterns: external_exports2.number().min(0).max(100),
27694-
controlScore: external_exports2.number().min(0).max(100)
27692+
aiPartnership: external_exports2.number().min(0).max(100),
27693+
sessionCraft: external_exports2.number().min(0).max(100),
27694+
toolMastery: external_exports2.number().min(0).max(100),
27695+
skillResilience: external_exports2.number().min(0).max(100),
27696+
sessionMastery: external_exports2.number().min(0).max(100),
27697+
controlScore: external_exports2.number().min(0).max(100),
27698+
// Legacy fields — present on old runs, not computed for new
27699+
contextEfficiency: external_exports2.number().min(0).max(100).optional(),
27700+
sessionOutcome: external_exports2.number().min(0).max(100).optional(),
27701+
thinkingQuality: external_exports2.number().min(0).max(100).optional(),
27702+
learningBehavior: external_exports2.number().min(0).max(100).optional(),
27703+
communicationPatterns: external_exports2.number().min(0).max(100).optional()
2769527704
});
2769627705
var CodingStyleTypeSchema = external_exports2.enum([
2769727706
"architect",
@@ -27740,6 +27749,12 @@ var DomainGrowthAreaSchema = external_exports2.object({
2774027749
});
2774127750
var DomainResultSchema = external_exports2.object({
2774227751
domain: external_exports2.enum([
27752+
"aiPartnership",
27753+
"sessionCraft",
27754+
"toolMastery",
27755+
"skillResilience",
27756+
"sessionMastery",
27757+
// Legacy domains accepted for backward compat with old runs
2774327758
"thinkingQuality",
2774427759
"communicationPatterns",
2774527760
"learningBehavior",
@@ -27885,12 +27900,17 @@ var STAGE_NAMES = [
2788527900
"evidenceVerification",
2788627901
"contentWriter",
2788727902
"translator",
27903+
// 5-dimension extractors (v2)
27904+
"extractAiPartnership",
27905+
"extractSessionCraft",
27906+
"extractToolMastery",
27907+
"extractSkillResilience",
27908+
"extractSessionMastery",
27909+
// Legacy extractors (old runs only)
2788827910
"extractAiCollaboration",
2788927911
"extractContextEngineering",
27890-
"extractToolMastery",
2789127912
"extractBurnoutRisk",
27892-
"extractAiControl",
27893-
"extractSkillResilience"
27913+
"extractAiControl"
2789427914
];
2789527915
var STAGE_SCHEMAS = {
2789627916
sessionSummaries: SessionSummaryBatchSchema,
@@ -27900,12 +27920,17 @@ var STAGE_SCHEMAS = {
2790027920
evidenceVerification: EvidenceVerificationOutputSchema,
2790127921
contentWriter: ContentWriterOutputSchema,
2790227922
translator: TranslatorOutputSchema,
27923+
// 5-dimension extractors (v2)
27924+
extractAiPartnership: DimensionExtractionSchema,
27925+
extractSessionCraft: DimensionExtractionSchema,
27926+
extractToolMastery: DimensionExtractionSchema,
27927+
extractSkillResilience: DimensionExtractionSchema,
27928+
extractSessionMastery: DimensionExtractionSchema,
27929+
// Legacy extractors (old runs)
2790327930
extractAiCollaboration: DimensionExtractionSchema,
2790427931
extractContextEngineering: DimensionExtractionSchema,
27905-
extractToolMastery: DimensionExtractionSchema,
2790627932
extractBurnoutRisk: DimensionExtractionSchema,
27907-
extractAiControl: DimensionExtractionSchema,
27908-
extractSkillResilience: DimensionExtractionSchema
27933+
extractAiControl: DimensionExtractionSchema
2790927934
};
2791027935

2791127936
// ../shared/dist/schemas/analysis-run.js
@@ -28336,6 +28361,96 @@ var SessionOutcomeOutputSchema = external_exports2.object({
2833628361
growthAreas: external_exports2.array(WorkerGrowthSchema).optional(),
2833728362
referencedInsights: external_exports2.array(ReferencedInsightSchema).optional()
2833828363
});
28364+
var AiPartnershipOutputSchema = external_exports2.object({
28365+
// Planning Dimension (from ThinkingQuality)
28366+
planningHabits: external_exports2.array(PlanningHabitSchema),
28367+
planQualityScore: external_exports2.number().min(0).max(100),
28368+
multitaskingPattern: MultitaskingPatternSchema.optional(),
28369+
// Critical Thinking Dimension (from ThinkingQuality)
28370+
verificationBehavior: VerificationBehaviorSchema,
28371+
criticalThinkingMoments: external_exports2.array(CriticalThinkingMomentSchema),
28372+
verificationAntiPatterns: external_exports2.array(DetectedAntiPatternSchema),
28373+
// AI Control Dimension (from SessionOutcome)
28374+
sessionAnalyses: external_exports2.array(SessionAnalysisSchema),
28375+
overallSuccessRate: external_exports2.number().min(0).max(100),
28376+
goalDistribution: external_exports2.array(GoalDistributionItemSchema),
28377+
frictionSummary: external_exports2.array(FrictionSummaryItemSchema),
28378+
// Overall Scores
28379+
overallAiPartnershipScore: external_exports2.number().min(0).max(100),
28380+
confidenceScore: external_exports2.number().min(0).max(1),
28381+
summary: external_exports2.string().optional(),
28382+
// Domain-specific Strengths & Growth Areas
28383+
strengths: external_exports2.array(WorkerStrengthSchema).optional(),
28384+
growthAreas: external_exports2.array(WorkerGrowthSchema).optional(),
28385+
referencedInsights: external_exports2.array(ReferencedInsightSchema).optional()
28386+
});
28387+
var SessionCraftOutputSchema = external_exports2.object({
28388+
// Context Efficiency Dimension
28389+
contextUsagePatterns: external_exports2.array(ContextUsagePatternSchema),
28390+
inefficiencyPatterns: external_exports2.array(InefficiencySchema),
28391+
promptLengthTrends: external_exports2.array(PromptLengthTrendSchema),
28392+
avgContextFillPercent: external_exports2.number().min(0).max(100),
28393+
// Burnout / Learning Dimension
28394+
repeatedMistakePatterns: external_exports2.array(RepeatedMistakePatternSchema),
28395+
knowledgeGaps: external_exports2.array(KnowledgeGapItemSchema),
28396+
// Insights
28397+
topInsights: external_exports2.array(external_exports2.string()).max(3),
28398+
// Overall Scores
28399+
overallSessionCraftScore: external_exports2.number().min(0).max(100),
28400+
confidenceScore: external_exports2.number().min(0).max(1),
28401+
summary: external_exports2.string().optional(),
28402+
// Domain-specific Strengths & Growth Areas
28403+
strengths: external_exports2.array(WorkerStrengthSchema).optional(),
28404+
growthAreas: external_exports2.array(WorkerGrowthSchema).optional(),
28405+
referencedInsights: external_exports2.array(ReferencedInsightSchema).optional()
28406+
});
28407+
var AbsenceIndicatorSchema = external_exports2.object({
28408+
/** Name of the anti-pattern being checked */
28409+
pattern: external_exports2.string(),
28410+
/** Whether the anti-pattern was observed */
28411+
present: external_exports2.boolean(),
28412+
/** Number of occurrences (0 = absent) */
28413+
occurrenceCount: external_exports2.number().int().min(0),
28414+
/** Total sessions checked */
28415+
sessionsChecked: external_exports2.number().int().min(1),
28416+
/** Score contribution: 0 (always present) to 100 (always absent) */
28417+
absenceScore: external_exports2.number().min(0).max(100),
28418+
/** Evidence of presence (empty if absent — which is good) */
28419+
evidence: external_exports2.array(EvidenceItemSchema),
28420+
/**
28421+
* Expert interpretation:
28422+
* - 'internalized': absence indicates mastery (skill no longer needs scaffolding)
28423+
* - 'not_applicable': pattern not relevant for this developer's workflow
28424+
* - 'concerning': absence may indicate gap (e.g., never verifies output)
28425+
*/
28426+
interpretation: external_exports2.enum(["internalized", "not_applicable", "concerning"])
28427+
});
28428+
var SessionCleanlinessSchema = external_exports2.object({
28429+
sessionId: external_exports2.string(),
28430+
cleanlinessScore: external_exports2.number().min(0).max(100),
28431+
antiPatternCount: external_exports2.number().int().min(0),
28432+
/** Indicates expert-level session control */
28433+
isCleanSession: external_exports2.boolean()
28434+
});
28435+
var SessionMasteryOutputSchema = external_exports2.object({
28436+
// Absence Indicators
28437+
absenceIndicators: external_exports2.array(AbsenceIndicatorSchema),
28438+
// Session Cleanliness
28439+
sessionCleanliness: external_exports2.array(SessionCleanlinessSchema),
28440+
cleanSessionPercentage: external_exports2.number().min(0).max(100),
28441+
// Expert Differentiation
28442+
scaffoldingDependencyScore: external_exports2.number().min(0).max(100),
28443+
internalizedSkillSignals: external_exports2.array(external_exports2.string()),
28444+
expertBehaviorIndicators: external_exports2.array(external_exports2.string()),
28445+
// Overall Scores
28446+
overallSessionMasteryScore: external_exports2.number().min(0).max(100),
28447+
confidenceScore: external_exports2.number().min(0).max(1),
28448+
summary: external_exports2.string().optional(),
28449+
// Domain-specific Strengths & Growth Areas
28450+
strengths: external_exports2.array(WorkerStrengthSchema).optional(),
28451+
growthAreas: external_exports2.array(WorkerGrowthSchema).optional(),
28452+
referencedInsights: external_exports2.array(ReferencedInsightSchema).optional()
28453+
});
2833928454

2834028455
// ../shared/dist/evaluation/canonical-analysis.js
2834128456
var DOMAIN_TO_EVALUATION_DIMENSION = {
@@ -29026,14 +29141,56 @@ function scoreControl(metrics) {
2902629141
const commandSignal = Math.min(uniqueCommands * 10 + totalCommands * 2, 100);
2902729142
return clampScore(rejectionSignal * 0.25 + questionSignal * 0.25 + lengthSignal * 0.25 + commandSignal * 0.25);
2902829143
}
29144+
function scoreAiPartnership(metrics) {
29145+
const thinking = scoreThinkingQuality(metrics);
29146+
const outcome = scoreSessionOutcome(metrics);
29147+
const control = scoreControl(metrics);
29148+
return clampScore(thinking * 0.4 + outcome * 0.35 + control * 0.25);
29149+
}
29150+
function scoreSessionCraft(metrics) {
29151+
const efficiency = scoreContextEfficiency(metrics);
29152+
const learning = scoreLearningBehavior(metrics);
29153+
return clampScore(efficiency * 0.55 + learning * 0.45);
29154+
}
29155+
function scoreSkillResilience(metrics) {
29156+
const totalSessions = Math.max(metrics.totalSessions, 1);
29157+
const totalUtterances = Math.max(metrics.totalDeveloperUtterances, 1);
29158+
const shortSessions = metrics.sessionHints?.shortSessions ?? 0;
29159+
const shortRatio = shortSessions / totalSessions;
29160+
const coldStartScore = bellCurveScore(shortRatio * 100, 20, 50, 3e-3);
29161+
const bareRetryRate = (metrics.frictionSignals?.bareRetryAfterErrorCount ?? 0) / totalUtterances;
29162+
const recoveryScore = invertedScale(bareRetryRate * 200);
29163+
const slashCmds = metrics.slashCommandCounts ?? {};
29164+
const uniqueCommands = Object.keys(slashCmds).length;
29165+
const diversityScore = Math.min(uniqueCommands * 12 + 20, 100);
29166+
return clampScore(coldStartScore * 0.3 + recoveryScore * 0.4 + diversityScore * 0.3);
29167+
}
29168+
function scoreSessionMastery(metrics) {
29169+
const totalSessions = Math.max(metrics.totalSessions, 1);
29170+
const totalUtterances = Math.max(metrics.totalDeveloperUtterances, 1);
29171+
const friction = metrics.frictionSignals;
29172+
const excessiveIterationRate = (friction?.excessiveIterationSessions ?? 0) / totalSessions;
29173+
const noExcessiveScore = invertedScale(excessiveIterationRate * 150);
29174+
const overflowRate = (metrics.contextFillExceeded90Count ?? 0) / totalSessions;
29175+
const noOverflowScore = invertedScale(overflowRate * 120);
29176+
const bareRetryRate = (friction?.bareRetryAfterErrorCount ?? 0) / totalUtterances;
29177+
const noRetryScore = invertedScale(bareRetryRate * 300);
29178+
const frustrationRate = (friction?.frustrationExpressionCount ?? 0) / totalUtterances;
29179+
const noFrustrationScore = invertedScale(frustrationRate * 400);
29180+
const toolFailureRate = (friction?.toolFailureCount ?? 0) / Math.max(metrics.totalMessages, 1);
29181+
const noToolFailureScore = invertedScale(toolFailureRate * 200);
29182+
const mediumSessions = metrics.sessionHints?.mediumSessions ?? 0;
29183+
const focusBonus = mediumSessions / totalSessions * 15;
29184+
return clampScore(noExcessiveScore * 0.25 + noOverflowScore * 0.2 + noRetryScore * 0.2 + noFrustrationScore * 0.15 + noToolFailureScore * 0.1 + focusBonus + 10);
29185+
}
2902929186
function computeDeterministicScores(phase1Output) {
2903029187
const metrics = phase1Output.sessionMetrics;
2903129188
return {
29032-
contextEfficiency: scoreContextEfficiency(metrics),
29033-
sessionOutcome: scoreSessionOutcome(metrics),
29034-
thinkingQuality: scoreThinkingQuality(metrics),
29035-
learningBehavior: scoreLearningBehavior(metrics),
29036-
communicationPatterns: scoreCommunicationPatterns(metrics, phase1Output),
29189+
aiPartnership: scoreAiPartnership(metrics),
29190+
sessionCraft: scoreSessionCraft(metrics),
29191+
toolMastery: scoreCommunicationPatterns(metrics, phase1Output),
29192+
skillResilience: scoreSkillResilience(metrics),
29193+
sessionMastery: scoreSessionMastery(metrics),
2903729194
controlScore: scoreControl(metrics)
2903829195
};
2903929196
}
@@ -29120,8 +29277,8 @@ function computeAffinities(scores, metrics, trendDensity) {
2912029277
const slashCmds = metrics.slashCommandCounts ?? {};
2912129278
const planCount = (slashCmds["plan"] ?? 0) + (slashCmds["review"] ?? 0);
2912229279
const planBonus = planCount > 0 ? Math.min(planCount * 8, 30) : 0;
29123-
const architectAffinity = scores.thinkingQuality * 0.5 + scores.controlScore * 0.3 + planBonus;
29124-
const analystAffinity = scores.thinkingQuality * 0.3 + scores.learningBehavior * 0.4 + scores.sessionOutcome * 0.2 + (metrics.questionRatio > 0.2 ? 10 : 0);
29280+
const architectAffinity = scores.aiPartnership * 0.5 + scores.controlScore * 0.3 + planBonus;
29281+
const analystAffinity = scores.aiPartnership * 0.3 + scores.sessionCraft * 0.4 + scores.sessionMastery * 0.2 + (metrics.questionRatio > 0.2 ? 10 : 0);
2912529282
const uniqueCommands = Object.keys(slashCmds).length;
2912629283
const totalCommands = Object.values(slashCmds).reduce((sum, c) => sum + c, 0);
2912729284
const orchestrationCmds = (slashCmds["sisyphus"] ?? 0) + (slashCmds["orchestrator"] ?? 0) + (slashCmds["ultrawork"] ?? 0) + (slashCmds["ralph-loop"] ?? 0);
@@ -29131,9 +29288,9 @@ function computeAffinities(scores, metrics, trendDensity) {
2913129288
const conductorAffinity = commandDiversityScore + commandVolumeScore + orchestrationBonus;
2913229289
const avgLen = metrics.avgDeveloperMessageLength;
2913329290
const concisenessScore = avgLen < 200 ? 40 : avgLen < 400 ? 25 : 10;
29134-
const speedrunnerAffinity = scores.contextEfficiency * 0.5 + concisenessScore + (scores.sessionOutcome > 70 ? 15 : 0);
29291+
const speedrunnerAffinity = scores.sessionCraft * 0.5 + concisenessScore + (scores.sessionMastery > 70 ? 15 : 0);
2913529292
const trendKeywordScore = trendDensity > 3 ? Math.min(trendDensity * 15, 60) : trendDensity * 5;
29136-
const learningCuriosityBonus = scores.learningBehavior > 70 ? 15 : 0;
29293+
const learningCuriosityBonus = scores.sessionCraft > 70 ? 15 : 0;
2913729294
const trendsetterAffinity = trendKeywordScore + learningCuriosityBonus;
2913829295
return {
2913929296
architect: architectAffinity,
@@ -29581,4 +29738,4 @@ export {
2958129738
clearAnalysisPending,
2958229739
shouldResumeRunningAnalysis
2958329740
};
29584-
//# sourceMappingURL=chunk-72GWNTBD.js.map
29741+
//# sourceMappingURL=chunk-SUEN2LKX.js.map

packages/plugin/dist/chunk-72GWNTBD.js.map renamed to packages/plugin/dist/chunk-SUEN2LKX.js.map

Lines changed: 1 addition & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

packages/plugin/dist/hooks/post-session-handler.js

Lines changed: 1 addition & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

packages/plugin/dist/hooks/session-start-handler.js

Lines changed: 2 additions & 2 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

packages/plugin/dist/mcp/server.js

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
import {
33
readPrefs,
44
writePrefs
5-
} from "../chunk-NH4BKYT6.js";
5+
} from "../chunk-QFCYET5Y.js";
66
import {
77
$ZodObject,
88
$ZodType,
@@ -58,7 +58,7 @@ import {
5858
touchAnalysisHeartbeat,
5959
union,
6060
unknown
61-
} from "../chunk-72GWNTBD.js";
61+
} from "../chunk-SUEN2LKX.js";
6262
import {
6363
__commonJS,
6464
__require,

packages/plugin/skills/bp-analyze/SKILL.md

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,9 @@ Each analysis skill runs as an **isolated Agent** rather than inline in this ses
3535

3636
## Agent Dispatch
3737

38-
When `get_run_progress` returns a `nextStep` with a non-null `skill`, dispatch it as an Agent using the Agent tool:
38+
**CRITICAL**: Use the built-in `Agent` tool from your tool list to dispatch skills. Do NOT use Bash to run `claude` CLI commands (e.g., `claude --model haiku -p "..."`). The Agent tool is a first-class tool like `Read` or `Bash` -- invoke it directly with the parameters listed below.
39+
40+
When `get_run_progress` returns a `nextStep` with a non-null `skill`, dispatch it as an Agent using the built-in `Agent` tool:
3941

4042
- **prompt**: `"You are executing a BetterPrompt analysis stage. Read the skill instructions at [PLUGIN_PATH]/skills/[SKILL_NAME]/SKILL.md and follow them exactly. You have access to BetterPrompt MCP tools (get_prompt_context, get_stage_output, save_stage_output, save_domain_results, etc.). Execute the complete skill workflow. When finished, report whether the stage completed successfully or failed (include the error message if failed). Do not delegate to other agents or tasks."`
4143
- **model**: Use the model from the Model Tiering table above

packages/plugin/skills/bp-setup/SKILL.md

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -136,7 +136,12 @@ This is a one-time ask, tracked via `starAsked` in `~/.betterprompt/prefs.json`.
136136
> It helps others discover it: https://github.com/onlycastle/BetterPrompt"
137137
>
138138
> **"Sure, I'll star it"** / **"Skip"**
139-
3. Regardless of choice, write `starAsked: true` to prefs.
139+
3. If the user chose **"Sure, I'll star it"**, open the repo page in their browser:
140+
```bash
141+
open "https://github.com/onlycastle/BetterPrompt"
142+
```
143+
(Use `open` on macOS, `xdg-open` on Linux, `start` on Windows.)
144+
4. Regardless of choice, write `starAsked: true` to prefs.
140145
Prefer `save_user_prefs` instead of writing JSON manually.
141146

142147
### Step 6: Quick Reference + First Action
@@ -162,7 +167,7 @@ Then ask the user what they would like to do next.
162167
If the user already said to continue working or to run analysis now, follow that instruction directly and do not call `AskUserQuestion`.
163168

164169
Use `AskUserQuestion` with these options:
165-
- **"Run bp analyze now"** (Recommended) — dispatch `bp-analyze` as an **Agent** (not as an inline skill) so it starts with a clean context. Use the Agent tool with:
170+
- **"Run bp analyze now"** (Recommended) — dispatch `bp-analyze` as an **Agent** (not as an inline skill) so it starts with a clean context. Use Claude Code's built-in `Agent` tool (do NOT use Bash to run `claude` CLI):
166171
```
167172
model: sonnet
168173
description: "bp: analyze"

tests/unit/plugin/bp-analyze-agent-dispatch.test.ts

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,11 @@ describe('bp-analyze agent dispatch contract', () => {
3939
expect(content).toContain('must NOT internally spawn additional Agents or Tasks');
4040
});
4141

42+
it('explicitly warns against using Bash for agent dispatch', () => {
43+
expect(content).toContain('Do NOT use Bash');
44+
expect(content).toContain('built-in `Agent` tool');
45+
});
46+
4247
it('does not contain old single-session constraint language', () => {
4348
expect(content).not.toContain('single-session and deterministic');
4449
expect(content).not.toContain('Do NOT switch to `Agent`, `Task`, or any delegation mechanism');

0 commit comments

Comments
 (0)