From af0d17463e9c3872787396edfeea9708b3a8796a Mon Sep 17 00:00:00 2001
From: Jon Klein <kleintech@gmail.com>
Date: Wed, 13 May 2026 22:00:40 -0400
Subject: [PATCH] feat(content): add get_content_summary tool for audit
 workflows
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Returns a minimal, fixed-shape summary (id, title, slug, status, link,
excerpt, modified date, taxonomy IDs, featured media, word count, Yoast
SEO fields) for audit and lookup workflows where the full WP REST
response is overkill — recipe posts can exceed 50KB because of rendered
Recipe Maker card HTML.

Look up by `id` (with optional `content_type`, defaulting to "post") or
by `url`. Bypasses the response trim added in PR #16 via the documented
`rawResponse: true` escape hatch so it can read yoast_head_json for the
SEO fields and Yoast's wordCount.

Also extracts the URL→post resolution from `find_content_by_url` into
a reusable `findContentByUrl` helper, dropping the duplicated update/
return logic along the way.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 README.md                    |  55 ++++++++++-
 src/tools/content-summary.ts | 174 +++++++++++++++++++++++++++++++++++
 src/tools/index.ts           |   9 +-
 src/tools/unified-content.ts | 162 +++++++++++---------------------
 4 files changed, 286 insertions(+), 114 deletions(-)
 create mode 100644 src/tools/content-summary.ts

diff --git a/README.md b/README.md
index e9ae91d..072eee4 100644
--- a/README.md
+++ b/README.md
@@ -28,7 +28,7 @@ Manage multiple WordPress sites from a single MCP server:
 
 All content and taxonomy tools support an optional `site_id` parameter to target specific sites.
 
-### **Unified Content Management** (8 tools)
+### **Unified Content Management** (9 tools)
 
 Handles ALL content types (posts, pages, custom post types) with a single set of intelligent tools:
 
@@ -40,6 +40,7 @@ Handles ALL content types (posts, pages, custom post types) with a single set of
 - `discover_content_types`: Find all available content types on your site
 - `find_content_by_url`: Smart URL resolver that can find and optionally update content from any WordPress URL
 - `get_content_by_slug`: Search by slug across all content types
+- `get_content_summary`: Return a minimal summary (id, title, slug, status, excerpt, taxonomies, word count, Yoast SEO fields) for audit and lookup workflows. Look up by `id` or `url`.
 
 ### **Unified Taxonomy Management** (8 tools)
 
@@ -131,6 +132,58 @@ The `find_content_by_url` tool can:
 - Optionally update the content in a single operation
 - Works with posts, pages, and any custom post types
 
+#### Audit & Lookup Summaries
+
+The `get_content_summary` tool returns a minimal, fixed-shape representation of a single piece of content. Designed for audit and lookup workflows where the full WP REST response — which can exceed 50KB on recipe posts because of the rendered Recipe Maker card HTML — is overkill.
+
+**Look up by ID** (with optional `content_type`, defaulting to `post`):
+
+```json
+{
+  "id": 4274,
+  "content_type": "post"
+}
+```
+
+**Look up by URL** (content type is detected from the URL):
+
+```json
+{
+  "url": "https://example.com/blog/easy-smoked-asparagus/"
+}
+```
+
+`id` and `url` are mutually exclusive — provide exactly one.
+
+The response shape is fixed:
+
+```json
+{
+  "id": 4274,
+  "title": "Easy Smoked Asparagus & Hot Honey",
+  "slug": "easy-smoked-asparagus",
+  "status": "publish",
+  "link": "https://example.com/blog/easy-smoked-asparagus/",
+  "excerpt": "Smoky asparagus with hot honey.",
+  "date_modified": "2026-04-30T10:14:00",
+  "categories": [12, 7],
+  "tags": [33],
+  "featured_media": 9012,
+  "word_count": 875,
+  "yoast_focus_keyword": "smoked asparagus",
+  "yoast_meta_title": "Easy Smoked Asparagus | Example",
+  "yoast_meta_description": "Smoky charred asparagus finished with chili-lime hot honey."
+}
+```
+
+Field notes:
+
+- `title` and `excerpt` are stripped to plain text (HTML tags removed, basic entities decoded).
+- `word_count` prefers `yoast_head_json.schema.@graph[].wordCount` when Yoast SEO is active; otherwise it is computed from the rendered post content with HTML stripped.
+- `yoast_meta_title` and `yoast_meta_description` are read from `yoast_head_json` on the post. They are `null` when Yoast SEO is not active.
+- `yoast_focus_keyword` is read from `meta._yoast_wpseo_focuskw`. WordPress core only exposes meta keys that are registered with `show_in_rest`, and Yoast SEO does not register this key by default — so this field will typically be `null` unless a companion plugin registers it (see PR #17 for context on the broader meta-key REST exposure issue).
+- This tool internally bypasses the response trimming added in PR #16 so it can read `yoast_head_json`. The trim still applies to all other tools.
+
 #### Universal Content Operations
 
 All content operations use a single `content_type` parameter:
diff --git a/src/tools/content-summary.ts b/src/tools/content-summary.ts
new file mode 100644
index 0000000..1e9c4fa
--- /dev/null
+++ b/src/tools/content-summary.ts
@@ -0,0 +1,174 @@
+// src/tools/content-summary.ts
+import { Tool } from '@modelcontextprotocol/sdk/types.js';
+import { z } from 'zod';
+import { makeWordPressRequest } from '../wordpress.js';
+import { findContentByUrl, getContentEndpoint } from './unified-content.js';
+
+const getContentSummarySchema = z.object({
+  id: z.coerce.number().optional().describe(
+    "Content ID. Mutually exclusive with `url` — provide exactly one."
+  ),
+  url: z.string().optional().describe(
+    "Public URL of the content (e.g. https://site.com/blog/my-post/). Mutually exclusive with `id`."
+  ),
+  content_type: z.string().optional().default('post').describe(
+    "Content type slug. Used only when looking up by `id`; when looking up by `url` the type is detected from the URL. Defaults to 'post'."
+  ),
+  site_id: z.string().optional().describe("Site ID (for multi-site setups)")
+});
+
+type GetContentSummaryParams = z.infer<typeof getContentSummarySchema>;
+
+const HTML_TAG_REGEX = /<[^>]*>/g;
+const WHITESPACE_RUN_REGEX = /\s+/g;
+const HTML_ENTITY_REGEX = /&(amp|lt|gt|quot|#39|nbsp);/g;
+
+function decodeBasicEntities(s: string): string {
+  return s.replace(HTML_ENTITY_REGEX, (match, entity) => {
+    switch (entity) {
+      case 'amp': return '&';
+      case 'lt': return '<';
+      case 'gt': return '>';
+      case 'quot': return '"';
+      case '#39': return "'";
+      case 'nbsp': return ' ';
+      default: return match;
+    }
+  });
+}
+
+export function htmlToPlainText(input: unknown): string {
+  if (typeof input !== 'string' || input.length === 0) return '';
+  const noTags = input.replace(HTML_TAG_REGEX, ' ');
+  const decoded = decodeBasicEntities(noTags);
+  return decoded.replace(WHITESPACE_RUN_REGEX, ' ').trim();
+}
+
+export function countWords(plainText: string): number {
+  if (plainText.length === 0) return 0;
+  return plainText.split(/\s+/).filter(Boolean).length;
+}
+
+// Yoast embeds wordCount on whichever schema.org node represents the article.
+// We scan the @graph rather than guessing the node type so this works on
+// posts, pages, products, recipes, etc.
+export function extractYoastWordCount(yoastJson: any): number | null {
+  const graph = yoastJson?.schema?.['@graph'];
+  if (!Array.isArray(graph)) return null;
+  for (const node of graph) {
+    if (node && typeof node.wordCount === 'number') return node.wordCount;
+  }
+  return null;
+}
+
+export interface ContentSummary {
+  id: number;
+  title: string;
+  slug: string;
+  status: string;
+  link: string;
+  excerpt: string;
+  date_modified: string;
+  categories: number[];
+  tags: number[];
+  featured_media: number;
+  word_count: number;
+  yoast_focus_keyword: string | null;
+  yoast_meta_title: string | null;
+  yoast_meta_description: string | null;
+}
+
+export function buildContentSummary(post: any): ContentSummary {
+  const yoast = post?.yoast_head_json;
+  const yoastWordCount = extractYoastWordCount(yoast);
+  const contentText = htmlToPlainText(post?.content?.rendered);
+  const focusKw = post?.meta?._yoast_wpseo_focuskw;
+
+  return {
+    id: typeof post?.id === 'number' ? post.id : 0,
+    title: htmlToPlainText(post?.title?.rendered),
+    slug: typeof post?.slug === 'string' ? post.slug : '',
+    status: typeof post?.status === 'string' ? post.status : '',
+    link: typeof post?.link === 'string' ? post.link : '',
+    excerpt: htmlToPlainText(post?.excerpt?.rendered),
+    date_modified: typeof post?.modified === 'string' ? post.modified : '',
+    categories: Array.isArray(post?.categories) ? post.categories : [],
+    tags: Array.isArray(post?.tags) ? post.tags : [],
+    featured_media: typeof post?.featured_media === 'number' ? post.featured_media : 0,
+    word_count: yoastWordCount ?? countWords(contentText),
+    yoast_focus_keyword: typeof focusKw === 'string' && focusKw.length > 0 ? focusKw : null,
+    yoast_meta_title: typeof yoast?.title === 'string' ? yoast.title : null,
+    yoast_meta_description: typeof yoast?.description === 'string' ? yoast.description : null,
+  };
+}
+
+export const contentSummaryTools: Tool[] = [
+  {
+    name: "get_content_summary",
+    description:
+      "Returns a minimal summary of a single piece of content — id, title, slug, status, link, excerpt, modified date, taxonomy IDs, featured media, word count, and Yoast SEO fields. Designed for audit and lookup workflows where the full WP REST response (which can exceed 50KB on recipe posts) is overkill. Look up by `id` (with optional `content_type`, defaulting to 'post') or by `url`.",
+    inputSchema: { type: "object", properties: getContentSummarySchema.shape }
+  }
+];
+
+export const contentSummaryHandlers = {
+  get_content_summary: async (params: GetContentSummaryParams) => {
+    try {
+      const hasId = params.id !== undefined && params.id !== null;
+      const hasUrl = typeof params.url === 'string' && params.url.length > 0;
+
+      if (hasId && hasUrl) {
+        throw new Error("Provide exactly one of `id` or `url`, not both.");
+      }
+      if (!hasId && !hasUrl) {
+        throw new Error("Provide one of `id` or `url`.");
+      }
+
+      let contentType = params.content_type ?? 'post';
+      let id: number;
+
+      if (hasUrl) {
+        const ref = await findContentByUrl(params.url!, params.site_id);
+        if (!ref) {
+          throw new Error(`No content found with URL: ${params.url}`);
+        }
+        contentType = ref.contentType;
+        id = ref.content.id;
+      } else {
+        id = params.id!;
+      }
+
+      const endpoint = await getContentEndpoint(contentType, params.site_id);
+      // Bypass response trimming so yoast_head_json reaches us — the trim
+      // documented in PR #16 strips it from every response by default, with
+      // `rawResponse: true` as the documented escape hatch for callers that
+      // need it.
+      const raw = await makeWordPressRequest('GET', `${endpoint}/${id}`, undefined, {
+        siteId: params.site_id,
+        rawResponse: true
+      });
+
+      const summary = buildContentSummary(raw.data);
+
+      return {
+        toolResult: {
+          content: [{
+            type: 'text',
+            text: JSON.stringify(summary, null, 2)
+          }],
+          isError: false
+        }
+      };
+    } catch (error: any) {
+      return {
+        toolResult: {
+          content: [{
+            type: 'text',
+            text: `Error getting content summary: ${error.message}`
+          }],
+          isError: true
+        }
+      };
+    }
+  }
+};
diff --git a/src/tools/index.ts b/src/tools/index.ts
index 1d5edbf..0e92195 100644
--- a/src/tools/index.ts
+++ b/src/tools/index.ts
@@ -9,8 +9,9 @@ import { pluginRepositoryTools, pluginRepositoryHandlers } from './plugin-reposi
 import { commentTools, commentHandlers } from './comments.js';
 import { sqlQueryTools, sqlQueryHandlers } from './sql-query.js';
 import { siteManagementTools, siteManagementHandlers } from './site-management.js';
+import { contentSummaryTools, contentSummaryHandlers } from './content-summary.js';
 
-// Combine all tools - significantly reduced from ~65 to ~42 tools
+// Combine all tools
 export const allTools: Tool[] = [
   ...unifiedContentTools,        // 8 tools (replaces posts, pages, custom-post-types)
   ...unifiedTaxonomyTools,       // 8 tools (replaces categories, custom-taxonomies)
@@ -20,7 +21,8 @@ export const allTools: Tool[] = [
   ...pluginRepositoryTools,     // ~2 tools
   ...commentTools,              // ~5 tools
   ...sqlQueryTools,             // 1 tool (database queries)
-  ...siteManagementTools        // 3 tools (multi-site support)
+  ...siteManagementTools,       // 3 tools (multi-site support)
+  ...contentSummaryTools        // 1 tool (audit/lookup summary)
 ];
 
 // Combine all handlers
@@ -33,5 +35,6 @@ export const toolHandlers = {
   ...pluginRepositoryHandlers,
   ...commentHandlers,
   ...sqlQueryHandlers,
-  ...siteManagementHandlers
+  ...siteManagementHandlers,
+  ...contentSummaryHandlers
 };
\ No newline at end of file
diff --git a/src/tools/unified-content.ts b/src/tools/unified-content.ts
index 3bb4302..f909319 100644
--- a/src/tools/unified-content.ts
+++ b/src/tools/unified-content.ts
@@ -93,7 +93,7 @@ async function getPostTypes(forceRefresh = false, siteId?: string) {
 }
 
 // Helper function to get the correct endpoint for a content type
-async function getContentEndpoint(contentType: string, siteId?: string): Promise<string> {
+export async function getContentEndpoint(contentType: string, siteId?: string): Promise<string> {
   // Quick return for standard types
   const standardMap: Record<string, string> = {
     'post': 'posts',
@@ -208,6 +208,51 @@ async function findContentAcrossTypes(slug: string, contentTypes?: string[], sit
   return null;
 }
 
+// URL → post-type hint table used when resolving a public WP URL to its content type.
+const URL_PATH_TYPE_HINTS: Record<string, string[]> = {
+  'documentation': ['documentation', 'docs', 'doc'],
+  'docs': ['documentation', 'docs', 'doc'],
+  'products': ['product'],
+  'portfolio': ['portfolio', 'project'],
+  'services': ['service'],
+  'testimonials': ['testimonial'],
+  'team': ['team_member', 'staff'],
+  'events': ['event'],
+  'courses': ['course', 'lesson']
+};
+
+/**
+ * Resolve a public WordPress URL to the underlying post by parsing the slug
+ * and path hints, searching priority content types first and then falling back
+ * to all available content types. Returns null when no content matches.
+ *
+ * Throws when the URL cannot be parsed into a slug — callers can surface that
+ * as a distinct error from the not-found case.
+ */
+export async function findContentByUrl(
+  url: string,
+  siteId?: string
+): Promise<{ content: any; contentType: string } | null> {
+  const { slug, pathHints } = parseUrl(url);
+
+  if (!slug) {
+    throw new Error('Could not extract slug from URL');
+  }
+
+  const priorityTypes: string[] = [];
+  for (const hint of pathHints) {
+    const mapped = URL_PATH_TYPE_HINTS[hint.toLowerCase()];
+    if (mapped) priorityTypes.push(...mapped);
+  }
+  priorityTypes.push('post', 'page');
+  const typesToSearch = [...new Set(priorityTypes)];
+
+  const result = await findContentAcrossTypes(slug, typesToSearch, siteId);
+  if (result) return result;
+
+  return findContentAcrossTypes(slug, undefined, siteId);
+}
+
 // Content format types
 type ContentFormat = 'auto' | 'markdown' | 'html' | 'blocks';
 type DetectedFormat = 'blocks' | 'html' | 'markdown' | 'text';
@@ -834,123 +879,20 @@ export const unifiedContentHandlers = {
 
   find_content_by_url: async (params: FindContentByUrlParams) => {
     try {
-      const { slug, pathHints } = parseUrl(params.url);
-      
-      if (!slug) {
-        throw new Error('Could not extract slug from URL');
-      }
-      
-      logToFile(`Searching for content with slug: ${slug}, path hints: ${pathHints.join('/')}`);
-      
-      // Try to guess content types based on URL structure
-      const priorityTypes: string[] = [];
-      
-      // Common URL patterns to content type mappings
-      const pathMappings: Record<string, string[]> = {
-        'documentation': ['documentation', 'docs', 'doc'],
-        'docs': ['documentation', 'docs', 'doc'],
-        'products': ['product'],
-        'portfolio': ['portfolio', 'project'],
-        'services': ['service'],
-        'testimonials': ['testimonial'],
-        'team': ['team_member', 'staff'],
-        'events': ['event'],
-        'courses': ['course', 'lesson']
-      };
-      
-      // Check path hints for potential content types
-      for (const hint of pathHints) {
-        const mappedTypes = pathMappings[hint.toLowerCase()];
-        if (mappedTypes) {
-          priorityTypes.push(...mappedTypes);
-        }
-      }
-      
-      // Always check standard content types as fallback
-      priorityTypes.push('post', 'page');
-      
-      // Remove duplicates
-      const typesToSearch = [...new Set(priorityTypes)];
-      
-      // Find the content
-      const result = await findContentAcrossTypes(slug, typesToSearch, params.site_id);
-      
-      if (!result) {
-        // If not found in priority types, search all types
-        const allResult = await findContentAcrossTypes(slug, undefined, params.site_id);
-        if (!allResult) {
-          throw new Error(`No content found with URL: ${params.url}`);
-        }
-        
-        const { content, contentType } = allResult;
-        
-        // Update if requested
-        if (params.update_fields) {
-          const endpoint = await getContentEndpoint(contentType, params.site_id);
-
-          const updateData: any = {};
-          if (params.update_fields.title !== undefined) updateData.title = params.update_fields.title;
-          if (params.update_fields.content !== undefined) {
-            // Process content format (markdown -> HTML, optional block conversion)
-            updateData.content = await processContent(
-              params.update_fields.content,
-              params.update_fields.content_format || 'auto',
-              params.update_fields.convert_to_blocks || false
-            );
-          }
-          if (params.update_fields.status !== undefined) updateData.status = params.update_fields.status;
-          if (params.update_fields.meta !== undefined) updateData.meta = params.update_fields.meta;
-          if (params.update_fields.custom_fields !== undefined) {
-            Object.assign(updateData, params.update_fields.custom_fields);
-          }
+      const result = await findContentByUrl(params.url, params.site_id);
 
-          const updatedContent = await makeWordPressRequest('POST', `${endpoint}/${content.id}`, updateData, { siteId: params.site_id });
-
-          return {
-            toolResult: {
-              content: [{
-                type: 'text',
-                text: JSON.stringify({
-                  found: true,
-                  content_type: contentType,
-                  content_id: content.id,
-                  original_url: params.url,
-                  updated: true,
-                  content: updatedContent
-                }, null, 2)
-              }],
-              isError: false
-            }
-          };
-        }
-
-        return {
-          toolResult: {
-            content: [{
-              type: 'text',
-              text: JSON.stringify({
-                found: true,
-                content_type: contentType,
-                content_id: content.id,
-                original_url: params.url,
-                content: content
-              }, null, 2)
-            }],
-            isError: false
-          }
-        };
+      if (!result) {
+        throw new Error(`No content found with URL: ${params.url}`);
       }
 
       const { content, contentType } = result;
 
-      // Update if requested
       if (params.update_fields) {
         const endpoint = await getContentEndpoint(contentType, params.site_id);
 
         const updateData: any = {};
         if (params.update_fields.title !== undefined) updateData.title = params.update_fields.title;
         if (params.update_fields.content !== undefined) {
-          // Process content format (markdown -> HTML, optional block conversion)
           updateData.content = await processContent(
             params.update_fields.content,
             params.update_fields.content_format || 'auto',
@@ -1001,9 +943,9 @@ export const unifiedContentHandlers = {
     } catch (error: any) {
       return {
         toolResult: {
-          content: [{ 
-            type: 'text', 
-            text: `Error finding content by URL: ${error.message}` 
+          content: [{
+            type: 'text',
+            text: `Error finding content by URL: ${error.message}`
           }],
           isError: true
         }