-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathllm-span.ts
More file actions
292 lines (238 loc) · 12.9 KB
/
llm-span.ts
File metadata and controls
292 lines (238 loc) · 12.9 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
/**
* LLM Span Example
*
* Demonstrates how to create LLM (Large Language Model) spans in AMP SDK.
* LLM spans track AI model interactions including chat completions, embeddings,
* and other generative AI operations.
*
* Usage:
* AMP_API_KEY=your-api-key npx ts-node examples/llm-span.ts
* AMP_API_KEY=your-api-key AMP_BASE_URL=https://your-server.com npx ts-node examples/llm-span.ts
*/
import { AMP } from '@koreaiinc/amp-sdk';
// =============================================================================
// CONFIGURATION
// =============================================================================
// API key is required - get from environment variable
const API_KEY = process.env.AMP_API_KEY;
if (!API_KEY) {
console.error('Error: AMP_API_KEY environment variable is required');
console.error('Usage: AMP_API_KEY=your-api-key npx ts-node examples/llm-span.ts');
process.exit(1);
}
// Base URL is optional - defaults to https://amp.kore.ai
const BASE_URL = process.env.AMP_BASE_URL;
// Helper to simulate async operations (for realistic timestamps)
function delay(ms: number): Promise<void> {
return new Promise(resolve => setTimeout(resolve, ms));
}
// =============================================================================
// MAIN EXAMPLE
// =============================================================================
async function main() {
console.log('LLM Span Example\n');
// Initialize AMP SDK
const amp = new AMP({
apiKey: API_KEY,
...(BASE_URL && { baseURL: BASE_URL }),
debug: true, // Set to false in production
});
// Session ID groups related traces together (e.g., a conversation, user session)
// You can use your own session ID or let AMP generate one
const sessionId = process.env.SESSION_ID || `llm-example-${Date.now()}`;
// ---------------------------------------------------------------------------
// Create a trace with an LLM span
// ---------------------------------------------------------------------------
// trace() creates a new trace - the top-level container for spans
// Parameters:
// name: string - Descriptive name for this trace
// options.sessionId - Groups traces into a session (optional but recommended)
const trace = amp.trace('chat-completion', { sessionId });
// startLLMSpan() creates an LLM span with provider and model pre-set
// Parameters:
// name: string - Display name for this LLM call
// provider: string - AI provider (openai, anthropic, google, azure, etc.)
// model: string - Model identifier (gpt-4, claude-3, gemini-pro, etc.)
const llmSpan = trace.startLLMSpan('Chat Completion', 'openai', 'gpt-4-turbo');
// ---------------------------------------------------------------------------
// Set token usage
// ---------------------------------------------------------------------------
// setTokens() records input (prompt) and output (completion) token counts
// Parameters:
// inputTokens: number - Tokens in the prompt/input
// outputTokens: number - Tokens in the completion/output
// The SDK automatically calculates total_tokens
llmSpan.setTokens(256, 128);
// ---------------------------------------------------------------------------
// Set LLM request parameters
// ---------------------------------------------------------------------------
// setLLMParams() sets common model parameters
// All parameters are optional - only set what you use
llmSpan.setLLMParams({
temperature: 0.7, // Randomness (0.0 = deterministic, 2.0 = very random)
topP: 0.9, // Nucleus sampling threshold
maxTokens: 1000, // Maximum tokens to generate
frequencyPenalty: 0.0, // Penalize repeated tokens (-2.0 to 2.0)
presencePenalty: 0.0, // Penalize tokens already present (-2.0 to 2.0)
stopSequences: ['END'], // Stop generation at these sequences
});
// ---------------------------------------------------------------------------
// Set operation type
// ---------------------------------------------------------------------------
// setOperation() specifies the type of LLM operation
// Common values:
// 'chat' - Chat completion (most common)
// 'text_completion' - Text completion
// 'embeddings' - Text embeddings
llmSpan.setOperation('chat');
// ---------------------------------------------------------------------------
// Set input and output messages
// ---------------------------------------------------------------------------
// setMessages() records the conversation messages
// Parameters:
// inputMessages: array - Messages sent to the model
// outputMessages: array - Messages received from the model
//
// Message format: { role: string, content: string, name?: string }
// Roles: 'system', 'user', 'assistant', 'tool'
llmSpan.setMessages(
// Input messages - what was sent to the model
[
{ role: 'system', content: 'You are a helpful AI assistant specialized in explaining technical concepts.' },
{ role: 'user', content: 'Explain quantum computing in simple terms' }
],
// Output messages - what the model returned
[
{ role: 'assistant', content: 'Quantum computing uses quantum bits (qubits) that can exist in multiple states simultaneously, unlike classical bits that are either 0 or 1. This allows quantum computers to process many possibilities at once, making them potentially much faster for certain types of problems like cryptography and simulation.' }
]
);
// ---------------------------------------------------------------------------
// Set response metadata
// ---------------------------------------------------------------------------
// setLLMResponse() records completion metadata
// Parameters:
// finishReason: string - Why generation stopped
// 'stop' - Natural completion
// 'length' - Hit max_tokens limit
// 'content_filter' - Blocked by content filter
// 'tool_calls' - Model wants to call a tool
// responseId?: string - Provider's response ID (optional)
llmSpan.setLLMResponse('stop', 'chatcmpl-abc123xyz');
// ---------------------------------------------------------------------------
// Optional: Set cost and additional metadata
// ---------------------------------------------------------------------------
// setCost() records the cost of this LLM call in USD
// Useful for tracking spend across models
llmSpan.setCost(0.0082);
// setConversationId() links this span to a conversation thread
// Use when you need to track multi-turn conversations
llmSpan.setConversationId('conv-12345');
// setAttribute() for any custom attributes
llmSpan.setAttribute('framework', 'langchain');
llmSpan.setAttribute('service.name', 'my-chatbot');
// Performance metric - latency in milliseconds
llmSpan.setAttribute('latency_ms', 1250);
// ---------------------------------------------------------------------------
// Simulate LLM call time (for realistic timestamps)
// In real usage, this would be your actual LLM API call
// ---------------------------------------------------------------------------
await delay(1250); // Simulates ~1.25s LLM response time
// ---------------------------------------------------------------------------
// End the span and trace
// Timestamps are captured automatically:
// - start_time: when span/trace was created
// - end_time: when end() is called
// ---------------------------------------------------------------------------
// Always end spans when the operation completes
llmSpan.end();
// End the trace when all spans are complete
trace.end();
// ---------------------------------------------------------------------------
// Flush and shutdown
// ---------------------------------------------------------------------------
// flush() sends all queued traces to the server
await amp.flush();
console.log(`\nSession ID: ${sessionId}`);
console.log('Check this session in the AMP UI to see the LLM span');
// shutdown() cleanly closes the SDK (flushes remaining data)
await amp.shutdown();
}
// =============================================================================
// AVAILABLE LLM SPAN METHODS REFERENCE
// =============================================================================
/*
startLLMSpan(name, provider, model)
- Create an LLM span with provider/model set
setLLM(provider, model, responseModel?)
- Set/change LLM provider and model
responseModel is the actual model used (if different)
setTokens(inputTokens, outputTokens)
- Set token usage counts
Automatically calculates total_tokens
setLLMParams({...}) - Set request parameters
temperature, topP, maxTokens, frequencyPenalty,
presencePenalty, stopSequences
setOperation(operation) - Set operation type
'chat', 'text_completion', 'embeddings'
setMessages(input[], output[]) - Set input/output messages (RECOMMENDED)
Message: { role, content, name? }
Sets BOTH standard formats automatically:
- OTEL GenAI: gen_ai.input.messages, gen_ai.output.messages
- OpenInference: llm.input_messages, llm.output_messages
setSystemPrompt(prompt) - Set system prompt separately
Sets BOTH standard formats:
- OTEL GenAI: gen_ai.system_instructions
- OpenInference: llm.system_instructions
setLLMResponse(finishReason, responseId?)
- Set completion metadata
finishReason: 'stop' | 'length' | 'content_filter' | 'tool_calls'
setCost(costUsd) - Set cost in USD
setConversationId(id) - Link to conversation thread
setAttribute(key, value) - Set custom attribute
recordPrompt(content) - Record prompt as OTEL event (alternative to setMessages)
Event name: gen_ai.content.prompt
recordCompletion(content) - Record completion as OTEL event
Event name: gen_ai.content.completion
end() - Mark span as complete
-------------------------------------------------------------------------
SUPPORTED ATTRIBUTES (OTEL GenAI Standard):
-------------------------------------------------------------------------
gen_ai.provider.name - Provider (openai, anthropic, etc.)
gen_ai.system - Provider (legacy, use provider.name)
gen_ai.request.model - Requested model
gen_ai.response.model - Actual model used
gen_ai.usage.input_tokens - Input token count
gen_ai.usage.output_tokens - Output token count
gen_ai.usage.total_tokens - Total token count
gen_ai.request.temperature - Temperature setting
gen_ai.request.top_p - Top-p sampling
gen_ai.request.max_tokens - Max tokens limit
gen_ai.operation.name - Operation type (chat, embeddings, etc.)
gen_ai.response.finish_reason - Why generation stopped
gen_ai.response.id - Provider's response ID
gen_ai.conversation.id - Conversation thread ID
gen_ai.input.messages - Input messages (OTEL GenAI standard)
gen_ai.output.messages - Output messages (OTEL GenAI standard)
gen_ai.system_instructions - System prompt (OTEL GenAI standard)
-------------------------------------------------------------------------
SUPPORTED ATTRIBUTES (OpenInference Standard - Arize/Phoenix):
-------------------------------------------------------------------------
llm.input_messages - Input messages (OpenInference)
llm.output_messages - Output messages (OpenInference)
llm.system_instructions - System prompt (OpenInference)
-------------------------------------------------------------------------
SUPPORTED ATTRIBUTES (AMP Custom):
-------------------------------------------------------------------------
span_cost_usd - Cost in USD
latency_ms - Execution time in ms
framework - Framework (langchain, etc.)
service.name - Service name
service.version - Service version
-------------------------------------------------------------------------
AUTOMATIC TIMESTAMPS:
-------------------------------------------------------------------------
start_time - Captured when span is created
end_time - Captured when end() is called
duration_ms - Calculated by backend from timestamps
*/
main().catch(console.error);