Skip to content

Commit d0ef922

Browse files
authored
concurrency implementation for llama.cpp (#14)
1 parent 4b0463b commit d0ef922

5 files changed

Lines changed: 75 additions & 20 deletions

File tree

README.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -182,3 +182,5 @@ Mostly self explanatory. Paste in the path to the directory you want processed
182182
For instance, if you are writing webscrapers, make sure to collect metadata from the webpage as you go rather than blindly just download each image. Perhaps you might include the website address or full URI of the webpage, the `<title>` tag from the webpage, or the `alt-text` field. Save this information with each image, or in a database. Then feed into the VLM with a `hint_source`. New hint sources are very easy for an amateur Python programmer to write, or you can have an LLM write for you.
183183

184184
See [HINTSOURCES.md](HINTSOURCES.md) for more information.
185+
186+
- **Batch concurrency**: If you use a VLM host that support batch concurrency such as llama.cpp (via -np n arg) you can potentially increase speed. This is not supported by LM Studio. Example command: `llama-server -np 4 -c 32768 --mmproj "mmproj-Qwen3-VL-32B-Instruct-F16.gguf" --model "Qwen3-VL-32B-Instruct-Q4_K_M.gguf" -dev cuda0 --top-k 30 --top-p 0.95 --min-p 0.05 --temp 0.5` would launch Qwen3VL 32B with four concurrent processes (-np 4) each with 8192 tokens (32768/4) of context for each of the 4 slots. This requires additional processing power and an increase of total context size (`-np 4 -c 32768` instead of `-np 1 -c 8192` as an example), but may increase total token generation speeds by utilizing batch processing. _This feature does not utilize the OpenAI jsonl batch API suitable for commercial APIs to save on costs, but should work to speed up rates._

caption.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ api_key_env_vars:
77
model: llama-4-scout-17b-16e-instruct
88
api_key: ''
99
max_tokens: 16384
10+
concurrent_batch_size: 1
1011
system_prompt: You are to analyze an image
1112
and provide information based on what is visible in the image. Do not embellish,
1213
and avoid langauge like 'showcases' or 'features,' preferring to focus on factual

caption_openai.py

Lines changed: 44 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -142,18 +142,47 @@ async def process_image(client: openai.AsyncOpenAI, image_path, conf) -> Tuple[s
142142
messages = remove_base64_image(messages)
143143
return final_summary_response, json.dumps(messages, indent=2), prompt_tokens_usage, completion_tokens_usage
144144

145+
async def process_batch(client: openai.AsyncOpenAI, image_paths: list, conf) -> Tuple[int, int]:
146+
"""Process a batch of images concurrently and return aggregated token usage."""
147+
tasks = [process_image(client, image_path, conf) for image_path in image_paths]
148+
149+
# Process all images in the batch concurrently
150+
results = await asyncio.gather(*tasks, return_exceptions=True)
151+
152+
batch_prompt_tokens = 0
153+
batch_completion_tokens = 0
154+
155+
for i, result in enumerate(results):
156+
if isinstance(result, Exception):
157+
print(filter_ascii(f"Error processing {image_paths[i]}: {result}"))
158+
else:
159+
caption_text, chat_history, prompt_token_usage, completion_token_usage = result # type: ignore
160+
161+
await save_caption(file_path=image_paths[i], caption_text=caption_text, debug_info=chat_history)
162+
163+
batch_prompt_tokens += prompt_token_usage
164+
batch_completion_tokens += completion_token_usage
165+
166+
print(filter_ascii(f" --> Processed {image_paths[i]}"))
167+
print(f" --> prompt_token_usage: {prompt_token_usage}, completion_token_usage: {completion_token_usage}")
168+
169+
return batch_prompt_tokens, batch_completion_tokens
170+
145171
async def main():
146172
import hints.registration as registration
147173
registration._validate_hint_sources()
148174

149175
conf = OmegaConf.load("caption.yaml")
176+
177+
concurrent_batch_size = conf.concurrent_batch_size
150178

151179
if conf.get("global_metadata_file"): # type: ignore
152180
async with aiofiles.open(conf.global_metadata_file) as f:
153181
global_metadata = await f.read()
154182
conf.system_prompt = f"{global_metadata}\n{conf.system_prompt}"
155183

156184
print(filter_ascii(f" -> SYSTEM PROMPT:\n{conf.system_prompt}\n"))
185+
print(filter_ascii(f" -> CONCURRENT BATCH SIZE: {concurrent_batch_size}\n"))
157186

158187
api_key = resolve_api_key(conf)
159188

@@ -162,32 +191,31 @@ async def main():
162191
aggregated_prompt_token_usage = 0
163192
aggregated_completion_token_usage = 0
164193

194+
# Collect images in batches for concurrent processing
195+
batch = []
165196
async for image_path in image_walk(conf.base_directory, recursive=conf.recursive, skip_if_txt_exists=conf.skip_if_txt_exists):
166197
current_task = asyncio.current_task()
167198
if current_task is not None and current_task.cancelled():
168199
print("Captioning task was cancelled by user")
169200
return
170201

171-
print(filter_ascii(f"\nProcessing {image_path}"))
172-
try:
173-
start_time = time.perf_counter()
174-
caption_text, chat_history, prompt_token_usage, completion_token_usage = await process_image(client, image_path, conf)
175-
total_time = (time.perf_counter() - start_time)
176-
except openai.APIConnectionError as e:
177-
print(f"{e}\nAPI Error. Check that your service is running and caption.yaml has the correct base_url")
178-
except asyncio.CancelledError:
179-
print("Captioning task was cancelled during image processing")
180-
raise
202+
batch.append(image_path)
181203

182-
aggregated_prompt_token_usage += prompt_token_usage
183-
aggregated_completion_token_usage += aggregated_completion_token_usage
204+
if len(batch) >= concurrent_batch_size:
205+
print(filter_ascii(f"\nProcessing batch of {len(batch)} images:"))
206+
start_time = time.perf_counter()
207+
208+
batch_prompt_tokens, batch_completion_tokens = await process_batch(client, batch, conf)
209+
210+
batch_time = (time.perf_counter() - start_time)
211+
print(filter_ascii(f" --> Batch completed in {batch_time:.2f}s, {batch_time/concurrent_batch_size:.2f}s per image"))
212+
213+
aggregated_prompt_token_usage += batch_prompt_tokens
214+
aggregated_completion_token_usage += batch_completion_tokens
215+
batch = []
184216

185-
print(filter_ascii(f" --> Took {total_time:.2f}s, Final caption:\n{caption_text}"))
186-
print(f" --> prompt_token_usage: {prompt_token_usage}, completion_token_usage: {completion_token_usage}")
187-
await save_caption(file_path=image_path, caption_text=caption_text, debug_info=chat_history)
188217

189218
print(F" -> JOB COMPLETE.")
190-
# not working?
191219
print(f"aggregated_prompt_token_usage: {aggregated_prompt_token_usage}, aggregated_completion_token_usage: {aggregated_completion_token_usage}")
192220

193221
if __name__ == "__main__":

ui/src/App.js

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,8 @@ function App() {
1818
recursive: false,
1919
hint_sources: [],
2020
global_metadata_file: '',
21-
skip_if_txt_exists: false
21+
skip_if_txt_exists: false,
22+
concurrent_batch_size: 1
2223
});
2324
const [configLoading, setConfigLoading] = useState(false);
2425
const [configError, setConfigError] = useState('');
@@ -106,7 +107,8 @@ function App() {
106107
recursive: data.config.recursive || false,
107108
hint_sources: data.config.hint_sources || [],
108109
global_metadata_file: data.config.global_metadata_file || '',
109-
skip_if_txt_exists: data.config.skip_if_txt_exists || false
110+
skip_if_txt_exists: data.config.skip_if_txt_exists || false,
111+
concurrent_batch_size: data.config.concurrent_batch_size || 1
110112
};
111113
setConfig(newConfig);
112114
if (newConfig.base_url) {
@@ -147,7 +149,8 @@ function App() {
147149
recursive: config.recursive,
148150
hint_sources: config.hint_sources,
149151
global_metadata_file: config.global_metadata_file,
150-
skip_if_txt_exists: config.skip_if_txt_exists
152+
skip_if_txt_exists: config.skip_if_txt_exists,
153+
concurrent_batch_size: config.concurrent_batch_size
151154
}
152155
}),
153156
});

ui/src/components/ConfigForm.js

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,13 @@ const ConfigForm = ({
105105
onConfigChange('retry_rules', newRetryRules);
106106
};
107107

108+
const handleConcurrentBatchSizeChange = (e) => {
109+
const value = parseInt(e.target.value);
110+
if (value >= 1 && value <= 16) {
111+
onConfigChange('concurrent_batch_size', value);
112+
}
113+
};
114+
108115
if (configLoading) return <p>Loading configuration...</p>;
109116

110117
return (
@@ -127,7 +134,7 @@ const ConfigForm = ({
127134
onChange={(e) => onConfigChange('base_url', e.target.value)}
128135
placeholder="e.g., http://localhost:1234/v1"
129136
/>
130-
<span className="description-text">Copy from LM Studio developer tab.</span>
137+
<span className="description-text">Copy from LM Studio developer tab or llama.cpp console output. Make sure /v1 at end is present, ex. http://127.0.0.1:8080/v1</span>
131138
</div>
132139

133140
<div>
@@ -161,6 +168,20 @@ const ConfigForm = ({
161168
</div>
162169
</div>
163170

171+
<div className="form-group">
172+
<label htmlFor="concurrent_batch_size">Concurrent Batch Size</label>
173+
<input
174+
type="number"
175+
id="concurrent_batch_size"
176+
min="1"
177+
max="16"
178+
value={config.concurrent_batch_size || 4}
179+
onChange={handleConcurrentBatchSizeChange}
180+
style={{ width: '100px' }}
181+
/>
182+
<span className="description-text">Batch concurrency if using API with support (i.e. "llama-server -np n"), otherwise leave 1</span>
183+
</div>
184+
164185
<div className="form-group side-by-side api-key-directory">
165186
<div>
166187
<label htmlFor="api_key">API Key</label>

0 commit comments

Comments
 (0)