mirror of
https://github.com/clawdbot/clawdbot.git
synced 2026-01-31 19:37:45 +01:00
refactor: unify media provider options
This commit is contained in:
@@ -80,7 +80,7 @@ read_when:
|
|||||||
- Provider auth follows the standard model auth order (auth profiles, env vars, `models.providers.*.apiKey`).
|
- Provider auth follows the standard model auth order (auth profiles, env vars, `models.providers.*.apiKey`).
|
||||||
- Deepgram picks up `DEEPGRAM_API_KEY` when `provider: "deepgram"` is used.
|
- Deepgram picks up `DEEPGRAM_API_KEY` when `provider: "deepgram"` is used.
|
||||||
- Deepgram setup details: [Deepgram (audio transcription)](/providers/deepgram).
|
- Deepgram setup details: [Deepgram (audio transcription)](/providers/deepgram).
|
||||||
- Audio providers can override `baseUrl`/`headers` via `tools.media.audio`.
|
- Audio providers can override `baseUrl`, `headers`, and `providerOptions` via `tools.media.audio`.
|
||||||
- Default size cap is 20MB (`tools.media.audio.maxBytes`). Oversize audio is skipped for that model and the next entry is tried.
|
- Default size cap is 20MB (`tools.media.audio.maxBytes`). Oversize audio is skipped for that model and the next entry is tried.
|
||||||
- Default `maxChars` for audio is **unset** (full transcript). Set `tools.media.audio.maxChars` or per-entry `maxChars` to trim output.
|
- Default `maxChars` for audio is **unset** (full transcript). Set `tools.media.audio.maxChars` or per-entry `maxChars` to trim output.
|
||||||
- Use `tools.media.audio.attachments` to process multiple voice notes (`mode: "all"` + `maxAttachments`).
|
- Use `tools.media.audio.attachments` to process multiple voice notes (`mode: "all"` + `maxAttachments`).
|
||||||
|
|||||||
@@ -32,8 +32,8 @@ If understanding fails or is disabled, **the reply flow continues** with the ori
|
|||||||
- `tools.media.models`: shared model list (use `capabilities` to gate).
|
- `tools.media.models`: shared model list (use `capabilities` to gate).
|
||||||
- `tools.media.image` / `tools.media.audio` / `tools.media.video`:
|
- `tools.media.image` / `tools.media.audio` / `tools.media.video`:
|
||||||
- defaults (`prompt`, `maxChars`, `maxBytes`, `timeoutSeconds`, `language`)
|
- defaults (`prompt`, `maxChars`, `maxBytes`, `timeoutSeconds`, `language`)
|
||||||
- provider overrides (`baseUrl`, `headers`)
|
- provider overrides (`baseUrl`, `headers`, `providerOptions`)
|
||||||
- Deepgram audio options (`deepgram` in `tools.media.audio`)
|
- Deepgram audio options via `tools.media.audio.providerOptions.deepgram`
|
||||||
- optional **per‑capability `models` list** (preferred before shared models)
|
- optional **per‑capability `models` list** (preferred before shared models)
|
||||||
- `attachments` policy (`mode`, `maxAttachments`, `prefer`)
|
- `attachments` policy (`mode`, `maxAttachments`, `prefer`)
|
||||||
- `scope` (optional gating by channel/chatType/session key)
|
- `scope` (optional gating by channel/chatType/session key)
|
||||||
|
|||||||
@@ -41,9 +41,9 @@ DEEPGRAM_API_KEY=dg_...
|
|||||||
|
|
||||||
- `model`: Deepgram model id (default: `nova-3`)
|
- `model`: Deepgram model id (default: `nova-3`)
|
||||||
- `language`: language hint (optional)
|
- `language`: language hint (optional)
|
||||||
- `tools.media.audio.deepgram.detectLanguage`: enable language detection (optional)
|
- `tools.media.audio.providerOptions.deepgram.detect_language`: enable language detection (optional)
|
||||||
- `tools.media.audio.deepgram.punctuate`: enable punctuation (optional)
|
- `tools.media.audio.providerOptions.deepgram.punctuate`: enable punctuation (optional)
|
||||||
- `tools.media.audio.deepgram.smartFormat`: enable smart formatting (optional)
|
- `tools.media.audio.providerOptions.deepgram.smart_format`: enable smart formatting (optional)
|
||||||
|
|
||||||
Example with language:
|
Example with language:
|
||||||
```json5
|
```json5
|
||||||
@@ -68,10 +68,12 @@ Example with Deepgram options:
|
|||||||
media: {
|
media: {
|
||||||
audio: {
|
audio: {
|
||||||
enabled: true,
|
enabled: true,
|
||||||
deepgram: {
|
providerOptions: {
|
||||||
detectLanguage: true,
|
deepgram: {
|
||||||
punctuate: true,
|
detect_language: true,
|
||||||
smartFormat: true
|
punctuate: true,
|
||||||
|
smart_format: true
|
||||||
|
}
|
||||||
},
|
},
|
||||||
models: [{ provider: "deepgram", model: "nova-3" }]
|
models: [{ provider: "deepgram", model: "nova-3" }]
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -51,7 +51,9 @@ export type MediaUnderstandingModelConfig = {
|
|||||||
timeoutSeconds?: number;
|
timeoutSeconds?: number;
|
||||||
/** Optional language hint for audio transcription. */
|
/** Optional language hint for audio transcription. */
|
||||||
language?: string;
|
language?: string;
|
||||||
/** Optional Deepgram transcription options (audio only). */
|
/** Optional provider-specific query params (merged into requests). */
|
||||||
|
providerOptions?: Record<string, Record<string, string | number | boolean>>;
|
||||||
|
/** @deprecated Use providerOptions.deepgram instead. */
|
||||||
deepgram?: {
|
deepgram?: {
|
||||||
detectLanguage?: boolean;
|
detectLanguage?: boolean;
|
||||||
punctuate?: boolean;
|
punctuate?: boolean;
|
||||||
@@ -82,7 +84,9 @@ export type MediaUnderstandingConfig = {
|
|||||||
timeoutSeconds?: number;
|
timeoutSeconds?: number;
|
||||||
/** Default language hint (audio). */
|
/** Default language hint (audio). */
|
||||||
language?: string;
|
language?: string;
|
||||||
/** Optional Deepgram transcription options (audio only). */
|
/** Optional provider-specific query params (merged into requests). */
|
||||||
|
providerOptions?: Record<string, Record<string, string | number | boolean>>;
|
||||||
|
/** @deprecated Use providerOptions.deepgram instead. */
|
||||||
deepgram?: {
|
deepgram?: {
|
||||||
detectLanguage?: boolean;
|
detectLanguage?: boolean;
|
||||||
punctuate?: boolean;
|
punctuate?: boolean;
|
||||||
|
|||||||
@@ -284,6 +284,11 @@ const DeepgramAudioSchema = z
|
|||||||
})
|
})
|
||||||
.optional();
|
.optional();
|
||||||
|
|
||||||
|
const ProviderOptionValueSchema = z.union([z.string(), z.number(), z.boolean()]);
|
||||||
|
const ProviderOptionsSchema = z
|
||||||
|
.record(z.string(), z.record(z.string(), ProviderOptionValueSchema))
|
||||||
|
.optional();
|
||||||
|
|
||||||
export const MediaUnderstandingModelSchema = z
|
export const MediaUnderstandingModelSchema = z
|
||||||
.object({
|
.object({
|
||||||
provider: z.string().optional(),
|
provider: z.string().optional(),
|
||||||
@@ -297,6 +302,7 @@ export const MediaUnderstandingModelSchema = z
|
|||||||
maxBytes: z.number().int().positive().optional(),
|
maxBytes: z.number().int().positive().optional(),
|
||||||
timeoutSeconds: z.number().int().positive().optional(),
|
timeoutSeconds: z.number().int().positive().optional(),
|
||||||
language: z.string().optional(),
|
language: z.string().optional(),
|
||||||
|
providerOptions: ProviderOptionsSchema,
|
||||||
deepgram: DeepgramAudioSchema,
|
deepgram: DeepgramAudioSchema,
|
||||||
baseUrl: z.string().optional(),
|
baseUrl: z.string().optional(),
|
||||||
headers: z.record(z.string(), z.string()).optional(),
|
headers: z.record(z.string(), z.string()).optional(),
|
||||||
@@ -314,6 +320,7 @@ export const ToolsMediaUnderstandingSchema = z
|
|||||||
prompt: z.string().optional(),
|
prompt: z.string().optional(),
|
||||||
timeoutSeconds: z.number().int().positive().optional(),
|
timeoutSeconds: z.number().int().positive().optional(),
|
||||||
language: z.string().optional(),
|
language: z.string().optional(),
|
||||||
|
providerOptions: ProviderOptionsSchema,
|
||||||
deepgram: DeepgramAudioSchema,
|
deepgram: DeepgramAudioSchema,
|
||||||
baseUrl: z.string().optional(),
|
baseUrl: z.string().optional(),
|
||||||
headers: z.record(z.string(), z.string()).optional(),
|
headers: z.record(z.string(), z.string()).optional(),
|
||||||
|
|||||||
112
src/media-understanding/runner.deepgram.test.ts
Normal file
112
src/media-understanding/runner.deepgram.test.ts
Normal file
@@ -0,0 +1,112 @@
|
|||||||
|
import fs from "node:fs/promises";
|
||||||
|
import os from "node:os";
|
||||||
|
import path from "node:path";
|
||||||
|
|
||||||
|
import { describe, expect, it } from "vitest";
|
||||||
|
|
||||||
|
import type { ClawdbotConfig } from "../config/config.js";
|
||||||
|
import type { MsgContext } from "../auto-reply/templating.js";
|
||||||
|
import {
|
||||||
|
buildProviderRegistry,
|
||||||
|
createMediaAttachmentCache,
|
||||||
|
normalizeMediaAttachments,
|
||||||
|
runCapability,
|
||||||
|
} from "./runner.js";
|
||||||
|
|
||||||
|
describe("runCapability deepgram provider options", () => {
|
||||||
|
it("merges provider options, headers, and baseUrl overrides", async () => {
|
||||||
|
const tmpPath = path.join(os.tmpdir(), `clawdbot-deepgram-${Date.now()}.wav`);
|
||||||
|
await fs.writeFile(tmpPath, Buffer.from("RIFF"));
|
||||||
|
const ctx: MsgContext = { MediaPath: tmpPath, MediaType: "audio/wav" };
|
||||||
|
const media = normalizeMediaAttachments(ctx);
|
||||||
|
const cache = createMediaAttachmentCache(media);
|
||||||
|
|
||||||
|
let seenQuery: Record<string, string | number | boolean> | undefined;
|
||||||
|
let seenBaseUrl: string | undefined;
|
||||||
|
let seenHeaders: Record<string, string> | undefined;
|
||||||
|
|
||||||
|
const providerRegistry = buildProviderRegistry({
|
||||||
|
deepgram: {
|
||||||
|
id: "deepgram",
|
||||||
|
capabilities: ["audio"],
|
||||||
|
transcribeAudio: async (req) => {
|
||||||
|
seenQuery = req.query;
|
||||||
|
seenBaseUrl = req.baseUrl;
|
||||||
|
seenHeaders = req.headers;
|
||||||
|
return { text: "ok", model: req.model };
|
||||||
|
},
|
||||||
|
},
|
||||||
|
});
|
||||||
|
|
||||||
|
const cfg = {
|
||||||
|
models: {
|
||||||
|
providers: {
|
||||||
|
deepgram: {
|
||||||
|
baseUrl: "https://provider.example",
|
||||||
|
apiKey: "test-key",
|
||||||
|
headers: { "X-Provider": "1" },
|
||||||
|
models: [],
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
tools: {
|
||||||
|
media: {
|
||||||
|
audio: {
|
||||||
|
enabled: true,
|
||||||
|
baseUrl: "https://config.example",
|
||||||
|
headers: { "X-Config": "2" },
|
||||||
|
providerOptions: {
|
||||||
|
deepgram: {
|
||||||
|
detect_language: true,
|
||||||
|
punctuate: true,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
deepgram: { smartFormat: true },
|
||||||
|
models: [
|
||||||
|
{
|
||||||
|
provider: "deepgram",
|
||||||
|
model: "nova-3",
|
||||||
|
baseUrl: "https://entry.example",
|
||||||
|
headers: { "X-Entry": "3" },
|
||||||
|
providerOptions: {
|
||||||
|
deepgram: {
|
||||||
|
detectLanguage: false,
|
||||||
|
punctuate: false,
|
||||||
|
smart_format: true,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
} as unknown as ClawdbotConfig;
|
||||||
|
|
||||||
|
try {
|
||||||
|
const result = await runCapability({
|
||||||
|
capability: "audio",
|
||||||
|
cfg,
|
||||||
|
ctx,
|
||||||
|
attachments: cache,
|
||||||
|
media,
|
||||||
|
providerRegistry,
|
||||||
|
});
|
||||||
|
expect(result.outputs[0]?.text).toBe("ok");
|
||||||
|
expect(seenBaseUrl).toBe("https://entry.example");
|
||||||
|
expect(seenHeaders).toMatchObject({
|
||||||
|
"X-Provider": "1",
|
||||||
|
"X-Config": "2",
|
||||||
|
"X-Entry": "3",
|
||||||
|
});
|
||||||
|
expect(seenQuery).toMatchObject({
|
||||||
|
detect_language: false,
|
||||||
|
punctuate: false,
|
||||||
|
smart_format: true,
|
||||||
|
});
|
||||||
|
expect((seenQuery as Record<string, unknown>)["detectLanguage"]).toBeUndefined();
|
||||||
|
} finally {
|
||||||
|
await cache.cleanup();
|
||||||
|
await fs.unlink(tmpPath).catch(() => {});
|
||||||
|
}
|
||||||
|
});
|
||||||
|
});
|
||||||
@@ -71,21 +71,73 @@ function trimOutput(text: string, maxChars?: number): string {
|
|||||||
return trimmed.slice(0, maxChars).trim();
|
return trimmed.slice(0, maxChars).trim();
|
||||||
}
|
}
|
||||||
|
|
||||||
function buildDeepgramQuery(options?: {
|
type ProviderQuery = Record<string, string | number | boolean>;
|
||||||
|
|
||||||
|
function normalizeProviderQuery(
|
||||||
|
options?: Record<string, string | number | boolean>,
|
||||||
|
): ProviderQuery | undefined {
|
||||||
|
if (!options) return undefined;
|
||||||
|
const query: ProviderQuery = {};
|
||||||
|
for (const [key, value] of Object.entries(options)) {
|
||||||
|
if (value === undefined) continue;
|
||||||
|
query[key] = value;
|
||||||
|
}
|
||||||
|
return Object.keys(query).length > 0 ? query : undefined;
|
||||||
|
}
|
||||||
|
|
||||||
|
function buildDeepgramCompatQuery(options?: {
|
||||||
detectLanguage?: boolean;
|
detectLanguage?: boolean;
|
||||||
punctuate?: boolean;
|
punctuate?: boolean;
|
||||||
smartFormat?: boolean;
|
smartFormat?: boolean;
|
||||||
}): Record<string, string | number | boolean> | undefined {
|
}): ProviderQuery | undefined {
|
||||||
if (!options) return undefined;
|
if (!options) return undefined;
|
||||||
const query: Record<string, string | number | boolean> = {};
|
const query: ProviderQuery = {};
|
||||||
if (typeof options.detectLanguage === "boolean") {
|
if (typeof options.detectLanguage === "boolean") query.detect_language = options.detectLanguage;
|
||||||
query.detect_language = options.detectLanguage;
|
if (typeof options.punctuate === "boolean") query.punctuate = options.punctuate;
|
||||||
|
if (typeof options.smartFormat === "boolean") query.smart_format = options.smartFormat;
|
||||||
|
return Object.keys(query).length > 0 ? query : undefined;
|
||||||
|
}
|
||||||
|
|
||||||
|
function mergeProviderQuery(
|
||||||
|
base: ProviderQuery | undefined,
|
||||||
|
incoming: ProviderQuery | undefined,
|
||||||
|
): ProviderQuery | undefined {
|
||||||
|
if (!base && !incoming) return undefined;
|
||||||
|
return { ...(base ?? {}), ...(incoming ?? {}) };
|
||||||
|
}
|
||||||
|
|
||||||
|
function normalizeDeepgramQueryKeys(query: ProviderQuery): ProviderQuery {
|
||||||
|
const normalized = { ...query };
|
||||||
|
if ("detectLanguage" in normalized) {
|
||||||
|
normalized.detect_language = normalized.detectLanguage as boolean;
|
||||||
|
delete normalized.detectLanguage;
|
||||||
}
|
}
|
||||||
if (typeof options.punctuate === "boolean") {
|
if ("smartFormat" in normalized) {
|
||||||
query.punctuate = options.punctuate;
|
normalized.smart_format = normalized.smartFormat as boolean;
|
||||||
|
delete normalized.smartFormat;
|
||||||
}
|
}
|
||||||
if (typeof options.smartFormat === "boolean") {
|
return normalized;
|
||||||
query.smart_format = options.smartFormat;
|
}
|
||||||
|
|
||||||
|
function resolveProviderQuery(params: {
|
||||||
|
providerId: string;
|
||||||
|
config?: MediaUnderstandingConfig;
|
||||||
|
entry: MediaUnderstandingModelConfig;
|
||||||
|
}): ProviderQuery | undefined {
|
||||||
|
const { providerId, config, entry } = params;
|
||||||
|
const mergedOptions = normalizeProviderQuery({
|
||||||
|
...(config?.providerOptions?.[providerId] ?? {}),
|
||||||
|
...(entry.providerOptions?.[providerId] ?? {}),
|
||||||
|
});
|
||||||
|
if (providerId !== "deepgram") {
|
||||||
|
return mergedOptions;
|
||||||
|
}
|
||||||
|
let query = normalizeDeepgramQueryKeys(mergedOptions ?? {});
|
||||||
|
const compat = buildDeepgramCompatQuery({ ...config?.deepgram, ...entry.deepgram });
|
||||||
|
for (const [key, value] of Object.entries(compat ?? {})) {
|
||||||
|
if (query[key] === undefined) {
|
||||||
|
query[key] = value;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
return Object.keys(query).length > 0 ? query : undefined;
|
return Object.keys(query).length > 0 ? query : undefined;
|
||||||
}
|
}
|
||||||
@@ -246,13 +298,11 @@ async function runProviderEntry(params: {
|
|||||||
...(entry.headers ?? {}),
|
...(entry.headers ?? {}),
|
||||||
};
|
};
|
||||||
const headers = Object.keys(mergedHeaders).length > 0 ? mergedHeaders : undefined;
|
const headers = Object.keys(mergedHeaders).length > 0 ? mergedHeaders : undefined;
|
||||||
const deepgramQuery =
|
const providerQuery = resolveProviderQuery({
|
||||||
providerId === "deepgram"
|
providerId,
|
||||||
? buildDeepgramQuery({
|
config: params.config,
|
||||||
...params.config?.deepgram,
|
entry,
|
||||||
...entry.deepgram,
|
});
|
||||||
})
|
|
||||||
: undefined;
|
|
||||||
const model = entry.model?.trim() || DEFAULT_AUDIO_MODELS[providerId] || entry.model;
|
const model = entry.model?.trim() || DEFAULT_AUDIO_MODELS[providerId] || entry.model;
|
||||||
const result = await provider.transcribeAudio({
|
const result = await provider.transcribeAudio({
|
||||||
buffer: media.buffer,
|
buffer: media.buffer,
|
||||||
@@ -264,7 +314,7 @@ async function runProviderEntry(params: {
|
|||||||
model,
|
model,
|
||||||
language: entry.language ?? params.config?.language ?? cfg.tools?.media?.audio?.language,
|
language: entry.language ?? params.config?.language ?? cfg.tools?.media?.audio?.language,
|
||||||
prompt,
|
prompt,
|
||||||
query: deepgramQuery,
|
query: providerQuery,
|
||||||
timeoutMs,
|
timeoutMs,
|
||||||
});
|
});
|
||||||
return {
|
return {
|
||||||
|
|||||||
Reference in New Issue
Block a user