refactor: unify media provider options

2026-01-31 19:37:45 +01:00 · 2026-01-17 09:12:19 +00:00
parent 89f85ddeab
commit d66bc65ca6
7 changed files with 204 additions and 29 deletions
--- a/docs/nodes/audio.md
+++ b/docs/nodes/audio.md
@@ -80,7 +80,7 @@ read_when:
 - Provider auth follows the standard model auth order (auth profiles, env vars, `models.providers.*.apiKey`).
 - Deepgram picks up `DEEPGRAM_API_KEY` when `provider: "deepgram"` is used.
 - Deepgram setup details: [Deepgram (audio transcription)](/providers/deepgram).
- Audio providers can override `baseUrl`/`headers` via `tools.media.audio`.
+- Audio providers can override `baseUrl`, `headers`, and `providerOptions` via `tools.media.audio`.
 - Default size cap is 20MB (`tools.media.audio.maxBytes`). Oversize audio is skipped for that model and the next entry is tried.
 - Default `maxChars` for audio is **unset** (full transcript). Set `tools.media.audio.maxChars` or per-entry `maxChars` to trim output.
 - Use `tools.media.audio.attachments` to process multiple voice notes (`mode: "all"` + `maxAttachments`).
--- a/docs/nodes/media-understanding.md
+++ b/docs/nodes/media-understanding.md
@@ -32,8 +32,8 @@ If understanding fails or is disabled, **the reply flow continues** with the ori
 - `tools.media.models`: shared model list (use `capabilities` to gate).
 - `tools.media.image` / `tools.media.audio` / `tools.media.video`:
  - defaults (`prompt`, `maxChars`, `maxBytes`, `timeoutSeconds`, `language`)
-  - provider overrides (`baseUrl`, `headers`)
+  - provider overrides (`baseUrl`, `headers`, `providerOptions`)
-  - Deepgram audio options (`deepgram` in `tools.media.audio`)
+  - Deepgram audio options via `tools.media.audio.providerOptions.deepgram`
  - optional **per‑capability `models` list** (preferred before shared models)
  - `attachments` policy (`mode`, `maxAttachments`, `prefer`)
  - `scope` (optional gating by channel/chatType/session key)
--- a/docs/providers/deepgram.md
+++ b/docs/providers/deepgram.md
@@ -41,9 +41,9 @@ DEEPGRAM_API_KEY=dg_...
 - `model`: Deepgram model id (default: `nova-3`)
 - `language`: language hint (optional)
- `tools.media.audio.deepgram.detectLanguage`: enable language detection (optional)
+- `tools.media.audio.providerOptions.deepgram.detect_language`: enable language detection (optional)
- `tools.media.audio.deepgram.punctuate`: enable punctuation (optional)
+- `tools.media.audio.providerOptions.deepgram.punctuate`: enable punctuation (optional)
- `tools.media.audio.deepgram.smartFormat`: enable smart formatting (optional)
+- `tools.media.audio.providerOptions.deepgram.smart_format`: enable smart formatting (optional)
 Example with language:
 ```json5
@@ -68,10 +68,12 @@ Example with Deepgram options:
    media: {
      audio: {
        enabled: true,
-        deepgram: {
+        providerOptions: {
-          detectLanguage: true,
+          deepgram: {
-          punctuate: true,
+            detect_language: true,
-          smartFormat: true
+            punctuate: true,
            smart_format: true
          }
        },
        models: [{ provider: "deepgram", model: "nova-3" }]
      }
--- a/src/config/types.tools.ts
+++ b/src/config/types.tools.ts
@@ -51,7 +51,9 @@ export type MediaUnderstandingModelConfig = {
  timeoutSeconds?: number;
  /** Optional language hint for audio transcription. */
  language?: string;
-  /** Optional Deepgram transcription options (audio only). */
+  /** Optional provider-specific query params (merged into requests). */
  providerOptions?: Record<string, Record<string, string | number | boolean>>;
  /** @deprecated Use providerOptions.deepgram instead. */
  deepgram?: {
    detectLanguage?: boolean;
    punctuate?: boolean;
@@ -82,7 +84,9 @@ export type MediaUnderstandingConfig = {
  timeoutSeconds?: number;
  /** Default language hint (audio). */
  language?: string;
-  /** Optional Deepgram transcription options (audio only). */
+  /** Optional provider-specific query params (merged into requests). */
  providerOptions?: Record<string, Record<string, string | number | boolean>>;
  /** @deprecated Use providerOptions.deepgram instead. */
  deepgram?: {
    detectLanguage?: boolean;
    punctuate?: boolean;
--- a/src/config/zod-schema.core.ts
+++ b/src/config/zod-schema.core.ts
@@ -284,6 +284,11 @@ const DeepgramAudioSchema = z
  })
  .optional();
 const ProviderOptionValueSchema = z.union([z.string(), z.number(), z.boolean()]);
 const ProviderOptionsSchema = z
  .record(z.string(), z.record(z.string(), ProviderOptionValueSchema))
  .optional();
 export const MediaUnderstandingModelSchema = z
  .object({
    provider: z.string().optional(),
@@ -297,6 +302,7 @@ export const MediaUnderstandingModelSchema = z
    maxBytes: z.number().int().positive().optional(),
    timeoutSeconds: z.number().int().positive().optional(),
    language: z.string().optional(),
    providerOptions: ProviderOptionsSchema,
    deepgram: DeepgramAudioSchema,
    baseUrl: z.string().optional(),
    headers: z.record(z.string(), z.string()).optional(),
@@ -314,6 +320,7 @@ export const ToolsMediaUnderstandingSchema = z
    prompt: z.string().optional(),
    timeoutSeconds: z.number().int().positive().optional(),
    language: z.string().optional(),
    providerOptions: ProviderOptionsSchema,
    deepgram: DeepgramAudioSchema,
    baseUrl: z.string().optional(),
    headers: z.record(z.string(), z.string()).optional(),
--- a/src/media-understanding/runner.deepgram.test.ts
+++ b/src/media-understanding/runner.deepgram.test.ts
@@ -0,0 +1,112 @@
 import fs from "node:fs/promises";
 import os from "node:os";
 import path from "node:path";
 import { describe, expect, it } from "vitest";
 import type { ClawdbotConfig } from "../config/config.js";
 import type { MsgContext } from "../auto-reply/templating.js";
 import {
  buildProviderRegistry,
  createMediaAttachmentCache,
  normalizeMediaAttachments,
  runCapability,
 } from "./runner.js";
 describe("runCapability deepgram provider options", () => {
  it("merges provider options, headers, and baseUrl overrides", async () => {
    const tmpPath = path.join(os.tmpdir(), `clawdbot-deepgram-${Date.now()}.wav`);
    await fs.writeFile(tmpPath, Buffer.from("RIFF"));
    const ctx: MsgContext = { MediaPath: tmpPath, MediaType: "audio/wav" };
    const media = normalizeMediaAttachments(ctx);
    const cache = createMediaAttachmentCache(media);
    let seenQuery: Record<string, string | number | boolean> | undefined;
    let seenBaseUrl: string | undefined;
    let seenHeaders: Record<string, string> | undefined;
    const providerRegistry = buildProviderRegistry({
      deepgram: {
        id: "deepgram",
        capabilities: ["audio"],
        transcribeAudio: async (req) => {
          seenQuery = req.query;
          seenBaseUrl = req.baseUrl;
          seenHeaders = req.headers;
          return { text: "ok", model: req.model };
        },
      },
    });
    const cfg = {
      models: {
        providers: {
          deepgram: {
            baseUrl: "https://provider.example",
            apiKey: "test-key",
            headers: { "X-Provider": "1" },
            models: [],
          },
        },
      },
      tools: {
        media: {
          audio: {
            enabled: true,
            baseUrl: "https://config.example",
            headers: { "X-Config": "2" },
            providerOptions: {
              deepgram: {
                detect_language: true,
                punctuate: true,
              },
            },
            deepgram: { smartFormat: true },
            models: [
              {
                provider: "deepgram",
                model: "nova-3",
                baseUrl: "https://entry.example",
                headers: { "X-Entry": "3" },
                providerOptions: {
                  deepgram: {
                    detectLanguage: false,
                    punctuate: false,
                    smart_format: true,
                  },
                },
              },
            ],
          },
        },
      },
    } as unknown as ClawdbotConfig;
    try {
      const result = await runCapability({
        capability: "audio",
        cfg,
        ctx,
        attachments: cache,
        media,
        providerRegistry,
      });
      expect(result.outputs[0]?.text).toBe("ok");
      expect(seenBaseUrl).toBe("https://entry.example");
      expect(seenHeaders).toMatchObject({
        "X-Provider": "1",
        "X-Config": "2",
        "X-Entry": "3",
      });
      expect(seenQuery).toMatchObject({
        detect_language: false,
        punctuate: false,
        smart_format: true,
      });
      expect((seenQuery as Record<string, unknown>)["detectLanguage"]).toBeUndefined();
    } finally {
      await cache.cleanup();
      await fs.unlink(tmpPath).catch(() => {});
    }
  });
 });
--- a/src/media-understanding/runner.ts
+++ b/src/media-understanding/runner.ts
@@ -71,21 +71,73 @@ function trimOutput(text: string, maxChars?: number): string {
  return trimmed.slice(0, maxChars).trim();
 }
-function buildDeepgramQuery(options?: {
+type ProviderQuery = Record<string, string | number | boolean>;
 function normalizeProviderQuery(
  options?: Record<string, string | number | boolean>,
 ): ProviderQuery | undefined {
  if (!options) return undefined;
  const query: ProviderQuery = {};
  for (const [key, value] of Object.entries(options)) {
    if (value === undefined) continue;
    query[key] = value;
  }
  return Object.keys(query).length > 0 ? query : undefined;
 }
 function buildDeepgramCompatQuery(options?: {
  detectLanguage?: boolean;
  punctuate?: boolean;
  smartFormat?: boolean;
-}): Record<string, string | number | boolean> | undefined {
+}): ProviderQuery | undefined {
  if (!options) return undefined;
-  const query: Record<string, string | number | boolean> = {};
+  const query: ProviderQuery = {};
-  if (typeof options.detectLanguage === "boolean") {
+  if (typeof options.detectLanguage === "boolean") query.detect_language = options.detectLanguage;
-    query.detect_language = options.detectLanguage;
+  if (typeof options.punctuate === "boolean") query.punctuate = options.punctuate;
  if (typeof options.smartFormat === "boolean") query.smart_format = options.smartFormat;
  return Object.keys(query).length > 0 ? query : undefined;
 }
 function mergeProviderQuery(
  base: ProviderQuery | undefined,
  incoming: ProviderQuery | undefined,
 ): ProviderQuery | undefined {
  if (!base && !incoming) return undefined;
  return { ...(base ?? {}), ...(incoming ?? {}) };
 }
 function normalizeDeepgramQueryKeys(query: ProviderQuery): ProviderQuery {
  const normalized = { ...query };
  if ("detectLanguage" in normalized) {
    normalized.detect_language = normalized.detectLanguage as boolean;
    delete normalized.detectLanguage;
  }
-  if (typeof options.punctuate === "boolean") {
+  if ("smartFormat" in normalized) {
-    query.punctuate = options.punctuate;
+    normalized.smart_format = normalized.smartFormat as boolean;
    delete normalized.smartFormat;
  }
-  if (typeof options.smartFormat === "boolean") {
+  return normalized;
-    query.smart_format = options.smartFormat;
+}
 function resolveProviderQuery(params: {
  providerId: string;
  config?: MediaUnderstandingConfig;
  entry: MediaUnderstandingModelConfig;
 }): ProviderQuery | undefined {
  const { providerId, config, entry } = params;
  const mergedOptions = normalizeProviderQuery({
    ...(config?.providerOptions?.[providerId] ?? {}),
    ...(entry.providerOptions?.[providerId] ?? {}),
  });
  if (providerId !== "deepgram") {
    return mergedOptions;
  }
  let query = normalizeDeepgramQueryKeys(mergedOptions ?? {});
  const compat = buildDeepgramCompatQuery({ ...config?.deepgram, ...entry.deepgram });
  for (const [key, value] of Object.entries(compat ?? {})) {
    if (query[key] === undefined) {
      query[key] = value;
    }
  }
  return Object.keys(query).length > 0 ? query : undefined;
 }
@@ -246,13 +298,11 @@ async function runProviderEntry(params: {
      ...(entry.headers ?? {}),
    };
    const headers = Object.keys(mergedHeaders).length > 0 ? mergedHeaders : undefined;
-    const deepgramQuery =
+    const providerQuery = resolveProviderQuery({
-      providerId === "deepgram"
+      providerId,
-        ? buildDeepgramQuery({
+      config: params.config,
-            ...params.config?.deepgram,
+      entry,
-            ...entry.deepgram,
+    });
          })
        : undefined;
    const model = entry.model?.trim() || DEFAULT_AUDIO_MODELS[providerId] || entry.model;
    const result = await provider.transcribeAudio({
      buffer: media.buffer,
@@ -264,7 +314,7 @@ async function runProviderEntry(params: {
      model,
      language: entry.language ?? params.config?.language ?? cfg.tools?.media?.audio?.language,
      prompt,
-      query: deepgramQuery,
+      query: providerQuery,
      timeoutMs,
    });
    return {