feat: Add URL document parser for knowledge base (#3622)

* feat: 添加从 URL 上传文档的功能,支持进度回调和错误处理

* feat: 添加从 URL 上传文档的前端

* chore: 添加 URL 上传功能的警告提示,确保用户配置正确

* feat: 添加内容清洗功能,支持从 URL 上传文档时的清洗设置和服务提供商选择

* feat: 更新内容清洗系统提示,增强信息提取规则;添加 URL 上传功能的测试版标识

* style: format code

* perf: 优化上传设置,增强 URL 上传时的禁用逻辑和清洗提供商验证

* refactor:使用自带chunking模块

* refactor: 提取prompt到单独文件

* feat: 添加 Tavily API Key 配置对话框,增强网页搜索功能的配置体验

* fix: update URL hint and warning messages for clarity in knowledge base upload settings

* fix: 修复设置tavily_key的热重载问题

---------

Co-authored-by: Soulter <905617992@qq.com>
This commit is contained in:
RC-CHN
2025-11-17 19:05:14 +08:00
committed by GitHub
parent c7a58252fe
commit 270c89c12f
9 changed files with 1086 additions and 76 deletions

View File

@@ -1,4 +1,7 @@
import asyncio
import json
import re
import time
import uuid
from pathlib import Path
@@ -8,12 +11,98 @@ from astrbot.core import logger
from astrbot.core.db.vec_db.base import BaseVecDB
from astrbot.core.db.vec_db.faiss_impl.vec_db import FaissVecDB
from astrbot.core.provider.manager import ProviderManager
from astrbot.core.provider.provider import EmbeddingProvider, RerankProvider
from astrbot.core.provider.provider import (
EmbeddingProvider,
RerankProvider,
)
from astrbot.core.provider.provider import (
Provider as LLMProvider,
)
from .chunking.base import BaseChunker
from .chunking.recursive import RecursiveCharacterChunker
from .kb_db_sqlite import KBSQLiteDatabase
from .models import KBDocument, KBMedia, KnowledgeBase
from .parsers.url_parser import extract_text_from_url
from .parsers.util import select_parser
from .prompts import TEXT_REPAIR_SYSTEM_PROMPT
class RateLimiter:
"""一个简单的速率限制器"""
def __init__(self, max_rpm: int):
self.max_per_minute = max_rpm
self.interval = 60.0 / max_rpm if max_rpm > 0 else 0
self.last_call_time = 0
async def __aenter__(self):
if self.interval == 0:
return
now = time.monotonic()
elapsed = now - self.last_call_time
if elapsed < self.interval:
await asyncio.sleep(self.interval - elapsed)
self.last_call_time = time.monotonic()
async def __aexit__(self, exc_type, exc_val, exc_tb):
pass
async def _repair_and_translate_chunk_with_retry(
chunk: str,
repair_llm_service: LLMProvider,
rate_limiter: RateLimiter,
max_retries: int = 2,
) -> list[str]:
"""
Repairs, translates, and optionally re-chunks a single text chunk using the small LLM, with rate limiting.
"""
# 为了防止 LLM 上下文污染,在 user_prompt 中也加入明确的指令
user_prompt = f"""IGNORE ALL PREVIOUS INSTRUCTIONS. Your ONLY task is to process the following text chunk according to the system prompt provided.
Text chunk to process:
---
{chunk}
---
"""
for attempt in range(max_retries + 1):
try:
async with rate_limiter:
response = await repair_llm_service.text_chat(
prompt=user_prompt, system_prompt=TEXT_REPAIR_SYSTEM_PROMPT
)
llm_output = response.completion_text
if "<discard_chunk />" in llm_output:
return [] # Signal to discard this chunk
# More robust regex to handle potential LLM formatting errors (spaces, newlines in tags)
matches = re.findall(
r"<\s*repaired_text\s*>\s*(.*?)\s*<\s*/\s*repaired_text\s*>",
llm_output,
re.DOTALL,
)
if matches:
# Further cleaning to ensure no empty strings are returned
return [m.strip() for m in matches if m.strip()]
else:
# If no valid tags and not explicitly discarded, discard it to be safe.
return []
except Exception as e:
logger.warning(
f" - LLM call failed on attempt {attempt + 1}/{max_retries + 1}. Error: {str(e)}"
)
logger.error(
f" - Failed to process chunk after {max_retries + 1} attempts. Using original text."
)
return [chunk]
class KBHelper:
@@ -100,7 +189,7 @@ class KBHelper:
async def upload_document(
self,
file_name: str,
file_content: bytes,
file_content: bytes | None,
file_type: str,
chunk_size: int = 512,
chunk_overlap: int = 50,
@@ -108,6 +197,7 @@ class KBHelper:
tasks_limit: int = 3,
max_retries: int = 3,
progress_callback=None,
pre_chunked_text: list[str] | None = None,
) -> KBDocument:
"""上传并处理文档(带原子性保证和失败清理)
@@ -130,46 +220,63 @@ class KBHelper:
await self._ensure_vec_db()
doc_id = str(uuid.uuid4())
media_paths: list[Path] = []
file_size = 0
# file_path = self.kb_files_dir / f"{doc_id}.{file_type}"
# async with aiofiles.open(file_path, "wb") as f:
# await f.write(file_content)
try:
# 阶段1: 解析文档
if progress_callback:
await progress_callback("parsing", 0, 100)
parser = await select_parser(f".{file_type}")
parse_result = await parser.parse(file_content, file_name)
text_content = parse_result.text
media_items = parse_result.media
if progress_callback:
await progress_callback("parsing", 100, 100)
# 保存媒体文件
chunks_text = []
saved_media = []
for media_item in media_items:
media = await self._save_media(
doc_id=doc_id,
media_type=media_item.media_type,
file_name=media_item.file_name,
content=media_item.content,
mime_type=media_item.mime_type,
if pre_chunked_text is not None:
# 如果提供了预分块文本,直接使用
chunks_text = pre_chunked_text
file_size = sum(len(chunk) for chunk in chunks_text)
logger.info(f"使用预分块文本进行上传,共 {len(chunks_text)} 个块。")
else:
# 否则,执行标准的文件解析和分块流程
if file_content is None:
raise ValueError(
"当未提供 pre_chunked_text 时file_content 不能为空。"
)
file_size = len(file_content)
# 阶段1: 解析文档
if progress_callback:
await progress_callback("parsing", 0, 100)
parser = await select_parser(f".{file_type}")
parse_result = await parser.parse(file_content, file_name)
text_content = parse_result.text
media_items = parse_result.media
if progress_callback:
await progress_callback("parsing", 100, 100)
# 保存媒体文件
for media_item in media_items:
media = await self._save_media(
doc_id=doc_id,
media_type=media_item.media_type,
file_name=media_item.file_name,
content=media_item.content,
mime_type=media_item.mime_type,
)
saved_media.append(media)
media_paths.append(Path(media.file_path))
# 阶段2: 分块
if progress_callback:
await progress_callback("chunking", 0, 100)
chunks_text = await self.chunker.chunk(
text_content,
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
)
saved_media.append(media)
media_paths.append(Path(media.file_path))
# 阶段2: 分块
if progress_callback:
await progress_callback("chunking", 0, 100)
chunks_text = await self.chunker.chunk(
text_content,
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
)
contents = []
metadatas = []
for idx, chunk_text in enumerate(chunks_text):
@@ -205,7 +312,7 @@ class KBHelper:
kb_id=self.kb.kb_id,
doc_name=file_name,
file_type=file_type,
file_size=len(file_content),
file_size=file_size,
# file_path=str(file_path),
file_path="",
chunk_count=len(chunks_text),
@@ -359,3 +466,177 @@ class KBHelper:
)
return media
async def upload_from_url(
self,
url: str,
chunk_size: int = 512,
chunk_overlap: int = 50,
batch_size: int = 32,
tasks_limit: int = 3,
max_retries: int = 3,
progress_callback=None,
enable_cleaning: bool = False,
cleaning_provider_id: str | None = None,
) -> KBDocument:
"""从 URL 上传并处理文档(带原子性保证和失败清理)
Args:
url: 要提取内容的网页 URL
chunk_size: 文本块大小
chunk_overlap: 文本块重叠大小
batch_size: 批处理大小
tasks_limit: 并发任务限制
max_retries: 最大重试次数
progress_callback: 进度回调函数,接收参数 (stage, current, total)
- stage: 当前阶段 ('extracting', 'cleaning', 'parsing', 'chunking', 'embedding')
- current: 当前进度
- total: 总数
Returns:
KBDocument: 上传的文档对象
Raises:
ValueError: 如果 URL 为空或无法提取内容
IOError: 如果网络请求失败
"""
# 获取 Tavily API 密钥
config = self.prov_mgr.acm.default_conf
tavily_keys = config.get("provider_settings", {}).get(
"websearch_tavily_key", []
)
if not tavily_keys:
raise ValueError(
"Error: Tavily API key is not configured in provider_settings."
)
# 阶段1: 从 URL 提取内容
if progress_callback:
await progress_callback("extracting", 0, 100)
try:
text_content = await extract_text_from_url(url, tavily_keys)
except Exception as e:
logger.error(f"Failed to extract content from URL {url}: {e}")
raise OSError(f"Failed to extract content from URL {url}: {e}") from e
if not text_content:
raise ValueError(f"No content extracted from URL: {url}")
if progress_callback:
await progress_callback("extracting", 100, 100)
# 阶段2: (可选)清洗内容并分块
final_chunks = await self._clean_and_rechunk_content(
content=text_content,
url=url,
progress_callback=progress_callback,
enable_cleaning=enable_cleaning,
cleaning_provider_id=cleaning_provider_id,
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
)
if enable_cleaning and not final_chunks:
raise ValueError(
"内容清洗后未提取到有效文本。请尝试关闭内容清洗功能或更换更高性能的LLM模型后重试。"
)
# 创建一个虚拟文件名
file_name = url.split("/")[-1] or f"document_from_{url}"
if not Path(file_name).suffix:
file_name += ".url"
# 复用现有的 upload_document 方法,但传入预分块文本
return await self.upload_document(
file_name=file_name,
file_content=None,
file_type="url", # 使用 'url' 作为特殊文件类型
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
batch_size=batch_size,
tasks_limit=tasks_limit,
max_retries=max_retries,
progress_callback=progress_callback,
pre_chunked_text=final_chunks,
)
async def _clean_and_rechunk_content(
self,
content: str,
url: str,
progress_callback=None,
enable_cleaning: bool = False,
cleaning_provider_id: str | None = None,
repair_max_rpm: int = 60,
chunk_size: int = 512,
chunk_overlap: int = 50,
) -> list[str]:
"""
对从 URL 获取的内容进行清洗、修复、翻译和重新分块。
"""
if not enable_cleaning:
# 如果不启用清洗,则使用从前端传递的参数进行分块
logger.info(
f"内容清洗未启用,使用指定参数进行分块: chunk_size={chunk_size}, chunk_overlap={chunk_overlap}"
)
return await self.chunker.chunk(
content, chunk_size=chunk_size, chunk_overlap=chunk_overlap
)
if not cleaning_provider_id:
logger.warning(
"启用了内容清洗,但未提供 cleaning_provider_id跳过清洗并使用默认分块。"
)
return await self.chunker.chunk(content)
if progress_callback:
await progress_callback("cleaning", 0, 100)
try:
# 获取指定的 LLM Provider
llm_provider = await self.prov_mgr.get_provider_by_id(cleaning_provider_id)
if not llm_provider or not isinstance(llm_provider, LLMProvider):
raise ValueError(
f"无法找到 ID 为 {cleaning_provider_id} 的 LLM Provider 或类型不正确"
)
# 初步分块
# 优化分隔符,优先按段落分割,以获得更高质量的文本块
text_splitter = RecursiveCharacterChunker(
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
separators=["\n\n", "\n", " "], # 优先使用段落分隔符
)
initial_chunks = await text_splitter.chunk(content)
logger.info(f"初步分块完成,生成 {len(initial_chunks)} 个块用于修复。")
# 并发处理所有块
rate_limiter = RateLimiter(repair_max_rpm)
tasks = [
_repair_and_translate_chunk_with_retry(
chunk, llm_provider, rate_limiter
)
for chunk in initial_chunks
]
repaired_results = await asyncio.gather(*tasks, return_exceptions=True)
final_chunks = []
for i, result in enumerate(repaired_results):
if isinstance(result, Exception):
logger.warning(f"{i} 处理异常: {str(result)}. 回退到原始块。")
final_chunks.append(initial_chunks[i])
elif isinstance(result, list):
final_chunks.extend(result)
logger.info(
f"文本修复完成: {len(initial_chunks)} 个原始块 -> {len(final_chunks)} 个最终块。"
)
if progress_callback:
await progress_callback("cleaning", 100, 100)
return final_chunks
except Exception as e:
logger.error(f"使用 Provider '{cleaning_provider_id}' 清洗内容失败: {e}")
# 清洗失败,返回默认分块结果,保证流程不中断
return await self.chunker.chunk(content)

View File

@@ -8,7 +8,7 @@ from astrbot.core.provider.manager import ProviderManager
from .chunking.recursive import RecursiveCharacterChunker
from .kb_db_sqlite import KBSQLiteDatabase
from .kb_helper import KBHelper
from .models import KnowledgeBase
from .models import KBDocument, KnowledgeBase
from .retrieval.manager import RetrievalManager, RetrievalResult
from .retrieval.rank_fusion import RankFusion
from .retrieval.sparse_retriever import SparseRetriever
@@ -284,3 +284,47 @@ class KnowledgeBaseManager:
await self.kb_db.close()
except Exception as e:
logger.error(f"关闭知识库元数据数据库失败: {e}")
async def upload_from_url(
self,
kb_id: str,
url: str,
chunk_size: int = 512,
chunk_overlap: int = 50,
batch_size: int = 32,
tasks_limit: int = 3,
max_retries: int = 3,
progress_callback=None,
) -> KBDocument:
"""从 URL 上传文档到指定的知识库
Args:
kb_id: 知识库 ID
url: 要提取内容的网页 URL
chunk_size: 文本块大小
chunk_overlap: 文本块重叠大小
batch_size: 批处理大小
tasks_limit: 并发任务限制
max_retries: 最大重试次数
progress_callback: 进度回调函数
Returns:
KBDocument: 上传的文档对象
Raises:
ValueError: 如果知识库不存在或 URL 为空
IOError: 如果网络请求失败
"""
kb_helper = await self.get_kb(kb_id)
if not kb_helper:
raise ValueError(f"Knowledge base with id {kb_id} not found.")
return await kb_helper.upload_from_url(
url=url,
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
batch_size=batch_size,
tasks_limit=tasks_limit,
max_retries=max_retries,
progress_callback=progress_callback,
)

View File

@@ -0,0 +1,103 @@
import asyncio
import aiohttp
class URLExtractor:
"""URL 内容提取器,封装了 Tavily API 调用和密钥管理"""
def __init__(self, tavily_keys: list[str]):
"""
初始化 URL 提取器
Args:
tavily_keys: Tavily API 密钥列表
"""
if not tavily_keys:
raise ValueError("Error: Tavily API keys are not configured.")
self.tavily_keys = tavily_keys
self.tavily_key_index = 0
self.tavily_key_lock = asyncio.Lock()
async def _get_tavily_key(self) -> str:
"""并发安全的从列表中获取并轮换Tavily API密钥。"""
async with self.tavily_key_lock:
key = self.tavily_keys[self.tavily_key_index]
self.tavily_key_index = (self.tavily_key_index + 1) % len(self.tavily_keys)
return key
async def extract_text_from_url(self, url: str) -> str:
"""
使用 Tavily API 从 URL 提取主要文本内容。
这是 web_searcher 插件中 tavily_extract_web_page 方法的简化版本,
专门为知识库模块设计,不依赖 AstrMessageEvent。
Args:
url: 要提取内容的网页 URL
Returns:
提取的文本内容
Raises:
ValueError: 如果 URL 为空或 API 密钥未配置
IOError: 如果请求失败或返回错误
"""
if not url:
raise ValueError("Error: url must be a non-empty string.")
tavily_key = await self._get_tavily_key()
api_url = "https://api.tavily.com/extract"
headers = {
"Authorization": f"Bearer {tavily_key}",
"Content-Type": "application/json",
}
payload = {
"urls": [url],
"extract_depth": "basic", # 使用基础提取深度
}
try:
async with aiohttp.ClientSession(trust_env=True) as session:
async with session.post(
api_url,
json=payload,
headers=headers,
timeout=30.0, # 增加超时时间,因为内容提取可能需要更长时间
) as response:
if response.status != 200:
reason = await response.text()
raise OSError(
f"Tavily web extraction failed: {reason}, status: {response.status}"
)
data = await response.json()
results = data.get("results", [])
if not results:
raise ValueError(f"No content extracted from URL: {url}")
# 返回第一个结果的内容
return results[0].get("raw_content", "")
except aiohttp.ClientError as e:
raise OSError(f"Failed to fetch URL {url}: {e}") from e
except Exception as e:
raise OSError(f"Failed to extract content from URL {url}: {e}") from e
# 为了向后兼容,提供一个简单的函数接口
async def extract_text_from_url(url: str, tavily_keys: list[str]) -> str:
"""
简单的函数接口,用于从 URL 提取文本内容
Args:
url: 要提取内容的网页 URL
tavily_keys: Tavily API 密钥列表
Returns:
提取的文本内容
"""
extractor = URLExtractor(tavily_keys)
return await extractor.extract_text_from_url(url)

View File

@@ -0,0 +1,65 @@
TEXT_REPAIR_SYSTEM_PROMPT = """You are a meticulous digital archivist. Your mission is to reconstruct a clean, readable article from raw, noisy text chunks.
**Core Task:**
1. **Analyze:** Examine the text chunk to separate "signal" (substantive information) from "noise" (UI elements, ads, navigation, footers).
2. **Process:** Clean and repair the signal. **Do not translate it.** Keep the original language.
**Crucial Rules:**
- **NEVER discard a chunk if it contains ANY valuable information.** Your primary duty is to salvage content.
- **If a chunk contains multiple distinct topics, split them.** Enclose each topic in its own `<repaired_text>` tag.
- Your output MUST be ONLY `<repaired_text>...</repaired_text>` tags or a single `<discard_chunk />` tag.
---
**Example 1: Chunk with Noise and Signal**
*Input Chunk:*
"Home | About | Products | **The Llama is a domesticated South American camelid.** | © 2025 ACME Corp."
*Your Thought Process:*
1. "Home | About | Products..." and "© 2025 ACME Corp." are noise.
2. "The Llama is a domesticated..." is the signal.
3. I must extract the signal and wrap it.
*Your Output:*
<repaired_text>
The Llama is a domesticated South American camelid.
</repaired_text>
---
**Example 2: Chunk with ONLY Noise**
*Input Chunk:*
"Next Page > | Subscribe to our newsletter | Follow us on X"
*Your Thought Process:*
1. This entire chunk is noise. There is no signal.
2. I must discard this.
*Your Output:*
<discard_chunk />
---
**Example 3: Chunk with Multiple Topics (Requires Splitting)**
*Input Chunk:*
"## Chapter 1: The Sun
The Sun is the star at the center of the Solar System.
## Chapter 2: The Moon
The Moon is Earth's only natural satellite."
*Your Thought Process:*
1. This chunk contains two distinct topics.
2. I must process them separately to maintain semantic integrity.
3. I will create two `<repaired_text>` blocks.
*Your Output:*
<repaired_text>
## Chapter 1: The Sun
The Sun is the star at the center of the Solar System.
</repaired_text>
<repaired_text>
## Chapter 2: The Moon
The Moon is Earth's only natural satellite.
</repaired_text>
"""

View File

@@ -48,6 +48,7 @@ class KnowledgeBaseRoute(Route):
# 文档管理
"/kb/document/list": ("GET", self.list_documents),
"/kb/document/upload": ("POST", self.upload_document),
"/kb/document/upload/url": ("POST", self.upload_document_from_url),
"/kb/document/upload/progress": ("GET", self.get_upload_progress),
"/kb/document/get": ("GET", self.get_document),
"/kb/document/delete": ("POST", self.delete_document),
@@ -1070,3 +1071,174 @@ class KnowledgeBaseRoute(Route):
logger.error(f"删除会话知识库配置失败: {e}")
logger.error(traceback.format_exc())
return Response().error(f"删除会话知识库配置失败: {e!s}").__dict__
async def upload_document_from_url(self):
"""从 URL 上传文档
Body:
- kb_id: 知识库 ID (必填)
- url: 要提取内容的网页 URL (必填)
- chunk_size: 分块大小 (可选, 默认512)
- chunk_overlap: 块重叠大小 (可选, 默认50)
- batch_size: 批处理大小 (可选, 默认32)
- tasks_limit: 并发任务限制 (可选, 默认3)
- max_retries: 最大重试次数 (可选, 默认3)
返回:
- task_id: 任务ID用于查询上传进度和结果
"""
try:
kb_manager = self._get_kb_manager()
data = await request.json
kb_id = data.get("kb_id")
if not kb_id:
return Response().error("缺少参数 kb_id").__dict__
url = data.get("url")
if not url:
return Response().error("缺少参数 url").__dict__
chunk_size = data.get("chunk_size", 512)
chunk_overlap = data.get("chunk_overlap", 50)
batch_size = data.get("batch_size", 32)
tasks_limit = data.get("tasks_limit", 3)
max_retries = data.get("max_retries", 3)
enable_cleaning = data.get("enable_cleaning", False)
cleaning_provider_id = data.get("cleaning_provider_id")
# 获取知识库
kb_helper = await kb_manager.get_kb(kb_id)
if not kb_helper:
return Response().error("知识库不存在").__dict__
# 生成任务ID
task_id = str(uuid.uuid4())
# 初始化任务状态
self.upload_tasks[task_id] = {
"status": "pending",
"result": None,
"error": None,
}
# 启动后台任务
asyncio.create_task(
self._background_upload_from_url_task(
task_id=task_id,
kb_helper=kb_helper,
url=url,
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
batch_size=batch_size,
tasks_limit=tasks_limit,
max_retries=max_retries,
enable_cleaning=enable_cleaning,
cleaning_provider_id=cleaning_provider_id,
),
)
return (
Response()
.ok(
{
"task_id": task_id,
"url": url,
"message": "URL upload task created, processing in background",
},
)
.__dict__
)
except ValueError as e:
return Response().error(str(e)).__dict__
except Exception as e:
logger.error(f"从URL上传文档失败: {e}")
logger.error(traceback.format_exc())
return Response().error(f"从URL上传文档失败: {e!s}").__dict__
async def _background_upload_from_url_task(
self,
task_id: str,
kb_helper,
url: str,
chunk_size: int,
chunk_overlap: int,
batch_size: int,
tasks_limit: int,
max_retries: int,
enable_cleaning: bool,
cleaning_provider_id: str | None,
):
"""后台上传URL任务"""
try:
# 初始化任务状态
self.upload_tasks[task_id] = {
"status": "processing",
"result": None,
"error": None,
}
self.upload_progress[task_id] = {
"status": "processing",
"file_index": 0,
"file_total": 1,
"file_name": f"URL: {url}",
"stage": "extracting",
"current": 0,
"total": 100,
}
# 创建进度回调函数
async def progress_callback(stage, current, total):
if task_id in self.upload_progress:
self.upload_progress[task_id].update(
{
"status": "processing",
"file_index": 0,
"file_name": f"URL: {url}",
"stage": stage,
"current": current,
"total": total,
},
)
# 上传文档
doc = await kb_helper.upload_from_url(
url=url,
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
batch_size=batch_size,
tasks_limit=tasks_limit,
max_retries=max_retries,
progress_callback=progress_callback,
enable_cleaning=enable_cleaning,
cleaning_provider_id=cleaning_provider_id,
)
# 更新任务完成状态
result = {
"task_id": task_id,
"uploaded": [doc.model_dump()],
"failed": [],
"total": 1,
"success_count": 1,
"failed_count": 0,
}
self.upload_tasks[task_id] = {
"status": "completed",
"result": result,
"error": None,
}
self.upload_progress[task_id]["status"] = "completed"
except Exception as e:
logger.error(f"后台上传URL任务 {task_id} 失败: {e}")
logger.error(traceback.format_exc())
self.upload_tasks[task_id] = {
"status": "failed",
"result": None,
"error": str(e),
}
if task_id in self.upload_progress:
self.upload_progress[task_id]["status"] = "failed"

View File

@@ -4,6 +4,7 @@
"tabs": {
"overview": "Overview",
"documents": "Documents",
"retrieval": "Retrieval",
"sessions": "Sessions",
"settings": "Settings"
},
@@ -49,6 +50,10 @@
"maxSize": "Max file size: 128MB",
"chunkSettings": "Chunk Settings",
"batchSettings": "Batch Settings",
"cleaningSettings": "Cleaning Settings",
"enableCleaning": "Enable Content Cleaning",
"cleaningProvider": "Cleaning Service Provider",
"cleaningProviderHint": "Select an LLM provider to clean and summarize the extracted web page content",
"chunkSize": "Chunk Size",
"chunkSizeHint": "Number of characters per chunk (default: 512)",
"chunkOverlap": "Chunk Overlap",
@@ -61,7 +66,13 @@
"maxRetriesHint": "Number of times to retry a failed upload task (default: 3)",
"cancel": "Cancel",
"submit": "Upload",
"fileRequired": "Please select a file to upload"
"fileRequired": "Please select a file to upload",
"fileUpload": "File Upload",
"fromUrl": "From URL",
"urlPlaceholder": "Enter the URL of the web page to extract content from",
"urlRequired": "Please enter a URL",
"urlHint": "The main content will be automatically extracted from the target URL as a document. Currently supports {supported} pages. Before use, please ensure that the target web page allows crawler access.",
"beta": "Beta"
},
"settings": {
"title": "Knowledge Base Settings",

View File

@@ -50,6 +50,10 @@
"maxSize": "最大文件大小: 128MB",
"chunkSettings": "分块设置",
"batchSettings": "批处理设置",
"cleaningSettings": "清洗设置",
"enableCleaning": "启用内容清洗",
"cleaningProvider": "清洗服务提供商",
"cleaningProviderHint": "选择一个 LLM 服务商来对提取的网页内容进行清洗和总结",
"chunkSize": "分块大小",
"chunkSizeHint": "每个文本块的字符数 (默认: 512)",
"chunkOverlap": "分块重叠",
@@ -62,7 +66,13 @@
"maxRetriesHint": "上传失败任务的重试次数 (默认: 3)",
"cancel": "取消",
"submit": "上传",
"fileRequired": "请选择要上传的文件"
"fileRequired": "请选择要上传的文件",
"fileUpload": "文件上传",
"fromUrl": "从 URL",
"urlPlaceholder": "请输入要提取内容的网页 URL",
"urlRequired": "请输入 URL",
"urlHint": "将自动从目标 URL 提取主要内容作为文档。目前支持 {supported} 页面,请确保目标网页允许爬虫访问。",
"beta": "测试版"
},
"retrieval": {
"title": "知识库检索",

View File

@@ -57,7 +57,7 @@
</v-card>
<!-- 上传对话框 -->
<v-dialog v-model="showUploadDialog" max-width="600px" persistent @after-enter="initUploadSettings">
<v-dialog v-model="showUploadDialog" max-width="650px" persistent @after-enter="initUploadSettings">
<v-card>
<v-card-title class="pa-4 d-flex align-center">
<span class="text-h5">{{ t('upload.title') }}</span>
@@ -67,40 +67,91 @@
<v-divider />
<v-card-text class="pa-6">
<!-- 文件选择 -->
<div class="upload-dropzone" :class="{ 'dragover': isDragging }" @drop.prevent="handleDrop"
@dragover.prevent="isDragging = true" @dragleave="isDragging = false" @click="fileInput?.click()">
<v-icon size="64" color="primary">mdi-cloud-upload</v-icon>
<p class="mt-4 text-h6">{{ t('upload.dropzone') }}</p>
<p class="text-caption text-medium-emphasis mt-2">{{ t('upload.supportedFormats') }}.txt, .md, .pdf, .docx,
.xls, .xlsx</p>
<p class="text-caption text-medium-emphasis">{{ t('upload.maxSize') }}</p>
<p class="text-caption text-medium-emphasis">最多可上传 10 个文件</p>
<input ref="fileInput" type="file" multiple hidden accept=".txt,.md,.pdf,.docx,.xls,.xlsx"
@change="handleFileSelect" />
</div>
<v-tabs v-model="uploadMode" grow class="mb-4">
<v-tab value="file">{{ t('upload.fileUpload') }}</v-tab>
<v-tab value="url">
{{ t('upload.fromUrl') }}
<v-badge color="warning" :content="t('upload.beta')" inline class="ml-2" />
</v-tab>
</v-tabs>
<div v-if="selectedFiles.length > 0" class="mt-4">
<div class="d-flex align-center justify-space-between mb-2">
<span class="text-subtitle-2">已选择 {{ selectedFiles.length }} 个文件</span>
<v-btn variant="text" size="small" @click="selectedFiles = []">清空</v-btn>
</div>
<div class="files-list">
<div v-for="(file, index) in selectedFiles" :key="index"
class="file-item pa-3 mb-2 rounded bg-surface-variant">
<div class="d-flex align-center justify-space-between">
<div class="d-flex align-center gap-2">
<v-icon>{{ getFileIcon(file.name) }}</v-icon>
<div>
<div class="font-weight-medium">{{ file.name }}</div>
<div class="text-caption">{{ formatFileSize(file.size) }}</div>
<v-card-text class="pa-6 pt-2">
<v-window v-model="uploadMode">
<!-- 文件上传 -->
<v-window-item value="file">
<!-- 文件选择 -->
<div class="upload-dropzone" :class="{ 'dragover': isDragging }" @drop.prevent="handleDrop"
@dragover.prevent="isDragging = true" @dragleave="isDragging = false" @click="fileInput?.click()">
<v-icon size="64" color="primary">mdi-cloud-upload</v-icon>
<p class="mt-4 text-h6">{{ t('upload.dropzone') }}</p>
<p class="text-caption text-medium-emphasis mt-2">{{ t('upload.supportedFormats') }}.txt, .md, .pdf,
.docx,
.xls, .xlsx</p>
<p class="text-caption text-medium-emphasis">{{ t('upload.maxSize') }}</p>
<p class="text-caption text-medium-emphasis">最多可上传 10 个文件</p>
<input ref="fileInput" type="file" multiple hidden accept=".txt,.md,.pdf,.docx,.xls,.xlsx"
@change="handleFileSelect" />
</div>
<div v-if="selectedFiles.length > 0" class="mt-4">
<div class="d-flex align-center justify-space-between mb-2">
<span class="text-subtitle-2">已选择 {{ selectedFiles.length }} 个文件</span>
<v-btn variant="text" size="small" @click="selectedFiles = []">清空</v-btn>
</div>
<div class="files-list">
<div v-for="(file, index) in selectedFiles" :key="index"
class="file-item pa-3 mb-2 rounded bg-surface-variant">
<div class="d-flex align-center justify-space-between">
<div class="d-flex align-center gap-2">
<v-icon>{{ getFileIcon(file.name) }}</v-icon>
<div>
<div class="font-weight-medium">{{ file.name }}</div>
<div class="text-caption">{{ formatFileSize(file.size) }}</div>
</div>
</div>
<v-btn icon="mdi-close" variant="text" size="small" @click="removeFile(index)" />
</div>
</div>
<v-btn icon="mdi-close" variant="text" size="small" @click="removeFile(index)" />
</div>
</div>
</v-window-item>
<!-- URL上传 -->
<v-window-item value="url" class="pt-2">
<!-- Tavily Key 快速配置 -->
<div v-if="tavilyConfigStatus === 'not_configured' || tavilyConfigStatus === 'error'" class="mb-4">
<v-alert :type="tavilyConfigStatus === 'error' ? 'error' : 'info'" variant="tonal" density="compact">
<div class="d-flex align-center justify-space-between">
<span>
{{ tavilyConfigStatus === 'error' ? '检查网页搜索配置失败' : '使用此功能需要配置 Tavily Key' }}
</span>
<v-btn size="small" variant="flat" @click="showTavilyDialog = true">
配置
</v-btn>
</div>
</v-alert>
</div>
<v-text-field v-model="uploadUrl" :label="t('upload.urlPlaceholder')" variant="outlined" clearable :disabled="tavilyConfigStatus === 'not_configured'"
autofocus :hint="t('upload.urlHint', { supported: 'HTML' })" persistent-hint />
</v-window-item>
</v-window>
<!-- 清洗设置 (仅在URL模式下显示) -->
<div v-if="uploadMode === 'url'" class="mt-6">
<div class="d-flex align-center mb-4">
<h3 class="text-h6">{{ t('upload.cleaningSettings') }}</h3>
</div>
<v-row>
<v-col cols="12" sm="4">
<v-switch v-model="uploadSettings.enable_cleaning" :label="t('upload.enableCleaning')" color="primary" />
</v-col>
<v-col cols="12" sm="8">
<v-select v-model="uploadSettings.cleaning_provider_id" :items="llmProviders" item-title="id"
item-value="id" :label="t('upload.cleaningProvider')" :hint="t('upload.cleaningProviderHint')"
persistent-hint variant="outlined" density="compact" :disabled="!uploadSettings.enable_cleaning" />
</v-col>
</v-row>
</div>
<!-- 分块设置 -->
@@ -151,8 +202,8 @@
<v-btn variant="text" @click="closeUploadDialog" :disabled="uploading">
{{ t('upload.cancel') }}
</v-btn>
<v-btn color="primary" variant="elevated" @click="uploadDocument" :loading="uploading"
:disabled="selectedFiles.length === 0">
<v-btn color="primary" variant="elevated" @click="startUpload" :loading="uploading"
:disabled="isUploadDisabled">
{{ t('upload.submit') }}
</v-btn>
</v-card-actions>
@@ -185,11 +236,15 @@
<v-snackbar v-model="snackbar.show" :color="snackbar.color">
{{ snackbar.text }}
</v-snackbar>
<!-- Tavily Key 配置对话框 -->
<TavilyKeyDialog v-model="showTavilyDialog" @success="onTavilyKeySet" />
</div>
</template>
<script setup lang="ts">
import { ref, onMounted, onUnmounted } from 'vue'
import TavilyKeyDialog from './TavilyKeyDialog.vue'
import { ref, onMounted, onUnmounted, computed } from 'vue'
import { useRouter } from 'vue-router'
import axios from 'axios'
import { useModuleI18n } from '@/i18n/composables'
@@ -216,10 +271,13 @@ const selectedFiles = ref<File[]>([])
const deleteTarget = ref<any>(null)
const isDragging = ref(false)
const fileInput = ref<HTMLInputElement | null>(null)
// 上传进度 - 用于轮询多个任务
const uploadMode = ref('file') // 'file' or 'url'
const uploadUrl = ref('')
const llmProviders = ref<any[]>([])
const uploadingTasks = ref<Map<string, any>>(new Map())
const progressPollingInterval = ref<number | null>(null)
const tavilyConfigStatus = ref('loading') // 'loading', 'configured', 'not_configured', 'error'
const showTavilyDialog = ref(false)
const snackbar = ref({
show: false,
@@ -239,7 +297,9 @@ const uploadSettings = ref({
chunk_overlap: null as number | null,
batch_size: 32,
tasks_limit: 3,
max_retries: 3
max_retries: 3,
enable_cleaning: false,
cleaning_provider_id: null as string | null
})
// 初始化上传设置
@@ -249,10 +309,31 @@ const initUploadSettings = () => {
chunk_overlap: props.kb?.chunk_overlap || null,
batch_size: 32,
tasks_limit: 3,
max_retries: 3
max_retries: 3,
enable_cleaning: false,
cleaning_provider_id: null
}
}
const isUploadDisabled = computed(() => {
if (uploading.value) {
return true
}
if (uploadMode.value === 'file') {
return selectedFiles.value.length === 0
}
if (uploadMode.value === 'url') {
if (!uploadUrl.value) {
return true
}
if (uploadSettings.value.enable_cleaning && !uploadSettings.value.cleaning_provider_id) {
return true
}
return false
}
return true
})
// 表格列
const headers = [
{ title: t('documents.name'), key: 'doc_name', sortable: true },
@@ -314,8 +395,17 @@ const handleDrop = (event: DragEvent) => {
}
}
// 上传文档
const uploadDocument = async () => {
// 上传调度器
const startUpload = async () => {
if (uploadMode.value === 'file') {
await uploadFiles()
} else if (uploadMode.value === 'url') {
await uploadFromUrl()
}
}
// 上传文件
const uploadFiles = async () => {
if (selectedFiles.value.length === 0) {
showSnackbar(t('upload.fileRequired'), 'warning')
return
@@ -390,6 +480,80 @@ const uploadDocument = async () => {
}
}
// 从 URL 上传
const uploadFromUrl = async () => {
if (!uploadUrl.value) {
showSnackbar(t('upload.urlRequired'), 'warning')
return
}
uploading.value = true
try {
const payload: any = {
kb_id: props.kbId,
url: uploadUrl.value,
batch_size: uploadSettings.value.batch_size,
tasks_limit: uploadSettings.value.tasks_limit,
max_retries: uploadSettings.value.max_retries
}
if (uploadSettings.value.chunk_size) {
payload.chunk_size = uploadSettings.value.chunk_size
}
if (uploadSettings.value.chunk_overlap) {
payload.chunk_overlap = uploadSettings.value.chunk_overlap
}
if (uploadSettings.value.enable_cleaning) {
payload.enable_cleaning = true
if (uploadSettings.value.cleaning_provider_id) {
payload.cleaning_provider_id = uploadSettings.value.cleaning_provider_id
}
}
const response = await axios.post('/api/kb/document/upload/url', payload)
if (response.data.status === 'ok') {
const result = response.data.data
const taskId = result.task_id
showSnackbar(`正在从 URL 后台提取内容...`, 'info')
// 添加占位条目
const uploadingDoc = {
doc_id: `uploading_${taskId}_0`,
doc_name: result.url,
file_type: 'url',
file_size: 0, // URL has no size
chunk_count: 0,
created_at: new Date().toISOString(),
uploading: true,
taskId: taskId,
uploadProgress: {
stage: 'waiting',
current: 0,
total: 100
}
}
documents.value = [uploadingDoc, ...documents.value]
closeUploadDialog()
if (taskId) {
startProgressPolling(taskId)
}
} else {
showSnackbar(response.data.message || t('documents.uploadFailed'), 'error')
}
} catch (error: any) {
console.error('Failed to upload from URL:', error)
const message = error.response?.data?.message || t('documents.uploadFailed')
showSnackbar(message, 'error')
} finally {
uploading.value = false
}
}
// 开始轮询进度
const startProgressPolling = (taskId: string) => {
// 如果已经在轮询,先停止
@@ -490,6 +654,8 @@ const getUploadPercentage = (item: any) => {
const getStageText = (stage: string) => {
const stageMap: Record<string, string> = {
'waiting': '等待中...',
'extracting': '提取内容...',
'cleaning': '清洗内容...',
'parsing': '解析文档...',
'chunking': '文本分块...',
'embedding': '生成向量...'
@@ -501,6 +667,8 @@ const getStageText = (stage: string) => {
const closeUploadDialog = () => {
showUploadDialog.value = false
selectedFiles.value = []
uploadUrl.value = ''
uploadMode.value = 'file'
initUploadSettings()
}
@@ -551,6 +719,7 @@ const getFileIcon = (fileType: string) => {
if (type.includes('pdf')) return 'mdi-file-pdf-box'
if (type.includes('md') || type.includes('markdown')) return 'mdi-language-markdown'
if (type.includes('txt')) return 'mdi-file-document-outline'
if (type.includes('url')) return 'mdi-link-variant'
return 'mdi-file'
}
@@ -559,6 +728,7 @@ const getFileColor = (fileType: string) => {
if (type.includes('pdf')) return 'error'
if (type.includes('md')) return 'info'
if (type.includes('txt')) return 'success'
if (type.includes('url')) return 'primary'
return 'grey'
}
@@ -585,8 +755,53 @@ const formatDate = (dateStr: string) => {
})
}
// 加载LLM providers
const loadLlmProviders = async () => {
try {
const response = await axios.get('/api/config/provider/list', {
params: { provider_type: 'chat_completion' }
})
if (response.data.status === 'ok') {
llmProviders.value = response.data.data
}
} catch (error) {
console.error('Failed to load LLM providers:', error)
}
}
// 检查Tavily Key配置
const checkTavilyConfig = async () => {
tavilyConfigStatus.value = 'loading'
try {
const response = await axios.get('/api/config/abconf', {
params: { id: 'default' }
})
if (response.data.status === 'ok') {
const config = response.data.data.config
const tavilyKeys = config?.provider_settings?.websearch_tavily_key
if (Array.isArray(tavilyKeys) && tavilyKeys.length > 0 && tavilyKeys.some(key => key.trim() !== '')) {
tavilyConfigStatus.value = 'configured'
} else {
tavilyConfigStatus.value = 'not_configured'
}
} else {
tavilyConfigStatus.value = 'error'
}
} catch (error) {
console.warn('Failed to check Tavily key config:', error)
tavilyConfigStatus.value = 'error'
}
}
const onTavilyKeySet = () => {
showSnackbar('Tavily API Key 配置成功', 'success')
checkTavilyConfig()
}
onMounted(() => {
loadDocuments()
loadLlmProviders()
checkTavilyConfig()
})
onUnmounted(() => {

View File

@@ -0,0 +1,109 @@
<template>
<v-dialog v-model="dialog" max-width="500px" persistent>
<v-card>
<v-card-title class="text-h5">
配置 Tavily API Key
</v-card-title>
<v-card-text>
<p class="mb-4 text-body-2 text-medium-emphasis">
为了使用基于网页的知识库功能需要提供 Tavily API Key您可以从 <a href="https://tavily.com/" target="_blank">Tavily 官网</a> 获取
</p>
<v-text-field
v-model="apiKey"
label="Tavily API Key"
variant="outlined"
:loading="saving"
:error-messages="errorMessage"
autofocus
clearable
placeholder="tvly-..."
/>
</v-card-text>
<v-card-actions>
<v-spacer />
<v-btn variant="text" @click="closeDialog" :disabled="saving">
取消
</v-btn>
<v-btn color="primary" variant="elevated" @click="saveKey" :loading="saving">
保存
</v-btn>
</v-card-actions>
</v-card>
</v-dialog>
</template>
<script setup lang="ts">
import { ref, watch } from 'vue'
import axios from 'axios'
const props = defineProps<{
modelValue: boolean
}>()
const emit = defineEmits(['update:modelValue', 'success'])
const dialog = ref(props.modelValue)
const apiKey = ref('')
const saving = ref(false)
const errorMessage = ref('')
watch(() => props.modelValue, (val) => {
dialog.value = val
if (val) {
// Reset state when dialog opens
apiKey.value = ''
errorMessage.value = ''
saving.value = false
}
})
const closeDialog = () => {
emit('update:modelValue', false)
}
const saveKey = async () => {
if (!apiKey.value.trim()) {
errorMessage.value = 'API Key 不能为空'
return
}
errorMessage.value = ''
saving.value = true
try {
// 1. 获取当前配置
const configResponse = await axios.get('/api/config/abconf', {
params: { id: 'default' }
})
if (configResponse.data.status !== 'ok') {
throw new Error('获取当前配置失败')
}
const currentConfig = configResponse.data.data.config
// 2. 更新配置
if (!currentConfig.provider_settings) {
currentConfig.provider_settings = {}
}
currentConfig.provider_settings.websearch_tavily_key = [apiKey.value.trim()]
// 同时将搜索提供商设置为 tavily
currentConfig.provider_settings.websearch_provider = 'tavily'
// 3. 保存整个配置
const saveResponse = await axios.post('/api/config/astrbot/update', {
conf_id: 'default',
config: currentConfig
})
if (saveResponse.data.status === 'ok') {
emit('success')
closeDialog()
} else {
errorMessage.value = saveResponse.data.message || '保存失败,请检查 Key 是否正确'
}
} catch (error: any) {
errorMessage.value = error.response?.data?.message || '保存失败,发生未知错误'
} finally {
saving.value = false
}
}
</script>