feat: Add URL document parser for knowledge base (#3622)
* feat: 添加从 URL 上传文档的功能,支持进度回调和错误处理 * feat: 添加从 URL 上传文档的前端 * chore: 添加 URL 上传功能的警告提示,确保用户配置正确 * feat: 添加内容清洗功能,支持从 URL 上传文档时的清洗设置和服务提供商选择 * feat: 更新内容清洗系统提示,增强信息提取规则;添加 URL 上传功能的测试版标识 * style: format code * perf: 优化上传设置,增强 URL 上传时的禁用逻辑和清洗提供商验证 * refactor:使用自带chunking模块 * refactor: 提取prompt到单独文件 * feat: 添加 Tavily API Key 配置对话框,增强网页搜索功能的配置体验 * fix: update URL hint and warning messages for clarity in knowledge base upload settings * fix: 修复设置tavily_key的热重载问题 --------- Co-authored-by: Soulter <905617992@qq.com>
This commit is contained in:
@@ -1,4 +1,7 @@
|
||||
import asyncio
|
||||
import json
|
||||
import re
|
||||
import time
|
||||
import uuid
|
||||
from pathlib import Path
|
||||
|
||||
@@ -8,12 +11,98 @@ from astrbot.core import logger
|
||||
from astrbot.core.db.vec_db.base import BaseVecDB
|
||||
from astrbot.core.db.vec_db.faiss_impl.vec_db import FaissVecDB
|
||||
from astrbot.core.provider.manager import ProviderManager
|
||||
from astrbot.core.provider.provider import EmbeddingProvider, RerankProvider
|
||||
from astrbot.core.provider.provider import (
|
||||
EmbeddingProvider,
|
||||
RerankProvider,
|
||||
)
|
||||
from astrbot.core.provider.provider import (
|
||||
Provider as LLMProvider,
|
||||
)
|
||||
|
||||
from .chunking.base import BaseChunker
|
||||
from .chunking.recursive import RecursiveCharacterChunker
|
||||
from .kb_db_sqlite import KBSQLiteDatabase
|
||||
from .models import KBDocument, KBMedia, KnowledgeBase
|
||||
from .parsers.url_parser import extract_text_from_url
|
||||
from .parsers.util import select_parser
|
||||
from .prompts import TEXT_REPAIR_SYSTEM_PROMPT
|
||||
|
||||
|
||||
class RateLimiter:
|
||||
"""一个简单的速率限制器"""
|
||||
|
||||
def __init__(self, max_rpm: int):
|
||||
self.max_per_minute = max_rpm
|
||||
self.interval = 60.0 / max_rpm if max_rpm > 0 else 0
|
||||
self.last_call_time = 0
|
||||
|
||||
async def __aenter__(self):
|
||||
if self.interval == 0:
|
||||
return
|
||||
|
||||
now = time.monotonic()
|
||||
elapsed = now - self.last_call_time
|
||||
|
||||
if elapsed < self.interval:
|
||||
await asyncio.sleep(self.interval - elapsed)
|
||||
|
||||
self.last_call_time = time.monotonic()
|
||||
|
||||
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
||||
pass
|
||||
|
||||
|
||||
async def _repair_and_translate_chunk_with_retry(
|
||||
chunk: str,
|
||||
repair_llm_service: LLMProvider,
|
||||
rate_limiter: RateLimiter,
|
||||
max_retries: int = 2,
|
||||
) -> list[str]:
|
||||
"""
|
||||
Repairs, translates, and optionally re-chunks a single text chunk using the small LLM, with rate limiting.
|
||||
"""
|
||||
# 为了防止 LLM 上下文污染,在 user_prompt 中也加入明确的指令
|
||||
user_prompt = f"""IGNORE ALL PREVIOUS INSTRUCTIONS. Your ONLY task is to process the following text chunk according to the system prompt provided.
|
||||
|
||||
Text chunk to process:
|
||||
---
|
||||
{chunk}
|
||||
---
|
||||
"""
|
||||
for attempt in range(max_retries + 1):
|
||||
try:
|
||||
async with rate_limiter:
|
||||
response = await repair_llm_service.text_chat(
|
||||
prompt=user_prompt, system_prompt=TEXT_REPAIR_SYSTEM_PROMPT
|
||||
)
|
||||
|
||||
llm_output = response.completion_text
|
||||
|
||||
if "<discard_chunk />" in llm_output:
|
||||
return [] # Signal to discard this chunk
|
||||
|
||||
# More robust regex to handle potential LLM formatting errors (spaces, newlines in tags)
|
||||
matches = re.findall(
|
||||
r"<\s*repaired_text\s*>\s*(.*?)\s*<\s*/\s*repaired_text\s*>",
|
||||
llm_output,
|
||||
re.DOTALL,
|
||||
)
|
||||
|
||||
if matches:
|
||||
# Further cleaning to ensure no empty strings are returned
|
||||
return [m.strip() for m in matches if m.strip()]
|
||||
else:
|
||||
# If no valid tags and not explicitly discarded, discard it to be safe.
|
||||
return []
|
||||
except Exception as e:
|
||||
logger.warning(
|
||||
f" - LLM call failed on attempt {attempt + 1}/{max_retries + 1}. Error: {str(e)}"
|
||||
)
|
||||
|
||||
logger.error(
|
||||
f" - Failed to process chunk after {max_retries + 1} attempts. Using original text."
|
||||
)
|
||||
return [chunk]
|
||||
|
||||
|
||||
class KBHelper:
|
||||
@@ -100,7 +189,7 @@ class KBHelper:
|
||||
async def upload_document(
|
||||
self,
|
||||
file_name: str,
|
||||
file_content: bytes,
|
||||
file_content: bytes | None,
|
||||
file_type: str,
|
||||
chunk_size: int = 512,
|
||||
chunk_overlap: int = 50,
|
||||
@@ -108,6 +197,7 @@ class KBHelper:
|
||||
tasks_limit: int = 3,
|
||||
max_retries: int = 3,
|
||||
progress_callback=None,
|
||||
pre_chunked_text: list[str] | None = None,
|
||||
) -> KBDocument:
|
||||
"""上传并处理文档(带原子性保证和失败清理)
|
||||
|
||||
@@ -130,46 +220,63 @@ class KBHelper:
|
||||
await self._ensure_vec_db()
|
||||
doc_id = str(uuid.uuid4())
|
||||
media_paths: list[Path] = []
|
||||
file_size = 0
|
||||
|
||||
# file_path = self.kb_files_dir / f"{doc_id}.{file_type}"
|
||||
# async with aiofiles.open(file_path, "wb") as f:
|
||||
# await f.write(file_content)
|
||||
|
||||
try:
|
||||
# 阶段1: 解析文档
|
||||
if progress_callback:
|
||||
await progress_callback("parsing", 0, 100)
|
||||
|
||||
parser = await select_parser(f".{file_type}")
|
||||
parse_result = await parser.parse(file_content, file_name)
|
||||
text_content = parse_result.text
|
||||
media_items = parse_result.media
|
||||
|
||||
if progress_callback:
|
||||
await progress_callback("parsing", 100, 100)
|
||||
|
||||
# 保存媒体文件
|
||||
chunks_text = []
|
||||
saved_media = []
|
||||
for media_item in media_items:
|
||||
media = await self._save_media(
|
||||
doc_id=doc_id,
|
||||
media_type=media_item.media_type,
|
||||
file_name=media_item.file_name,
|
||||
content=media_item.content,
|
||||
mime_type=media_item.mime_type,
|
||||
|
||||
if pre_chunked_text is not None:
|
||||
# 如果提供了预分块文本,直接使用
|
||||
chunks_text = pre_chunked_text
|
||||
file_size = sum(len(chunk) for chunk in chunks_text)
|
||||
logger.info(f"使用预分块文本进行上传,共 {len(chunks_text)} 个块。")
|
||||
else:
|
||||
# 否则,执行标准的文件解析和分块流程
|
||||
if file_content is None:
|
||||
raise ValueError(
|
||||
"当未提供 pre_chunked_text 时,file_content 不能为空。"
|
||||
)
|
||||
|
||||
file_size = len(file_content)
|
||||
|
||||
# 阶段1: 解析文档
|
||||
if progress_callback:
|
||||
await progress_callback("parsing", 0, 100)
|
||||
|
||||
parser = await select_parser(f".{file_type}")
|
||||
parse_result = await parser.parse(file_content, file_name)
|
||||
text_content = parse_result.text
|
||||
media_items = parse_result.media
|
||||
|
||||
if progress_callback:
|
||||
await progress_callback("parsing", 100, 100)
|
||||
|
||||
# 保存媒体文件
|
||||
for media_item in media_items:
|
||||
media = await self._save_media(
|
||||
doc_id=doc_id,
|
||||
media_type=media_item.media_type,
|
||||
file_name=media_item.file_name,
|
||||
content=media_item.content,
|
||||
mime_type=media_item.mime_type,
|
||||
)
|
||||
saved_media.append(media)
|
||||
media_paths.append(Path(media.file_path))
|
||||
|
||||
# 阶段2: 分块
|
||||
if progress_callback:
|
||||
await progress_callback("chunking", 0, 100)
|
||||
|
||||
chunks_text = await self.chunker.chunk(
|
||||
text_content,
|
||||
chunk_size=chunk_size,
|
||||
chunk_overlap=chunk_overlap,
|
||||
)
|
||||
saved_media.append(media)
|
||||
media_paths.append(Path(media.file_path))
|
||||
|
||||
# 阶段2: 分块
|
||||
if progress_callback:
|
||||
await progress_callback("chunking", 0, 100)
|
||||
|
||||
chunks_text = await self.chunker.chunk(
|
||||
text_content,
|
||||
chunk_size=chunk_size,
|
||||
chunk_overlap=chunk_overlap,
|
||||
)
|
||||
contents = []
|
||||
metadatas = []
|
||||
for idx, chunk_text in enumerate(chunks_text):
|
||||
@@ -205,7 +312,7 @@ class KBHelper:
|
||||
kb_id=self.kb.kb_id,
|
||||
doc_name=file_name,
|
||||
file_type=file_type,
|
||||
file_size=len(file_content),
|
||||
file_size=file_size,
|
||||
# file_path=str(file_path),
|
||||
file_path="",
|
||||
chunk_count=len(chunks_text),
|
||||
@@ -359,3 +466,177 @@ class KBHelper:
|
||||
)
|
||||
|
||||
return media
|
||||
|
||||
async def upload_from_url(
|
||||
self,
|
||||
url: str,
|
||||
chunk_size: int = 512,
|
||||
chunk_overlap: int = 50,
|
||||
batch_size: int = 32,
|
||||
tasks_limit: int = 3,
|
||||
max_retries: int = 3,
|
||||
progress_callback=None,
|
||||
enable_cleaning: bool = False,
|
||||
cleaning_provider_id: str | None = None,
|
||||
) -> KBDocument:
|
||||
"""从 URL 上传并处理文档(带原子性保证和失败清理)
|
||||
Args:
|
||||
url: 要提取内容的网页 URL
|
||||
chunk_size: 文本块大小
|
||||
chunk_overlap: 文本块重叠大小
|
||||
batch_size: 批处理大小
|
||||
tasks_limit: 并发任务限制
|
||||
max_retries: 最大重试次数
|
||||
progress_callback: 进度回调函数,接收参数 (stage, current, total)
|
||||
- stage: 当前阶段 ('extracting', 'cleaning', 'parsing', 'chunking', 'embedding')
|
||||
- current: 当前进度
|
||||
- total: 总数
|
||||
Returns:
|
||||
KBDocument: 上传的文档对象
|
||||
Raises:
|
||||
ValueError: 如果 URL 为空或无法提取内容
|
||||
IOError: 如果网络请求失败
|
||||
"""
|
||||
# 获取 Tavily API 密钥
|
||||
config = self.prov_mgr.acm.default_conf
|
||||
tavily_keys = config.get("provider_settings", {}).get(
|
||||
"websearch_tavily_key", []
|
||||
)
|
||||
if not tavily_keys:
|
||||
raise ValueError(
|
||||
"Error: Tavily API key is not configured in provider_settings."
|
||||
)
|
||||
|
||||
# 阶段1: 从 URL 提取内容
|
||||
if progress_callback:
|
||||
await progress_callback("extracting", 0, 100)
|
||||
|
||||
try:
|
||||
text_content = await extract_text_from_url(url, tavily_keys)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to extract content from URL {url}: {e}")
|
||||
raise OSError(f"Failed to extract content from URL {url}: {e}") from e
|
||||
|
||||
if not text_content:
|
||||
raise ValueError(f"No content extracted from URL: {url}")
|
||||
|
||||
if progress_callback:
|
||||
await progress_callback("extracting", 100, 100)
|
||||
|
||||
# 阶段2: (可选)清洗内容并分块
|
||||
final_chunks = await self._clean_and_rechunk_content(
|
||||
content=text_content,
|
||||
url=url,
|
||||
progress_callback=progress_callback,
|
||||
enable_cleaning=enable_cleaning,
|
||||
cleaning_provider_id=cleaning_provider_id,
|
||||
chunk_size=chunk_size,
|
||||
chunk_overlap=chunk_overlap,
|
||||
)
|
||||
|
||||
if enable_cleaning and not final_chunks:
|
||||
raise ValueError(
|
||||
"内容清洗后未提取到有效文本。请尝试关闭内容清洗功能,或更换更高性能的LLM模型后重试。"
|
||||
)
|
||||
|
||||
# 创建一个虚拟文件名
|
||||
file_name = url.split("/")[-1] or f"document_from_{url}"
|
||||
if not Path(file_name).suffix:
|
||||
file_name += ".url"
|
||||
|
||||
# 复用现有的 upload_document 方法,但传入预分块文本
|
||||
return await self.upload_document(
|
||||
file_name=file_name,
|
||||
file_content=None,
|
||||
file_type="url", # 使用 'url' 作为特殊文件类型
|
||||
chunk_size=chunk_size,
|
||||
chunk_overlap=chunk_overlap,
|
||||
batch_size=batch_size,
|
||||
tasks_limit=tasks_limit,
|
||||
max_retries=max_retries,
|
||||
progress_callback=progress_callback,
|
||||
pre_chunked_text=final_chunks,
|
||||
)
|
||||
|
||||
async def _clean_and_rechunk_content(
|
||||
self,
|
||||
content: str,
|
||||
url: str,
|
||||
progress_callback=None,
|
||||
enable_cleaning: bool = False,
|
||||
cleaning_provider_id: str | None = None,
|
||||
repair_max_rpm: int = 60,
|
||||
chunk_size: int = 512,
|
||||
chunk_overlap: int = 50,
|
||||
) -> list[str]:
|
||||
"""
|
||||
对从 URL 获取的内容进行清洗、修复、翻译和重新分块。
|
||||
"""
|
||||
if not enable_cleaning:
|
||||
# 如果不启用清洗,则使用从前端传递的参数进行分块
|
||||
logger.info(
|
||||
f"内容清洗未启用,使用指定参数进行分块: chunk_size={chunk_size}, chunk_overlap={chunk_overlap}"
|
||||
)
|
||||
return await self.chunker.chunk(
|
||||
content, chunk_size=chunk_size, chunk_overlap=chunk_overlap
|
||||
)
|
||||
|
||||
if not cleaning_provider_id:
|
||||
logger.warning(
|
||||
"启用了内容清洗,但未提供 cleaning_provider_id,跳过清洗并使用默认分块。"
|
||||
)
|
||||
return await self.chunker.chunk(content)
|
||||
|
||||
if progress_callback:
|
||||
await progress_callback("cleaning", 0, 100)
|
||||
|
||||
try:
|
||||
# 获取指定的 LLM Provider
|
||||
llm_provider = await self.prov_mgr.get_provider_by_id(cleaning_provider_id)
|
||||
if not llm_provider or not isinstance(llm_provider, LLMProvider):
|
||||
raise ValueError(
|
||||
f"无法找到 ID 为 {cleaning_provider_id} 的 LLM Provider 或类型不正确"
|
||||
)
|
||||
|
||||
# 初步分块
|
||||
# 优化分隔符,优先按段落分割,以获得更高质量的文本块
|
||||
text_splitter = RecursiveCharacterChunker(
|
||||
chunk_size=chunk_size,
|
||||
chunk_overlap=chunk_overlap,
|
||||
separators=["\n\n", "\n", " "], # 优先使用段落分隔符
|
||||
)
|
||||
initial_chunks = await text_splitter.chunk(content)
|
||||
logger.info(f"初步分块完成,生成 {len(initial_chunks)} 个块用于修复。")
|
||||
|
||||
# 并发处理所有块
|
||||
rate_limiter = RateLimiter(repair_max_rpm)
|
||||
tasks = [
|
||||
_repair_and_translate_chunk_with_retry(
|
||||
chunk, llm_provider, rate_limiter
|
||||
)
|
||||
for chunk in initial_chunks
|
||||
]
|
||||
|
||||
repaired_results = await asyncio.gather(*tasks, return_exceptions=True)
|
||||
|
||||
final_chunks = []
|
||||
for i, result in enumerate(repaired_results):
|
||||
if isinstance(result, Exception):
|
||||
logger.warning(f"块 {i} 处理异常: {str(result)}. 回退到原始块。")
|
||||
final_chunks.append(initial_chunks[i])
|
||||
elif isinstance(result, list):
|
||||
final_chunks.extend(result)
|
||||
|
||||
logger.info(
|
||||
f"文本修复完成: {len(initial_chunks)} 个原始块 -> {len(final_chunks)} 个最终块。"
|
||||
)
|
||||
|
||||
if progress_callback:
|
||||
await progress_callback("cleaning", 100, 100)
|
||||
|
||||
return final_chunks
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"使用 Provider '{cleaning_provider_id}' 清洗内容失败: {e}")
|
||||
# 清洗失败,返回默认分块结果,保证流程不中断
|
||||
return await self.chunker.chunk(content)
|
||||
|
||||
@@ -8,7 +8,7 @@ from astrbot.core.provider.manager import ProviderManager
|
||||
from .chunking.recursive import RecursiveCharacterChunker
|
||||
from .kb_db_sqlite import KBSQLiteDatabase
|
||||
from .kb_helper import KBHelper
|
||||
from .models import KnowledgeBase
|
||||
from .models import KBDocument, KnowledgeBase
|
||||
from .retrieval.manager import RetrievalManager, RetrievalResult
|
||||
from .retrieval.rank_fusion import RankFusion
|
||||
from .retrieval.sparse_retriever import SparseRetriever
|
||||
@@ -284,3 +284,47 @@ class KnowledgeBaseManager:
|
||||
await self.kb_db.close()
|
||||
except Exception as e:
|
||||
logger.error(f"关闭知识库元数据数据库失败: {e}")
|
||||
|
||||
async def upload_from_url(
|
||||
self,
|
||||
kb_id: str,
|
||||
url: str,
|
||||
chunk_size: int = 512,
|
||||
chunk_overlap: int = 50,
|
||||
batch_size: int = 32,
|
||||
tasks_limit: int = 3,
|
||||
max_retries: int = 3,
|
||||
progress_callback=None,
|
||||
) -> KBDocument:
|
||||
"""从 URL 上传文档到指定的知识库
|
||||
|
||||
Args:
|
||||
kb_id: 知识库 ID
|
||||
url: 要提取内容的网页 URL
|
||||
chunk_size: 文本块大小
|
||||
chunk_overlap: 文本块重叠大小
|
||||
batch_size: 批处理大小
|
||||
tasks_limit: 并发任务限制
|
||||
max_retries: 最大重试次数
|
||||
progress_callback: 进度回调函数
|
||||
|
||||
Returns:
|
||||
KBDocument: 上传的文档对象
|
||||
|
||||
Raises:
|
||||
ValueError: 如果知识库不存在或 URL 为空
|
||||
IOError: 如果网络请求失败
|
||||
"""
|
||||
kb_helper = await self.get_kb(kb_id)
|
||||
if not kb_helper:
|
||||
raise ValueError(f"Knowledge base with id {kb_id} not found.")
|
||||
|
||||
return await kb_helper.upload_from_url(
|
||||
url=url,
|
||||
chunk_size=chunk_size,
|
||||
chunk_overlap=chunk_overlap,
|
||||
batch_size=batch_size,
|
||||
tasks_limit=tasks_limit,
|
||||
max_retries=max_retries,
|
||||
progress_callback=progress_callback,
|
||||
)
|
||||
|
||||
103
astrbot/core/knowledge_base/parsers/url_parser.py
Normal file
103
astrbot/core/knowledge_base/parsers/url_parser.py
Normal file
@@ -0,0 +1,103 @@
|
||||
import asyncio
|
||||
|
||||
import aiohttp
|
||||
|
||||
|
||||
class URLExtractor:
|
||||
"""URL 内容提取器,封装了 Tavily API 调用和密钥管理"""
|
||||
|
||||
def __init__(self, tavily_keys: list[str]):
|
||||
"""
|
||||
初始化 URL 提取器
|
||||
|
||||
Args:
|
||||
tavily_keys: Tavily API 密钥列表
|
||||
"""
|
||||
if not tavily_keys:
|
||||
raise ValueError("Error: Tavily API keys are not configured.")
|
||||
|
||||
self.tavily_keys = tavily_keys
|
||||
self.tavily_key_index = 0
|
||||
self.tavily_key_lock = asyncio.Lock()
|
||||
|
||||
async def _get_tavily_key(self) -> str:
|
||||
"""并发安全的从列表中获取并轮换Tavily API密钥。"""
|
||||
async with self.tavily_key_lock:
|
||||
key = self.tavily_keys[self.tavily_key_index]
|
||||
self.tavily_key_index = (self.tavily_key_index + 1) % len(self.tavily_keys)
|
||||
return key
|
||||
|
||||
async def extract_text_from_url(self, url: str) -> str:
|
||||
"""
|
||||
使用 Tavily API 从 URL 提取主要文本内容。
|
||||
这是 web_searcher 插件中 tavily_extract_web_page 方法的简化版本,
|
||||
专门为知识库模块设计,不依赖 AstrMessageEvent。
|
||||
|
||||
Args:
|
||||
url: 要提取内容的网页 URL
|
||||
|
||||
Returns:
|
||||
提取的文本内容
|
||||
|
||||
Raises:
|
||||
ValueError: 如果 URL 为空或 API 密钥未配置
|
||||
IOError: 如果请求失败或返回错误
|
||||
"""
|
||||
if not url:
|
||||
raise ValueError("Error: url must be a non-empty string.")
|
||||
|
||||
tavily_key = await self._get_tavily_key()
|
||||
api_url = "https://api.tavily.com/extract"
|
||||
headers = {
|
||||
"Authorization": f"Bearer {tavily_key}",
|
||||
"Content-Type": "application/json",
|
||||
}
|
||||
|
||||
payload = {
|
||||
"urls": [url],
|
||||
"extract_depth": "basic", # 使用基础提取深度
|
||||
}
|
||||
|
||||
try:
|
||||
async with aiohttp.ClientSession(trust_env=True) as session:
|
||||
async with session.post(
|
||||
api_url,
|
||||
json=payload,
|
||||
headers=headers,
|
||||
timeout=30.0, # 增加超时时间,因为内容提取可能需要更长时间
|
||||
) as response:
|
||||
if response.status != 200:
|
||||
reason = await response.text()
|
||||
raise OSError(
|
||||
f"Tavily web extraction failed: {reason}, status: {response.status}"
|
||||
)
|
||||
|
||||
data = await response.json()
|
||||
results = data.get("results", [])
|
||||
|
||||
if not results:
|
||||
raise ValueError(f"No content extracted from URL: {url}")
|
||||
|
||||
# 返回第一个结果的内容
|
||||
return results[0].get("raw_content", "")
|
||||
|
||||
except aiohttp.ClientError as e:
|
||||
raise OSError(f"Failed to fetch URL {url}: {e}") from e
|
||||
except Exception as e:
|
||||
raise OSError(f"Failed to extract content from URL {url}: {e}") from e
|
||||
|
||||
|
||||
# 为了向后兼容,提供一个简单的函数接口
|
||||
async def extract_text_from_url(url: str, tavily_keys: list[str]) -> str:
|
||||
"""
|
||||
简单的函数接口,用于从 URL 提取文本内容
|
||||
|
||||
Args:
|
||||
url: 要提取内容的网页 URL
|
||||
tavily_keys: Tavily API 密钥列表
|
||||
|
||||
Returns:
|
||||
提取的文本内容
|
||||
"""
|
||||
extractor = URLExtractor(tavily_keys)
|
||||
return await extractor.extract_text_from_url(url)
|
||||
65
astrbot/core/knowledge_base/prompts.py
Normal file
65
astrbot/core/knowledge_base/prompts.py
Normal file
@@ -0,0 +1,65 @@
|
||||
TEXT_REPAIR_SYSTEM_PROMPT = """You are a meticulous digital archivist. Your mission is to reconstruct a clean, readable article from raw, noisy text chunks.
|
||||
|
||||
**Core Task:**
|
||||
1. **Analyze:** Examine the text chunk to separate "signal" (substantive information) from "noise" (UI elements, ads, navigation, footers).
|
||||
2. **Process:** Clean and repair the signal. **Do not translate it.** Keep the original language.
|
||||
|
||||
**Crucial Rules:**
|
||||
- **NEVER discard a chunk if it contains ANY valuable information.** Your primary duty is to salvage content.
|
||||
- **If a chunk contains multiple distinct topics, split them.** Enclose each topic in its own `<repaired_text>` tag.
|
||||
- Your output MUST be ONLY `<repaired_text>...</repaired_text>` tags or a single `<discard_chunk />` tag.
|
||||
|
||||
---
|
||||
**Example 1: Chunk with Noise and Signal**
|
||||
|
||||
*Input Chunk:*
|
||||
"Home | About | Products | **The Llama is a domesticated South American camelid.** | © 2025 ACME Corp."
|
||||
|
||||
*Your Thought Process:*
|
||||
1. "Home | About | Products..." and "© 2025 ACME Corp." are noise.
|
||||
2. "The Llama is a domesticated..." is the signal.
|
||||
3. I must extract the signal and wrap it.
|
||||
|
||||
*Your Output:*
|
||||
<repaired_text>
|
||||
The Llama is a domesticated South American camelid.
|
||||
</repaired_text>
|
||||
|
||||
---
|
||||
**Example 2: Chunk with ONLY Noise**
|
||||
|
||||
*Input Chunk:*
|
||||
"Next Page > | Subscribe to our newsletter | Follow us on X"
|
||||
|
||||
*Your Thought Process:*
|
||||
1. This entire chunk is noise. There is no signal.
|
||||
2. I must discard this.
|
||||
|
||||
*Your Output:*
|
||||
<discard_chunk />
|
||||
|
||||
---
|
||||
**Example 3: Chunk with Multiple Topics (Requires Splitting)**
|
||||
|
||||
*Input Chunk:*
|
||||
"## Chapter 1: The Sun
|
||||
The Sun is the star at the center of the Solar System.
|
||||
|
||||
## Chapter 2: The Moon
|
||||
The Moon is Earth's only natural satellite."
|
||||
|
||||
*Your Thought Process:*
|
||||
1. This chunk contains two distinct topics.
|
||||
2. I must process them separately to maintain semantic integrity.
|
||||
3. I will create two `<repaired_text>` blocks.
|
||||
|
||||
*Your Output:*
|
||||
<repaired_text>
|
||||
## Chapter 1: The Sun
|
||||
The Sun is the star at the center of the Solar System.
|
||||
</repaired_text>
|
||||
<repaired_text>
|
||||
## Chapter 2: The Moon
|
||||
The Moon is Earth's only natural satellite.
|
||||
</repaired_text>
|
||||
"""
|
||||
@@ -48,6 +48,7 @@ class KnowledgeBaseRoute(Route):
|
||||
# 文档管理
|
||||
"/kb/document/list": ("GET", self.list_documents),
|
||||
"/kb/document/upload": ("POST", self.upload_document),
|
||||
"/kb/document/upload/url": ("POST", self.upload_document_from_url),
|
||||
"/kb/document/upload/progress": ("GET", self.get_upload_progress),
|
||||
"/kb/document/get": ("GET", self.get_document),
|
||||
"/kb/document/delete": ("POST", self.delete_document),
|
||||
@@ -1070,3 +1071,174 @@ class KnowledgeBaseRoute(Route):
|
||||
logger.error(f"删除会话知识库配置失败: {e}")
|
||||
logger.error(traceback.format_exc())
|
||||
return Response().error(f"删除会话知识库配置失败: {e!s}").__dict__
|
||||
|
||||
async def upload_document_from_url(self):
|
||||
"""从 URL 上传文档
|
||||
|
||||
Body:
|
||||
- kb_id: 知识库 ID (必填)
|
||||
- url: 要提取内容的网页 URL (必填)
|
||||
- chunk_size: 分块大小 (可选, 默认512)
|
||||
- chunk_overlap: 块重叠大小 (可选, 默认50)
|
||||
- batch_size: 批处理大小 (可选, 默认32)
|
||||
- tasks_limit: 并发任务限制 (可选, 默认3)
|
||||
- max_retries: 最大重试次数 (可选, 默认3)
|
||||
|
||||
返回:
|
||||
- task_id: 任务ID,用于查询上传进度和结果
|
||||
"""
|
||||
try:
|
||||
kb_manager = self._get_kb_manager()
|
||||
data = await request.json
|
||||
|
||||
kb_id = data.get("kb_id")
|
||||
if not kb_id:
|
||||
return Response().error("缺少参数 kb_id").__dict__
|
||||
|
||||
url = data.get("url")
|
||||
if not url:
|
||||
return Response().error("缺少参数 url").__dict__
|
||||
|
||||
chunk_size = data.get("chunk_size", 512)
|
||||
chunk_overlap = data.get("chunk_overlap", 50)
|
||||
batch_size = data.get("batch_size", 32)
|
||||
tasks_limit = data.get("tasks_limit", 3)
|
||||
max_retries = data.get("max_retries", 3)
|
||||
enable_cleaning = data.get("enable_cleaning", False)
|
||||
cleaning_provider_id = data.get("cleaning_provider_id")
|
||||
|
||||
# 获取知识库
|
||||
kb_helper = await kb_manager.get_kb(kb_id)
|
||||
if not kb_helper:
|
||||
return Response().error("知识库不存在").__dict__
|
||||
|
||||
# 生成任务ID
|
||||
task_id = str(uuid.uuid4())
|
||||
|
||||
# 初始化任务状态
|
||||
self.upload_tasks[task_id] = {
|
||||
"status": "pending",
|
||||
"result": None,
|
||||
"error": None,
|
||||
}
|
||||
|
||||
# 启动后台任务
|
||||
asyncio.create_task(
|
||||
self._background_upload_from_url_task(
|
||||
task_id=task_id,
|
||||
kb_helper=kb_helper,
|
||||
url=url,
|
||||
chunk_size=chunk_size,
|
||||
chunk_overlap=chunk_overlap,
|
||||
batch_size=batch_size,
|
||||
tasks_limit=tasks_limit,
|
||||
max_retries=max_retries,
|
||||
enable_cleaning=enable_cleaning,
|
||||
cleaning_provider_id=cleaning_provider_id,
|
||||
),
|
||||
)
|
||||
|
||||
return (
|
||||
Response()
|
||||
.ok(
|
||||
{
|
||||
"task_id": task_id,
|
||||
"url": url,
|
||||
"message": "URL upload task created, processing in background",
|
||||
},
|
||||
)
|
||||
.__dict__
|
||||
)
|
||||
|
||||
except ValueError as e:
|
||||
return Response().error(str(e)).__dict__
|
||||
except Exception as e:
|
||||
logger.error(f"从URL上传文档失败: {e}")
|
||||
logger.error(traceback.format_exc())
|
||||
return Response().error(f"从URL上传文档失败: {e!s}").__dict__
|
||||
|
||||
async def _background_upload_from_url_task(
|
||||
self,
|
||||
task_id: str,
|
||||
kb_helper,
|
||||
url: str,
|
||||
chunk_size: int,
|
||||
chunk_overlap: int,
|
||||
batch_size: int,
|
||||
tasks_limit: int,
|
||||
max_retries: int,
|
||||
enable_cleaning: bool,
|
||||
cleaning_provider_id: str | None,
|
||||
):
|
||||
"""后台上传URL任务"""
|
||||
try:
|
||||
# 初始化任务状态
|
||||
self.upload_tasks[task_id] = {
|
||||
"status": "processing",
|
||||
"result": None,
|
||||
"error": None,
|
||||
}
|
||||
self.upload_progress[task_id] = {
|
||||
"status": "processing",
|
||||
"file_index": 0,
|
||||
"file_total": 1,
|
||||
"file_name": f"URL: {url}",
|
||||
"stage": "extracting",
|
||||
"current": 0,
|
||||
"total": 100,
|
||||
}
|
||||
|
||||
# 创建进度回调函数
|
||||
async def progress_callback(stage, current, total):
|
||||
if task_id in self.upload_progress:
|
||||
self.upload_progress[task_id].update(
|
||||
{
|
||||
"status": "processing",
|
||||
"file_index": 0,
|
||||
"file_name": f"URL: {url}",
|
||||
"stage": stage,
|
||||
"current": current,
|
||||
"total": total,
|
||||
},
|
||||
)
|
||||
|
||||
# 上传文档
|
||||
doc = await kb_helper.upload_from_url(
|
||||
url=url,
|
||||
chunk_size=chunk_size,
|
||||
chunk_overlap=chunk_overlap,
|
||||
batch_size=batch_size,
|
||||
tasks_limit=tasks_limit,
|
||||
max_retries=max_retries,
|
||||
progress_callback=progress_callback,
|
||||
enable_cleaning=enable_cleaning,
|
||||
cleaning_provider_id=cleaning_provider_id,
|
||||
)
|
||||
|
||||
# 更新任务完成状态
|
||||
result = {
|
||||
"task_id": task_id,
|
||||
"uploaded": [doc.model_dump()],
|
||||
"failed": [],
|
||||
"total": 1,
|
||||
"success_count": 1,
|
||||
"failed_count": 0,
|
||||
}
|
||||
|
||||
self.upload_tasks[task_id] = {
|
||||
"status": "completed",
|
||||
"result": result,
|
||||
"error": None,
|
||||
}
|
||||
self.upload_progress[task_id]["status"] = "completed"
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"后台上传URL任务 {task_id} 失败: {e}")
|
||||
logger.error(traceback.format_exc())
|
||||
self.upload_tasks[task_id] = {
|
||||
"status": "failed",
|
||||
"result": None,
|
||||
"error": str(e),
|
||||
}
|
||||
if task_id in self.upload_progress:
|
||||
self.upload_progress[task_id]["status"] = "failed"
|
||||
|
||||
@@ -4,6 +4,7 @@
|
||||
"tabs": {
|
||||
"overview": "Overview",
|
||||
"documents": "Documents",
|
||||
"retrieval": "Retrieval",
|
||||
"sessions": "Sessions",
|
||||
"settings": "Settings"
|
||||
},
|
||||
@@ -49,6 +50,10 @@
|
||||
"maxSize": "Max file size: 128MB",
|
||||
"chunkSettings": "Chunk Settings",
|
||||
"batchSettings": "Batch Settings",
|
||||
"cleaningSettings": "Cleaning Settings",
|
||||
"enableCleaning": "Enable Content Cleaning",
|
||||
"cleaningProvider": "Cleaning Service Provider",
|
||||
"cleaningProviderHint": "Select an LLM provider to clean and summarize the extracted web page content",
|
||||
"chunkSize": "Chunk Size",
|
||||
"chunkSizeHint": "Number of characters per chunk (default: 512)",
|
||||
"chunkOverlap": "Chunk Overlap",
|
||||
@@ -61,7 +66,13 @@
|
||||
"maxRetriesHint": "Number of times to retry a failed upload task (default: 3)",
|
||||
"cancel": "Cancel",
|
||||
"submit": "Upload",
|
||||
"fileRequired": "Please select a file to upload"
|
||||
"fileRequired": "Please select a file to upload",
|
||||
"fileUpload": "File Upload",
|
||||
"fromUrl": "From URL",
|
||||
"urlPlaceholder": "Enter the URL of the web page to extract content from",
|
||||
"urlRequired": "Please enter a URL",
|
||||
"urlHint": "The main content will be automatically extracted from the target URL as a document. Currently supports {supported} pages. Before use, please ensure that the target web page allows crawler access.",
|
||||
"beta": "Beta"
|
||||
},
|
||||
"settings": {
|
||||
"title": "Knowledge Base Settings",
|
||||
|
||||
@@ -50,6 +50,10 @@
|
||||
"maxSize": "最大文件大小: 128MB",
|
||||
"chunkSettings": "分块设置",
|
||||
"batchSettings": "批处理设置",
|
||||
"cleaningSettings": "清洗设置",
|
||||
"enableCleaning": "启用内容清洗",
|
||||
"cleaningProvider": "清洗服务提供商",
|
||||
"cleaningProviderHint": "选择一个 LLM 服务商来对提取的网页内容进行清洗和总结",
|
||||
"chunkSize": "分块大小",
|
||||
"chunkSizeHint": "每个文本块的字符数 (默认: 512)",
|
||||
"chunkOverlap": "分块重叠",
|
||||
@@ -62,7 +66,13 @@
|
||||
"maxRetriesHint": "上传失败任务的重试次数 (默认: 3)",
|
||||
"cancel": "取消",
|
||||
"submit": "上传",
|
||||
"fileRequired": "请选择要上传的文件"
|
||||
"fileRequired": "请选择要上传的文件",
|
||||
"fileUpload": "文件上传",
|
||||
"fromUrl": "从 URL",
|
||||
"urlPlaceholder": "请输入要提取内容的网页 URL",
|
||||
"urlRequired": "请输入 URL",
|
||||
"urlHint": "将自动从目标 URL 提取主要内容作为文档。目前支持 {supported} 页面,请确保目标网页允许爬虫访问。",
|
||||
"beta": "测试版"
|
||||
},
|
||||
"retrieval": {
|
||||
"title": "知识库检索",
|
||||
|
||||
@@ -57,7 +57,7 @@
|
||||
</v-card>
|
||||
|
||||
<!-- 上传对话框 -->
|
||||
<v-dialog v-model="showUploadDialog" max-width="600px" persistent @after-enter="initUploadSettings">
|
||||
<v-dialog v-model="showUploadDialog" max-width="650px" persistent @after-enter="initUploadSettings">
|
||||
<v-card>
|
||||
<v-card-title class="pa-4 d-flex align-center">
|
||||
<span class="text-h5">{{ t('upload.title') }}</span>
|
||||
@@ -67,40 +67,91 @@
|
||||
|
||||
<v-divider />
|
||||
|
||||
<v-card-text class="pa-6">
|
||||
<!-- 文件选择 -->
|
||||
<div class="upload-dropzone" :class="{ 'dragover': isDragging }" @drop.prevent="handleDrop"
|
||||
@dragover.prevent="isDragging = true" @dragleave="isDragging = false" @click="fileInput?.click()">
|
||||
<v-icon size="64" color="primary">mdi-cloud-upload</v-icon>
|
||||
<p class="mt-4 text-h6">{{ t('upload.dropzone') }}</p>
|
||||
<p class="text-caption text-medium-emphasis mt-2">{{ t('upload.supportedFormats') }}.txt, .md, .pdf, .docx,
|
||||
.xls, .xlsx</p>
|
||||
<p class="text-caption text-medium-emphasis">{{ t('upload.maxSize') }}</p>
|
||||
<p class="text-caption text-medium-emphasis">最多可上传 10 个文件</p>
|
||||
<input ref="fileInput" type="file" multiple hidden accept=".txt,.md,.pdf,.docx,.xls,.xlsx"
|
||||
@change="handleFileSelect" />
|
||||
</div>
|
||||
<v-tabs v-model="uploadMode" grow class="mb-4">
|
||||
<v-tab value="file">{{ t('upload.fileUpload') }}</v-tab>
|
||||
<v-tab value="url">
|
||||
{{ t('upload.fromUrl') }}
|
||||
<v-badge color="warning" :content="t('upload.beta')" inline class="ml-2" />
|
||||
</v-tab>
|
||||
</v-tabs>
|
||||
|
||||
<div v-if="selectedFiles.length > 0" class="mt-4">
|
||||
<div class="d-flex align-center justify-space-between mb-2">
|
||||
<span class="text-subtitle-2">已选择 {{ selectedFiles.length }} 个文件</span>
|
||||
<v-btn variant="text" size="small" @click="selectedFiles = []">清空</v-btn>
|
||||
</div>
|
||||
<div class="files-list">
|
||||
<div v-for="(file, index) in selectedFiles" :key="index"
|
||||
class="file-item pa-3 mb-2 rounded bg-surface-variant">
|
||||
<div class="d-flex align-center justify-space-between">
|
||||
<div class="d-flex align-center gap-2">
|
||||
<v-icon>{{ getFileIcon(file.name) }}</v-icon>
|
||||
<div>
|
||||
<div class="font-weight-medium">{{ file.name }}</div>
|
||||
<div class="text-caption">{{ formatFileSize(file.size) }}</div>
|
||||
<v-card-text class="pa-6 pt-2">
|
||||
<v-window v-model="uploadMode">
|
||||
<!-- 文件上传 -->
|
||||
<v-window-item value="file">
|
||||
<!-- 文件选择 -->
|
||||
<div class="upload-dropzone" :class="{ 'dragover': isDragging }" @drop.prevent="handleDrop"
|
||||
@dragover.prevent="isDragging = true" @dragleave="isDragging = false" @click="fileInput?.click()">
|
||||
<v-icon size="64" color="primary">mdi-cloud-upload</v-icon>
|
||||
<p class="mt-4 text-h6">{{ t('upload.dropzone') }}</p>
|
||||
<p class="text-caption text-medium-emphasis mt-2">{{ t('upload.supportedFormats') }}.txt, .md, .pdf,
|
||||
.docx,
|
||||
.xls, .xlsx</p>
|
||||
<p class="text-caption text-medium-emphasis">{{ t('upload.maxSize') }}</p>
|
||||
<p class="text-caption text-medium-emphasis">最多可上传 10 个文件</p>
|
||||
<input ref="fileInput" type="file" multiple hidden accept=".txt,.md,.pdf,.docx,.xls,.xlsx"
|
||||
@change="handleFileSelect" />
|
||||
</div>
|
||||
|
||||
<div v-if="selectedFiles.length > 0" class="mt-4">
|
||||
<div class="d-flex align-center justify-space-between mb-2">
|
||||
<span class="text-subtitle-2">已选择 {{ selectedFiles.length }} 个文件</span>
|
||||
<v-btn variant="text" size="small" @click="selectedFiles = []">清空</v-btn>
|
||||
</div>
|
||||
<div class="files-list">
|
||||
<div v-for="(file, index) in selectedFiles" :key="index"
|
||||
class="file-item pa-3 mb-2 rounded bg-surface-variant">
|
||||
<div class="d-flex align-center justify-space-between">
|
||||
<div class="d-flex align-center gap-2">
|
||||
<v-icon>{{ getFileIcon(file.name) }}</v-icon>
|
||||
<div>
|
||||
<div class="font-weight-medium">{{ file.name }}</div>
|
||||
<div class="text-caption">{{ formatFileSize(file.size) }}</div>
|
||||
</div>
|
||||
</div>
|
||||
<v-btn icon="mdi-close" variant="text" size="small" @click="removeFile(index)" />
|
||||
</div>
|
||||
</div>
|
||||
<v-btn icon="mdi-close" variant="text" size="small" @click="removeFile(index)" />
|
||||
</div>
|
||||
</div>
|
||||
</v-window-item>
|
||||
|
||||
<!-- URL上传 -->
|
||||
<v-window-item value="url" class="pt-2">
|
||||
<!-- Tavily Key 快速配置 -->
|
||||
<div v-if="tavilyConfigStatus === 'not_configured' || tavilyConfigStatus === 'error'" class="mb-4">
|
||||
<v-alert :type="tavilyConfigStatus === 'error' ? 'error' : 'info'" variant="tonal" density="compact">
|
||||
<div class="d-flex align-center justify-space-between">
|
||||
<span>
|
||||
{{ tavilyConfigStatus === 'error' ? '检查网页搜索配置失败' : '使用此功能需要配置 Tavily Key' }}
|
||||
</span>
|
||||
<v-btn size="small" variant="flat" @click="showTavilyDialog = true">
|
||||
配置
|
||||
</v-btn>
|
||||
</div>
|
||||
</v-alert>
|
||||
</div>
|
||||
|
||||
<v-text-field v-model="uploadUrl" :label="t('upload.urlPlaceholder')" variant="outlined" clearable :disabled="tavilyConfigStatus === 'not_configured'"
|
||||
autofocus :hint="t('upload.urlHint', { supported: 'HTML' })" persistent-hint />
|
||||
</v-window-item>
|
||||
</v-window>
|
||||
|
||||
<!-- 清洗设置 (仅在URL模式下显示) -->
|
||||
<div v-if="uploadMode === 'url'" class="mt-6">
|
||||
<div class="d-flex align-center mb-4">
|
||||
<h3 class="text-h6">{{ t('upload.cleaningSettings') }}</h3>
|
||||
</div>
|
||||
<v-row>
|
||||
<v-col cols="12" sm="4">
|
||||
<v-switch v-model="uploadSettings.enable_cleaning" :label="t('upload.enableCleaning')" color="primary" />
|
||||
</v-col>
|
||||
<v-col cols="12" sm="8">
|
||||
<v-select v-model="uploadSettings.cleaning_provider_id" :items="llmProviders" item-title="id"
|
||||
item-value="id" :label="t('upload.cleaningProvider')" :hint="t('upload.cleaningProviderHint')"
|
||||
persistent-hint variant="outlined" density="compact" :disabled="!uploadSettings.enable_cleaning" />
|
||||
</v-col>
|
||||
</v-row>
|
||||
</div>
|
||||
|
||||
<!-- 分块设置 -->
|
||||
@@ -151,8 +202,8 @@
|
||||
<v-btn variant="text" @click="closeUploadDialog" :disabled="uploading">
|
||||
{{ t('upload.cancel') }}
|
||||
</v-btn>
|
||||
<v-btn color="primary" variant="elevated" @click="uploadDocument" :loading="uploading"
|
||||
:disabled="selectedFiles.length === 0">
|
||||
<v-btn color="primary" variant="elevated" @click="startUpload" :loading="uploading"
|
||||
:disabled="isUploadDisabled">
|
||||
{{ t('upload.submit') }}
|
||||
</v-btn>
|
||||
</v-card-actions>
|
||||
@@ -185,11 +236,15 @@
|
||||
<v-snackbar v-model="snackbar.show" :color="snackbar.color">
|
||||
{{ snackbar.text }}
|
||||
</v-snackbar>
|
||||
|
||||
<!-- Tavily Key 配置对话框 -->
|
||||
<TavilyKeyDialog v-model="showTavilyDialog" @success="onTavilyKeySet" />
|
||||
</div>
|
||||
</template>
|
||||
|
||||
<script setup lang="ts">
|
||||
import { ref, onMounted, onUnmounted } from 'vue'
|
||||
import TavilyKeyDialog from './TavilyKeyDialog.vue'
|
||||
import { ref, onMounted, onUnmounted, computed } from 'vue'
|
||||
import { useRouter } from 'vue-router'
|
||||
import axios from 'axios'
|
||||
import { useModuleI18n } from '@/i18n/composables'
|
||||
@@ -216,10 +271,13 @@ const selectedFiles = ref<File[]>([])
|
||||
const deleteTarget = ref<any>(null)
|
||||
const isDragging = ref(false)
|
||||
const fileInput = ref<HTMLInputElement | null>(null)
|
||||
|
||||
// 上传进度 - 用于轮询多个任务
|
||||
const uploadMode = ref('file') // 'file' or 'url'
|
||||
const uploadUrl = ref('')
|
||||
const llmProviders = ref<any[]>([])
|
||||
const uploadingTasks = ref<Map<string, any>>(new Map())
|
||||
const progressPollingInterval = ref<number | null>(null)
|
||||
const tavilyConfigStatus = ref('loading') // 'loading', 'configured', 'not_configured', 'error'
|
||||
const showTavilyDialog = ref(false)
|
||||
|
||||
const snackbar = ref({
|
||||
show: false,
|
||||
@@ -239,7 +297,9 @@ const uploadSettings = ref({
|
||||
chunk_overlap: null as number | null,
|
||||
batch_size: 32,
|
||||
tasks_limit: 3,
|
||||
max_retries: 3
|
||||
max_retries: 3,
|
||||
enable_cleaning: false,
|
||||
cleaning_provider_id: null as string | null
|
||||
})
|
||||
|
||||
// 初始化上传设置
|
||||
@@ -249,10 +309,31 @@ const initUploadSettings = () => {
|
||||
chunk_overlap: props.kb?.chunk_overlap || null,
|
||||
batch_size: 32,
|
||||
tasks_limit: 3,
|
||||
max_retries: 3
|
||||
max_retries: 3,
|
||||
enable_cleaning: false,
|
||||
cleaning_provider_id: null
|
||||
}
|
||||
}
|
||||
|
||||
const isUploadDisabled = computed(() => {
|
||||
if (uploading.value) {
|
||||
return true
|
||||
}
|
||||
if (uploadMode.value === 'file') {
|
||||
return selectedFiles.value.length === 0
|
||||
}
|
||||
if (uploadMode.value === 'url') {
|
||||
if (!uploadUrl.value) {
|
||||
return true
|
||||
}
|
||||
if (uploadSettings.value.enable_cleaning && !uploadSettings.value.cleaning_provider_id) {
|
||||
return true
|
||||
}
|
||||
return false
|
||||
}
|
||||
return true
|
||||
})
|
||||
|
||||
// 表格列
|
||||
const headers = [
|
||||
{ title: t('documents.name'), key: 'doc_name', sortable: true },
|
||||
@@ -314,8 +395,17 @@ const handleDrop = (event: DragEvent) => {
|
||||
}
|
||||
}
|
||||
|
||||
// 上传文档
|
||||
const uploadDocument = async () => {
|
||||
// 上传调度器
|
||||
const startUpload = async () => {
|
||||
if (uploadMode.value === 'file') {
|
||||
await uploadFiles()
|
||||
} else if (uploadMode.value === 'url') {
|
||||
await uploadFromUrl()
|
||||
}
|
||||
}
|
||||
|
||||
// 上传文件
|
||||
const uploadFiles = async () => {
|
||||
if (selectedFiles.value.length === 0) {
|
||||
showSnackbar(t('upload.fileRequired'), 'warning')
|
||||
return
|
||||
@@ -390,6 +480,80 @@ const uploadDocument = async () => {
|
||||
}
|
||||
}
|
||||
|
||||
// 从 URL 上传
|
||||
const uploadFromUrl = async () => {
|
||||
if (!uploadUrl.value) {
|
||||
showSnackbar(t('upload.urlRequired'), 'warning')
|
||||
return
|
||||
}
|
||||
|
||||
uploading.value = true
|
||||
|
||||
try {
|
||||
const payload: any = {
|
||||
kb_id: props.kbId,
|
||||
url: uploadUrl.value,
|
||||
batch_size: uploadSettings.value.batch_size,
|
||||
tasks_limit: uploadSettings.value.tasks_limit,
|
||||
max_retries: uploadSettings.value.max_retries
|
||||
}
|
||||
if (uploadSettings.value.chunk_size) {
|
||||
payload.chunk_size = uploadSettings.value.chunk_size
|
||||
}
|
||||
if (uploadSettings.value.chunk_overlap) {
|
||||
payload.chunk_overlap = uploadSettings.value.chunk_overlap
|
||||
}
|
||||
if (uploadSettings.value.enable_cleaning) {
|
||||
payload.enable_cleaning = true
|
||||
if (uploadSettings.value.cleaning_provider_id) {
|
||||
payload.cleaning_provider_id = uploadSettings.value.cleaning_provider_id
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
const response = await axios.post('/api/kb/document/upload/url', payload)
|
||||
|
||||
if (response.data.status === 'ok') {
|
||||
const result = response.data.data
|
||||
const taskId = result.task_id
|
||||
|
||||
showSnackbar(`正在从 URL 后台提取内容...`, 'info')
|
||||
|
||||
// 添加占位条目
|
||||
const uploadingDoc = {
|
||||
doc_id: `uploading_${taskId}_0`,
|
||||
doc_name: result.url,
|
||||
file_type: 'url',
|
||||
file_size: 0, // URL has no size
|
||||
chunk_count: 0,
|
||||
created_at: new Date().toISOString(),
|
||||
uploading: true,
|
||||
taskId: taskId,
|
||||
uploadProgress: {
|
||||
stage: 'waiting',
|
||||
current: 0,
|
||||
total: 100
|
||||
}
|
||||
}
|
||||
|
||||
documents.value = [uploadingDoc, ...documents.value]
|
||||
closeUploadDialog()
|
||||
|
||||
if (taskId) {
|
||||
startProgressPolling(taskId)
|
||||
}
|
||||
} else {
|
||||
showSnackbar(response.data.message || t('documents.uploadFailed'), 'error')
|
||||
}
|
||||
} catch (error: any) {
|
||||
console.error('Failed to upload from URL:', error)
|
||||
const message = error.response?.data?.message || t('documents.uploadFailed')
|
||||
showSnackbar(message, 'error')
|
||||
} finally {
|
||||
uploading.value = false
|
||||
}
|
||||
}
|
||||
|
||||
// 开始轮询进度
|
||||
const startProgressPolling = (taskId: string) => {
|
||||
// 如果已经在轮询,先停止
|
||||
@@ -490,6 +654,8 @@ const getUploadPercentage = (item: any) => {
|
||||
const getStageText = (stage: string) => {
|
||||
const stageMap: Record<string, string> = {
|
||||
'waiting': '等待中...',
|
||||
'extracting': '提取内容...',
|
||||
'cleaning': '清洗内容...',
|
||||
'parsing': '解析文档...',
|
||||
'chunking': '文本分块...',
|
||||
'embedding': '生成向量...'
|
||||
@@ -501,6 +667,8 @@ const getStageText = (stage: string) => {
|
||||
const closeUploadDialog = () => {
|
||||
showUploadDialog.value = false
|
||||
selectedFiles.value = []
|
||||
uploadUrl.value = ''
|
||||
uploadMode.value = 'file'
|
||||
initUploadSettings()
|
||||
}
|
||||
|
||||
@@ -551,6 +719,7 @@ const getFileIcon = (fileType: string) => {
|
||||
if (type.includes('pdf')) return 'mdi-file-pdf-box'
|
||||
if (type.includes('md') || type.includes('markdown')) return 'mdi-language-markdown'
|
||||
if (type.includes('txt')) return 'mdi-file-document-outline'
|
||||
if (type.includes('url')) return 'mdi-link-variant'
|
||||
return 'mdi-file'
|
||||
}
|
||||
|
||||
@@ -559,6 +728,7 @@ const getFileColor = (fileType: string) => {
|
||||
if (type.includes('pdf')) return 'error'
|
||||
if (type.includes('md')) return 'info'
|
||||
if (type.includes('txt')) return 'success'
|
||||
if (type.includes('url')) return 'primary'
|
||||
return 'grey'
|
||||
}
|
||||
|
||||
@@ -585,8 +755,53 @@ const formatDate = (dateStr: string) => {
|
||||
})
|
||||
}
|
||||
|
||||
// 加载LLM providers
|
||||
const loadLlmProviders = async () => {
|
||||
try {
|
||||
const response = await axios.get('/api/config/provider/list', {
|
||||
params: { provider_type: 'chat_completion' }
|
||||
})
|
||||
if (response.data.status === 'ok') {
|
||||
llmProviders.value = response.data.data
|
||||
}
|
||||
} catch (error) {
|
||||
console.error('Failed to load LLM providers:', error)
|
||||
}
|
||||
}
|
||||
|
||||
// 检查Tavily Key配置
|
||||
const checkTavilyConfig = async () => {
|
||||
tavilyConfigStatus.value = 'loading'
|
||||
try {
|
||||
const response = await axios.get('/api/config/abconf', {
|
||||
params: { id: 'default' }
|
||||
})
|
||||
if (response.data.status === 'ok') {
|
||||
const config = response.data.data.config
|
||||
const tavilyKeys = config?.provider_settings?.websearch_tavily_key
|
||||
if (Array.isArray(tavilyKeys) && tavilyKeys.length > 0 && tavilyKeys.some(key => key.trim() !== '')) {
|
||||
tavilyConfigStatus.value = 'configured'
|
||||
} else {
|
||||
tavilyConfigStatus.value = 'not_configured'
|
||||
}
|
||||
} else {
|
||||
tavilyConfigStatus.value = 'error'
|
||||
}
|
||||
} catch (error) {
|
||||
console.warn('Failed to check Tavily key config:', error)
|
||||
tavilyConfigStatus.value = 'error'
|
||||
}
|
||||
}
|
||||
|
||||
const onTavilyKeySet = () => {
|
||||
showSnackbar('Tavily API Key 配置成功', 'success')
|
||||
checkTavilyConfig()
|
||||
}
|
||||
|
||||
onMounted(() => {
|
||||
loadDocuments()
|
||||
loadLlmProviders()
|
||||
checkTavilyConfig()
|
||||
})
|
||||
|
||||
onUnmounted(() => {
|
||||
|
||||
@@ -0,0 +1,109 @@
|
||||
<template>
|
||||
<v-dialog v-model="dialog" max-width="500px" persistent>
|
||||
<v-card>
|
||||
<v-card-title class="text-h5">
|
||||
配置 Tavily API Key
|
||||
</v-card-title>
|
||||
<v-card-text>
|
||||
<p class="mb-4 text-body-2 text-medium-emphasis">
|
||||
为了使用基于网页的知识库功能,需要提供 Tavily API Key。您可以从 <a href="https://tavily.com/" target="_blank">Tavily 官网</a> 获取。
|
||||
</p>
|
||||
<v-text-field
|
||||
v-model="apiKey"
|
||||
label="Tavily API Key"
|
||||
variant="outlined"
|
||||
:loading="saving"
|
||||
:error-messages="errorMessage"
|
||||
autofocus
|
||||
clearable
|
||||
placeholder="tvly-..."
|
||||
/>
|
||||
</v-card-text>
|
||||
<v-card-actions>
|
||||
<v-spacer />
|
||||
<v-btn variant="text" @click="closeDialog" :disabled="saving">
|
||||
取消
|
||||
</v-btn>
|
||||
<v-btn color="primary" variant="elevated" @click="saveKey" :loading="saving">
|
||||
保存
|
||||
</v-btn>
|
||||
</v-card-actions>
|
||||
</v-card>
|
||||
</v-dialog>
|
||||
</template>
|
||||
|
||||
<script setup lang="ts">
|
||||
import { ref, watch } from 'vue'
|
||||
import axios from 'axios'
|
||||
|
||||
const props = defineProps<{
|
||||
modelValue: boolean
|
||||
}>()
|
||||
|
||||
const emit = defineEmits(['update:modelValue', 'success'])
|
||||
|
||||
const dialog = ref(props.modelValue)
|
||||
const apiKey = ref('')
|
||||
const saving = ref(false)
|
||||
const errorMessage = ref('')
|
||||
|
||||
watch(() => props.modelValue, (val) => {
|
||||
dialog.value = val
|
||||
if (val) {
|
||||
// Reset state when dialog opens
|
||||
apiKey.value = ''
|
||||
errorMessage.value = ''
|
||||
saving.value = false
|
||||
}
|
||||
})
|
||||
|
||||
const closeDialog = () => {
|
||||
emit('update:modelValue', false)
|
||||
}
|
||||
|
||||
const saveKey = async () => {
|
||||
if (!apiKey.value.trim()) {
|
||||
errorMessage.value = 'API Key 不能为空'
|
||||
return
|
||||
}
|
||||
errorMessage.value = ''
|
||||
saving.value = true
|
||||
try {
|
||||
// 1. 获取当前配置
|
||||
const configResponse = await axios.get('/api/config/abconf', {
|
||||
params: { id: 'default' }
|
||||
})
|
||||
|
||||
if (configResponse.data.status !== 'ok') {
|
||||
throw new Error('获取当前配置失败')
|
||||
}
|
||||
|
||||
const currentConfig = configResponse.data.data.config
|
||||
|
||||
// 2. 更新配置
|
||||
if (!currentConfig.provider_settings) {
|
||||
currentConfig.provider_settings = {}
|
||||
}
|
||||
currentConfig.provider_settings.websearch_tavily_key = [apiKey.value.trim()]
|
||||
// 同时将搜索提供商设置为 tavily
|
||||
currentConfig.provider_settings.websearch_provider = 'tavily'
|
||||
|
||||
// 3. 保存整个配置
|
||||
const saveResponse = await axios.post('/api/config/astrbot/update', {
|
||||
conf_id: 'default',
|
||||
config: currentConfig
|
||||
})
|
||||
|
||||
if (saveResponse.data.status === 'ok') {
|
||||
emit('success')
|
||||
closeDialog()
|
||||
} else {
|
||||
errorMessage.value = saveResponse.data.message || '保存失败,请检查 Key 是否正确'
|
||||
}
|
||||
} catch (error: any) {
|
||||
errorMessage.value = error.response?.data?.message || '保存失败,发生未知错误'
|
||||
} finally {
|
||||
saving.value = false
|
||||
}
|
||||
}
|
||||
</script>
|
||||
Reference in New Issue
Block a user