Files
cherry-studio/src/main/knowledge/preprocess/Doc2xPreprocessProvider.ts
T
beyondkmp 4a62bb6ad7 refactor: replace axios and node fetch with electron's net module (#9212)
* refactor: replace axios and node fetch with electron's net module for network requests in preprocess providers

- Updated Doc2xPreprocessProvider and MineruPreprocessProvider to use net.fetch instead of axios for making HTTP requests.
- Improved error handling for network responses across various methods.
- Removed unnecessary AxiosRequestConfig and related code to streamline the implementation.

* lint

* refactor(Doc2xPreprocessProvider): enhance file validation and upload process

- Added file size validation to prevent loading files larger than 300MB into memory.
- Implemented file size check before reading the PDF to ensure efficient memory usage.
- Updated the file upload method to use a stream, setting the 'Content-Length' header for better handling of large files.

* refactor(brave-search): update net.fetch calls to use url.toString()

- Modified all instances of net.fetch to use url.toString() for better URL handling.
- Ensured consistency in how URLs are passed to the fetch method across various functions.

* refactor(MCPService): improve URL handling in net.fetch calls

- Updated net.fetch to use url.toString() for better type handling of URLs.
- Ensured consistent URL processing across the MCPService class.

* feat(ProxyManager): integrate axios with fetch proxy support

- Added axios as a dependency to enable fetch proxy usage.
- Implemented logic to set axios's adapter to 'fetch' for proxy handling.
- Preserved original axios adapter for restoration when disabling the proxy.
2025-08-15 22:48:22 +08:00

381 lines
11 KiB
TypeScript

import fs from 'node:fs'
import path from 'node:path'
import { loggerService } from '@logger'
import { fileStorage } from '@main/services/FileStorage'
import { FileMetadata, PreprocessProvider } from '@types'
import AdmZip from 'adm-zip'
import { net } from 'electron'
import BasePreprocessProvider from './BasePreprocessProvider'
const logger = loggerService.withContext('Doc2xPreprocessProvider')
type ApiResponse<T> = {
code: string
data: T
message?: string
}
type PreuploadResponse = {
uid: string
url: string
}
type StatusResponse = {
status: string
progress: number
}
type ParsedFileResponse = {
status: string
url: string
}
export default class Doc2xPreprocessProvider extends BasePreprocessProvider {
constructor(provider: PreprocessProvider) {
super(provider)
}
private async validateFile(filePath: string): Promise<void> {
// 首先检查文件大小,避免读取大文件到内存
const stats = await fs.promises.stat(filePath)
const fileSizeBytes = stats.size
// 文件大小小于300MB
if (fileSizeBytes >= 300 * 1024 * 1024) {
const fileSizeMB = Math.round(fileSizeBytes / (1024 * 1024))
throw new Error(`PDF file size (${fileSizeMB}MB) exceeds the limit of 300MB`)
}
// 只有在文件大小合理的情况下才读取文件内容检查页数
const pdfBuffer = await fs.promises.readFile(filePath)
const doc = await this.readPdf(pdfBuffer)
// 文件页数小于1000页
if (doc.numPages >= 1000) {
throw new Error(`PDF page count (${doc.numPages}) exceeds the limit of 1000 pages`)
}
}
public async parseFile(sourceId: string, file: FileMetadata): Promise<{ processedFile: FileMetadata }> {
try {
const filePath = fileStorage.getFilePathById(file)
logger.info(`Preprocess processing started: ${filePath}`)
// 步骤1: 准备上传
const { uid, url } = await this.preupload()
logger.info(`Preprocess preupload completed: uid=${uid}`)
await this.validateFile(filePath)
// 步骤2: 上传文件
await this.putFile(filePath, url)
// 步骤3: 等待处理完成
await this.waitForProcessing(sourceId, uid)
logger.info(`Preprocess parsing completed successfully for: ${filePath}`)
// 步骤4: 导出文件
const { path: outputPath } = await this.exportFile(file, uid)
// 步骤5: 创建处理后的文件信息
return {
processedFile: this.createProcessedFileInfo(file, outputPath)
}
} catch (error) {
logger.error(`Preprocess processing failed for:`, error as Error)
throw error
}
}
private createProcessedFileInfo(file: FileMetadata, outputPath: string): FileMetadata {
const outputFilePath = `${outputPath}/${file.name.split('.').slice(0, -1).join('.')}.md`
return {
...file,
name: file.name.replace('.pdf', '.md'),
path: outputFilePath,
ext: '.md',
size: fs.statSync(outputFilePath).size
}
}
/**
* 导出文件
* @param file 文件信息
* @param uid 预上传响应的uid
* @returns 导出文件的路径
*/
public async exportFile(file: FileMetadata, uid: string): Promise<{ path: string }> {
const filePath = fileStorage.getFilePathById(file)
logger.info(`Exporting file: ${filePath}`)
// 步骤1: 转换文件
await this.convertFile(uid, filePath)
logger.info(`File conversion completed for: ${filePath}`)
// 步骤2: 等待导出并获取URL
const exportUrl = await this.waitForExport(uid)
// 步骤3: 下载并解压文件
return this.downloadFile(exportUrl, file)
}
/**
* 等待处理完成
* @param sourceId 源文件ID
* @param uid 预上传响应的uid
*/
private async waitForProcessing(sourceId: string, uid: string): Promise<void> {
while (true) {
await this.delay(1000)
const { status, progress } = await this.getStatus(uid)
await this.sendPreprocessProgress(sourceId, progress)
logger.info(`Preprocess processing status: ${status}, progress: ${progress}%`)
if (status === 'success') {
return
} else if (status === 'failed') {
throw new Error('Preprocess processing failed')
}
}
}
/**
* 等待导出完成
* @param uid 预上传响应的uid
* @returns 导出文件的url
*/
private async waitForExport(uid: string): Promise<string> {
while (true) {
await this.delay(1000)
const { status, url } = await this.getParsedFile(uid)
logger.info(`Export status: ${status}`)
if (status === 'success' && url) {
return url
} else if (status === 'failed') {
throw new Error('Export failed')
}
}
}
/**
* 预上传文件
* @returns 预上传响应的url和uid
*/
private async preupload(): Promise<PreuploadResponse> {
const endpoint = `${this.provider.apiHost}/api/v2/parse/preupload`
try {
const response = await net.fetch(endpoint, {
method: 'POST',
headers: {
'Content-Type': 'application/json',
Authorization: `Bearer ${this.provider.apiKey}`
},
body: null
})
if (!response.ok) {
throw new Error(`HTTP ${response.status}: ${response.statusText}`)
}
const data = (await response.json()) as ApiResponse<PreuploadResponse>
if (data.code === 'success' && data.data) {
return data.data
} else {
throw new Error(`API returned error: ${data.message || JSON.stringify(data)}`)
}
} catch (error) {
logger.error(`Failed to get preupload URL: ${error instanceof Error ? error.message : String(error)}`)
throw new Error('Failed to get preupload URL')
}
}
/**
* 上传文件(使用流式上传)
* @param filePath 文件路径
* @param url 预上传响应的url
*/
private async putFile(filePath: string, url: string): Promise<void> {
try {
// 获取文件大小用于设置 Content-Length
const stats = await fs.promises.stat(filePath)
const fileSize = stats.size
// 创建可读流
const fileStream = fs.createReadStream(filePath)
const response = await net.fetch(url, {
method: 'PUT',
body: fileStream as any, // TypeScript 类型转换,net.fetch 支持 ReadableStream
headers: {
'Content-Length': fileSize.toString()
}
})
if (!response.ok) {
throw new Error(`HTTP ${response.status}: ${response.statusText}`)
}
} catch (error) {
logger.error(`Failed to upload file ${filePath}: ${error instanceof Error ? error.message : String(error)}`)
throw new Error('Failed to upload file')
}
}
private async getStatus(uid: string): Promise<StatusResponse> {
const endpoint = `${this.provider.apiHost}/api/v2/parse/status?uid=${uid}`
try {
const response = await net.fetch(endpoint, {
method: 'GET',
headers: {
Authorization: `Bearer ${this.provider.apiKey}`
}
})
if (!response.ok) {
throw new Error(`HTTP ${response.status}: ${response.statusText}`)
}
const data = (await response.json()) as ApiResponse<StatusResponse>
if (data.code === 'success' && data.data) {
return data.data
} else {
throw new Error(`API returned error: ${data.message || JSON.stringify(data)}`)
}
} catch (error) {
logger.error(`Failed to get status for uid ${uid}: ${error instanceof Error ? error.message : String(error)}`)
throw new Error('Failed to get processing status')
}
}
/**
* Preprocess文件
* @param uid 预上传响应的uid
* @param filePath 文件路径
*/
private async convertFile(uid: string, filePath: string): Promise<void> {
const fileName = path.parse(filePath).name
const payload = {
uid,
to: 'md',
formula_mode: 'normal',
filename: fileName
}
const endpoint = `${this.provider.apiHost}/api/v2/convert/parse`
try {
const response = await net.fetch(endpoint, {
method: 'POST',
headers: {
'Content-Type': 'application/json',
Authorization: `Bearer ${this.provider.apiKey}`
},
body: JSON.stringify(payload)
})
if (!response.ok) {
throw new Error(`HTTP ${response.status}: ${response.statusText}`)
}
const data = (await response.json()) as ApiResponse<any>
if (data.code !== 'success') {
throw new Error(`API returned error: ${data.message || JSON.stringify(data)}`)
}
} catch (error) {
logger.error(`Failed to convert file ${filePath}: ${error instanceof Error ? error.message : String(error)}`)
throw new Error('Failed to convert file')
}
}
/**
* 获取解析后的文件信息
* @param uid 预上传响应的uid
* @returns 解析后的文件信息
*/
private async getParsedFile(uid: string): Promise<ParsedFileResponse> {
const endpoint = `${this.provider.apiHost}/api/v2/convert/parse/result?uid=${uid}`
try {
const response = await net.fetch(endpoint, {
method: 'GET',
headers: {
Authorization: `Bearer ${this.provider.apiKey}`
}
})
if (!response.ok) {
throw new Error(`HTTP ${response.status}: ${response.statusText}`)
}
const data = (await response.json()) as ApiResponse<ParsedFileResponse>
if (data.data) {
return data.data
} else {
throw new Error(`No data in response`)
}
} catch (error) {
logger.error(
`Failed to get parsed file for uid ${uid}: ${error instanceof Error ? error.message : String(error)}`
)
throw new Error('Failed to get parsed file information')
}
}
/**
* 下载文件
* @param url 导出文件的url
* @param file 文件信息
* @returns 下载文件的路径
*/
private async downloadFile(url: string, file: FileMetadata): Promise<{ path: string }> {
const dirPath = this.storageDir
// 使用统一的存储路径:Data/Files/{file.id}/
const extractPath = path.join(dirPath, file.id)
const zipPath = path.join(dirPath, `${file.id}.zip`)
// 确保目录存在
fs.mkdirSync(dirPath, { recursive: true })
fs.mkdirSync(extractPath, { recursive: true })
logger.info(`Downloading to export path: ${zipPath}`)
try {
// 下载文件
const response = await net.fetch(url, { method: 'GET' })
if (!response.ok) {
throw new Error(`HTTP ${response.status}: ${response.statusText}`)
}
const arrayBuffer = await response.arrayBuffer()
fs.writeFileSync(zipPath, Buffer.from(arrayBuffer))
// 确保提取目录存在
if (!fs.existsSync(extractPath)) {
fs.mkdirSync(extractPath, { recursive: true })
}
// 解压文件
const zip = new AdmZip(zipPath)
zip.extractAllTo(extractPath, true)
logger.info(`Extracted files to: ${extractPath}`)
// 删除临时ZIP文件
fs.unlinkSync(zipPath)
return { path: extractPath }
} catch (error) {
logger.error(`Failed to download and extract file: ${error instanceof Error ? error.message : String(error)}`)
throw new Error('Failed to download and extract file')
}
}
public checkQuota(): Promise<number> {
throw new Error('Method not implemented.')
}
}