4a62bb6ad7
* refactor: replace axios and node fetch with electron's net module for network requests in preprocess providers - Updated Doc2xPreprocessProvider and MineruPreprocessProvider to use net.fetch instead of axios for making HTTP requests. - Improved error handling for network responses across various methods. - Removed unnecessary AxiosRequestConfig and related code to streamline the implementation. * lint * refactor(Doc2xPreprocessProvider): enhance file validation and upload process - Added file size validation to prevent loading files larger than 300MB into memory. - Implemented file size check before reading the PDF to ensure efficient memory usage. - Updated the file upload method to use a stream, setting the 'Content-Length' header for better handling of large files. * refactor(brave-search): update net.fetch calls to use url.toString() - Modified all instances of net.fetch to use url.toString() for better URL handling. - Ensured consistency in how URLs are passed to the fetch method across various functions. * refactor(MCPService): improve URL handling in net.fetch calls - Updated net.fetch to use url.toString() for better type handling of URLs. - Ensured consistent URL processing across the MCPService class. * feat(ProxyManager): integrate axios with fetch proxy support - Added axios as a dependency to enable fetch proxy usage. - Implemented logic to set axios's adapter to 'fetch' for proxy handling. - Preserved original axios adapter for restoration when disabling the proxy.
381 lines
11 KiB
TypeScript
381 lines
11 KiB
TypeScript
import fs from 'node:fs'
|
|
import path from 'node:path'
|
|
|
|
import { loggerService } from '@logger'
|
|
import { fileStorage } from '@main/services/FileStorage'
|
|
import { FileMetadata, PreprocessProvider } from '@types'
|
|
import AdmZip from 'adm-zip'
|
|
import { net } from 'electron'
|
|
|
|
import BasePreprocessProvider from './BasePreprocessProvider'
|
|
|
|
const logger = loggerService.withContext('Doc2xPreprocessProvider')
|
|
|
|
type ApiResponse<T> = {
|
|
code: string
|
|
data: T
|
|
message?: string
|
|
}
|
|
|
|
type PreuploadResponse = {
|
|
uid: string
|
|
url: string
|
|
}
|
|
|
|
type StatusResponse = {
|
|
status: string
|
|
progress: number
|
|
}
|
|
|
|
type ParsedFileResponse = {
|
|
status: string
|
|
url: string
|
|
}
|
|
|
|
export default class Doc2xPreprocessProvider extends BasePreprocessProvider {
|
|
constructor(provider: PreprocessProvider) {
|
|
super(provider)
|
|
}
|
|
|
|
private async validateFile(filePath: string): Promise<void> {
|
|
// 首先检查文件大小,避免读取大文件到内存
|
|
const stats = await fs.promises.stat(filePath)
|
|
const fileSizeBytes = stats.size
|
|
|
|
// 文件大小小于300MB
|
|
if (fileSizeBytes >= 300 * 1024 * 1024) {
|
|
const fileSizeMB = Math.round(fileSizeBytes / (1024 * 1024))
|
|
throw new Error(`PDF file size (${fileSizeMB}MB) exceeds the limit of 300MB`)
|
|
}
|
|
|
|
// 只有在文件大小合理的情况下才读取文件内容检查页数
|
|
const pdfBuffer = await fs.promises.readFile(filePath)
|
|
const doc = await this.readPdf(pdfBuffer)
|
|
|
|
// 文件页数小于1000页
|
|
if (doc.numPages >= 1000) {
|
|
throw new Error(`PDF page count (${doc.numPages}) exceeds the limit of 1000 pages`)
|
|
}
|
|
}
|
|
|
|
public async parseFile(sourceId: string, file: FileMetadata): Promise<{ processedFile: FileMetadata }> {
|
|
try {
|
|
const filePath = fileStorage.getFilePathById(file)
|
|
logger.info(`Preprocess processing started: ${filePath}`)
|
|
|
|
// 步骤1: 准备上传
|
|
const { uid, url } = await this.preupload()
|
|
logger.info(`Preprocess preupload completed: uid=${uid}`)
|
|
|
|
await this.validateFile(filePath)
|
|
|
|
// 步骤2: 上传文件
|
|
await this.putFile(filePath, url)
|
|
|
|
// 步骤3: 等待处理完成
|
|
await this.waitForProcessing(sourceId, uid)
|
|
logger.info(`Preprocess parsing completed successfully for: ${filePath}`)
|
|
|
|
// 步骤4: 导出文件
|
|
const { path: outputPath } = await this.exportFile(file, uid)
|
|
|
|
// 步骤5: 创建处理后的文件信息
|
|
return {
|
|
processedFile: this.createProcessedFileInfo(file, outputPath)
|
|
}
|
|
} catch (error) {
|
|
logger.error(`Preprocess processing failed for:`, error as Error)
|
|
throw error
|
|
}
|
|
}
|
|
|
|
private createProcessedFileInfo(file: FileMetadata, outputPath: string): FileMetadata {
|
|
const outputFilePath = `${outputPath}/${file.name.split('.').slice(0, -1).join('.')}.md`
|
|
return {
|
|
...file,
|
|
name: file.name.replace('.pdf', '.md'),
|
|
path: outputFilePath,
|
|
ext: '.md',
|
|
size: fs.statSync(outputFilePath).size
|
|
}
|
|
}
|
|
|
|
/**
|
|
* 导出文件
|
|
* @param file 文件信息
|
|
* @param uid 预上传响应的uid
|
|
* @returns 导出文件的路径
|
|
*/
|
|
public async exportFile(file: FileMetadata, uid: string): Promise<{ path: string }> {
|
|
const filePath = fileStorage.getFilePathById(file)
|
|
logger.info(`Exporting file: ${filePath}`)
|
|
|
|
// 步骤1: 转换文件
|
|
await this.convertFile(uid, filePath)
|
|
logger.info(`File conversion completed for: ${filePath}`)
|
|
|
|
// 步骤2: 等待导出并获取URL
|
|
const exportUrl = await this.waitForExport(uid)
|
|
|
|
// 步骤3: 下载并解压文件
|
|
return this.downloadFile(exportUrl, file)
|
|
}
|
|
|
|
/**
|
|
* 等待处理完成
|
|
* @param sourceId 源文件ID
|
|
* @param uid 预上传响应的uid
|
|
*/
|
|
private async waitForProcessing(sourceId: string, uid: string): Promise<void> {
|
|
while (true) {
|
|
await this.delay(1000)
|
|
const { status, progress } = await this.getStatus(uid)
|
|
await this.sendPreprocessProgress(sourceId, progress)
|
|
logger.info(`Preprocess processing status: ${status}, progress: ${progress}%`)
|
|
|
|
if (status === 'success') {
|
|
return
|
|
} else if (status === 'failed') {
|
|
throw new Error('Preprocess processing failed')
|
|
}
|
|
}
|
|
}
|
|
|
|
/**
|
|
* 等待导出完成
|
|
* @param uid 预上传响应的uid
|
|
* @returns 导出文件的url
|
|
*/
|
|
private async waitForExport(uid: string): Promise<string> {
|
|
while (true) {
|
|
await this.delay(1000)
|
|
const { status, url } = await this.getParsedFile(uid)
|
|
logger.info(`Export status: ${status}`)
|
|
|
|
if (status === 'success' && url) {
|
|
return url
|
|
} else if (status === 'failed') {
|
|
throw new Error('Export failed')
|
|
}
|
|
}
|
|
}
|
|
|
|
/**
|
|
* 预上传文件
|
|
* @returns 预上传响应的url和uid
|
|
*/
|
|
private async preupload(): Promise<PreuploadResponse> {
|
|
const endpoint = `${this.provider.apiHost}/api/v2/parse/preupload`
|
|
|
|
try {
|
|
const response = await net.fetch(endpoint, {
|
|
method: 'POST',
|
|
headers: {
|
|
'Content-Type': 'application/json',
|
|
Authorization: `Bearer ${this.provider.apiKey}`
|
|
},
|
|
body: null
|
|
})
|
|
|
|
if (!response.ok) {
|
|
throw new Error(`HTTP ${response.status}: ${response.statusText}`)
|
|
}
|
|
|
|
const data = (await response.json()) as ApiResponse<PreuploadResponse>
|
|
|
|
if (data.code === 'success' && data.data) {
|
|
return data.data
|
|
} else {
|
|
throw new Error(`API returned error: ${data.message || JSON.stringify(data)}`)
|
|
}
|
|
} catch (error) {
|
|
logger.error(`Failed to get preupload URL: ${error instanceof Error ? error.message : String(error)}`)
|
|
throw new Error('Failed to get preupload URL')
|
|
}
|
|
}
|
|
|
|
/**
|
|
* 上传文件(使用流式上传)
|
|
* @param filePath 文件路径
|
|
* @param url 预上传响应的url
|
|
*/
|
|
private async putFile(filePath: string, url: string): Promise<void> {
|
|
try {
|
|
// 获取文件大小用于设置 Content-Length
|
|
const stats = await fs.promises.stat(filePath)
|
|
const fileSize = stats.size
|
|
|
|
// 创建可读流
|
|
const fileStream = fs.createReadStream(filePath)
|
|
|
|
const response = await net.fetch(url, {
|
|
method: 'PUT',
|
|
body: fileStream as any, // TypeScript 类型转换,net.fetch 支持 ReadableStream
|
|
headers: {
|
|
'Content-Length': fileSize.toString()
|
|
}
|
|
})
|
|
|
|
if (!response.ok) {
|
|
throw new Error(`HTTP ${response.status}: ${response.statusText}`)
|
|
}
|
|
} catch (error) {
|
|
logger.error(`Failed to upload file ${filePath}: ${error instanceof Error ? error.message : String(error)}`)
|
|
throw new Error('Failed to upload file')
|
|
}
|
|
}
|
|
|
|
private async getStatus(uid: string): Promise<StatusResponse> {
|
|
const endpoint = `${this.provider.apiHost}/api/v2/parse/status?uid=${uid}`
|
|
|
|
try {
|
|
const response = await net.fetch(endpoint, {
|
|
method: 'GET',
|
|
headers: {
|
|
Authorization: `Bearer ${this.provider.apiKey}`
|
|
}
|
|
})
|
|
|
|
if (!response.ok) {
|
|
throw new Error(`HTTP ${response.status}: ${response.statusText}`)
|
|
}
|
|
|
|
const data = (await response.json()) as ApiResponse<StatusResponse>
|
|
if (data.code === 'success' && data.data) {
|
|
return data.data
|
|
} else {
|
|
throw new Error(`API returned error: ${data.message || JSON.stringify(data)}`)
|
|
}
|
|
} catch (error) {
|
|
logger.error(`Failed to get status for uid ${uid}: ${error instanceof Error ? error.message : String(error)}`)
|
|
throw new Error('Failed to get processing status')
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Preprocess文件
|
|
* @param uid 预上传响应的uid
|
|
* @param filePath 文件路径
|
|
*/
|
|
private async convertFile(uid: string, filePath: string): Promise<void> {
|
|
const fileName = path.parse(filePath).name
|
|
|
|
const payload = {
|
|
uid,
|
|
to: 'md',
|
|
formula_mode: 'normal',
|
|
filename: fileName
|
|
}
|
|
|
|
const endpoint = `${this.provider.apiHost}/api/v2/convert/parse`
|
|
|
|
try {
|
|
const response = await net.fetch(endpoint, {
|
|
method: 'POST',
|
|
headers: {
|
|
'Content-Type': 'application/json',
|
|
Authorization: `Bearer ${this.provider.apiKey}`
|
|
},
|
|
body: JSON.stringify(payload)
|
|
})
|
|
|
|
if (!response.ok) {
|
|
throw new Error(`HTTP ${response.status}: ${response.statusText}`)
|
|
}
|
|
|
|
const data = (await response.json()) as ApiResponse<any>
|
|
if (data.code !== 'success') {
|
|
throw new Error(`API returned error: ${data.message || JSON.stringify(data)}`)
|
|
}
|
|
} catch (error) {
|
|
logger.error(`Failed to convert file ${filePath}: ${error instanceof Error ? error.message : String(error)}`)
|
|
throw new Error('Failed to convert file')
|
|
}
|
|
}
|
|
|
|
/**
|
|
* 获取解析后的文件信息
|
|
* @param uid 预上传响应的uid
|
|
* @returns 解析后的文件信息
|
|
*/
|
|
private async getParsedFile(uid: string): Promise<ParsedFileResponse> {
|
|
const endpoint = `${this.provider.apiHost}/api/v2/convert/parse/result?uid=${uid}`
|
|
|
|
try {
|
|
const response = await net.fetch(endpoint, {
|
|
method: 'GET',
|
|
headers: {
|
|
Authorization: `Bearer ${this.provider.apiKey}`
|
|
}
|
|
})
|
|
|
|
if (!response.ok) {
|
|
throw new Error(`HTTP ${response.status}: ${response.statusText}`)
|
|
}
|
|
|
|
const data = (await response.json()) as ApiResponse<ParsedFileResponse>
|
|
if (data.data) {
|
|
return data.data
|
|
} else {
|
|
throw new Error(`No data in response`)
|
|
}
|
|
} catch (error) {
|
|
logger.error(
|
|
`Failed to get parsed file for uid ${uid}: ${error instanceof Error ? error.message : String(error)}`
|
|
)
|
|
throw new Error('Failed to get parsed file information')
|
|
}
|
|
}
|
|
|
|
/**
|
|
* 下载文件
|
|
* @param url 导出文件的url
|
|
* @param file 文件信息
|
|
* @returns 下载文件的路径
|
|
*/
|
|
private async downloadFile(url: string, file: FileMetadata): Promise<{ path: string }> {
|
|
const dirPath = this.storageDir
|
|
// 使用统一的存储路径:Data/Files/{file.id}/
|
|
const extractPath = path.join(dirPath, file.id)
|
|
const zipPath = path.join(dirPath, `${file.id}.zip`)
|
|
|
|
// 确保目录存在
|
|
fs.mkdirSync(dirPath, { recursive: true })
|
|
fs.mkdirSync(extractPath, { recursive: true })
|
|
|
|
logger.info(`Downloading to export path: ${zipPath}`)
|
|
|
|
try {
|
|
// 下载文件
|
|
const response = await net.fetch(url, { method: 'GET' })
|
|
if (!response.ok) {
|
|
throw new Error(`HTTP ${response.status}: ${response.statusText}`)
|
|
}
|
|
const arrayBuffer = await response.arrayBuffer()
|
|
fs.writeFileSync(zipPath, Buffer.from(arrayBuffer))
|
|
|
|
// 确保提取目录存在
|
|
if (!fs.existsSync(extractPath)) {
|
|
fs.mkdirSync(extractPath, { recursive: true })
|
|
}
|
|
|
|
// 解压文件
|
|
const zip = new AdmZip(zipPath)
|
|
zip.extractAllTo(extractPath, true)
|
|
logger.info(`Extracted files to: ${extractPath}`)
|
|
|
|
// 删除临时ZIP文件
|
|
fs.unlinkSync(zipPath)
|
|
|
|
return { path: extractPath }
|
|
} catch (error) {
|
|
logger.error(`Failed to download and extract file: ${error instanceof Error ? error.message : String(error)}`)
|
|
throw new Error('Failed to download and extract file')
|
|
}
|
|
}
|
|
|
|
public checkQuota(): Promise<number> {
|
|
throw new Error('Method not implemented.')
|
|
}
|
|
}
|