Files
cherry-studio/src/main/knowledge/ocr/BaseOcrProvider.ts
T
2025-07-21 18:11:27 +08:00

123 lines
4.0 KiB
TypeScript

import fs from 'node:fs'
import path from 'node:path'
import { windowService } from '@main/services/WindowService'
import { getFileExt } from '@main/utils/file'
import { FileMetadata, OcrProvider } from '@types'
import { app } from 'electron'
import pdfjs from 'pdfjs-dist'
import { TypedArray } from 'pdfjs-dist/types/src/display/api'
export default abstract class BaseOcrProvider {
protected provider: OcrProvider
public storageDir = path.join(app.getPath('userData'), 'Data', 'Files')
constructor(provider: OcrProvider) {
if (!provider) {
throw new Error('OCR provider is not set')
}
this.provider = provider
}
abstract parseFile(sourceId: string, file: FileMetadata): Promise<{ processedFile: FileMetadata; quota?: number }>
/**
* 检查文件是否已经被预处理过
* 统一检测方法:如果 Data/Files/{file.id} 是目录,说明已被预处理
* @param file 文件信息
* @returns 如果已处理返回处理后的文件信息,否则返回null
*/
public async checkIfAlreadyProcessed(file: FileMetadata): Promise<FileMetadata | null> {
try {
// 检查 Data/Files/{file.id} 是否是目录
const preprocessDirPath = path.join(this.storageDir, file.id)
if (fs.existsSync(preprocessDirPath)) {
const stats = await fs.promises.stat(preprocessDirPath)
// 如果是目录,说明已经被预处理过
if (stats.isDirectory()) {
// 查找目录中的处理结果文件
const files = await fs.promises.readdir(preprocessDirPath)
// 查找主要的处理结果文件(.md 或 .txt)
const processedFile = files.find((fileName) => fileName.endsWith('.md') || fileName.endsWith('.txt'))
if (processedFile) {
const processedFilePath = path.join(preprocessDirPath, processedFile)
const processedStats = await fs.promises.stat(processedFilePath)
const ext = getFileExt(processedFile)
return {
...file,
name: file.name.replace(file.ext, ext),
path: processedFilePath,
ext: ext,
size: processedStats.size,
created_at: processedStats.birthtime.toISOString()
}
}
}
}
return null
} catch (error) {
// 如果检查过程中出现错误,返回null表示未处理
return null
}
}
/**
* 辅助方法:延迟执行
*/
public delay = (ms: number): Promise<void> => {
return new Promise((resolve) => setTimeout(resolve, ms))
}
public async readPdf(
source: string | URL | TypedArray,
passwordCallback?: (fn: (password: string) => void, reason: string) => string
) {
const documentLoadingTask = pdfjs.getDocument(source)
if (passwordCallback) {
documentLoadingTask.onPassword = passwordCallback
}
const document = await documentLoadingTask.promise
return document
}
public async sendOcrProgress(sourceId: string, progress: number): Promise<void> {
const mainWindow = windowService.getMainWindow()
mainWindow?.webContents.send('file-ocr-progress', {
itemId: sourceId,
progress: progress
})
}
/**
* 将文件移动到附件目录
* @param fileId 文件id
* @param filePaths 需要移动的文件路径数组
* @returns 移动后的文件路径数组
*/
public moveToAttachmentsDir(fileId: string, filePaths: string[]): string[] {
const attachmentsPath = path.join(this.storageDir, fileId)
if (!fs.existsSync(attachmentsPath)) {
fs.mkdirSync(attachmentsPath, { recursive: true })
}
const movedPaths: string[] = []
for (const filePath of filePaths) {
if (fs.existsSync(filePath)) {
const fileName = path.basename(filePath)
const destPath = path.join(attachmentsPath, fileName)
fs.copyFileSync(filePath, destPath)
fs.unlinkSync(filePath) // 删除原文件,实现"移动"
movedPaths.push(destPath)
}
}
return movedPaths
}
}