refactor: streamline OCR processing in MacSysOcrProvider and update type definitions in MineruOcrProvider

- Removed batch processing in MacSysOcrProvider for direct page handling, improving efficiency.
- Updated type definitions from FileType to FileMetadata in MineruOcrProvider for consistency across the codebase.
This commit is contained in:
suyao
2025-06-03 17:48:22 +08:00
parent de58296136
commit 791bd37adf
2 changed files with 24 additions and 48 deletions
+19 -43
View File
@@ -2,15 +2,12 @@ import { isMac } from '@main/constant'
import { FileMetadata, OcrProvider } from '@types'
import Logger from 'electron-log'
import * as fs from 'fs'
import PQueue from 'p-queue'
import * as path from 'path'
import { TextItem } from 'pdfjs-dist/types/src/display/api'
import BaseOcrProvider from './BaseOcrProvider'
export default class MacSysOcrProvider extends BaseOcrProvider {
private readonly BATCH_SIZE = 4
private readonly CONCURRENCY = 2
private readonly MIN_TEXT_LENGTH = 1000
private MacOCR: any
@@ -45,45 +42,27 @@ export default class MacSysOcrProvider extends BaseOcrProvider {
sourceId: string,
writeStream: fs.WriteStream
): Promise<void> {
const queue = new PQueue({ concurrency: this.CONCURRENCY })
const batches: Promise<void>[] = []
await this.initMacOCR()
for (let i = 0; i < totalPages; i++) {
// Convert pages to buffers
const pageNum = i + 1
const pageBuffer = await results.getPage(pageNum)
const croppedPageBuffer = await this.cropImage(pageBuffer)
// Create ordered batches
for (let startPage = 0; startPage < totalPages; startPage += this.BATCH_SIZE) {
const endPage = Math.min(startPage + this.BATCH_SIZE, totalPages)
const batchPromise = queue.add(async () => {
// Convert pages to buffers
const pageBuffers: Buffer[] = []
for (let i = startPage; i < endPage; i++) {
const pageNum = i + 1
const pageBuffer = await results.getPage(pageNum)
// const croppedPageBuffer = await this.cropImage(pageBuffer)
pageBuffers.push(pageBuffer)
// Process batch
const ocrResult = await this.MacOCR.recognizeFromBuffer(croppedPageBuffer, {
ocrOptions: {
recognitionLevel: this.getRecognitionLevel(this.provider.options?.recognitionLevel),
minConfidence: this.provider.options?.minConfidence || 0.5
}
// Process batch
const ocrResults = await this.MacOCR.recognizeBatchFromBuffer(pageBuffers, {
ocrOptions: {
recognitionLevel: this.getRecognitionLevel(this.provider.options?.recognitionLevel),
minConfidence: this.provider.options?.minConfidence || 0.5,
languages: this.provider.options?.language || 'zh-Hans'
}
})
// Write results in order
for (const result of ocrResults) {
writeStream.write(result.text + '\n')
}
// Update progress
await this.sendOcrProgress(sourceId, (endPage / totalPages) * 100)
})
batches.push(batchPromise)
}
// Wait for all batches to complete in order
await Promise.all(batches)
// Write results in order
writeStream.write(ocrResult.text + '\n')
// Update progress
await this.sendOcrProgress(sourceId, (pageNum / totalPages) * 100)
}
}
public async isScanPdf(buffer: Buffer): Promise<boolean> {
@@ -109,12 +88,9 @@ export default class MacSysOcrProvider extends BaseOcrProvider {
try {
const { pdf } = await import('pdf-to-img')
const pdfBuffer = await fs.promises.readFile(file.path)
const isScanPdf = await this.isScanPdf(pdfBuffer)
if (!isScanPdf) {
Logger.info('[OCR] PDF is not a scan version, skipping OCR')
return { processedFile: file }
}
const results = await pdf(pdfBuffer)
const results = await pdf(pdfBuffer, {
scale: 2
})
const totalPages = results.length
const baseDir = path.dirname(file.path)
+5 -5
View File
@@ -1,7 +1,7 @@
import fs from 'node:fs'
import path from 'node:path'
import { FileType, OcrProvider } from '@types'
import { FileMetadata, OcrProvider } from '@types'
import AdmZip from 'adm-zip'
import axios from 'axios'
import Logger from 'electron-log'
@@ -44,7 +44,7 @@ export default class MineruOcrProvider extends BaseOcrProvider {
super(provider)
}
public async parseFile(sourceId: string, file: FileType): Promise<{ processedFile: FileType }> {
public async parseFile(sourceId: string, file: FileMetadata): Promise<{ processedFile: FileMetadata }> {
try {
Logger.info(`MinerU OCR processing started: ${file.path}`)
await this.validateFile(file.path)
@@ -86,7 +86,7 @@ export default class MineruOcrProvider extends BaseOcrProvider {
}
}
private createProcessedFileInfo(file: FileType, outputPath: string): FileType {
private createProcessedFileInfo(file: FileMetadata, outputPath: string): FileMetadata {
// 查找解压后的主要文件
let finalPath = ''
let finalName = file.origin_name.replace('.pdf', '.md')
@@ -159,7 +159,7 @@ export default class MineruOcrProvider extends BaseOcrProvider {
}
}
private async uploadFile(file: FileType): Promise<string> {
private async uploadFile(file: FileMetadata): Promise<string> {
try {
// 步骤1: 获取上传URL
const { batchId, fileUrls } = await this.getBatchUploadUrls(file)
@@ -177,7 +177,7 @@ export default class MineruOcrProvider extends BaseOcrProvider {
}
}
private async getBatchUploadUrls(file: FileType): Promise<{ batchId: string; fileUrls: string[] }> {
private async getBatchUploadUrls(file: FileMetadata): Promise<{ batchId: string; fileUrls: string[] }> {
const endpoint = `${this.provider.apiHost}/api/v4/file-urls/batch`
const payload = {