diff --git a/src/main/ocr/MacSysOcrProvider.ts b/src/main/ocr/MacSysOcrProvider.ts index 1f80f52e8..a565ebf95 100644 --- a/src/main/ocr/MacSysOcrProvider.ts +++ b/src/main/ocr/MacSysOcrProvider.ts @@ -2,15 +2,12 @@ import { isMac } from '@main/constant' import { FileMetadata, OcrProvider } from '@types' import Logger from 'electron-log' import * as fs from 'fs' -import PQueue from 'p-queue' import * as path from 'path' import { TextItem } from 'pdfjs-dist/types/src/display/api' import BaseOcrProvider from './BaseOcrProvider' export default class MacSysOcrProvider extends BaseOcrProvider { - private readonly BATCH_SIZE = 4 - private readonly CONCURRENCY = 2 private readonly MIN_TEXT_LENGTH = 1000 private MacOCR: any @@ -45,45 +42,27 @@ export default class MacSysOcrProvider extends BaseOcrProvider { sourceId: string, writeStream: fs.WriteStream ): Promise { - const queue = new PQueue({ concurrency: this.CONCURRENCY }) - const batches: Promise[] = [] await this.initMacOCR() + for (let i = 0; i < totalPages; i++) { + // Convert pages to buffers + const pageNum = i + 1 + const pageBuffer = await results.getPage(pageNum) + const croppedPageBuffer = await this.cropImage(pageBuffer) - // Create ordered batches - for (let startPage = 0; startPage < totalPages; startPage += this.BATCH_SIZE) { - const endPage = Math.min(startPage + this.BATCH_SIZE, totalPages) - const batchPromise = queue.add(async () => { - // Convert pages to buffers - const pageBuffers: Buffer[] = [] - for (let i = startPage; i < endPage; i++) { - const pageNum = i + 1 - const pageBuffer = await results.getPage(pageNum) - // const croppedPageBuffer = await this.cropImage(pageBuffer) - pageBuffers.push(pageBuffer) + // Process batch + const ocrResult = await this.MacOCR.recognizeFromBuffer(croppedPageBuffer, { + ocrOptions: { + recognitionLevel: this.getRecognitionLevel(this.provider.options?.recognitionLevel), + minConfidence: this.provider.options?.minConfidence || 0.5 } - - // Process batch - const ocrResults = await this.MacOCR.recognizeBatchFromBuffer(pageBuffers, { - ocrOptions: { - recognitionLevel: this.getRecognitionLevel(this.provider.options?.recognitionLevel), - minConfidence: this.provider.options?.minConfidence || 0.5, - languages: this.provider.options?.language || 'zh-Hans' - } - }) - - // Write results in order - for (const result of ocrResults) { - writeStream.write(result.text + '\n') - } - - // Update progress - await this.sendOcrProgress(sourceId, (endPage / totalPages) * 100) }) - batches.push(batchPromise) - } - // Wait for all batches to complete in order - await Promise.all(batches) + // Write results in order + writeStream.write(ocrResult.text + '\n') + + // Update progress + await this.sendOcrProgress(sourceId, (pageNum / totalPages) * 100) + } } public async isScanPdf(buffer: Buffer): Promise { @@ -109,12 +88,9 @@ export default class MacSysOcrProvider extends BaseOcrProvider { try { const { pdf } = await import('pdf-to-img') const pdfBuffer = await fs.promises.readFile(file.path) - const isScanPdf = await this.isScanPdf(pdfBuffer) - if (!isScanPdf) { - Logger.info('[OCR] PDF is not a scan version, skipping OCR') - return { processedFile: file } - } - const results = await pdf(pdfBuffer) + const results = await pdf(pdfBuffer, { + scale: 2 + }) const totalPages = results.length const baseDir = path.dirname(file.path) diff --git a/src/main/ocr/MineruOcrProvider.ts b/src/main/ocr/MineruOcrProvider.ts index 49f8a5b0c..1acfb0350 100644 --- a/src/main/ocr/MineruOcrProvider.ts +++ b/src/main/ocr/MineruOcrProvider.ts @@ -1,7 +1,7 @@ import fs from 'node:fs' import path from 'node:path' -import { FileType, OcrProvider } from '@types' +import { FileMetadata, OcrProvider } from '@types' import AdmZip from 'adm-zip' import axios from 'axios' import Logger from 'electron-log' @@ -44,7 +44,7 @@ export default class MineruOcrProvider extends BaseOcrProvider { super(provider) } - public async parseFile(sourceId: string, file: FileType): Promise<{ processedFile: FileType }> { + public async parseFile(sourceId: string, file: FileMetadata): Promise<{ processedFile: FileMetadata }> { try { Logger.info(`MinerU OCR processing started: ${file.path}`) await this.validateFile(file.path) @@ -86,7 +86,7 @@ export default class MineruOcrProvider extends BaseOcrProvider { } } - private createProcessedFileInfo(file: FileType, outputPath: string): FileType { + private createProcessedFileInfo(file: FileMetadata, outputPath: string): FileMetadata { // 查找解压后的主要文件 let finalPath = '' let finalName = file.origin_name.replace('.pdf', '.md') @@ -159,7 +159,7 @@ export default class MineruOcrProvider extends BaseOcrProvider { } } - private async uploadFile(file: FileType): Promise { + private async uploadFile(file: FileMetadata): Promise { try { // 步骤1: 获取上传URL const { batchId, fileUrls } = await this.getBatchUploadUrls(file) @@ -177,7 +177,7 @@ export default class MineruOcrProvider extends BaseOcrProvider { } } - private async getBatchUploadUrls(file: FileType): Promise<{ batchId: string; fileUrls: string[] }> { + private async getBatchUploadUrls(file: FileMetadata): Promise<{ batchId: string; fileUrls: string[] }> { const endpoint = `${this.provider.apiHost}/api/v4/file-urls/batch` const payload = {