5771d0c9e8
* refactor(FileManager): streamline file path handling in FilesPage and ImageBlock components * refactor(file): implement getSafeFilePath utility for consistent file path handling across loaders and preprocessors * refactor(FileStorage): replace getSafeFilePath with fileStorage.getFilePathById for consistent file path retrieval across services * refactor(file): unify file path retrieval across loaders and preprocessors for improved consistency * refactor(Inputbar, MessageEditor): replace getFileExtension with file.ext for improved file type handling * refactor(FileStorage): simplify getFilePathById method by removing redundant checks for file path retrieval * fix(FileStorage): update getFilePathById to ensure file.path is consistent with generated filePath * refactor(FileStorage): simplify getFilePathById method by removing unnecessary file path consistency checks * fix(FileStorage): update duplicate file check to use file.path for accurate detection * fix(FileStorage): correct file path usage in uploadFile method for accurate duplicate detection * fix(loader): update file path retrieval to use file.path for consistency across loaders
192 lines
6.6 KiB
TypeScript
192 lines
6.6 KiB
TypeScript
import fs from 'node:fs'
|
|
|
|
import { loggerService } from '@logger'
|
|
import { fileStorage } from '@main/services/FileStorage'
|
|
import { MistralClientManager } from '@main/services/MistralClientManager'
|
|
import { MistralService } from '@main/services/remotefile/MistralService'
|
|
import { Mistral } from '@mistralai/mistralai'
|
|
import { DocumentURLChunk } from '@mistralai/mistralai/models/components/documenturlchunk'
|
|
import { ImageURLChunk } from '@mistralai/mistralai/models/components/imageurlchunk'
|
|
import { OCRResponse } from '@mistralai/mistralai/models/components/ocrresponse'
|
|
import { FileMetadata, FileTypes, PreprocessProvider, Provider } from '@types'
|
|
import path from 'path'
|
|
|
|
import BasePreprocessProvider from './BasePreprocessProvider'
|
|
|
|
type PreuploadResponse = DocumentURLChunk | ImageURLChunk
|
|
|
|
const logger = loggerService.withContext('MistralPreprocessProvider')
|
|
|
|
export default class MistralPreprocessProvider extends BasePreprocessProvider {
|
|
private sdk: Mistral
|
|
private fileService: MistralService
|
|
|
|
constructor(provider: PreprocessProvider) {
|
|
super(provider)
|
|
const clientManager = MistralClientManager.getInstance()
|
|
const aiProvider: Provider = {
|
|
id: provider.id,
|
|
type: 'mistral',
|
|
name: provider.name,
|
|
apiKey: provider.apiKey!,
|
|
apiHost: provider.apiHost!,
|
|
models: []
|
|
}
|
|
clientManager.initializeClient(aiProvider)
|
|
this.sdk = clientManager.getClient()
|
|
this.fileService = new MistralService(aiProvider)
|
|
}
|
|
|
|
private async preupload(file: FileMetadata): Promise<PreuploadResponse> {
|
|
let document: PreuploadResponse
|
|
const filePath = fileStorage.getFilePathById(file)
|
|
logger.info(`preprocess preupload started for local file: ${filePath}`)
|
|
|
|
if (file.ext.toLowerCase() === '.pdf') {
|
|
const uploadResponse = await this.fileService.uploadFile(file)
|
|
|
|
if (uploadResponse.status === 'failed') {
|
|
logger.error('File upload failed:', uploadResponse)
|
|
throw new Error('Failed to upload file: ' + uploadResponse.displayName)
|
|
}
|
|
await this.sendPreprocessProgress(file.id, 15)
|
|
const fileUrl = await this.sdk.files.getSignedUrl({
|
|
fileId: uploadResponse.fileId
|
|
})
|
|
logger.info('Got signed URL:', fileUrl)
|
|
await this.sendPreprocessProgress(file.id, 20)
|
|
document = {
|
|
type: 'document_url',
|
|
documentUrl: fileUrl.url
|
|
}
|
|
} else {
|
|
const base64Image = Buffer.from(fs.readFileSync(filePath)).toString('base64')
|
|
document = {
|
|
type: 'image_url',
|
|
imageUrl: `data:image/png;base64,${base64Image}`
|
|
}
|
|
}
|
|
|
|
if (!document) {
|
|
throw new Error('Unsupported file type')
|
|
}
|
|
return document
|
|
}
|
|
|
|
public async parseFile(sourceId: string, file: FileMetadata): Promise<{ processedFile: FileMetadata }> {
|
|
try {
|
|
const document = await this.preupload(file)
|
|
const result = await this.sdk.ocr.process({
|
|
model: this.provider.model!,
|
|
document: document,
|
|
includeImageBase64: true
|
|
})
|
|
if (result) {
|
|
await this.sendPreprocessProgress(sourceId, 100)
|
|
const processedFile = this.convertFile(result, file)
|
|
return {
|
|
processedFile
|
|
}
|
|
} else {
|
|
throw new Error('preprocess processing failed: OCR response is empty')
|
|
}
|
|
} catch (error) {
|
|
throw new Error('preprocess processing failed: ' + error)
|
|
}
|
|
}
|
|
|
|
private convertFile(result: OCRResponse, file: FileMetadata): FileMetadata {
|
|
// 使用统一的存储路径:Data/Files/{file.id}/
|
|
const conversionId = file.id
|
|
const outputPath = path.join(this.storageDir, file.id)
|
|
const filePath = fileStorage.getFilePathById(file)
|
|
const outputFileName = path.basename(filePath, path.extname(filePath))
|
|
fs.mkdirSync(outputPath, { recursive: true })
|
|
|
|
const markdownParts: string[] = []
|
|
let counter = 0
|
|
|
|
// Process each page
|
|
result.pages.forEach((page) => {
|
|
let pageMarkdown = page.markdown
|
|
|
|
// Process images from this page
|
|
page.images.forEach((image) => {
|
|
if (image.imageBase64) {
|
|
let imageFormat = 'jpeg' // default format
|
|
let imageBase64Data = image.imageBase64
|
|
|
|
// Check for data URL prefix more efficiently
|
|
const prefixEnd = image.imageBase64.indexOf(';base64,')
|
|
if (prefixEnd > 0) {
|
|
const prefix = image.imageBase64.substring(0, prefixEnd)
|
|
const formatIndex = prefix.indexOf('image/')
|
|
if (formatIndex >= 0) {
|
|
imageFormat = prefix.substring(formatIndex + 6)
|
|
}
|
|
imageBase64Data = image.imageBase64.substring(prefixEnd + 8)
|
|
}
|
|
|
|
const imageFileName = `img-${counter}.${imageFormat}`
|
|
const imagePath = path.join(outputPath, imageFileName)
|
|
|
|
// Save image file
|
|
try {
|
|
fs.writeFileSync(imagePath, Buffer.from(imageBase64Data, 'base64'))
|
|
|
|
// Update image reference in markdown
|
|
// Use relative path for better portability
|
|
const relativeImagePath = `./${imageFileName}`
|
|
|
|
// Find the start and end of the image markdown
|
|
const imgStart = pageMarkdown.indexOf(image.imageBase64)
|
|
if (imgStart >= 0) {
|
|
// Find the markdown image syntax around this base64
|
|
const mdStart = pageMarkdown.lastIndexOf('` +
|
|
pageMarkdown.substring(mdEnd + 1)
|
|
}
|
|
}
|
|
|
|
counter++
|
|
} catch (error) {
|
|
logger.error(`Failed to save image ${imageFileName}:`, error as Error)
|
|
}
|
|
}
|
|
})
|
|
|
|
markdownParts.push(pageMarkdown)
|
|
})
|
|
|
|
// Combine all markdown content with double newlines for readability
|
|
const combinedMarkdown = markdownParts.join('\n\n')
|
|
|
|
// Write the markdown content to a file
|
|
const mdFileName = `${outputFileName}.md`
|
|
const mdFilePath = path.join(outputPath, mdFileName)
|
|
fs.writeFileSync(mdFilePath, combinedMarkdown)
|
|
|
|
return {
|
|
id: conversionId,
|
|
name: file.name.replace(/\.[^/.]+$/, '.md'),
|
|
origin_name: file.origin_name,
|
|
path: mdFilePath,
|
|
created_at: new Date().toISOString(),
|
|
type: FileTypes.DOCUMENT,
|
|
ext: '.md',
|
|
size: fs.statSync(mdFilePath).size,
|
|
count: 1
|
|
} as FileMetadata
|
|
}
|
|
|
|
public checkQuota(): Promise<number> {
|
|
throw new Error('Method not implemented.')
|
|
}
|
|
}
|