feat: Add EPUB file support to document loader

This commit is contained in:
bC2y5tal
2025-02-14 23:54:19 +08:00
committed by 亢奋猫
parent 6a7ec6d8ba
commit 7a7f51d5f3
5 changed files with 530 additions and 16 deletions
+228
View File
@@ -0,0 +1,228 @@
import { RecursiveCharacterTextSplitter } from '@langchain/textsplitters'
import { BaseLoader } from '@llm-tools/embedjs-interfaces'
import { cleanString } from '@llm-tools/embedjs-utils'
import Logger from 'electron-log'
import EPub from 'epub'
import * as fs from 'fs'
/**
* epub 加载器的配置选项
*/
interface EpubLoaderOptions {
/** epub 文件路径 */
filePath: string
/** 文本分块大小 */
chunkSize: number
/** 分块重叠大小 */
chunkOverlap: number
}
/**
* epub 文件的元数据信息
*/
interface EpubMetadata {
/** 作者显示名称(例如:"Lewis Carroll" */
creator?: string
/** 作者规范化名称,用于排序和索引(例如:"Carroll, Lewis" */
creatorFileAs?: string
/** 书籍标题(例如:"Alice's Adventures in Wonderland" */
title?: string
/** 语言代码(例如:"en" 或 "zh-CN" */
language?: string
/** 主题或分类(例如:"Fantasy"、"Fiction" */
subject?: string
/** 创建日期(例如:"2024-02-14" */
date?: string
/** 书籍描述或简介 */
description?: string
}
/**
* epub 章节信息
*/
interface EpubChapter {
/** 章节 ID */
id: string
/** 章节标题 */
title?: string
/** 章节顺序 */
order?: number
}
/**
* epub 文件加载器
* 用于解析 epub 电子书文件,提取文本内容和元数据
*/
export class EpubLoader extends BaseLoader<Record<string, string | number | boolean>, Record<string, unknown>> {
protected filePath: string
protected chunkSize: number
protected chunkOverlap: number
private extractedText: string
private metadata: EpubMetadata | null
/**
* 创建 epub 加载器实例
* @param options 加载器配置选项
*/
constructor(options: EpubLoaderOptions) {
super(options.filePath, {
chunkSize: options.chunkSize,
chunkOverlap: options.chunkOverlap
})
this.filePath = options.filePath
this.chunkSize = options.chunkSize
this.chunkOverlap = options.chunkOverlap
this.extractedText = ''
this.metadata = null
}
/**
* 等待 epub 文件初始化完成
* epub 库使用事件机制,需要等待 'end' 事件触发后才能访问文件内容
* @param epub epub 实例
* @returns 元数据和章节信息
*/
private waitForEpubInit(epub: any): Promise<{ metadata: EpubMetadata; chapters: EpubChapter[] }> {
return new Promise((resolve, reject) => {
epub.on('end', () => {
// 提取元数据
const metadata: EpubMetadata = {
creator: epub.metadata.creator,
creatorFileAs: epub.metadata.creatorFileAs,
title: epub.metadata.title,
language: epub.metadata.language,
subject: epub.metadata.subject,
date: epub.metadata.date,
description: epub.metadata.description
}
// 提取章节信息
const chapters: EpubChapter[] = epub.flow.map((chapter: any, index: number) => ({
id: chapter.id,
title: chapter.title || `Chapter ${index + 1}`,
order: index + 1
}))
resolve({ metadata, chapters })
})
epub.on('error', (error: Error) => {
reject(error)
})
epub.parse()
})
}
/**
* 获取章节内容
* @param epub epub 实例
* @param chapterId 章节 ID
* @returns 章节文本内容
*/
private getChapter(epub: any, chapterId: string): Promise<string> {
return new Promise((resolve, reject) => {
epub.getChapter(chapterId, (error: Error | null, text: string) => {
if (error) {
reject(error)
} else {
resolve(text)
}
})
})
}
/**
* 从 epub 文件中提取文本内容
* 1. 检查文件是否存在
* 2. 初始化 epub 并获取元数据
* 3. 遍历所有章节并提取文本
* 4. 清理 HTML 标签
* 5. 合并所有章节文本
*/
private async extractTextFromEpub() {
try {
// 检查文件是否存在
if (!fs.existsSync(this.filePath)) {
throw new Error(`File not found: ${this.filePath}`)
}
const epub = new EPub(this.filePath)
// 等待 epub 初始化完成并获取元数据
const { metadata, chapters } = await this.waitForEpubInit(epub)
this.metadata = metadata
if (!epub.flow || epub.flow.length === 0) {
throw new Error('No content found in epub file')
}
const chapterTexts: string[] = []
// 遍历所有章节
for (const chapter of chapters) {
try {
const content = await this.getChapter(epub, chapter.id)
if (!content) {
continue
}
// 移除 HTML 标签并清理文本
const text = content
.replace(/<[^>]*>/g, ' ') // 移除所有 HTML 标签
.replace(/\s+/g, ' ') // 将多个空白字符替换为单个空格
.trim() // 移除首尾空白
if (text) {
chapterTexts.push(text)
}
} catch (error) {
Logger.error(`[EpubLoader] Error processing chapter ${chapter.id}:`, error)
}
}
// 使用双换行符连接所有章节文本
this.extractedText = chapterTexts.join('\n\n')
} catch (error) {
Logger.error('[EpubLoader] Error in extractTextFromEpub:', error)
throw error
}
}
/**
* 生成文本块
* 重写 BaseLoader 的方法,将提取的文本分割成适当大小的块
* 每个块都包含源文件和元数据信息
*/
override async *getUnfilteredChunks() {
// 如果还没有提取文本,先提取
if (!this.extractedText) {
await this.extractTextFromEpub()
}
Logger.info('[EpubLoader] 书名:', this.metadata?.title || '未知书名', ' 文本大小:', this.extractedText.length)
// 创建文本分块器
const chunker = new RecursiveCharacterTextSplitter({
chunkSize: this.chunkSize,
chunkOverlap: this.chunkOverlap
})
// 清理并分割文本
const chunks = await chunker.splitText(cleanString(this.extractedText))
// 为每个文本块添加元数据
for (const chunk of chunks) {
yield {
pageContent: chunk,
metadata: {
source: this.filePath,
title: this.metadata?.title || '',
creator: this.metadata?.creator || '',
language: this.metadata?.language || ''
}
}
}
}
}
+19
View File
@@ -7,6 +7,7 @@ import { LoaderReturn } from '@shared/config/types'
import { FileType, KnowledgeBaseParams } from '@types'
import Logger from 'electron-log'
import { EpubLoader } from './epubLoader'
import { OdLoader, OdType } from './odLoader'
// embedjs内置loader类型
@@ -70,6 +71,24 @@ export async function addFileLoader(
} as LoaderReturn
}
// epub 文件处理
if (file.ext === '.epub') {
const loaderReturn = await ragApplication.addLoader(
new EpubLoader({
filePath: file.path,
chunkSize: base.chunkSize ?? 1000,
chunkOverlap: base.chunkOverlap ?? 200
}) as any,
forceReload
)
return {
entriesAdded: loaderReturn.entriesAdded,
uniqueId: loaderReturn.uniqueId,
uniqueIds: [loaderReturn.uniqueId],
loaderType: loaderReturn.loaderType
} as LoaderReturn
}
const fileContent = fs.readFileSync(file.path, 'utf-8')
// HTML类型
if (['.html', '.htm'].includes(file.ext)) {