4d1d3e316f
* build: add eslint-plugin-oxlint dependency Add new eslint plugin to enhance linting capabilities with oxlint rules * build(eslint): add oxlint plugin to eslint config Add oxlint plugin as recommended in the documentation to enhance linting capabilities * build: add oxlint v1.15.0 as a dependency * build: add oxlint to linting commands Add oxlint alongside eslint in test:lint and lint scripts for enhanced static analysis * build: add oxlint configuration file Configure oxlint with a comprehensive set of rules for JavaScript/TypeScript code quality checks * chore: update oxlint configuration and related settings - Add oxc to editor code actions on save - Update oxlint configs to use eslint, typescript, and unicorn presets - Extend ignore patterns in oxlint configuration - Simplify oxlint command in package.json scripts - Add oxlint-tsgolint dependency * fix: lint warning * chore: update oxlintrc from eslint.recommended * refactor(lint): update eslint and oxlint configurations - Add src/preload to eslint ignore patterns - Update oxlint env to es2022 and add environment overrides - Adjust several lint rule severities and configurations * fix: lint error * fix(file): replace eslint-disable with oxlint-disable in sanitizeFilename The linter was changed from ESLint to oxlint, so the directive needs to be updated accordingly. * fix: enforce stricter linting by failing on warnings in test:lint script * feat: add recommended ts-eslint rules into exlint * docs: remove outdated comment in oxlint config file * style: disable typescript/no-require-imports rule in oxlint config * docs(utils): fix comment typo from NODE to NOTE * fix(MessageErrorBoundary): correct error description display condition The error description was incorrectly showing in production and hiding in development. Fix the logic to show detailed errors only in development mode * chore: add oxc-vscode extension to recommended list * ci(workflows): reorder format check step in pr-ci.yml * chore: update yarn.lock
166 lines
4.7 KiB
TypeScript
166 lines
4.7 KiB
TypeScript
import { JsonLoader, LocalPathLoader, RAGApplication, TextLoader } from '@cherrystudio/embedjs'
|
|
import type { AddLoaderReturn } from '@cherrystudio/embedjs-interfaces'
|
|
import { WebLoader } from '@cherrystudio/embedjs-loader-web'
|
|
import { loggerService } from '@logger'
|
|
import { readTextFileWithAutoEncoding } from '@main/utils/file'
|
|
import { LoaderReturn } from '@shared/config/types'
|
|
import { FileMetadata, KnowledgeBaseParams } from '@types'
|
|
|
|
import { DraftsExportLoader } from './draftsExportLoader'
|
|
import { EpubLoader } from './epubLoader'
|
|
import { OdLoader, OdType } from './odLoader'
|
|
|
|
const logger = loggerService.withContext('KnowledgeLoader')
|
|
|
|
// 文件扩展名到加载器类型的映射
|
|
const FILE_LOADER_MAP: Record<string, string> = {
|
|
// 内置类型
|
|
'.pdf': 'common',
|
|
'.csv': 'common',
|
|
'.doc': 'common',
|
|
'.docx': 'common',
|
|
'.pptx': 'common',
|
|
'.xlsx': 'common',
|
|
'.md': 'common',
|
|
// OD类型
|
|
'.odt': 'od',
|
|
'.ods': 'od',
|
|
'.odp': 'od',
|
|
// epub类型
|
|
'.epub': 'epub',
|
|
// Drafts类型
|
|
'.draftsexport': 'drafts',
|
|
// HTML类型
|
|
'.html': 'html',
|
|
'.htm': 'html',
|
|
// JSON类型
|
|
'.json': 'json'
|
|
// 其他类型默认为文本类型
|
|
}
|
|
|
|
export async function addOdLoader(
|
|
ragApplication: RAGApplication,
|
|
file: FileMetadata,
|
|
base: KnowledgeBaseParams,
|
|
forceReload: boolean
|
|
): Promise<AddLoaderReturn> {
|
|
const loaderMap: Record<string, OdType> = {
|
|
'.odt': OdType.OdtLoader,
|
|
'.ods': OdType.OdsLoader,
|
|
'.odp': OdType.OdpLoader
|
|
}
|
|
const odType = loaderMap[file.ext]
|
|
if (!odType) {
|
|
throw new Error('Unknown odType')
|
|
}
|
|
return ragApplication.addLoader(
|
|
new OdLoader({
|
|
odType,
|
|
filePath: file.path,
|
|
chunkSize: base.chunkSize,
|
|
chunkOverlap: base.chunkOverlap
|
|
}) as any,
|
|
forceReload
|
|
)
|
|
}
|
|
|
|
export async function addFileLoader(
|
|
ragApplication: RAGApplication,
|
|
file: FileMetadata,
|
|
base: KnowledgeBaseParams,
|
|
forceReload: boolean
|
|
): Promise<LoaderReturn> {
|
|
// 获取文件类型,如果没有匹配则默认为文本类型
|
|
const loaderType = FILE_LOADER_MAP[file.ext.toLowerCase()] || 'text'
|
|
let loaderReturn: AddLoaderReturn
|
|
// 使用文件的实际路径
|
|
const filePath = file.path
|
|
|
|
// JSON类型处理
|
|
let jsonObject = {}
|
|
let jsonParsed = true
|
|
logger.info(`[KnowledgeBase] processing file ${filePath} as ${loaderType} type`)
|
|
switch (loaderType) {
|
|
case 'common':
|
|
// 内置类型处理
|
|
loaderReturn = await ragApplication.addLoader(
|
|
new LocalPathLoader({
|
|
path: filePath,
|
|
chunkSize: base.chunkSize,
|
|
chunkOverlap: base.chunkOverlap
|
|
}) as any,
|
|
forceReload
|
|
)
|
|
break
|
|
|
|
case 'od':
|
|
// OD类型处理
|
|
loaderReturn = await addOdLoader(ragApplication, file, base, forceReload)
|
|
break
|
|
case 'epub':
|
|
// epub类型处理
|
|
loaderReturn = await ragApplication.addLoader(
|
|
new EpubLoader({
|
|
filePath: filePath,
|
|
chunkSize: base.chunkSize ?? 1000,
|
|
chunkOverlap: base.chunkOverlap ?? 200
|
|
}) as any,
|
|
forceReload
|
|
)
|
|
break
|
|
|
|
case 'drafts':
|
|
// Drafts类型处理
|
|
loaderReturn = await ragApplication.addLoader(new DraftsExportLoader(filePath), forceReload)
|
|
break
|
|
|
|
case 'html':
|
|
// HTML类型处理
|
|
loaderReturn = await ragApplication.addLoader(
|
|
new WebLoader({
|
|
urlOrContent: await readTextFileWithAutoEncoding(filePath),
|
|
chunkSize: base.chunkSize,
|
|
chunkOverlap: base.chunkOverlap
|
|
}) as any,
|
|
forceReload
|
|
)
|
|
break
|
|
|
|
case 'json':
|
|
try {
|
|
jsonObject = JSON.parse(await readTextFileWithAutoEncoding(filePath))
|
|
} catch (error) {
|
|
jsonParsed = false
|
|
logger.warn(
|
|
`[KnowledgeBase] failed parsing json file, falling back to text processing: ${filePath}`,
|
|
error as Error
|
|
)
|
|
}
|
|
|
|
if (jsonParsed) {
|
|
loaderReturn = await ragApplication.addLoader(new JsonLoader({ object: jsonObject }), forceReload)
|
|
}
|
|
// fallthrough - JSON 解析失败时作为文本处理
|
|
// oxlint-disable-next-line no-fallthrough 利用switch特性,刻意不break
|
|
default:
|
|
// 文本类型处理(默认)
|
|
// 如果是其他文本类型且尚未读取文件,则读取文件
|
|
loaderReturn = await ragApplication.addLoader(
|
|
new TextLoader({
|
|
text: await readTextFileWithAutoEncoding(filePath),
|
|
chunkSize: base.chunkSize,
|
|
chunkOverlap: base.chunkOverlap
|
|
}) as any,
|
|
forceReload
|
|
)
|
|
break
|
|
}
|
|
|
|
return {
|
|
entriesAdded: loaderReturn.entriesAdded,
|
|
uniqueId: loaderReturn.uniqueId,
|
|
uniqueIds: [loaderReturn.uniqueId],
|
|
loaderType: loaderReturn.loaderType
|
|
} as LoaderReturn
|
|
}
|