Files
cherry-studio/src/main/knowledge/embedjs/loader/index.ts
T
Phantom 4d1d3e316f feat: use oxlint to speed up lint (#10168)
* build: add eslint-plugin-oxlint dependency

Add new eslint plugin to enhance linting capabilities with oxlint rules

* build(eslint): add oxlint plugin to eslint config

Add oxlint plugin as recommended in the documentation to enhance linting capabilities

* build: add oxlint v1.15.0 as a dependency

* build: add oxlint to linting commands

Add oxlint alongside eslint in test:lint and lint scripts for enhanced static analysis

* build: add oxlint configuration file

Configure oxlint with a comprehensive set of rules for JavaScript/TypeScript code quality checks

* chore: update oxlint configuration and related settings

- Add oxc to editor code actions on save
- Update oxlint configs to use eslint, typescript, and unicorn presets
- Extend ignore patterns in oxlint configuration
- Simplify oxlint command in package.json scripts
- Add oxlint-tsgolint dependency

* fix: lint warning

* chore: update oxlintrc from eslint.recommended

* refactor(lint): update eslint and oxlint configurations

- Add src/preload to eslint ignore patterns
- Update oxlint env to es2022 and add environment overrides
- Adjust several lint rule severities and configurations

* fix: lint error

* fix(file): replace eslint-disable with oxlint-disable in sanitizeFilename

The linter was changed from ESLint to oxlint, so the directive needs to be updated accordingly.

* fix: enforce stricter linting by failing on warnings in test:lint script

* feat: add recommended ts-eslint rules into exlint

* docs: remove outdated comment in oxlint config file

* style: disable typescript/no-require-imports rule in oxlint config

* docs(utils): fix comment typo from NODE to NOTE

* fix(MessageErrorBoundary): correct error description display condition

The error description was incorrectly showing in production and hiding in development. Fix the logic to show detailed errors only in development mode

* chore: add oxc-vscode extension to recommended list

* ci(workflows): reorder format check step in pr-ci.yml

* chore: update yarn.lock
2025-09-15 19:42:13 +08:00

166 lines
4.7 KiB
TypeScript

import { JsonLoader, LocalPathLoader, RAGApplication, TextLoader } from '@cherrystudio/embedjs'
import type { AddLoaderReturn } from '@cherrystudio/embedjs-interfaces'
import { WebLoader } from '@cherrystudio/embedjs-loader-web'
import { loggerService } from '@logger'
import { readTextFileWithAutoEncoding } from '@main/utils/file'
import { LoaderReturn } from '@shared/config/types'
import { FileMetadata, KnowledgeBaseParams } from '@types'
import { DraftsExportLoader } from './draftsExportLoader'
import { EpubLoader } from './epubLoader'
import { OdLoader, OdType } from './odLoader'
const logger = loggerService.withContext('KnowledgeLoader')
// 文件扩展名到加载器类型的映射
const FILE_LOADER_MAP: Record<string, string> = {
// 内置类型
'.pdf': 'common',
'.csv': 'common',
'.doc': 'common',
'.docx': 'common',
'.pptx': 'common',
'.xlsx': 'common',
'.md': 'common',
// OD类型
'.odt': 'od',
'.ods': 'od',
'.odp': 'od',
// epub类型
'.epub': 'epub',
// Drafts类型
'.draftsexport': 'drafts',
// HTML类型
'.html': 'html',
'.htm': 'html',
// JSON类型
'.json': 'json'
// 其他类型默认为文本类型
}
export async function addOdLoader(
ragApplication: RAGApplication,
file: FileMetadata,
base: KnowledgeBaseParams,
forceReload: boolean
): Promise<AddLoaderReturn> {
const loaderMap: Record<string, OdType> = {
'.odt': OdType.OdtLoader,
'.ods': OdType.OdsLoader,
'.odp': OdType.OdpLoader
}
const odType = loaderMap[file.ext]
if (!odType) {
throw new Error('Unknown odType')
}
return ragApplication.addLoader(
new OdLoader({
odType,
filePath: file.path,
chunkSize: base.chunkSize,
chunkOverlap: base.chunkOverlap
}) as any,
forceReload
)
}
export async function addFileLoader(
ragApplication: RAGApplication,
file: FileMetadata,
base: KnowledgeBaseParams,
forceReload: boolean
): Promise<LoaderReturn> {
// 获取文件类型,如果没有匹配则默认为文本类型
const loaderType = FILE_LOADER_MAP[file.ext.toLowerCase()] || 'text'
let loaderReturn: AddLoaderReturn
// 使用文件的实际路径
const filePath = file.path
// JSON类型处理
let jsonObject = {}
let jsonParsed = true
logger.info(`[KnowledgeBase] processing file ${filePath} as ${loaderType} type`)
switch (loaderType) {
case 'common':
// 内置类型处理
loaderReturn = await ragApplication.addLoader(
new LocalPathLoader({
path: filePath,
chunkSize: base.chunkSize,
chunkOverlap: base.chunkOverlap
}) as any,
forceReload
)
break
case 'od':
// OD类型处理
loaderReturn = await addOdLoader(ragApplication, file, base, forceReload)
break
case 'epub':
// epub类型处理
loaderReturn = await ragApplication.addLoader(
new EpubLoader({
filePath: filePath,
chunkSize: base.chunkSize ?? 1000,
chunkOverlap: base.chunkOverlap ?? 200
}) as any,
forceReload
)
break
case 'drafts':
// Drafts类型处理
loaderReturn = await ragApplication.addLoader(new DraftsExportLoader(filePath), forceReload)
break
case 'html':
// HTML类型处理
loaderReturn = await ragApplication.addLoader(
new WebLoader({
urlOrContent: await readTextFileWithAutoEncoding(filePath),
chunkSize: base.chunkSize,
chunkOverlap: base.chunkOverlap
}) as any,
forceReload
)
break
case 'json':
try {
jsonObject = JSON.parse(await readTextFileWithAutoEncoding(filePath))
} catch (error) {
jsonParsed = false
logger.warn(
`[KnowledgeBase] failed parsing json file, falling back to text processing: ${filePath}`,
error as Error
)
}
if (jsonParsed) {
loaderReturn = await ragApplication.addLoader(new JsonLoader({ object: jsonObject }), forceReload)
}
// fallthrough - JSON 解析失败时作为文本处理
// oxlint-disable-next-line no-fallthrough 利用switch特性,刻意不break
default:
// 文本类型处理(默认)
// 如果是其他文本类型且尚未读取文件,则读取文件
loaderReturn = await ragApplication.addLoader(
new TextLoader({
text: await readTextFileWithAutoEncoding(filePath),
chunkSize: base.chunkSize,
chunkOverlap: base.chunkOverlap
}) as any,
forceReload
)
break
}
return {
entriesAdded: loaderReturn.entriesAdded,
uniqueId: loaderReturn.uniqueId,
uniqueIds: [loaderReturn.uniqueId],
loaderType: loaderReturn.loaderType
} as LoaderReturn
}