Files
cherry-studio/src/main/knowledge/embedjs/loader/odLoader.ts
T
Phantom 4d1d3e316f feat: use oxlint to speed up lint (#10168)
* build: add eslint-plugin-oxlint dependency

Add new eslint plugin to enhance linting capabilities with oxlint rules

* build(eslint): add oxlint plugin to eslint config

Add oxlint plugin as recommended in the documentation to enhance linting capabilities

* build: add oxlint v1.15.0 as a dependency

* build: add oxlint to linting commands

Add oxlint alongside eslint in test:lint and lint scripts for enhanced static analysis

* build: add oxlint configuration file

Configure oxlint with a comprehensive set of rules for JavaScript/TypeScript code quality checks

* chore: update oxlint configuration and related settings

- Add oxc to editor code actions on save
- Update oxlint configs to use eslint, typescript, and unicorn presets
- Extend ignore patterns in oxlint configuration
- Simplify oxlint command in package.json scripts
- Add oxlint-tsgolint dependency

* fix: lint warning

* chore: update oxlintrc from eslint.recommended

* refactor(lint): update eslint and oxlint configurations

- Add src/preload to eslint ignore patterns
- Update oxlint env to es2022 and add environment overrides
- Adjust several lint rule severities and configurations

* fix: lint error

* fix(file): replace eslint-disable with oxlint-disable in sanitizeFilename

The linter was changed from ESLint to oxlint, so the directive needs to be updated accordingly.

* fix: enforce stricter linting by failing on warnings in test:lint script

* feat: add recommended ts-eslint rules into exlint

* docs: remove outdated comment in oxlint config file

* style: disable typescript/no-require-imports rule in oxlint config

* docs(utils): fix comment typo from NODE to NOTE

* fix(MessageErrorBoundary): correct error description display condition

The error description was incorrectly showing in production and hiding in development. Fix the logic to show detailed errors only in development mode

* chore: add oxc-vscode extension to recommended list

* ci(workflows): reorder format check step in pr-ci.yml

* chore: update yarn.lock
2025-09-15 19:42:13 +08:00

75 lines
1.9 KiB
TypeScript

import { BaseLoader } from '@cherrystudio/embedjs-interfaces'
import { cleanString } from '@cherrystudio/embedjs-utils'
import { RecursiveCharacterTextSplitter } from '@langchain/textsplitters'
import { loggerService } from '@logger'
import md5 from 'md5'
import { OfficeParserConfig, parseOfficeAsync } from 'officeparser'
const logger = loggerService.withContext('OdLoader')
export enum OdType {
OdtLoader = 'OdtLoader',
OdsLoader = 'OdsLoader',
OdpLoader = 'OdpLoader',
Undefined = 'undefined'
}
export class OdLoader<OdType> extends BaseLoader<{ type: string }> {
private readonly odType: OdType
private readonly filePath: string
private extractedText: string
private config: OfficeParserConfig
constructor({
odType,
filePath,
chunkSize,
chunkOverlap
}: {
odType: OdType
filePath: string
chunkSize?: number
chunkOverlap?: number
}) {
super(`${odType}_${md5(filePath)}`, { filePath }, chunkSize ?? 1000, chunkOverlap ?? 0)
this.odType = odType
this.filePath = filePath
this.extractedText = ''
this.config = {
newlineDelimiter: ' ',
ignoreNotes: false
}
}
private async extractTextFromOdt() {
try {
this.extractedText = await parseOfficeAsync(this.filePath, this.config)
} catch (err) {
logger.error('odLoader error', err as Error)
throw err
}
}
override async *getUnfilteredChunks() {
if (!this.extractedText) {
await this.extractTextFromOdt()
}
const chunker = new RecursiveCharacterTextSplitter({
chunkSize: this.chunkSize,
chunkOverlap: this.chunkOverlap
})
const chunks = await chunker.splitText(cleanString(this.extractedText))
for (const chunk of chunks) {
yield {
pageContent: chunk,
metadata: {
type: this.odType as string,
source: this.filePath
}
}
}
}
}