4d1d3e316f
* build: add eslint-plugin-oxlint dependency Add new eslint plugin to enhance linting capabilities with oxlint rules * build(eslint): add oxlint plugin to eslint config Add oxlint plugin as recommended in the documentation to enhance linting capabilities * build: add oxlint v1.15.0 as a dependency * build: add oxlint to linting commands Add oxlint alongside eslint in test:lint and lint scripts for enhanced static analysis * build: add oxlint configuration file Configure oxlint with a comprehensive set of rules for JavaScript/TypeScript code quality checks * chore: update oxlint configuration and related settings - Add oxc to editor code actions on save - Update oxlint configs to use eslint, typescript, and unicorn presets - Extend ignore patterns in oxlint configuration - Simplify oxlint command in package.json scripts - Add oxlint-tsgolint dependency * fix: lint warning * chore: update oxlintrc from eslint.recommended * refactor(lint): update eslint and oxlint configurations - Add src/preload to eslint ignore patterns - Update oxlint env to es2022 and add environment overrides - Adjust several lint rule severities and configurations * fix: lint error * fix(file): replace eslint-disable with oxlint-disable in sanitizeFilename The linter was changed from ESLint to oxlint, so the directive needs to be updated accordingly. * fix: enforce stricter linting by failing on warnings in test:lint script * feat: add recommended ts-eslint rules into exlint * docs: remove outdated comment in oxlint config file * style: disable typescript/no-require-imports rule in oxlint config * docs(utils): fix comment typo from NODE to NOTE * fix(MessageErrorBoundary): correct error description display condition The error description was incorrectly showing in production and hiding in development. Fix the logic to show detailed errors only in development mode * chore: add oxc-vscode extension to recommended list * ci(workflows): reorder format check step in pr-ci.yml * chore: update yarn.lock
75 lines
1.9 KiB
TypeScript
75 lines
1.9 KiB
TypeScript
import { BaseLoader } from '@cherrystudio/embedjs-interfaces'
|
|
import { cleanString } from '@cherrystudio/embedjs-utils'
|
|
import { RecursiveCharacterTextSplitter } from '@langchain/textsplitters'
|
|
import { loggerService } from '@logger'
|
|
import md5 from 'md5'
|
|
import { OfficeParserConfig, parseOfficeAsync } from 'officeparser'
|
|
|
|
const logger = loggerService.withContext('OdLoader')
|
|
|
|
export enum OdType {
|
|
OdtLoader = 'OdtLoader',
|
|
OdsLoader = 'OdsLoader',
|
|
OdpLoader = 'OdpLoader',
|
|
Undefined = 'undefined'
|
|
}
|
|
|
|
export class OdLoader<OdType> extends BaseLoader<{ type: string }> {
|
|
private readonly odType: OdType
|
|
private readonly filePath: string
|
|
private extractedText: string
|
|
private config: OfficeParserConfig
|
|
|
|
constructor({
|
|
odType,
|
|
filePath,
|
|
chunkSize,
|
|
chunkOverlap
|
|
}: {
|
|
odType: OdType
|
|
filePath: string
|
|
chunkSize?: number
|
|
chunkOverlap?: number
|
|
}) {
|
|
super(`${odType}_${md5(filePath)}`, { filePath }, chunkSize ?? 1000, chunkOverlap ?? 0)
|
|
this.odType = odType
|
|
this.filePath = filePath
|
|
this.extractedText = ''
|
|
this.config = {
|
|
newlineDelimiter: ' ',
|
|
ignoreNotes: false
|
|
}
|
|
}
|
|
|
|
private async extractTextFromOdt() {
|
|
try {
|
|
this.extractedText = await parseOfficeAsync(this.filePath, this.config)
|
|
} catch (err) {
|
|
logger.error('odLoader error', err as Error)
|
|
throw err
|
|
}
|
|
}
|
|
|
|
override async *getUnfilteredChunks() {
|
|
if (!this.extractedText) {
|
|
await this.extractTextFromOdt()
|
|
}
|
|
const chunker = new RecursiveCharacterTextSplitter({
|
|
chunkSize: this.chunkSize,
|
|
chunkOverlap: this.chunkOverlap
|
|
})
|
|
|
|
const chunks = await chunker.splitText(cleanString(this.extractedText))
|
|
|
|
for (const chunk of chunks) {
|
|
yield {
|
|
pageContent: chunk,
|
|
metadata: {
|
|
type: this.odType as string,
|
|
source: this.filePath
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|