From a3735a80ddd87000341a4342f82b709955d2212b Mon Sep 17 00:00:00 2001 From: Alex Rock Date: Tue, 24 Sep 2024 01:25:14 -0600 Subject: [PATCH 1/2] chore: add /validators folder --- package.json | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/package.json b/package.json index 85b5c4551..120618cd7 100644 --- a/package.json +++ b/package.json @@ -10,7 +10,8 @@ "flatfilers/*", "plugins/*", "support/*", - "utils/*" + "utils/*", + "validators/*" ], "scripts": { "clean": "find ./ '(' -name 'node_modules' -o -name 'dist' -o -name '.turbo' -o -name '.parcel-cache' ')' -type d -exec rm -rf {} +", From f83e1377ce91bcd1c5fdbaff54f7a72ed08df2e0 Mon Sep 17 00:00:00 2001 From: "Alex Rock (Koala)" Date: Tue, 24 Sep 2024 01:37:46 -0600 Subject: [PATCH 2/2] koala: initial commit --- validators/SpellChecker/README.MD | 65 +++++++++ validators/SpellChecker/metadata.json | 84 ++++++++++++ validators/SpellChecker/package.json | 68 ++++++++++ validators/SpellChecker/rollup.config.mjs | 64 +++++++++ validators/SpellChecker/src/index.ts | 155 ++++++++++++++++++++++ 5 files changed, 436 insertions(+) create mode 100644 validators/SpellChecker/README.MD create mode 100644 validators/SpellChecker/metadata.json create mode 100644 validators/SpellChecker/package.json create mode 100644 validators/SpellChecker/rollup.config.mjs create mode 100644 validators/SpellChecker/src/index.ts diff --git a/validators/SpellChecker/README.MD b/validators/SpellChecker/README.MD new file mode 100644 index 000000000..faea87d0d --- /dev/null +++ b/validators/SpellChecker/README.MD @@ -0,0 +1,65 @@ +# Flatfile Spell Check Plugin + +A powerful spell-checking plugin for Flatfile that enhances data quality by identifying and optionally correcting misspellings in specified fields. + +## Features + +- Customizable field selection for spell-checking +- Support for multiple languages +- Custom dictionary and affix file support +- Automatic spell correction option +- Highlighted misspellings in original text +- Spell-check suggestions for misspelled words +- Separate fields for highlighted text, suggestions, and corrected text + +## Installation + +To install the Flatfile Spell Check Plugin, use npm: + +```bash +npm install @flatfile/plugin-spellcheck +``` + +## Example Usage + +```javascript +import { FlatfileListener } from "@flatfile/listener"; +import spellCheckPlugin from "@flatfile/plugin-spellcheck"; + +const listener = new FlatfileListener(); + +spellCheckPlugin(listener, { + fields: ["name", "description"], + language: "en_US", + sheetSlug: "contacts", + autoCorrect: false +}); + +listener.use(/* ... other plugins ... */); +``` + +## Configuration + +The spell check plugin accepts the following configuration options: + +- `fields`: Array of field names to spell-check (default: `["name", "description", "comments"]`) +- `dictionaryPath`: Custom path to dictionary file +- `affixPath`: Custom path to affix file +- `language`: Language code for built-in dictionaries (default: `"en_US"`) +- `sheetSlug`: Sheet slug to apply the spell check to (default: `"mySheetSlug"`) +- `autoCorrect`: Whether to automatically apply corrections (default: `false`) + +## Behavior + +1. The plugin processes each record in the specified sheet. +2. For each configured field, it splits the text into words and checks for misspellings. +3. If misspellings are found: + - A warning is added to the field + - A new field is created with highlighted misspellings + - Another field is created with spell-check suggestions + - A field with the list of misspelled words is added + - If `autoCorrect` is true, the original field is updated with corrections + - If `autoCorrect` is false, a new field with suggested corrections is created +4. The plugin uses a concurrency of 5 and includes debug information. + +This plugin helps improve data quality by identifying potential spelling errors and providing suggestions or automatic corrections, depending on the configuration. \ No newline at end of file diff --git a/validators/SpellChecker/metadata.json b/validators/SpellChecker/metadata.json new file mode 100644 index 000000000..1c130a35a --- /dev/null +++ b/validators/SpellChecker/metadata.json @@ -0,0 +1,84 @@ +{ + "timestamp": "2024-09-24T07-05-11-529Z", + "task": "Create a spell-checking Flatfile Listener plugin:\n - Implement a RecordHook to check spelling in specified text fields\n - Use a spell-checking library (e.g., 'node-spellchecker' or 'nspell')\n - Allow configuration of which fields to check and custom dictionaries\n - Highlight misspelled words and suggest corrections\n - Add a new field with corrected text or a list of misspellings\n - Handle multiple languages if possible\n - Give the user reasonable config options to specify the Sheet Slug, the Field(s) that are the text(s), whether the spell-checking should be done automatically", + "summary": "This code implements a spell-checking plugin for Flatfile using the nspell library. It includes configuration options for fields to check, custom dictionaries, language support, sheet slug, and automatic correction. The plugin processes records, identifies misspellings, provides suggestions, and can automatically correct text if configured to do so.", + "steps": [ + [ + "Gather information about Flatfile Listeners and RecordHook plugin.\n", + "#E1", + "PineconeAssistant", + "Provide information about Flatfile Listeners and RecordHook plugin, including their structure and usage", + "Plan: Gather information about Flatfile Listeners and RecordHook plugin.\n#E1 = PineconeAssistant[Provide information about Flatfile Listeners and RecordHook plugin, including their structure and usage]" + ], + [ + "Research spell-checking libraries compatible with Node.js.\n", + "#E2", + "Google", + "node.js spell-checking libraries nspell vs node-spellchecker", + "Plan: Research spell-checking libraries compatible with Node.js.\n#E2 = Google[node.js spell-checking libraries nspell vs node-spellchecker]" + ], + [ + "Create the basic structure of the Flatfile Listener plugin.\n", + "#E3", + "LLM", + "Create a basic structure for a Flatfile Listener plugin using RecordHook, incorporating the information from #E1", + "Plan: Create the basic structure of the Flatfile Listener plugin.\n#E3 = LLM[Create a basic structure for a Flatfile Listener plugin using RecordHook, incorporating the information from #E1]" + ], + [ + "Implement the spell-checking functionality using the chosen library.\n", + "#E4", + "LLM", + "Implement spell-checking functionality using nspell library, based on the information from #E2 and the structure from #E3", + "Plan: Implement the spell-checking functionality using the chosen library.\n#E4 = LLM[Implement spell-checking functionality using nspell library, based on the information from #E2 and the structure from #E3]" + ], + [ + "Add configuration options for field selection, custom dictionaries, and language support.\n", + "#E5", + "LLM", + "Add configuration options to the plugin from #E4 for specifying fields to check, custom dictionaries, and language support", + "Plan: Add configuration options for field selection, custom dictionaries, and language support.\n#E5 = LLM[Add configuration options to the plugin from #E4 for specifying fields to check, custom dictionaries, and language support]" + ], + [ + "Implement highlighting of misspelled words and suggestion of corrections.\n", + "#E6", + "LLM", + "Add functionality to highlight misspelled words and suggest corrections to the plugin from #E5", + "Plan: Implement highlighting of misspelled words and suggestion of corrections.\n#E6 = LLM[Add functionality to highlight misspelled words and suggest corrections to the plugin from #E5]" + ], + [ + "Add a new field with corrected text or a list of misspellings.\n", + "#E7", + "LLM", + "Extend the plugin from #E6 to add a new field with corrected text or a list of misspellings", + "Plan: Add a new field with corrected text or a list of misspellings.\n#E7 = LLM[Extend the plugin from #E6 to add a new field with corrected text or a list of misspellings]" + ], + [ + "Implement automatic spell-checking option and Sheet Slug configuration.\n", + "#E8", + "LLM", + "Add configuration options for automatic spell-checking and Sheet Slug to the plugin from #E7", + "Plan: Implement automatic spell-checking option and Sheet Slug configuration.\n#E8 = LLM[Add configuration options for automatic spell-checking and Sheet Slug to the plugin from #E7]" + ], + [ + "Validate the complete plugin code, ensure all imports are used, and verify that the Listener subscribes to valid Event Topics.\n", + "#E9", + "PineconeAssistant", + "Validate the following Flatfile Listener plugin code, check for unused imports, and verify that it subscribes to valid Event Topics: #E8", + "Plan: Validate the complete plugin code, ensure all imports are used, and verify that the Listener subscribes to valid Event Topics.\n#E9 = PineconeAssistant[Validate the following Flatfile Listener plugin code, check for unused imports, and verify that it subscribes to valid Event Topics: #E8]" + ], + [ + "Finalize the plugin code based on the validation results.\n", + "#E10", + "LLM", + "Finalize the Flatfile Listener plugin code based on the validation results from #E9, ensuring all issues are addressed and the code is complete and functional", + "Plan: Finalize the plugin code based on the validation results.\n#E10 = LLM[Finalize the Flatfile Listener plugin code based on the validation results from #E9, ensuring all issues are addressed and the code is complete and functional]" + ] + ], + "metrics": { + "tokens": { + "plan": 4134, + "state": 5246, + "total": 9380 + } + } +} \ No newline at end of file diff --git a/validators/SpellChecker/package.json b/validators/SpellChecker/package.json new file mode 100644 index 000000000..2ace1aad6 --- /dev/null +++ b/validators/SpellChecker/package.json @@ -0,0 +1,68 @@ +{ + "name": "@flatfile/plugin-spell-check", + "version": "1.0.0", + "description": "A Flatfile plugin for spell-checking and auto-correction", + "main": "./dist/index.js", + "module": "./dist/index.mjs", + "types": "./dist/index.d.ts", + "browser": { + "./dist/index.js": "./dist/index.browser.js", + "./dist/index.mjs": "./dist/index.browser.mjs" + }, + "exports": { + "types": "./dist/index.d.ts", + "node": { + "import": "./dist/index.mjs", + "require": "./dist/index.js" + }, + "browser": { + "require": "./dist/index.browser.js", + "import": "./dist/index.browser.mjs" + }, + "default": "./dist/index.mjs" + }, + "source": "./src/index.ts", + "files": [ + "dist/**" + ], + "scripts": { + "build": "rollup -c", + "build:watch": "rollup -c --watch", + "build:prod": "NODE_ENV=production rollup -c", + "check": "tsc ./**/*.ts --noEmit --esModuleInterop", + "test": "jest ./**/*.spec.ts --config=../../jest.config.js --runInBand" + }, + "keywords": [ + "flatfile", + "plugin", + "spell-check", + "auto-correct", + "flatfile-plugins", + "category-transform" + ], + "author": "Your Name", + "license": "MIT", + "dependencies": { + "@flatfile/plugin-record-hook": "^1.6.1", + "nspell": "^2.1.5" + }, + "peerDependencies": { + "@flatfile/listener": "^1.0.5" + }, + "devDependencies": { + "@flatfile/hooks": "^1.5.0", + "@flatfile/rollup-config": "^0.1.1", + "@types/node": "^22.6.1", + "typescript": "^5.6.2" + }, + "repository": { + "type": "git", + "url": "https://github.com/FlatFilers/flatfile-plugins.git", + "directory": "plugins/spell-check" + }, + "browserslist": [ + "> 0.5%", + "last 2 versions", + "not dead" + ] +} \ No newline at end of file diff --git a/validators/SpellChecker/rollup.config.mjs b/validators/SpellChecker/rollup.config.mjs new file mode 100644 index 000000000..751d9852a --- /dev/null +++ b/validators/SpellChecker/rollup.config.mjs @@ -0,0 +1,64 @@ +import { buildConfig } from '@flatfile/rollup-config'; +import typescript from '@rollup/plugin-typescript'; +import resolve from '@rollup/plugin-node-resolve'; +import commonjs from '@rollup/plugin-commonjs'; +import json from '@rollup/plugin-json'; + +const umdExternals = [ + '@flatfile/api', + '@flatfile/hooks', + '@flatfile/listener', + '@flatfile/util-common', + '@flatfile/plugin-record-hook', + 'nspell', + 'fs', + 'path' +]; + +const config = buildConfig({ + input: 'src/index.ts', // Adjust this if your entry point is different + includeUmd: true, + umdConfig: { name: 'FlatfileSpellCheckPlugin', external: umdExternals }, + plugins: [ + typescript({ + tsconfig: './tsconfig.json', + declaration: true, + declarationDir: 'dist/types', + }), + resolve({ + preferBuiltins: true, + browser: true, + }), + commonjs(), + json(), + ], +}); + +// Add custom configuration for handling dictionary files +config.forEach(conf => { + if (!conf.output) return; + + conf.output.forEach(output => { + if (output.format === 'es' || output.format === 'cjs') { + output.assetFileNames = 'assets/[name][extname]'; + } + }); + + conf.plugins.push({ + name: 'copy-dictionary-files', + generateBundle() { + this.emitFile({ + type: 'asset', + fileName: 'assets/dictionaries/en_US.dic', + source: 'src/dictionaries/en_US.dic' + }); + this.emitFile({ + type: 'asset', + fileName: 'assets/dictionaries/en_US.aff', + source: 'src/dictionaries/en_US.aff' + }); + } + }); +}); + +export default config; \ No newline at end of file diff --git a/validators/SpellChecker/src/index.ts b/validators/SpellChecker/src/index.ts new file mode 100644 index 000000000..e8dd0d1f2 --- /dev/null +++ b/validators/SpellChecker/src/index.ts @@ -0,0 +1,155 @@ +import { recordHook } from '@flatfile/plugin-record-hook' +import { + FlatfileListener, + FlatfileRecord, + FlatfileEvent, +} from '@flatfile/listener' +import nspell from 'nspell' +import fs from 'fs' +import path from 'path' + +interface SpellCheckConfig { + fields?: string[] + dictionaryPath?: string + affixPath?: string + language?: string + sheetSlug?: string + autoCorrect?: boolean +} + +const defaultConfig: SpellCheckConfig = { + fields: ['name', 'description', 'comments'], + language: 'en_US', + sheetSlug: 'mySheetSlug', + autoCorrect: false, +} + +export default function spellCheckPlugin( + listener: FlatfileListener, + config: SpellCheckConfig = {} +) { + const finalConfig = { ...defaultConfig, ...config } + const { + fields, + dictionaryPath, + affixPath, + language, + sheetSlug, + autoCorrect, + } = finalConfig + + const dicPath = + dictionaryPath || path.join(__dirname, `dictionaries/${language}.dic`) + const affPath = + affixPath || path.join(__dirname, `dictionaries/${language}.aff`) + + const dictionary = fs.readFileSync(dicPath, 'utf-8') + const affixFile = fs.readFileSync(affPath, 'utf-8') + const spellChecker = nspell(affixFile, dictionary) + + listener.use( + recordHook( + sheetSlug, + async (record: FlatfileRecord, event: FlatfileEvent) => { + try { + await processRecord(record, fields, spellChecker, autoCorrect) + } catch (error) { + console.error('Error processing record:', error) + record.addError( + 'general', + 'An error occurred while processing this record.' + ) + } + return record + }, + { + concurrency: 5, + debug: true, + } + ) + ) +} + +async function processRecord( + record: FlatfileRecord, + fieldsToCheck: string[], + spellChecker: nspell.NspellInstance, + autoCorrect: boolean +): Promise { + for (const field of fieldsToCheck) { + const value = record.get(field) as string + if (value) { + const words = value.split(/\s+/) + const misspelledWords: { word: string; index: number }[] = [] + + words.forEach((word, index) => { + if (!spellChecker.correct(word)) { + misspelledWords.push({ word, index }) + } + }) + + if (misspelledWords.length > 0) { + const highlightedText = highlightMisspelledWords(value, misspelledWords) + const suggestions = misspelledWords.map(({ word }) => ({ + word, + suggestions: spellChecker.suggest(word).slice(0, 3), + })) + + const correctedText = correctMisspelledWords(value, suggestions) + + record.addWarning( + field, + `Possible misspellings found. See highlighted text, suggestions, and corrected text.` + ) + record.set(`${field}_highlighted`, highlightedText) + record.set(`${field}_spell_suggestions`, JSON.stringify(suggestions)) + record.set( + `${field}_misspellings`, + JSON.stringify(misspelledWords.map((mw) => mw.word)) + ) + + if (autoCorrect) { + record.set(field, correctedText) + record.addInfo(field, 'Text has been automatically corrected.') + } else { + record.set(`${field}_corrected`, correctedText) + } + } + } + } +} + +function highlightMisspelledWords( + text: string, + misspelledWords: { word: string; index: number }[] +): string { + let result = text + let offset = 0 + + misspelledWords.forEach(({ word, index }) => { + const start = text.indexOf(word, index) + const end = start + word.length + const highlightedWord = `${word}` + result = + result.slice(0, start + offset) + + highlightedWord + + result.slice(end + offset) + offset += highlightedWord.length - word.length + }) + + return result +} + +function correctMisspelledWords( + text: string, + suggestions: { word: string; suggestions: string[] }[] +): string { + let correctedText = text + suggestions.forEach(({ word, suggestions }) => { + if (suggestions.length > 0) { + const regex = new RegExp(`\\b${word}\\b`, 'g') + correctedText = correctedText.replace(regex, suggestions[0]) + } + }) + return correctedText +}