From b3a1dad64ae2f0d6310fffd6ff325c057d681510 Mon Sep 17 00:00:00 2001 From: Raphaela Heil Date: Mon, 25 Nov 2024 15:18:40 +0100 Subject: [PATCH] [fix] word-level search for line-level ALTO annotations --- src/builder/Digitized.ts | 8 +++++++- src/search/search.ts | 15 +++++++++++++-- 2 files changed, 20 insertions(+), 3 deletions(-) diff --git a/src/builder/Digitized.ts b/src/builder/Digitized.ts index 68f0fa8..bd3929f 100644 --- a/src/builder/Digitized.ts +++ b/src/builder/Digitized.ts @@ -115,7 +115,13 @@ export async function getAnnotationPage(item: RootItem, text: Text): Promise w.length > 0).length > 1){ + + annotation.setTextGranularity('line'); + }else{ + annotation.setTextGranularity('word'); + } + annotation.setCanvas(canvas, {x: word.x, y: word.y, w: word.width, h: word.height}); annotations.push(annotation); diff --git a/src/search/search.ts b/src/search/search.ts index 846f7f0..ee00862 100644 --- a/src/search/search.ts +++ b/src/search/search.ts @@ -138,10 +138,21 @@ function mapMatches(text: Text, hl: string, matchExact: string | null): SearchRe .split(/[\t\r\n\s]+/) .filter(token => token.length > 0); + let id = 0; + const extractedWords: TextWord[] = []; + for (const word of words) { + const splitAttempt = word.content.split(/\s+/).filter(w => w.length > 0); + if (splitAttempt.length > 1) { + extractedWords.push(...splitAttempt.map(w => { ...word, idx: id++, content: w, isHyphenated: false })); + } else { + extractedWords.push({ ...word, idx: id++ }); + } + } + let tokenIdx = 0, wordIdx = 0, curMatch: SearchResultMatch | null = null; while (tokenIdx < tokens.length) { const token = tokens[tokenIdx]; - const word = wordIdx < words.length ? words[wordIdx] : null; + const word = wordIdx < extractedWords.length ? extractedWords[wordIdx] : null; if (token.startsWith(PRE_TAG)) { if (curMatch == null) @@ -159,7 +170,7 @@ function mapMatches(text: Text, hl: string, matchExact: string | null): SearchRe if (word?.isHyphenated) { wordIdx++; - curMatch?.words.push(words[wordIdx]); + curMatch?.words.push(extractedWords[wordIdx]); } if (curMatch && (!matchExact || matchExact === curMatch.match)) {