From 491f51bc07520317f31416a68a9a221ccade03f9 Mon Sep 17 00:00:00 2001 From: Dale Wahl Date: Tue, 5 May 2026 17:38:25 +0200 Subject: [PATCH 01/20] minimal changes for direct from 4CAT mapping --- js/lib.js | 16 +++++++++++++++- modules/_loader.js | 6 +++++- popup/interface.js | 2 +- 3 files changed, 21 insertions(+), 3 deletions(-) diff --git a/js/lib.js b/js/lib.js index 6199d01..1579195 100644 --- a/js/lib.js +++ b/js/lib.js @@ -57,4 +57,18 @@ class MissingMappedField { toString() { return `${this.value}`; } -} \ No newline at end of file +} + +/** + * Wrap a Zeeschuimer stored item to match the shape a 4CAT map_item expects. + * + * 4CAT's importer constructs: + * { ...item.data, __import_meta: { ...everything in item except data } } + * + * Mirroring that here means map_item functions auto-generated from 4CAT + * data sources can run against Zeeschuimer-stored items without translation. + */ +function wrap_for_map_item(stored_item) { + const { data, ...meta } = stored_item; + return { ...data, __import_meta: meta }; +} diff --git a/modules/_loader.js b/modules/_loader.js index 47697ca..afae2d7 100644 --- a/modules/_loader.js +++ b/modules/_loader.js @@ -17,11 +17,15 @@ async function load() { ]; for(const module of imported_modules) { + const mapper = module.map_item + ? (stored_item) => module.map_item(wrap_for_map_item(stored_item)) + : null; + zeeschuimer.register_module( module.MODULE_NAME, module.DOMAIN, module.capture, - module.map_item, + mapper, module.MODULE_ID ? module.MODULE_ID : module.MODULE_DOMAIN, module.overwrite_partial, module.TOOLTIP ? module.TOOLTIP : null, diff --git a/popup/interface.js b/popup/interface.js index 5cc7864..1ae60a2 100644 --- a/popup/interface.js +++ b/popup/interface.js @@ -619,7 +619,7 @@ async function get_csv_blob(platform) { let csv = []; const module = background.zeeschuimer.modules[platform]; await iterate_items(platform, function(item) { - item = module.mapper(item.data); + item = module.mapper(item); if(csv.length === 0) { csv.push(Object.keys(item).map(v => csv_escape(v)).join(CSV_SEPARATOR) + "\n"); } From b06805f711a97fad6e9e3f6615db3a0cf936205e Mon Sep 17 00:00:00 2001 From: Dale Wahl Date: Wed, 6 May 2026 10:54:13 +0200 Subject: [PATCH 02/20] give me some standard helper functions --- js/lib.js | 54 +++++++++++++++++++++ modules/tiktok.js | 119 +++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 172 insertions(+), 1 deletion(-) diff --git a/js/lib.js b/js/lib.js index 1579195..3b144d2 100644 --- a/js/lib.js +++ b/js/lib.js @@ -72,3 +72,57 @@ function wrap_for_map_item(stored_item) { const { data, ...meta } = stored_item; return { ...data, __import_meta: meta }; } + +/** + * Ports of 4CAT functions commonly used by `map_item` below + */ + +/** + * Strip HTML tags from a string. + * @param {string} html + * @param {boolean} convertNewlines Convert
and

tags to \n before stripping. + * @returns {string} + */ +function strip_tags(html, convertNewlines = true) { + if (!html) return ""; + if (convertNewlines) { + html = html.replace(//gi, "\n").replace(/<\/p>/gi, "

\n"); + html = html.replace(/\n+/g, "\n"); + } + const doc = new DOMParser().parseFromString(html, "text/html"); + return doc.body.textContent || ""; +} + +/** + * Normalize URL encoding for display and linking. + * Decodes percent-encoded URLs and re-encodes the query string canonically. + * Returns the original URL on parse failure. + * @param {string} url + * @returns {string} + */ +function normalize_url_encoding(url) { + if (!url) return ""; + try { + // Iterative decode handles double-encoded inputs. + let decoded = url; + let prev; + do { + prev = decoded; + try { + decoded = decodeURIComponent(prev); + } catch { + decoded = prev; + break; + } + } while (decoded !== prev); + const parsed = new URL(decoded); + // URL.toString() re-encodes the query/fragment correctly. + return parsed.toString(); + } catch { + return url; + } +} + +function formatUtcTimestamp(unixSeconds) { + return new Date(unixSeconds * 1000).toISOString().replace('T', ' ').slice(0, 19); +} \ No newline at end of file diff --git a/modules/tiktok.js b/modules/tiktok.js index 55e6fbf..ea52532 100644 --- a/modules/tiktok.js +++ b/modules/tiktok.js @@ -1,3 +1,4 @@ + export const MODULE_NAME = 'TikTok (posts)'; export const DOMAIN = 'tiktok.com'; @@ -103,4 +104,120 @@ export function capture(response, source_platform_url, source_url) { } else { return []; } -} \ No newline at end of file +} + +// === auto-generated by 4cat map_item sync — DO NOT EDIT BY HAND === +// (regenerated from datasources/tiktok/search_tiktok.py) +export function map_item(post) { + // Zeeschuimer metadata + const metadata = post.__import_meta || {}; + + const challenges = Array.isArray(post.challenges) + ? post.challenges.map(ch => ch.title).filter(Boolean) + : []; + + const hashtags = Array.isArray(post.textExtra) + ? post.textExtra + .filter(e => e.hasOwnProperty('hashtagName') && e.hashtagName) + .map(e => e.hashtagName) + : []; + + const diversificationLabels = Array.isArray(post.diversificationLabels) + ? post.diversificationLabels.join(',') + : ''; + + let user_nickname = ''; + let user_fullname = ''; + let user_thumbnail = ''; + + if (post.author && typeof post.author === 'object') { + user_nickname = post.author.uniqueId || ''; + user_fullname = post.author.nickname || ''; + user_thumbnail = post.author.avatarThumb || ''; + } else if (post.author) { + user_nickname = post.author || ''; + user_fullname = post.nickname || ''; + user_thumbnail = ''; + } + + const thumbnailOptions = []; + + if (post.video && Array.isArray(post.video.shareCover)) { + thumbnailOptions.push(...post.video.shareCover); + } + + if (post.video && post.video.cover) { + thumbnailOptions.push(post.video.cover); + } + + const now = Math.floor(Date.now() / 1000); + + const validThumbnails = thumbnailOptions.filter(url => { + try { + const parsedUrl = new URL(url); + const expires = parseInt(parsedUrl.searchParams.get('x-expires'), 10) || 0; + return expires >= now; + } catch (e) { + return false; + } + }); + + const thumbnail_url = validThumbnails.length ? validThumbnails[validThumbnails.length - 1] : ''; + + return new MappedItem({ + collected_from_url: metadata.source_platform_url + ? normalize_url_encoding(metadata.source_platform_url) + : '', + id: post.id || '', + thread_id: post.id || '', + author: user_nickname, + author_full: user_fullname, + author_followers: post.authorStats?.followerCount ?? '', + author_likes: post.authorStats?.diggCount ?? '', + author_videos: post.authorStats?.videoCount ?? '', + author_avatar: user_thumbnail, + body: post.desc || '', + stickers: Array.isArray(post.stickersOnItem) + ? post.stickersOnItem + .map(s => (Array.isArray(s.stickerText) ? s.stickerText.join(' ') : '')) + .filter(Boolean) + .join('') + : '', + timestamp: post.createTime + ? formatUtcTimestamp(parseInt(post.createTime, 10)) + : '', + unix_timestamp: post.createTime ? parseInt(post.createTime, 10) : 0, + is_duet: + post.duetInfo && post.duetInfo.duetFromId && post.duetInfo.duetFromId !== '0' + ? 'yes' + : 'no', + is_ad: post.isAd ? 'yes' : 'no', + is_paid_partnership: post.adAuthorization ? 'yes' : 'no', + is_sensitive: post.maskType === 3 ? 'yes' : 'no', + is_photosensitive: post.maskType === 4 ? 'yes' : 'no', + music_name: post.music?.title ?? '', + music_id: post.music?.id ?? '', + music_url: post.music?.playUrl ?? '', + music_thumbnail: post.music?.coverLarge ?? '', + music_author: post.music?.authorName ?? '', + video_url: post.video?.downloadAddr ?? '', + tiktok_url: `https://www.tiktok.com/@${user_nickname}/video/${post.id}`, + thumbnail_url: thumbnail_url, + likes: post.stats?.diggCount ?? '', + comments: post.stats?.commentCount ?? '', + shares: post.stats?.shareCount ?? '', + plays: post.stats?.playCount ?? '', + hashtags: hashtags.join(','), + challenges: challenges.join(','), + diversification_labels: diversificationLabels, + location_created: post.locationCreated ?? '', + effects: Array.isArray(post.effectStickers) + ? post.effectStickers.map(e => e.name).join(',') + : '', + warning: Array.isArray(post.warnInfo) + ? post.warnInfo.map(w => w.text).join(',') + : '', + }); +} +// === end auto-generated === +// === end auto-generated === From f9a2405a0703bcadfdee7492ccd57af12917733e Mon Sep 17 00:00:00 2001 From: Dale Wahl Date: Wed, 6 May 2026 13:07:43 +0200 Subject: [PATCH 03/20] fix csv export --- popup/interface.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/popup/interface.js b/popup/interface.js index 1ae60a2..8afd1b1 100644 --- a/popup/interface.js +++ b/popup/interface.js @@ -595,7 +595,7 @@ const CSV_ESCAPED = `"${CSV_SEPARATOR}\n`; function csv_escape(value) { value = String(value); let needs_escape = false; - for(const character in CSV_ESCAPED) { + for(const character of CSV_ESCAPED) { if(value.indexOf(character) >= 0) { needs_escape = true; } From 2f084b9352c25a1034429bb05d8390b5961d35ef Mon Sep 17 00:00:00 2001 From: Dale Wahl Date: Wed, 6 May 2026 15:19:18 +0200 Subject: [PATCH 04/20] another to CSV fix --- popup/interface.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/popup/interface.js b/popup/interface.js index 8afd1b1..94fff77 100644 --- a/popup/interface.js +++ b/popup/interface.js @@ -626,7 +626,7 @@ async function get_csv_blob(platform) { csv.push(Object.values(item).map(v => csv_escape(v)).join(CSV_SEPARATOR) + "\n"); }) - return new Blob([csv], {type: 'text/csv'}); + return new Blob(csv, {type: 'text/csv'}); } /** From d7870426c7765a6107c47c4fff062f5643725167 Mon Sep 17 00:00:00 2001 From: Dale Wahl Date: Wed, 6 May 2026 15:25:42 +0200 Subject: [PATCH 05/20] revert tiktok (mistaken test result commited) --- modules/tiktok.js | 119 +--------------------------------------------- 1 file changed, 1 insertion(+), 118 deletions(-) diff --git a/modules/tiktok.js b/modules/tiktok.js index ea52532..55e6fbf 100644 --- a/modules/tiktok.js +++ b/modules/tiktok.js @@ -1,4 +1,3 @@ - export const MODULE_NAME = 'TikTok (posts)'; export const DOMAIN = 'tiktok.com'; @@ -104,120 +103,4 @@ export function capture(response, source_platform_url, source_url) { } else { return []; } -} - -// === auto-generated by 4cat map_item sync — DO NOT EDIT BY HAND === -// (regenerated from datasources/tiktok/search_tiktok.py) -export function map_item(post) { - // Zeeschuimer metadata - const metadata = post.__import_meta || {}; - - const challenges = Array.isArray(post.challenges) - ? post.challenges.map(ch => ch.title).filter(Boolean) - : []; - - const hashtags = Array.isArray(post.textExtra) - ? post.textExtra - .filter(e => e.hasOwnProperty('hashtagName') && e.hashtagName) - .map(e => e.hashtagName) - : []; - - const diversificationLabels = Array.isArray(post.diversificationLabels) - ? post.diversificationLabels.join(',') - : ''; - - let user_nickname = ''; - let user_fullname = ''; - let user_thumbnail = ''; - - if (post.author && typeof post.author === 'object') { - user_nickname = post.author.uniqueId || ''; - user_fullname = post.author.nickname || ''; - user_thumbnail = post.author.avatarThumb || ''; - } else if (post.author) { - user_nickname = post.author || ''; - user_fullname = post.nickname || ''; - user_thumbnail = ''; - } - - const thumbnailOptions = []; - - if (post.video && Array.isArray(post.video.shareCover)) { - thumbnailOptions.push(...post.video.shareCover); - } - - if (post.video && post.video.cover) { - thumbnailOptions.push(post.video.cover); - } - - const now = Math.floor(Date.now() / 1000); - - const validThumbnails = thumbnailOptions.filter(url => { - try { - const parsedUrl = new URL(url); - const expires = parseInt(parsedUrl.searchParams.get('x-expires'), 10) || 0; - return expires >= now; - } catch (e) { - return false; - } - }); - - const thumbnail_url = validThumbnails.length ? validThumbnails[validThumbnails.length - 1] : ''; - - return new MappedItem({ - collected_from_url: metadata.source_platform_url - ? normalize_url_encoding(metadata.source_platform_url) - : '', - id: post.id || '', - thread_id: post.id || '', - author: user_nickname, - author_full: user_fullname, - author_followers: post.authorStats?.followerCount ?? '', - author_likes: post.authorStats?.diggCount ?? '', - author_videos: post.authorStats?.videoCount ?? '', - author_avatar: user_thumbnail, - body: post.desc || '', - stickers: Array.isArray(post.stickersOnItem) - ? post.stickersOnItem - .map(s => (Array.isArray(s.stickerText) ? s.stickerText.join(' ') : '')) - .filter(Boolean) - .join('') - : '', - timestamp: post.createTime - ? formatUtcTimestamp(parseInt(post.createTime, 10)) - : '', - unix_timestamp: post.createTime ? parseInt(post.createTime, 10) : 0, - is_duet: - post.duetInfo && post.duetInfo.duetFromId && post.duetInfo.duetFromId !== '0' - ? 'yes' - : 'no', - is_ad: post.isAd ? 'yes' : 'no', - is_paid_partnership: post.adAuthorization ? 'yes' : 'no', - is_sensitive: post.maskType === 3 ? 'yes' : 'no', - is_photosensitive: post.maskType === 4 ? 'yes' : 'no', - music_name: post.music?.title ?? '', - music_id: post.music?.id ?? '', - music_url: post.music?.playUrl ?? '', - music_thumbnail: post.music?.coverLarge ?? '', - music_author: post.music?.authorName ?? '', - video_url: post.video?.downloadAddr ?? '', - tiktok_url: `https://www.tiktok.com/@${user_nickname}/video/${post.id}`, - thumbnail_url: thumbnail_url, - likes: post.stats?.diggCount ?? '', - comments: post.stats?.commentCount ?? '', - shares: post.stats?.shareCount ?? '', - plays: post.stats?.playCount ?? '', - hashtags: hashtags.join(','), - challenges: challenges.join(','), - diversification_labels: diversificationLabels, - location_created: post.locationCreated ?? '', - effects: Array.isArray(post.effectStickers) - ? post.effectStickers.map(e => e.name).join(',') - : '', - warning: Array.isArray(post.warnInfo) - ? post.warnInfo.map(w => w.text).join(',') - : '', - }); -} -// === end auto-generated === -// === end auto-generated === +} \ No newline at end of file From a9fba9a9caee86d8799ee35d11374fbb602c9a41 Mon Sep 17 00:00:00 2001 From: Dale Wahl Date: Wed, 6 May 2026 15:57:45 +0200 Subject: [PATCH 06/20] clean up UI (make download menu button) --- popup/interface.html | 32 +++++++++++++++++++++- popup/interface.js | 63 +++++++++++++++++++++++++++++++++----------- 2 files changed, 78 insertions(+), 17 deletions(-) diff --git a/popup/interface.html b/popup/interface.html index 356f2b5..e9d9b3f 100644 --- a/popup/interface.html +++ b/popup/interface.html @@ -215,10 +215,39 @@ text-indent: 2em; } - td > button:not(:last-child) { + td > button:not(:last-child), + td > .download-menu:not(:last-child) { margin-right: 0.25em; } + /* download chooser: trigger is a regular button (inherits all button + styles); */ + .download-menu { + display: inline-block; + position: relative; + } + + /* :not([hidden]) so the explicit display:flex doesn't override the + [hidden] attribute's default display:none */ + .download-menu > .download-options:not([hidden]) { + position: absolute; + top: calc(100% + 0.25em); + left: 0; + display: flex; + flex-direction: column; + gap: 0.25em; + padding: 0.25em; + background: var(--neutral-contrast-alt); + border: 2px solid var(--neutral-contrast); + border-radius: 0.5em; + z-index: 10; + white-space: nowrap; + } + + .download-menu > .download-options > button { + margin: 0; + } + input:not([type=checkbox]):not([type=radio]), button { background: var(--neutral-contrast-alt); color: var(--accent); @@ -302,6 +331,7 @@ .toggle-switch input { -moz-appearance: none; + appearance: none; opacity: 0; } diff --git a/popup/interface.js b/popup/interface.js index 94fff77..3b8aaa9 100644 --- a/popup/interface.js +++ b/popup/interface.js @@ -119,7 +119,7 @@ async function set_4cat_url(e) { function activate_buttons() { document.querySelectorAll("td button").forEach(button => { let current = button.disabled; - let items = parseInt(button.parentNode.parentNode.querySelector('.num-items').innerText); + let items = parseInt(button.closest('tr').querySelector('.num-items').innerText); let new_status = current; if(button.classList.contains('upload-to-4cat') && !is_uploading) { @@ -132,7 +132,7 @@ function activate_buttons() { button.setAttribute('title', ''); } - } else if(button.classList.contains('download-ndjson') || button.classList.contains('reset') || button.classList.contains('download-csv')) { + } else if(button.classList.contains('download-format') || button.classList.contains('download-menu-trigger') || button.classList.contains('reset')) { new_status = !(items > 0); } @@ -234,21 +234,32 @@ async function get_stats() { let actions = createElement("td"); const clear_button = createElement("button", {"data-platform": platform, "class": "reset"}, "Delete"); - const csv_button = createElement("button", {"data-platform": platform, 'class': 'download-csv'}, '.csv'); - const download_button = createElement("button", { - "data-platform": platform, - "class": "download-ndjson" - }, ".ndjson"); + + // Render the download chooser as a button + popover panel, + // (even when only NDJSON is available as visual consistent) + const download_widget = createElement("span", {"class": "download-menu"}); + const trigger = createElement("button", { + "data-platform": platform, "class": "download-menu-trigger" + }, "Download"); + const options = createElement("div", {"class": "download-options", "hidden": ""}); + options.appendChild(createElement("button", { + "data-platform": platform, "data-format": "ndjson", "class": "download-format" + }, ".ndjson (original)")); + if(module.mapper) { + options.appendChild(createElement("button", { + "data-platform": platform, "data-format": "csv", "class": "download-format" + }, ".csv")); + } + download_widget.appendChild(trigger); + download_widget.appendChild(options); + const fourcat_button = createElement("button", { "data-platform": platform, "class": "upload-to-4cat", }, "to 4CAT"); actions.appendChild(clear_button); - if(module.mapper) { - actions.appendChild(csv_button); - } - actions.appendChild(download_button); + actions.appendChild(download_widget); actions.appendChild(fourcat_button); row.appendChild(actions); @@ -317,22 +328,38 @@ async function get_stats() { async function button_handler(event) { let status = document.getElementById('upload-status'); - if (event.target.matches('.reset')) { + // Close any open download-format popovers when clicking outside their host. + // Skip if the click is on a trigger or inside an options panel + if(!event.target.matches('.download-menu-trigger') && !event.target.closest('.download-options')) { + document.querySelectorAll('.download-options:not([hidden])').forEach(el => el.hidden = true); + } + + if (event.target.matches('.download-menu-trigger')) { + const widget = event.target.closest('.download-menu'); + const options = widget.querySelector('.download-options'); + const opening = options.hidden; + // close any other menus before opening this one + document.querySelectorAll('.download-options:not([hidden])').forEach(el => { + if(el !== options) el.hidden = true; + }); + options.hidden = !opening; + + } else if (event.target.matches('.reset')) { let platform = event.target.getAttribute('data-platform'); await background.db.items.where("source_platform").equals(platform).delete(); } else if (event.target.matches('.reset-all')) { await background.db.items.clear(); - } else if (event.target.matches('.download-ndjson') || event.target.matches('.download-csv')) { - const blobber = event.target.matches('.download-ndjson') ? get_ndjson_blob : get_csv_blob; - const extension = event.target.matches('.download-ndjson') ? 'ndjson' : 'csv'; + } else if (event.target.matches('.download-format')) { + const format = event.target.getAttribute('data-format'); + const blobber = format === 'csv' ? get_csv_blob : get_ndjson_blob; + const extension = format; let platform = event.target.getAttribute('data-platform'); let date = new Date(); event.target.classList.add('loading'); - //let blob = await download_blob(platform, 'zeeschuimer-export-' + platform + '-' + date.toISOString().split(".")[0].replace(/:/g, "") + '.ndjson'); let blob = await blobber(platform); let filename = 'zeeschuimer-export-' + platform + '-' + date.toISOString().split(".")[0].replace(/:/g, "") + '.' + extension; const downloadUrl = window.URL.createObjectURL(blob); @@ -345,6 +372,10 @@ async function button_handler(event) { event.target.classList.remove('loading'); + // collapse the popover menu after the download fires + const widget = event.target.closest('.download-menu'); + if(widget) widget.querySelector('.download-options').hidden = true; + } else if (event.target.matches('.upload-to-4cat')) { let platform = event.target.getAttribute('data-platform'); status.innerText = 'Creating data file for uploading...'; From 0980a56f0ba6872884bfc1e891efc2cb9f4e4c33 Mon Sep 17 00:00:00 2001 From: Dale Wahl Date: Wed, 6 May 2026 16:13:52 +0200 Subject: [PATCH 07/20] testing is hard in JS --- docs/test-plan.md | 162 ++++++++++++++++++++++ modules/package.json | 3 + tests/__pycache__/test.cpython-39.pyc | Bin 0 -> 7345 bytes tests/duplicate-behavior.test.js | 3 +- tests/{jest.config.js => jest.config.cjs} | 3 +- tests/map_item.test.js | 130 +++++++++++++++++ tests/package.json | 5 +- tests/setup-globals.cjs | 41 ++++++ 8 files changed, 343 insertions(+), 4 deletions(-) create mode 100644 docs/test-plan.md create mode 100644 modules/package.json create mode 100644 tests/__pycache__/test.cpython-39.pyc rename tests/{jest.config.js => jest.config.cjs} (64%) create mode 100644 tests/map_item.test.js create mode 100644 tests/setup-globals.cjs diff --git a/docs/test-plan.md b/docs/test-plan.md new file mode 100644 index 0000000..249a7e0 --- /dev/null +++ b/docs/test-plan.md @@ -0,0 +1,162 @@ +# Selenium Test Harness — Improvement Plan + +Date: 2026-04-30 + +Overview + +This document captures an actionable plan to improve the Selenium-based integration tests in `tests/test.py` for the Zeeschuimer Firefox extension. The goals are to: + +- Make profile handling reliable and reusable (so logged-in sessions persist across runs). +- Preserve and export captured data per platform for offline analysis and for passing to 4CAT. +- Add optional automated upload to a 4CAT instance for mapping/validation tests. +- Reduce fragility caused by popups and interactive dialogs (pausing/dismissal patterns). +- Improve robustness, error handling, and machine-readable results. + +Scope + +All changes are confined to the test harness and test metadata (`tests/test.py` and `tests/tests.json`) and to this planning document. No changes are required in the extension source for the planned items (the test harness will interact with the extension's UI pages and background DB). + +Phases & Changes + +Phase 1 — Profile management + +- Problem: copying an entire profile can race with a running Firefox and the current ignore rule hides potentially useful session data. +- Changes: + - Detect if the selected profile directory appears locked (presence of `lock` or `.parentlock`) and warn if Firefox is running. + - Replace the naive ignore lambda used in `shutil.copytree` with a function that only excludes `storage`, `extensions`, and `signedInUser.json` at the profile root. + - Add CLI flags: `--profile-name NAME` (choose profile by display name from `profiles.ini`), `--save-profile PATH` (save the temp profile for reuse), and `--no-cleanup` (do not remove `.temp-profile` after run). + +Implementation note (copytree ignore example): + +```python +def _profile_ignore(root, names): + # Only ignore these entries in the root profile dir + if os.path.abspath(root) == os.path.abspath(profile_dir): + return {"storage", "extensions", "signedInUser.json"} + return set() + +shutil.copytree(profile_dir, profile_file, ignore=_profile_ignore) +``` + +Phase 2 — Data preservation & export + +- Problem: `reset-all` wipes the DB before each URL; no artifacts are kept for post-mortem or mapping tests. +- Decision: export a single combined NDJSON file per platform containing items collected while testing that platform. +- Changes: + - Add CLI `--export-dir PATH` (default `./zeeschuimer-exports/{timestamp}/`). + - Before clicking `reset-all` for each URL, read the current DB contents from the extension background page (Dexie) via `execute_async_script` and append those items to a per-platform in-memory list in Python. After all URLs for a platform are done, write `{export-dir}/{platform}.ndjson`. + - Optionally add `--no-reset` to skip the `reset-all` call entirely (default behavior remains to reset before each URL). + +Execute_async_script pattern (example): + +```python +script = ''' +const cb = arguments[0]; +background.db.items.toArray().then(items => cb(JSON.stringify(items))).catch(e => cb(JSON.stringify({error: String(e)}))); +''' +items_json = driver.execute_async_script(script) +items = json.loads(items_json) +``` + +Phase 3 — 4CAT integration (optional) + +- Problem: mapping tests live in 4CAT and need NDJSON input. +- Changes: + - Add CLI flags: `--4cat-url URL` and `--4cat-key KEY` (API key). Require both for upload. + - After writing the per-platform NDJSON, POST it to `{4cat_url.rstrip('/')}/api/import-dataset/` with header `X-Zeeschuimer-Platform: {platform}` and `Authorization: Bearer {key}` (confirm header with your 4CAT instance; alternative is to trigger the extension UI upload button when cookie-based auth is required). + - Do not fail the test run on 4CAT errors — print status and continue. + +Example upload with `requests`: + +```python +import requests +with open(ndjson_path, 'rb') as f: + headers = { + 'X-Zeeschuimer-Platform': platform, + 'Authorization': f'Bearer {fourcat_key}' + } + r = requests.post(f"{fourcat_url.rstrip('/')}/api/import-dataset/", headers=headers, data=f) + # check r.status_code and r.text for details +``` + +Phase 4 — Interactive controls & popup dismissals + +- Problem: cookie banners, paywall prompts, and other popups frequently interfere with automated navigation and can cause false failures. +- Decision: pause by default **once per platform** (not before every URL) so the tester can clear residual prompts; provide opt-out and finer-grained options. +- Changes: + - CLI flags: `--no-interactive` (disable all pauses), `--pause-before-url` (pause before each URL), `--pause-on-fail` (pause on failure), `--extra-wait N` (add N seconds to every wait), `--screenshot-dir PATH` (capture screenshots on fail/warning). + - Add a `dismiss-selectors` optional field in `tests.json` per URL: a list of CSS selectors to click to dismiss known popups. Example: + +```json +"dismiss-selectors": ["button.cookie-accept", ".modal .close"] +``` + + - Add per-URL `timeout` (page load timeout override). + +Phase 5 — Runner robustness & reporting + +- Problem: unhandled exceptions abort the run; final runtime is calculated incorrectly; no machine-readable results. +- Changes: + - Wrap each URL test body in try/except, increment `failed` on exceptions, and continue. + - Move the global `start_time = time.time()` to before the outer platform loop so the final elapsed time is for the full run. + - Add CLI flags: `--results-file PATH` (write JSON summary), `--resume-from PLATFORM` (skip earlier platforms), and `--screenshot-dir PATH` (as noted). + - Fix small test metadata issues (e.g., `more-after-scrolll` typo in `tests.json`). + +tests.json schema additions + +- Per-URL optional fields: + - `dismiss-selectors`: array of CSS selectors to click after page load + - `timeout`: numeric page load timeout seconds for this URL + - `extra-wait`: per-URL additional wait seconds + +CLI flags (summary) + +- `--profiledir PATH` — explicit profile path (existing) +- `--profile-name NAME` — choose Firefox profile by display name +- `--save-profile PATH` — persist the copied profile for reuse +- `--no-cleanup` — keep `.temp-profile` +- `--export-dir PATH` — where to write NDJSON exports +- `--no-reset` — do not click `reset-all` between URLs +- `--4cat-url URL` — base URL for 4CAT server +- `--4cat-key KEY` — API key for 4CAT uploads +- `--4cat-per-url` — upload per URL instead of per platform (optional) +- `--no-interactive` — disable pausing (default is to pause per-platform) +- `--pause-before-url` — pause before each URL +- `--pause-on-fail` — pause when a test fails +- `--extra-wait N` — add N seconds to every URL wait +- `--screenshot-dir PATH` — save screenshots on fail/warning +- `--results-file PATH` — write machine-readable results JSON +- `--resume-from PLATFORM` — resume a run from a platform + +Verification checklist + +1. `python tests/test.py --sources instagram.com --export-dir ./exports` -> `exports/instagram.com.ndjson` exists and contains NDJSON with captured items. +2. `python tests/test.py --save-profile .saved-profile --login` -> create a saved profile that can be reused with `--profiledir .saved-profile`. +3. Run with default interactive behavior and confirm one pause per platform. +4. `python tests/test.py --results-file results.json` -> JSON summary produced with per-URL status and counts. +5. Test 4CAT upload using a local mock server and `--4cat-url http://localhost:8000 --4cat-key KEY`. + +Implementation steps (recommended order) + +1. Docs and small fixes (this document + tests.json typo fix). +2. Profile management changes (`--profile-name`, improved copy ignore, `--save-profile`, lock detection). +3. Export behavior: `--export-dir` + `execute_async_script` collection and NDJSON write. +4. Runner robustness: try/except around URL loop, `--results-file`, fix `start_time` placement. +5. Interactive and dismissal features (`dismiss-selectors`, pause flags, screenshots). +6. 4CAT upload integration (optional, requires confirmation of auth header). + +Estimated effort: 6–10 hours of focused work to implement and test everything end-to-end; can be split into 3-4 incremental PRs. + +Open questions / confirmations needed + +- Confirm 4CAT API key header format (currently suggested: `Authorization: Bearer {key}`). If your 4CAT requires cookie-based auth, we should emulate the extension upload button via Selenium instead. +- Confirm desired default for interactive mode. (Current recommendation: pause once per platform by default; provide `--no-interactive` to run fully headless.) + +Next steps + +- I have created a matching TODO list in the session tracker and written this document to `docs/test-plan.md`. +- If you want, I can start implementing Phase 1 (profile management) in `tests/test.py` now and submit incremental changes. + +--- + +Requested file: `docs/test-plan.md` diff --git a/modules/package.json b/modules/package.json new file mode 100644 index 0000000..3dbc1ca --- /dev/null +++ b/modules/package.json @@ -0,0 +1,3 @@ +{ + "type": "module" +} diff --git a/tests/__pycache__/test.cpython-39.pyc b/tests/__pycache__/test.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..745e2b4aaad921a459372bb50b39980c50a68136 GIT binary patch literal 7345 zcmai3-E$k)b>CeqKnNl!lATnGrZeqKUpk()>F?ZK zfYgUo?Cjn9aqc#`iup9_Rb`1mDjmADRio3;Y0oamS9A_{-Zn zI^O|iiy!3Q;nuH=&MRVKL*uV(8SId->aT93{IL20jve)WFJ2Ur;?Rb^p|fd`7tSsMTX) znjhuUpKCnNk3CEQ%k)KU>2-EojCW>4x>FFxxAb~(OJ^nQIW9__6MSY%2S-@>LVIKq zmQDT6@3C5>zBh(m30UiHNgf0MuUxrUvyol_h;E^3h5X>j#U=M10ioE2|%-rdr+^exB}bWd*@ z@=L6qQfsH_)bV+|EM`w9Cpn|O*$s`I6MH-7#e3r91~5Q^$2%AJSw6940N3%(#Vv3k zj*D~U-e*Pih zy4?9dyvg6g{P&4lHFJeuNP1WK#iVzQ&#B%NzodA)+be-L|A4>0 zWz=u5dH#Wzhi?br&a$LC;2bI__ahHBlP7u zTfn=BcS)EX6T1C`e+ZqHx3v1Fd~Q-3*O(1?FK=lVwaxU>_@cq~u}TjXr@z(V;v=%6 zgwGH7^?vUMVqZt_*ZGYtW534d_s1|(!d5(--ZbT}#XfQRuC97hVjs?QGuGXvd8a5U z30-sPFF|*ORYkQw^ApU}m?Pp{x5Nte-V!zFKCQ4ni?bgp&hla(v~+u1;~(wUSiOIS z!|@p+OG&h*V4(rwKp!7NM+VK@*1R`OP3)Gy*k-{)9n^1&QO@`RU##kU>0t^p zpTvImvE~^kG=-;3cD9+8{}P=B71F62&%>Gx))0-3hnSL zN45rQTUV#9ku+Fav^NLjcppom2esGfX@S>OkMKQrZd+UY@ML0>PuS@5xm!sc9`}3p!khawwR?dd#&^N{w-`$T|E1{d{8H__vu$ZyiU9ltk5UPSC52(h`@24E z6I>rkFi?+ZKaC-L3PWwjS+cU;$EO2dUEwoS?@4^ECpfVbMaJ*;PrH^tEGk|Xz288l z>rY58EC28q#XtPNGW)a-Z*K0AxoAtIKgC}x)2I)b@Cm=XNAU&m>34X4qvF)% z&V7|NE_WWN?{-e?#kwi+nOG2uVyOxbP(F4q$+=VfejH83CzBd~0Pp)OsUfELkD#R= ziv{I}#s7=PzEJD>iuUYpN%&<@N5sht1IOy@0y&Lf&;M0$t<1 zHrkw6uZZpCitqYTa4hUv6U$sWcZI|t%k5BvAcB4-Xa}Ka#Cw~z$6JAdF|XTU4YOv| z^_Qc;LZK=B6~`6akf++jadli&!CGwR(x=f)J-Ehq5K|DM=-@+=Hf@ zu`48y2zWx0WOS%U<%9VTuUW*kRq?&hc09+cS}{yceTww}8+R27f!~r95kzm!#~mU| z5Hr}>XxN27G;Il~@l)1v+e%J1qd^Li`JS-EI}l6-EOYzp zJ3#TDcr`hR<|eE%wJO@(0V!brfrwHCYdRX8`nWLFC`@r{>hkQ=)!C`}$jDpKI~Quc zACR3i{2;WXs0fdw0bArsLer2n797tCYr=9J^zCK;uEo z)QUk8NPg6(SE(e6P8C(?U+YkZ0d=qe@+`Y5D1L;(3n;uqqk&WP1b^QnMJPv6x`8zH z^{(DlgSgobgi6%csq+FF3M|+ScYVwDnQ_ zjbPTg_aHK6tY|m}fmmhBo3)~`8!b=82)J|{CyLSN6}#nCYNUq5lcT9XR9eyr+eL~k z=oN|0N?7)#gE&@h`tYGPxE_GyC_Z-w{5U8_MzNUJ*{;40BLp>JyI~Ci{+goM*Xz`z z(L@L5ya32Sct={WoB(>aQRhWdu)9_`%f)i53bNH#OE?9&QYoxCK_Q6=)nEP<{r>UY z=whi9Ai^)43*q15%%L4v4=t5S#bT+%!-WtgFr7bh=zT8~(z9Lb)~)w1_Sbndf|yh2 zo5^gc6zwZD{bs9Ka%jhjT@l4v*l<}Yw3nlyoqD0Q_<1q z5P3BU3c|COkrNA1YH_g`z40wHUU@bVkN^!U8nk9-tzgw@Hlf7Gq)~M8*@UN$F1Nza z_lgjs2n#S%kW6?!^^I;tDeUf=YsjN5vY~~q{^qw!EYlX0Pno%lJWXC@hoKCjBUTeW z51IO_5WZK1GQye-@xz2*SR`z+v}mMaHz7t!g0YNS*MOSXT=?pjoqzw`&z7tUF$5B0 z2rUSXh-5uS)q(Hcg{n!(3GQwdfl+OZvdbb|6T-8iw5kQZ(F;*#;gb`Ecb1|fQiLt( zS>Gm|S1G3ig=oK3^jeLA13wEOA;ez&&Xa=3CureOP>8i+J8lb**0f?hQTk(e^Z6fj2g+p_ zYq`v_wydIt34U@zl41wj+nX(K-zq3|$%eXDNRm9u_JHI7hLUQF zZnhV>!SMIP4HA6vf=ZxszUj7l81M<_N`s)AZX$z*T@%BQ?e2l2)?(RY5OhrELE;@g_PIL)_tTH9E+jERIzQFF{%()Mm9qD zI?Gm2&{3~z`c2`nw91Ms<9GztH7VSp7>lnUD1K0jhisT6OhL7*pt~8WVNf}nM73+8 zvJK=>CB~#axgJZQn*xKhJ1jS0KSBzxBG_1~Nw>cpQvut(<2ELb5UPO$75?#{E{$x+ z@gN4*Eki55$ABHD_UHCEaPat|QKoE# z!@5E=+29I-aTy1u%+AsuT|a;GGF$|qo0+Ya!`Yr4^?$h~UA8Mu^5xKHIZv#W<7m!? z<8K)Wjk_XX1HDdK-6p$fRTQvH95Gmi+*$B$wihkvwrR^yG29)6twH({7ZBSm2Tp{F z4nELocmXqN638ggkZ+(GMGVCyEhbNQ4mbnkp6#vS9MTO~ig<_fgdxQyG67B}9=RDO z-?`pBo(*1@pD)kPU75RZ^TQi#Pm(YDA{umpGIwApb{ob!am}A^Rcdpw{Q5{B!FH4E z3oao`q~IYBWhwkR&N^VkUO_DbTTYR;q=MEm+l35Tjt@3MU6mI|k4TzqD3tA_5B|Gp zS#Jf{*9g$BxVYAI%{wiq6_^@3s&=~ENxClJvexi<%N1wowivi*&2(1pKgOW`jK|38 zBY4JCn;kH8<9{=$%>U3%n#~&7)Sy00SPW`0gZH43GsdtJk2$92^ju~f*pC?#`lO!G zC$J}r?=a|P4UF~S)PdAk3UGREAWN+=q7Py`sAmjNAS$}4o-FVZ)e*zgjX#>8jTUbP z7)OB7#OjznnHm9>5wr%r6Z(j8BsFXp`Zz}Pj_4Wlm_C~tS9@}Q%%)65i+1&%3Ggti r)7nAsdQd+C-g9`Zeye5-%!0d@)ao#;8W#8)2S0R@tbPE`VLbl_#VpU) literal 0 HcmV?d00001 diff --git a/tests/duplicate-behavior.test.js b/tests/duplicate-behavior.test.js index 031f663..9f0662b 100644 --- a/tests/duplicate-behavior.test.js +++ b/tests/duplicate-behavior.test.js @@ -5,8 +5,9 @@ * update or merge behaviors to duplicates across navigation boundaries. */ +import 'fake-indexeddb/auto'; + let Dexie; -require('fake-indexeddb/auto'); // Mock browser extension APIs global.browser = { diff --git a/tests/jest.config.js b/tests/jest.config.cjs similarity index 64% rename from tests/jest.config.js rename to tests/jest.config.cjs index 7dd5b02..ea72b10 100644 --- a/tests/jest.config.js +++ b/tests/jest.config.cjs @@ -3,6 +3,7 @@ module.exports = { testMatch: ['**/*.test.js'], transform: {}, moduleFileExtensions: ['js', 'json'], - collectCoverageFrom: ['duplicate-behavior.test.js'], + collectCoverageFrom: ['*.test.js'], + setupFiles: ['/setup-globals.cjs'], verbose: true }; diff --git a/tests/map_item.test.js b/tests/map_item.test.js new file mode 100644 index 0000000..9dee6e8 --- /dev/null +++ b/tests/map_item.test.js @@ -0,0 +1,130 @@ +/** + * Auto-discovery test driver for module `map_item` functions. + * + * Convention: + * tests/fixtures//*.ndjson + * + * matches a file in modules/ (e.g. "tiktok" maps to modules/tiktok.js). + * Each .ndjson line is one Zeeschuimer-stored item exported from the popup. + * + * Each item is wrapped via wrap_for_map_item to mirror how 4CAT's importer + * presents items to a map_item function, then run through the module's + * map_item. Tests assert: function returns a non-null object, and any fields + * listed in REQUIRED_NON_EMPTY for that module are present and non-empty. + */ + +import { readdirSync, readFileSync, statSync, existsSync } from 'node:fs'; +import { spawnSync } from 'node:child_process'; +import { join, dirname } from 'node:path'; +import { fileURLToPath } from 'node:url'; + +/** + * Local mirror of wrap_for_map_item from js/lib.js. + * + * lib.js is loaded by the browser as a plain script (it defines globals + * like traverse_data, MappedItem, wrap_for_map_item) and so cannot be + * imported from Node. The wrap is three trivial lines with no dependencies + * — duplicating it here is cheaper than restructuring lib.js into a module. + * If lib.js's wrap_for_map_item ever gains real logic, this needs to track. + */ +function wrap_for_map_item(stored_item) { + const { data, ...meta } = stored_item; + return { ...data, __import_meta: meta }; +} + +const __dirname = dirname(fileURLToPath(import.meta.url)); +const FIXTURE_ROOT = join(__dirname, 'fixtures'); +const MODULES_ROOT = join(__dirname, '..', 'modules'); + +/** + * Pre-validate module syntax before dynamic import. + * + * `await import()` on a module with a syntax error throws inside V8's module + * linker in a way Jest's experimental-vm-modules can't always recover from + * (worker retry loop or Node process exit). Running `node --check` first + * gives us a clean error string we can fail the test with. + */ +function check_module_syntax(module_name) { + const module_path = join(MODULES_ROOT, `${module_name}.js`); + const result = spawnSync(process.execPath, ['--check', module_path], { + encoding: 'utf8', + }); + if (result.status === 0) return null; + return (result.stderr || result.stdout || `exit code ${result.status}`).trim(); +} + +const REQUIRED_NON_EMPTY = { + tiktok: ['id', 'author', 'unix_timestamp'], +}; + +function list_module_dirs() { + if (!existsSync(FIXTURE_ROOT)) return []; + return readdirSync(FIXTURE_ROOT).filter(name => { + try { return statSync(join(FIXTURE_ROOT, name)).isDirectory(); } + catch { return false; } + }); +} + +const module_dirs = list_module_dirs(); +let total_fixtures = 0; + +for (const module_name of module_dirs) { + const fixture_dir = join(FIXTURE_ROOT, module_name); + const fixture_files = readdirSync(fixture_dir).filter(f => f.endsWith('.ndjson')); + + if (fixture_files.length === 0) continue; + total_fixtures += fixture_files.length; + + describe(`map_item: ${module_name}`, () => { + let map_item; + let import_error; + + beforeAll(async () => { + const syntax_error = check_module_syntax(module_name); + if (syntax_error) { + import_error = new Error(`syntax error:\n${syntax_error}`); + return; + } + try { + const mod = await import(`../modules/${module_name}.js`); + map_item = mod.map_item; + if (typeof map_item !== 'function') { + import_error = new Error(`modules/${module_name}.js does not export a map_item function`); + } + } catch (e) { + import_error = e; + } + }); + + for (const fixture_file of fixture_files) { + const lines = readFileSync(join(fixture_dir, fixture_file), 'utf8') + .split('\n') + .filter(line => line.trim().length > 0); + + describe(fixture_file, () => { + lines.forEach((line, i) => { + test(`item ${i} maps without throwing`, () => { + if (import_error) { + throw new Error(`failed to import modules/${module_name}.js: ${import_error.message}`); + } + const stored_item = JSON.parse(line); + const mapped = map_item(wrap_for_map_item(stored_item)); + expect(mapped).not.toBeNull(); + expect(typeof mapped).toBe('object'); + for (const field of REQUIRED_NON_EMPTY[module_name] ?? []) { + expect(mapped[field]).toBeDefined(); + expect(mapped[field]).not.toBe(''); + expect(mapped[field]).not.toBeNull(); + } + }); + }); + }); + } + }); +} + +if (total_fixtures === 0) { + describe('map_item', () => { + test.skip('no fixtures found under tests/fixtures//*.ndjson', () => {}); + }); +} diff --git a/tests/package.json b/tests/package.json index dc3654c..6dd35fb 100644 --- a/tests/package.json +++ b/tests/package.json @@ -2,9 +2,10 @@ "name": "zeeschuimer-db-tests", "version": "1.0.0", "description": "Unit tests for Zeeschuimer duplicate handling logic", + "type": "module", "scripts": { - "test": "jest", - "test:watch": "jest --watch" + "test": "node --experimental-vm-modules node_modules/jest/bin/jest.js", + "test:watch": "node --experimental-vm-modules node_modules/jest/bin/jest.js --watch" }, "devDependencies": { "dexie": "^3.2.4", diff --git a/tests/setup-globals.cjs b/tests/setup-globals.cjs new file mode 100644 index 0000000..a19fb09 --- /dev/null +++ b/tests/setup-globals.cjs @@ -0,0 +1,41 @@ +/** + * Make js/lib.js's helpers available as globals inside the Jest test + * environment, mirroring how the browser sees them after the manifest + * loads lib.js as a plain script. + * + * map_item bodies reference these as free identifiers (MappedItem, + * MissingMappedField, strip_tags, normalize_url_encoding, ...). Without this + * shim they'd hit ReferenceError as soon as a test invokes map_item. + * + * Approach: read lib.js, wrap it in a new Function() body that returns the + * named helpers, call the function, and assign the returned object onto + * globalThis. (Earlier attempt with vm.runInThisContext failed because in + * the jsdom env the vm context's global differs from jsdom's window.) + * + * If a new helper is added to lib.js, append its name to EXPOSED_NAMES. + */ + +const fs = require('node:fs'); +const path = require('node:path'); + +const EXPOSED_NAMES = [ + 'traverse_data', + 'MappedItem', + 'MissingMappedField', + 'wrap_for_map_item', + 'strip_tags', + 'normalize_url_encoding', + 'formatUtcTimestamp', +]; + +const lib_source = fs.readFileSync( + path.join(__dirname, '..', 'js', 'lib.js'), + 'utf8', +); + +const factory = new Function(` +${lib_source} +return { ${EXPOSED_NAMES.join(', ')} }; +`); + +Object.assign(globalThis, factory()); From 46b96c77ffd45f465f90880915e1f6d2836bd87e Mon Sep 17 00:00:00 2001 From: Dale Wahl Date: Wed, 6 May 2026 16:25:56 +0200 Subject: [PATCH 08/20] add fixtures folder and README.md to explain what I did --- tests/fixtures/.gitignore | 5 +++++ tests/fixtures/README.md | 29 +++++++++++++++++++++++++++++ 2 files changed, 34 insertions(+) create mode 100644 tests/fixtures/.gitignore create mode 100644 tests/fixtures/README.md diff --git a/tests/fixtures/.gitignore b/tests/fixtures/.gitignore new file mode 100644 index 0000000..8e89a83 --- /dev/null +++ b/tests/fixtures/.gitignore @@ -0,0 +1,5 @@ +# Ignore everything in this directory +* +# Except these files +!.gitignore +!README.md \ No newline at end of file diff --git a/tests/fixtures/README.md b/tests/fixtures/README.md new file mode 100644 index 0000000..d24fe06 --- /dev/null +++ b/tests/fixtures/README.md @@ -0,0 +1,29 @@ +# Test fixtures for `map_item` + +Real captured items used to exercise each module's auto-generated `map_item` +function. + +## Layout + +``` +tests/fixtures/ + / + .ndjson + .ndjson +``` + +`` matches the filename in `modules/` without `.js` — +e.g. `tiktok/` → `modules/tiktok.js`, `pinterest/` → `modules/pinterest.js`. +You can drop multiple `.ndjson` files in a module folder; each gets its own +`describe` block and each line becomes its own `test`. + +Filenames are free-form — the auto-export filename from the popup +(`zeeschuimer-export--.ndjson`) is fine. + +## Privacy / committing + +These files contain real captured platform data — usernames, post +content, URLs, sometimes images and other PII. + +If we want to create test exports or annonomize real exports, add them to +.gitignore. \ No newline at end of file From 487b5b618e4a989cbfca7dbfe2b30b1e78dc62ad Mon Sep 17 00:00:00 2001 From: Dale Wahl Date: Thu, 7 May 2026 15:53:22 +0200 Subject: [PATCH 09/20] add MapItemException --- js/lib.js | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/js/lib.js b/js/lib.js index 3b144d2..e38430e 100644 --- a/js/lib.js +++ b/js/lib.js @@ -59,6 +59,19 @@ class MissingMappedField { } } +/** + * Raised by `map_item` to signal a known mapping failure. + * + * Mirrors 4CAT's MapItemException: callers should catch it, skip the item, + * and warn the user that the platform's format may have shifted. + */ +class MapItemException extends Error { + constructor(message) { + super(message); + this.name = "MapItemException"; + } +} + /** * Wrap a Zeeschuimer stored item to match the shape a 4CAT map_item expects. * From b6f487dbfa017a79207726f04f059078aaf4c4b5 Mon Sep 17 00:00:00 2001 From: Dale Wahl Date: Thu, 7 May 2026 15:56:14 +0200 Subject: [PATCH 10/20] make a warning pop up --- popup/interface.html | 42 ++++++++++++++++++++++++++++++ popup/interface.js | 62 +++++++++++++++++++++++++++++++++++++++----- 2 files changed, 97 insertions(+), 7 deletions(-) diff --git a/popup/interface.html b/popup/interface.html index e9d9b3f..0570e40 100644 --- a/popup/interface.html +++ b/popup/interface.html @@ -303,6 +303,42 @@ text-align: center; } + #csv-warning { + position: fixed; + inset: 0; + background: rgba(60, 60, 59, 0.55); + display: flex; + align-items: center; + justify-content: center; + z-index: 1000; + } + + #csv-warning[hidden] { + display: none; + } + + #csv-warning .csv-warning-content { + background: var(--accent); + color: var(--neutral-contrast); + border: 2px solid var(--accent-alt); + border-radius: 6px; + padding: 1.25em 1.25em 1em 1.25em; + max-width: 24em; + text-align: center; + box-shadow: 0 0 20px var(--neutral-contrast); + } + + #csv-warning .csv-warning-content p { + margin: 0 0 1em 0; + line-height: 1.4; + } + + #csv-warning .dismiss-csv-warning { + display: block; + margin: 0 auto; + padding: 0.3em 1.25em; + } + .tooltippable:not(a):not(button) { display: inline-block; background: var(--neutral-contrast); @@ -409,6 +445,12 @@ +

Zeeschuimer

diff --git a/popup/interface.js b/popup/interface.js index 3b8aaa9..c56375a 100644 --- a/popup/interface.js +++ b/popup/interface.js @@ -351,16 +351,29 @@ async function button_handler(event) { } else if (event.target.matches('.reset-all')) { await background.db.items.clear(); + } else if (event.target.matches('.dismiss-csv-warning')) { + const warning = document.getElementById('csv-warning'); + if(warning) warning.hidden = true; + } else if (event.target.matches('.download-format')) { const format = event.target.getAttribute('data-format'); - const blobber = format === 'csv' ? get_csv_blob : get_ndjson_blob; const extension = format; let platform = event.target.getAttribute('data-platform'); let date = new Date(); event.target.classList.add('loading'); - let blob = await blobber(platform); + let blob; + if(format === 'csv') { + const result = await get_csv_blob(platform); + blob = result.blob; + if(result.skipped > 0) { + console.warn(`Zeeschuimer: skipped ${result.skipped} ${platform} item(s) during CSV export. First reason: ${result.firstReason}`); + show_csv_warning(platform, result.skipped); + } + } else { + blob = await get_ndjson_blob(platform); + } let filename = 'zeeschuimer-export-' + platform + '-' + date.toISOString().split(".")[0].replace(/:/g, "") + '.' + extension; const downloadUrl = window.URL.createObjectURL(blob); const downloadId = await browser.downloads.download({ @@ -637,27 +650,62 @@ function csv_escape(value) { return value; } +/** + * Surface a CSV-export skip warning in the popup. + * + * Shown when the platform's `map_item` raised MapItemException for one or + * more items — typically the platform's response shape has shifted and the + * mapper no longer recognises every field. The user is steered to the + * .ndjson export, which is unaffected because it skips the mapper entirely. + */ +function show_csv_warning(platform, skipped) { + const warning = document.getElementById('csv-warning'); + if(!warning) return; + const message = warning.querySelector('p'); + message.innerText = `Skipped ${skipped} ${platform} item${skipped === 1 ? '' : 's'} in the CSV export — the platform's data format may have changed. Use the .ndjson export to get the full dataset until Zeeschuimer is updated.`; + warning.hidden = false; +} + /** * Get a CSV dump of items * * Returns a Blob with all items in it as CSV rows, mapped via the module's * registered mapper function. A header row is included. * + * Items whose mapper raises MapItemException are skipped and counted; any + * other error propagates. Skip count and the first skip reason are returned + * alongside the blob so the caller can warn the user. Just like 4CAT! + * * @param platform - * @returns {Promise} + * @returns {Promise<{blob: Blob, skipped: number, firstReason: string|null}>} */ async function get_csv_blob(platform) { let csv = []; + let skipped = 0; + let firstReason = null; const module = background.zeeschuimer.modules[platform]; await iterate_items(platform, function(item) { - item = module.mapper(item); + let mapped; + try { + mapped = module.mapper(item); + } catch(e) { + // More JS fun: Check tag rather than `instanceof`. + // Actual Exception lives in some other realm (where modules and lib.js live), and cross-realm + // `instanceof` is unreliable under Firefox's wrappers. + if(e && e.name === 'MapItemException') { + skipped++; + if(firstReason === null) firstReason = e.message; + return; + } + throw e; + } if(csv.length === 0) { - csv.push(Object.keys(item).map(v => csv_escape(v)).join(CSV_SEPARATOR) + "\n"); + csv.push(Object.keys(mapped).map(v => csv_escape(v)).join(CSV_SEPARATOR) + "\n"); } - csv.push(Object.values(item).map(v => csv_escape(v)).join(CSV_SEPARATOR) + "\n"); + csv.push(Object.values(mapped).map(v => csv_escape(v)).join(CSV_SEPARATOR) + "\n"); }) - return new Blob(csv, {type: 'text/csv'}); + return {blob: new Blob(csv, {type: 'text/csv'}), skipped, firstReason}; } /** From f28e310c8893bb49ac535d33cc94089e8d0686b2 Mon Sep 17 00:00:00 2001 From: Dale Wahl Date: Thu, 7 May 2026 16:42:19 +0200 Subject: [PATCH 11/20] add MapItemException --- tests/setup-globals.cjs | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/setup-globals.cjs b/tests/setup-globals.cjs index a19fb09..4f54e34 100644 --- a/tests/setup-globals.cjs +++ b/tests/setup-globals.cjs @@ -22,6 +22,7 @@ const EXPOSED_NAMES = [ 'traverse_data', 'MappedItem', 'MissingMappedField', + 'MapItemException', 'wrap_for_map_item', 'strip_tags', 'normalize_url_encoding', From 5baff31ae49167d215a56cf16ead326b22d975f3 Mon Sep 17 00:00:00 2001 From: Dale Wahl Date: Wed, 27 May 2026 15:16:06 +0200 Subject: [PATCH 12/20] add env variables for tests (to connect to 4CAT) --- .gitignore | 2 ++ tests/.env.example | 9 +++++++++ tests/package-lock.json | 14 ++++++++++++++ tests/package.json | 4 +++- 4 files changed, 28 insertions(+), 1 deletion(-) create mode 100644 tests/.env.example diff --git a/.gitignore b/.gitignore index 6cf9326..fea65f3 100644 --- a/.gitignore +++ b/.gitignore @@ -5,6 +5,8 @@ # Testing artefacts .temp-profile +tests/.env +tests/.env.local # logs geckodriver.log diff --git a/tests/.env.example b/tests/.env.example new file mode 100644 index 0000000..2e021bb --- /dev/null +++ b/tests/.env.example @@ -0,0 +1,9 @@ +# 4CAT API config for the map_item comparison tests. +# Copy this file to .env in this directory and fill in real values. +# .env is gitignored; .env.example is the committed template. + +# Base URL of the 4CAT instance to hit. No trailing slash. +FOURCAT_URL=http://localhost + +# API key for that 4CAT instance. Get one from the 4CAT UI; tied to your user. +FOURCAT_API_KEY=your-api-key-here diff --git a/tests/package-lock.json b/tests/package-lock.json index cc8f457..d055883 100644 --- a/tests/package-lock.json +++ b/tests/package-lock.json @@ -9,6 +9,7 @@ "version": "1.0.0", "devDependencies": { "dexie": "^3.2.4", + "dotenv": "^16.4.5", "fake-indexeddb": "^5.0.1", "jest": "^29.7.0", "jest-environment-jsdom": "^29.7.0" @@ -1758,6 +1759,19 @@ "node": ">=12" } }, + "node_modules/dotenv": { + "version": "16.6.1", + "resolved": "https://registry.npmjs.org/dotenv/-/dotenv-16.6.1.tgz", + "integrity": "sha512-uBq4egWHTcTt33a72vpSG0z3HnPuIl6NqYcTrKEg2azoEyl2hpW0zqlxysq2pK9HlDIHyHyakeYaYnSAwd8bow==", + "dev": true, + "license": "BSD-2-Clause", + "engines": { + "node": ">=12" + }, + "funding": { + "url": "https://dotenvx.com" + } + }, "node_modules/dunder-proto": { "version": "1.0.1", "resolved": "https://registry.npmjs.org/dunder-proto/-/dunder-proto-1.0.1.tgz", diff --git a/tests/package.json b/tests/package.json index 6dd35fb..333564a 100644 --- a/tests/package.json +++ b/tests/package.json @@ -5,10 +5,12 @@ "type": "module", "scripts": { "test": "node --experimental-vm-modules node_modules/jest/bin/jest.js", - "test:watch": "node --experimental-vm-modules node_modules/jest/bin/jest.js --watch" + "test:watch": "node --experimental-vm-modules node_modules/jest/bin/jest.js --watch", + "probe": "node probe-4cat.mjs" }, "devDependencies": { "dexie": "^3.2.4", + "dotenv": "^16.4.5", "fake-indexeddb": "^5.0.1", "jest": "^29.7.0", "jest-environment-jsdom": "^29.7.0" From 6a8ce3870f4e0b6c050d68573d8affa4cc46e37b Mon Sep 17 00:00:00 2001 From: Dale Wahl Date: Wed, 27 May 2026 15:16:34 +0200 Subject: [PATCH 13/20] mirror 4CAT API missing value --- js/lib.js | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/js/lib.js b/js/lib.js index e38430e..c618a6a 100644 --- a/js/lib.js +++ b/js/lib.js @@ -57,6 +57,12 @@ class MissingMappedField { toString() { return `${this.value}`; } + + // Mirror 4CAT's API serialization so JSON.stringify produces the same + // tagged form on both sides. See docs/4cat-map-item-api.md. + toJSON() { + return { __missing: true, value: this.value }; + } } /** From 0c3140376ebd6e37cb1706fc48a105168d84d089 Mon Sep 17 00:00:00 2001 From: Dale Wahl Date: Wed, 27 May 2026 18:41:52 +0200 Subject: [PATCH 14/20] test the 4cat API endpoint --- tests/probe-4cat.mjs | 140 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 140 insertions(+) create mode 100644 tests/probe-4cat.mjs diff --git a/tests/probe-4cat.mjs b/tests/probe-4cat.mjs new file mode 100644 index 0000000..0bf4e4d --- /dev/null +++ b/tests/probe-4cat.mjs @@ -0,0 +1,140 @@ +/** + * Manually exercise 4CAT's /api/map-item/ endpoint against a fixture item. + * + * Usage: + * node probe-4cat.mjs [] [--index N] + * + * is the Zeeschuimer module filename without `.js` (e.g. + * "tiktok", "pinterest"). If is omitted, the first + * .ndjson in tests/fixtures// is used. --index selects which + * line of the fixture to send (default 0). + * + * Requires tests/.env with FOURCAT_URL and FOURCAT_API_KEY. + */ + +import 'dotenv/config'; +import { readFileSync, existsSync, readdirSync } from 'node:fs'; +import { join, dirname } from 'node:path'; +import { fileURLToPath } from 'node:url'; + +const __dirname = dirname(fileURLToPath(import.meta.url)); + +const FOURCAT_URL = process.env.FOURCAT_URL?.replace(/\/$/, ''); +const FOURCAT_API_KEY = process.env.FOURCAT_API_KEY; + +if (!FOURCAT_URL || !FOURCAT_API_KEY || FOURCAT_API_KEY === 'your-api-key-here') { + console.error('error: FOURCAT_URL and FOURCAT_API_KEY must be set in tests/.env'); + console.error(' (copy tests/.env.example to tests/.env and fill in real values)'); + process.exit(1); +} + +const ID_MAP_PATH = join(__dirname, 'zeeschuimer-to-4cat.json'); +const ID_MAP = existsSync(ID_MAP_PATH) + ? JSON.parse(readFileSync(ID_MAP_PATH, 'utf8')) + : {}; + +function auth_headers() { + return { 'Authorization': `${FOURCAT_API_KEY}` }; +} + +async function list_datasources() { + const res = await fetch(`${FOURCAT_URL}/api/datasources/`, { headers: auth_headers() }); + if (!res.ok) { + throw new Error(`GET /api/datasources/ → ${res.status}: ${await res.text()}`); + } + const body = await res.json(); + return body.datasources ?? []; +} + +async function map_item(datasource_id, item) { + const res = await fetch(`${FOURCAT_URL}/api/map-item/${datasource_id}/`, { + method: 'POST', + headers: { ...auth_headers(), 'Content-Type': 'application/json' }, + body: JSON.stringify({ item }), + }); + const text = await res.text(); + let body; + try { body = JSON.parse(text); } catch { body = { raw: text }; } + return { status_code: res.status, body }; +} + +function parse_args(argv) { + const args = { module: null, fixture: null, index: 0 }; + const positional = []; + for (let i = 2; i < argv.length; i++) { + if (argv[i] === '--index') { + args.index = parseInt(argv[++i], 10); + } else if (argv[i].startsWith('--index=')) { + args.index = parseInt(argv[i].split('=')[1], 10); + } else { + positional.push(argv[i]); + } + } + args.module = positional[0]; + args.fixture = positional[1]; + return args; +} + +async function main() { + const args = parse_args(process.argv); + if (!args.module) { + console.error('Usage: node probe-4cat.mjs [] [--index N]'); + process.exit(1); + } + + const datasource_id = ID_MAP[args.module] ?? args.module; + const fixture_dir = join(__dirname, 'fixtures', args.module); + + if (!existsSync(fixture_dir)) { + console.error(`error: no fixture dir at ${fixture_dir}`); + process.exit(1); + } + + const candidates = readdirSync(fixture_dir).filter(f => f.endsWith('.ndjson')); + if (candidates.length === 0) { + console.error(`error: no .ndjson fixtures under ${fixture_dir}`); + process.exit(1); + } + const fixture_name = args.fixture ?? candidates[0]; + const fixture_path = join(fixture_dir, fixture_name); + if (!existsSync(fixture_path)) { + console.error(`error: fixture ${fixture_path} not found`); + process.exit(1); + } + + const lines = readFileSync(fixture_path, 'utf8').split('\n').filter(l => l.trim().length > 0); + if (args.index >= lines.length) { + console.error(`error: --index ${args.index} but fixture has ${lines.length} items`); + process.exit(1); + } + const item = JSON.parse(lines[args.index]); + + console.log(`Module: ${args.module}`); + console.log(`Datasource id: ${datasource_id}${ID_MAP[args.module] ? ' (mapped via zeeschuimer-to-4cat.json)' : ''}`); + console.log(`URL: ${FOURCAT_URL}/api/map-item/${datasource_id}/`); + console.log(`Fixture: ${fixture_name}, item ${args.index} (item_id=${item.item_id ?? item.id})`); + console.log(''); + + const { status_code, body } = await map_item(datasource_id, item); + console.log(`HTTP ${status_code}`); + console.log(JSON.stringify(body, null, 2)); + + if (status_code === 404) { + console.error(''); + console.error('Hint: datasource id may be wrong. Available Zeeschuimer-origin datasources:'); + try { + const datasources = await list_datasources(); + datasources + .filter(d => d.is_from_zeeschuimer && d.has_map_item) + .forEach(d => console.error(` - ${d.id} (${d.name})`)); + } catch (e) { + console.error(` (couldn't fetch list: ${e.message})`); + } + process.exit(2); + } +} + +main().catch(e => { + console.error(`probe failed: ${e.message}`); + process.exit(2); +}); From be2f3087d8dd5af07175101a808903604c84d78b Mon Sep 17 00:00:00 2001 From: Dale Wahl Date: Wed, 27 May 2026 18:43:04 +0200 Subject: [PATCH 15/20] update docs and packages --- docs/test-plan.md | 6 +++--- tests/package-lock.json | 13 ++++++++++++- tests/setup-globals.cjs | 11 +++++++++++ 3 files changed, 26 insertions(+), 4 deletions(-) diff --git a/docs/test-plan.md b/docs/test-plan.md index 249a7e0..a4265eb 100644 --- a/docs/test-plan.md +++ b/docs/test-plan.md @@ -63,7 +63,7 @@ Phase 3 — 4CAT integration (optional) - Problem: mapping tests live in 4CAT and need NDJSON input. - Changes: - Add CLI flags: `--4cat-url URL` and `--4cat-key KEY` (API key). Require both for upload. - - After writing the per-platform NDJSON, POST it to `{4cat_url.rstrip('/')}/api/import-dataset/` with header `X-Zeeschuimer-Platform: {platform}` and `Authorization: Bearer {key}` (confirm header with your 4CAT instance; alternative is to trigger the extension UI upload button when cookie-based auth is required). + - After writing the per-platform NDJSON, POST it to `{4cat_url.rstrip('/')}/api/import-dataset/` with header `X-Zeeschuimer-Platform: {platform}` and `Authorization: {key}` (confirm header with your 4CAT instance; alternative is to trigger the extension UI upload button when cookie-based auth is required). - Do not fail the test run on 4CAT errors — print status and continue. Example upload with `requests`: @@ -73,7 +73,7 @@ import requests with open(ndjson_path, 'rb') as f: headers = { 'X-Zeeschuimer-Platform': platform, - 'Authorization': f'Bearer {fourcat_key}' + 'Authorization': f'{fourcat_key}' } r = requests.post(f"{fourcat_url.rstrip('/')}/api/import-dataset/", headers=headers, data=f) # check r.status_code and r.text for details @@ -149,7 +149,7 @@ Estimated effort: 6–10 hours of focused work to implement and test everything Open questions / confirmations needed -- Confirm 4CAT API key header format (currently suggested: `Authorization: Bearer {key}`). If your 4CAT requires cookie-based auth, we should emulate the extension upload button via Selenium instead. +- Confirm 4CAT API key header format (currently suggested: `Authorization: {key}`). If your 4CAT requires cookie-based auth, we should emulate the extension upload button via Selenium instead. - Confirm desired default for interactive mode. (Current recommendation: pause once per platform by default; provide `--no-interactive` to run fully headless.) Next steps diff --git a/tests/package-lock.json b/tests/package-lock.json index d055883..7758e9f 100644 --- a/tests/package-lock.json +++ b/tests/package-lock.json @@ -12,7 +12,8 @@ "dotenv": "^16.4.5", "fake-indexeddb": "^5.0.1", "jest": "^29.7.0", - "jest-environment-jsdom": "^29.7.0" + "jest-environment-jsdom": "^29.7.0", + "undici": "^6.20.0" } }, "node_modules/@babel/code-frame": { @@ -4197,6 +4198,16 @@ "url": "https://github.com/sponsors/sindresorhus" } }, + "node_modules/undici": { + "version": "6.26.0", + "resolved": "https://registry.npmjs.org/undici/-/undici-6.26.0.tgz", + "integrity": "sha512-4yqz8a3n5HmGTlsbADNtr/dJlhkh/55Rq798G6ibiULcXbDtaLpTl1pvdqcbFfeoj3iSi52lePFM7h9H21cw/A==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=18.17" + } + }, "node_modules/undici-types": { "version": "7.16.0", "resolved": "https://registry.npmjs.org/undici-types/-/undici-types-7.16.0.tgz", diff --git a/tests/setup-globals.cjs b/tests/setup-globals.cjs index 4f54e34..6793cc0 100644 --- a/tests/setup-globals.cjs +++ b/tests/setup-globals.cjs @@ -40,3 +40,14 @@ return { ${EXPOSED_NAMES.join(', ')} }; `); Object.assign(globalThis, factory()); + +// jsdom doesn't expose fetch and Jest's jsdom env shadows Node's global +// fetch, so the comparator can't hit 4CAT without help. Polyfill from +// undici (a Node-friendly HTTP client, separately installable on npm — +// distinct from the undici bundled internally by Node, which isn't +// require()-able by name). +// Note: tests that use fetch (e.g. map_item_compare.test.js) declare +// `@jest-environment node` at the top of the file. Node env has fetch +// natively. Don't try to polyfill into jsdom — undici's internals use +// Node-specific globals that jsdom shadows (clearImmediate, +// markResourceTiming, fast timers), and polyfilling them all is brittle. From caf1c7f48a19524282c06b688c08001e534791db Mon Sep 17 00:00:00 2001 From: Dale Wahl Date: Wed, 27 May 2026 18:43:17 +0200 Subject: [PATCH 16/20] some mapping for odd datasource names --- tests/zeeschuimer-to-4cat.json | 7 +++++++ 1 file changed, 7 insertions(+) create mode 100644 tests/zeeschuimer-to-4cat.json diff --git a/tests/zeeschuimer-to-4cat.json b/tests/zeeschuimer-to-4cat.json new file mode 100644 index 0000000..f7de942 --- /dev/null +++ b/tests/zeeschuimer-to-4cat.json @@ -0,0 +1,7 @@ +{ + "_comment": "Maps Zeeschuimer module filenames (without .js) to 4CAT datasource ids when they differ. Default behavior is identity — only include entries where the two diverge. Discovered via http://localhost/api/datasources/.", + "9gag": "ninegag", + "truth": "truthsocial", + "rednote": "xiaohongshu", + "rednote-comments": "xiaohongshu-comments" +} From f10fc492845051c87b96b75561eb91de2af99d18 Mon Sep 17 00:00:00 2001 From: Dale Wahl Date: Wed, 27 May 2026 18:44:05 +0200 Subject: [PATCH 17/20] update existing map_item tests and add helper --- tests/_module-info.js | 45 ++++++++++++++++++ tests/map_item.test.js | 105 +++++++++++++++++++---------------------- 2 files changed, 93 insertions(+), 57 deletions(-) create mode 100644 tests/_module-info.js diff --git a/tests/_module-info.js b/tests/_module-info.js new file mode 100644 index 0000000..e261e4e --- /dev/null +++ b/tests/_module-info.js @@ -0,0 +1,45 @@ +/** + * Shared helper for the map_item test drivers. + * + * Pre-validates a module by: + * 1. Running `node --check` on its file (syntax check; avoids the + * worker-killing experimental-ESM crash when a syntax error reaches + * the dynamic importer). + * 2. Dynamically importing it and checking for a `map_item` export. + * + * Returns one of four states the test driver can branch on: + * { state: 'ok', map_item: } + * { state: 'no_map_item' } + * { state: 'syntax_error', error: } + * { state: 'import_error', error: } + */ + +import { spawnSync } from 'node:child_process'; +import { join, dirname } from 'node:path'; +import { fileURLToPath } from 'node:url'; + +const __dirname = dirname(fileURLToPath(import.meta.url)); +const MODULES_ROOT = join(__dirname, '..', 'modules'); + +function check_module_syntax(module_name) { + const module_path = join(MODULES_ROOT, `${module_name}.js`); + const result = spawnSync(process.execPath, ['--check', module_path], { encoding: 'utf8' }); + if (result.status === 0) return null; + return (result.stderr || result.stdout || `exit code ${result.status}`).trim(); +} + +export async function inspect_module(module_name) { + const syntax_error = check_module_syntax(module_name); + if (syntax_error) { + return { state: 'syntax_error', error: syntax_error }; + } + try { + const mod = await import(`../modules/${module_name}.js`); + if (typeof mod.map_item !== 'function') { + return { state: 'no_map_item' }; + } + return { state: 'ok', map_item: mod.map_item }; + } catch (e) { + return { state: 'import_error', error: e }; + } +} diff --git a/tests/map_item.test.js b/tests/map_item.test.js index 9dee6e8..2dc1bb6 100644 --- a/tests/map_item.test.js +++ b/tests/map_item.test.js @@ -1,5 +1,5 @@ /** - * Auto-discovery test driver for module `map_item` functions. + * Smoke test driver for module `map_item` functions. * * Convention: * tests/fixtures//*.ndjson @@ -11,52 +11,36 @@ * presents items to a map_item function, then run through the module's * map_item. Tests assert: function returns a non-null object, and any fields * listed in REQUIRED_NON_EMPTY for that module are present and non-empty. + * + * Module-level state is determined upfront by inspect_module(): + * - 'ok' → register per-item tests + * - 'no_map_item' → register a single skipped test (not applicable) + * - 'syntax_error' → register a single failing test pointing at the line + * - 'import_error' → register a single failing test with the message */ import { readdirSync, readFileSync, statSync, existsSync } from 'node:fs'; -import { spawnSync } from 'node:child_process'; import { join, dirname } from 'node:path'; import { fileURLToPath } from 'node:url'; - -/** - * Local mirror of wrap_for_map_item from js/lib.js. - * - * lib.js is loaded by the browser as a plain script (it defines globals - * like traverse_data, MappedItem, wrap_for_map_item) and so cannot be - * imported from Node. The wrap is three trivial lines with no dependencies - * — duplicating it here is cheaper than restructuring lib.js into a module. - * If lib.js's wrap_for_map_item ever gains real logic, this needs to track. - */ -function wrap_for_map_item(stored_item) { - const { data, ...meta } = stored_item; - return { ...data, __import_meta: meta }; -} +import { inspect_module } from './_module-info.js'; const __dirname = dirname(fileURLToPath(import.meta.url)); const FIXTURE_ROOT = join(__dirname, 'fixtures'); -const MODULES_ROOT = join(__dirname, '..', 'modules'); - -/** - * Pre-validate module syntax before dynamic import. - * - * `await import()` on a module with a syntax error throws inside V8's module - * linker in a way Jest's experimental-vm-modules can't always recover from - * (worker retry loop or Node process exit). Running `node --check` first - * gives us a clean error string we can fail the test with. - */ -function check_module_syntax(module_name) { - const module_path = join(MODULES_ROOT, `${module_name}.js`); - const result = spawnSync(process.execPath, ['--check', module_path], { - encoding: 'utf8', - }); - if (result.status === 0) return null; - return (result.stderr || result.stdout || `exit code ${result.status}`).trim(); -} const REQUIRED_NON_EMPTY = { tiktok: ['id', 'author', 'unix_timestamp'], }; +/** + * Local mirror of wrap_for_map_item from js/lib.js. lib.js is loaded by + * the browser as a plain script and so cannot be imported from Node; this + * three-line mirror is cheaper than restructuring lib.js into a module. + */ +function wrap_for_map_item(stored_item) { + const { data, ...meta } = stored_item; + return { ...data, __import_meta: meta }; +} + function list_module_dirs() { if (!existsSync(FIXTURE_ROOT)) return []; return readdirSync(FIXTURE_ROOT).filter(name => { @@ -66,36 +50,46 @@ function list_module_dirs() { } const module_dirs = list_module_dirs(); + +// Pre-pass: synchronously determine each module's state so we can branch +// on it at describe/test registration time. Top-level await is supported +// in Jest's experimental-vm-modules mode. +const module_info = {}; +for (const module_name of module_dirs) { + module_info[module_name] = await inspect_module(module_name); +} + let total_fixtures = 0; for (const module_name of module_dirs) { const fixture_dir = join(FIXTURE_ROOT, module_name); const fixture_files = readdirSync(fixture_dir).filter(f => f.endsWith('.ndjson')); - if (fixture_files.length === 0) continue; total_fixtures += fixture_files.length; - describe(`map_item: ${module_name}`, () => { - let map_item; - let import_error; - - beforeAll(async () => { - const syntax_error = check_module_syntax(module_name); - if (syntax_error) { - import_error = new Error(`syntax error:\n${syntax_error}`); - return; - } - try { - const mod = await import(`../modules/${module_name}.js`); - map_item = mod.map_item; - if (typeof map_item !== 'function') { - import_error = new Error(`modules/${module_name}.js does not export a map_item function`); - } - } catch (e) { - import_error = e; - } + const info = module_info[module_name]; + + if (info.state === 'no_map_item') { + describe(`map_item: ${module_name}`, () => { + test.skip(`modules/${module_name}.js does not export a map_item function — nothing to smoke test`, () => {}); + }); + continue; + } + + if (info.state === 'syntax_error' || info.state === 'import_error') { + const msg = info.state === 'syntax_error' + ? `syntax error:\n${info.error}` + : `import failed: ${info.error.message}`; + describe(`map_item: ${module_name}`, () => { + test(`module loads`, () => { throw new Error(msg); }); }); + continue; + } + + // state === 'ok' — register per-item tests + const map_item = info.map_item; + describe(`map_item: ${module_name}`, () => { for (const fixture_file of fixture_files) { const lines = readFileSync(join(fixture_dir, fixture_file), 'utf8') .split('\n') @@ -104,9 +98,6 @@ for (const module_name of module_dirs) { describe(fixture_file, () => { lines.forEach((line, i) => { test(`item ${i} maps without throwing`, () => { - if (import_error) { - throw new Error(`failed to import modules/${module_name}.js: ${import_error.message}`); - } const stored_item = JSON.parse(line); const mapped = map_item(wrap_for_map_item(stored_item)); expect(mapped).not.toBeNull(); From 3633cde656da3f70880ae49a2909deba3a044953 Mon Sep 17 00:00:00 2001 From: Dale Wahl Date: Wed, 27 May 2026 18:44:23 +0200 Subject: [PATCH 18/20] comparison testing for datasources --- tests/map_item_compare.test.js | 283 +++++++++++++++++++++++++++++++++ 1 file changed, 283 insertions(+) create mode 100644 tests/map_item_compare.test.js diff --git a/tests/map_item_compare.test.js b/tests/map_item_compare.test.js new file mode 100644 index 0000000..37e3e4c --- /dev/null +++ b/tests/map_item_compare.test.js @@ -0,0 +1,283 @@ +/** + * @jest-environment node + * + * This file runs in Node test environment (not jsdom) because undici's + * fetch implementation uses Node-internal APIs (`clearImmediate`, + * `markResourceTiming`, fast-now timers, etc.) that jsdom shadows or + * doesn't expose. Polyfilling them into jsdom is whack-a-mole; node env + * has them all natively. + * + * Trade-off: no DOMParser in node env. The four modules that use + * `strip_tags` (gab, pinterest, rednote, truth) will need a DOMParser + * polyfill (e.g. via linkedom) before the comparator can run against + * them. Other modules (including instagram) work as-is. + */ +/** + * Compare JS map_item output against 4CAT's Python map_item via the API. + * + * For every line in every fixture, runs the JS map_item locally AND sends + * the same stored item to 4CAT's /api/map-item// endpoint, then + * diffs the two outputs field-by-field. Each item is its own Jest test — + * failures point at exactly which item and which fields diverge. + * + * Skips itself entirely if FOURCAT_URL / FOURCAT_API_KEY aren't set, so + * `npm test` keeps working without 4CAT configuration. Drop real values in + * tests/.env to enable. + * + * Datasource id mapping: tests/zeeschuimer-to-4cat.json (Zeeschuimer + * module filename → 4CAT datasource id, for the few names that diverge). + * + * Module-level state is determined upfront by inspect_module() (no + * map_item / syntax errors / import errors are handled before tests are + * registered, so they appear once per module, not once per item). + */ + +import 'dotenv/config'; +import { jest } from '@jest/globals'; +import { readdirSync, readFileSync, statSync, existsSync } from 'node:fs'; +import { join, dirname } from 'node:path'; +import { fileURLToPath } from 'node:url'; +import { inspect_module } from './_module-info.js'; + +const __dirname = dirname(fileURLToPath(import.meta.url)); + +const FOURCAT_URL = process.env.FOURCAT_URL?.replace(/\/$/, ''); +const FOURCAT_API_KEY = process.env.FOURCAT_API_KEY; +const HAS_4CAT = Boolean( + FOURCAT_URL && FOURCAT_API_KEY && FOURCAT_API_KEY !== 'your-api-key-here' +); + +// When true (default), once any item in a module fails, subsequent items +// in that same module skip the HTTP + map_item work and fail fast with a +// "halted" message. Saves time when generator output is broken at the top. +// Set FAIL_FAST=0 in env to run all items regardless. +// Trim because cmd.exe's `set FAIL_FAST=0 && ...` includes the trailing +// space in the variable value, which would otherwise defeat `!== '0'`. +const FAIL_FAST = (process.env.FAIL_FAST ?? '').trim() !== '0'; +const halted_modules = new Set(); + +const FIXTURE_ROOT = join(__dirname, 'fixtures'); +const ID_MAP_PATH = join(__dirname, 'zeeschuimer-to-4cat.json'); +const ID_MAP = existsSync(ID_MAP_PATH) + ? JSON.parse(readFileSync(ID_MAP_PATH, 'utf8')) + : {}; + +function wrap_for_map_item(stored_item) { + const { data, ...meta } = stored_item; + return { ...data, __import_meta: meta }; +} + +async function call_4cat_map_item(datasource_id, item) { + const res = await fetch(`${FOURCAT_URL}/api/map-item/${datasource_id}/`, { + method: 'POST', + headers: { + // 4CAT accepts the raw key without a `Bearer ` prefix, per probe + 'Authorization': FOURCAT_API_KEY, + 'Content-Type': 'application/json', + }, + body: JSON.stringify({ item }), + }); + const text = await res.text(); + if (!res.ok) { + throw new Error(`HTTP ${res.status} from 4CAT: ${text}`); + } + return JSON.parse(text); +} + +// Round-trip a value through JSON so MappedItem, MissingMappedField, etc. +// become plain JSON-compatible objects matching what 4CAT emits. +function normalize(value) { + return JSON.parse(JSON.stringify(value)); +} + +// Recursive structural equality. Doesn't care about object key order, which +// matters for nested values like {__missing: true, value: ""} where JS and +// Python might emit keys in different orders. +function deep_equal(a, b) { + if (a === b) return true; + if (a === null || b === null) return a === b; + if (typeof a !== typeof b) return false; + if (typeof a !== 'object') return false; + if (Array.isArray(a) !== Array.isArray(b)) return false; + if (Array.isArray(a)) { + if (a.length !== b.length) return false; + return a.every((v, i) => deep_equal(v, b[i])); + } + const a_keys = Object.keys(a); + const b_keys = Object.keys(b); + if (a_keys.length !== b_keys.length) return false; + return a_keys.every(k => k in b && deep_equal(a[k], b[k])); +} + +function diff_objects(js_obj, py_obj) { + const diffs = []; + const keys = new Set([...Object.keys(js_obj ?? {}), ...Object.keys(py_obj ?? {})]); + for (const key of keys) { + const in_js = js_obj && key in js_obj; + const in_py = py_obj && key in py_obj; + if (!in_js) { + diffs.push({ key, kind: 'only_python', python: py_obj[key] }); + } else if (!in_py) { + diffs.push({ key, kind: 'only_js', js: js_obj[key] }); + } else if (!deep_equal(js_obj[key], py_obj[key])) { + diffs.push({ key, kind: 'mismatch', js: js_obj[key], python: py_obj[key] }); + } + } + return diffs; +} + +function format_diffs(diffs) { + return diffs.map(d => { + if (d.kind === 'only_js') { + return ` + only in JS: ${d.key} = ${JSON.stringify(d.js)}`; + } + if (d.kind === 'only_python') { + return ` - only in Python: ${d.key} = ${JSON.stringify(d.python)}`; + } + return ` ~ ${d.key}\n JS: ${JSON.stringify(d.js)}\n Python: ${JSON.stringify(d.python)}`; + }).join('\n'); +} + +// Pull out the first few module-frame lines from an error's stack so the +// failure message points at where in modules/.js the throw happened. +function format_error_with_location(err) { + if (!err) return String(err); + const message = err.message || String(err); + const stack = err.stack || ''; + const module_frames = stack.split('\n') + .filter(l => l.includes('/modules/') || l.includes('\\modules\\')) + .slice(0, 3) + .map(l => l.trim()); + return module_frames.length + ? `${message}\n ${module_frames.join('\n ')}` + : message; +} + +function list_module_dirs() { + if (!existsSync(FIXTURE_ROOT)) return []; + return readdirSync(FIXTURE_ROOT).filter(name => { + try { return statSync(join(FIXTURE_ROOT, name)).isDirectory(); } + catch { return false; } + }); +} + +// Per-test timeout: each test does one HTTP round-trip to 4CAT. Jest's +// default 5s is tight under load. +jest.setTimeout(30000); + +if (!HAS_4CAT) { + describe('map_item compare (JS vs 4CAT Python)', () => { + test.skip('FOURCAT_URL / FOURCAT_API_KEY not configured — set them in tests/.env to enable', () => {}); + }); +} else { + const module_dirs = list_module_dirs(); + + // Pre-pass: synchronously determine each module's state so we can branch + // on it at registration time. + const module_info = {}; + for (const module_name of module_dirs) { + module_info[module_name] = await inspect_module(module_name); + } + + let any_fixtures = false; + + for (const module_name of module_dirs) { + const fixture_dir = join(FIXTURE_ROOT, module_name); + const fixture_files = readdirSync(fixture_dir).filter(f => f.endsWith('.ndjson')); + if (fixture_files.length === 0) continue; + any_fixtures = true; + + const datasource_id = ID_MAP[module_name] ?? module_name; + const info = module_info[module_name]; + + if (info.state === 'no_map_item') { + // eslint-disable-next-line no-console + console.log(`[compare] skipping ${module_name}: modules/${module_name}.js does not export a map_item`); + continue; + } + + if (info.state === 'syntax_error' || info.state === 'import_error') { + const msg = info.state === 'syntax_error' + ? `syntax error:\n${info.error}` + : `import failed: ${info.error.message}`; + describe(`map_item compare: ${module_name}`, () => { + test(`module loads`, () => { throw new Error(msg); }); + }); + continue; + } + + // state === 'ok' — register per-item comparison tests + const map_item = info.map_item; + + describe(`map_item compare: ${module_name} (4CAT id: ${datasource_id})`, () => { + for (const fixture_file of fixture_files) { + const lines = readFileSync(join(fixture_dir, fixture_file), 'utf8') + .split('\n') + .filter(line => line.trim().length > 0); + + describe(fixture_file, () => { + lines.forEach((line, i) => { + test(`item ${i}`, async () => { + if (FAIL_FAST && halted_modules.has(module_name)) { + throw new Error( + '[halted after prior failure in this module — set FAIL_FAST=0 to run all items]' + ); + } + try { + const stored_item = JSON.parse(line); + + // 4CAT side + const response = await call_4cat_map_item(datasource_id, stored_item); + + // JS side + let js_result; + let js_error; + try { + js_result = map_item(wrap_for_map_item(stored_item)); + } catch (e) { + js_error = e; + } + + if (response.status === 'mapped') { + if (js_error) { + throw new Error( + `4CAT mapped this item but JS threw: ${format_error_with_location(js_error)}` + ); + } + const js_obj = normalize(js_result); + const py_obj = normalize(response.item); + const diffs = diff_objects(js_obj, py_obj); + if (diffs.length > 0) { + throw new Error( + `${diffs.length} field(s) differ between JS and 4CAT:\n${format_diffs(diffs)}` + ); + } + } else if (response.status === 'skipped') { + if (!js_error) { + throw new Error( + `4CAT skipped this item ("${response.reason}") but JS produced a result` + ); + } + // Both rejected — good. Skip reasons may differ in wording. + } else if (response.status === 'error') { + throw new Error(`4CAT errored on this item: ${response.message}`); + } else { + throw new Error(`unexpected 4CAT response status: ${JSON.stringify(response)}`); + } + } catch (e) { + if (FAIL_FAST) halted_modules.add(module_name); + throw e; + } + }); + }); + }); + } + }); + } + + if (!any_fixtures) { + describe('map_item compare (JS vs 4CAT Python)', () => { + test.skip('no fixtures under tests/fixtures//*.ndjson', () => {}); + }); + } +} From 7d97a0fe342e3b7f932c79fe22e9b8c6b3c25bb3 Mon Sep 17 00:00:00 2001 From: Dale Wahl Date: Wed, 27 May 2026 18:44:35 +0200 Subject: [PATCH 19/20] list common translation errors --- tests/translation-errors.md | 430 ++++++++++++++++++++++++++++++++++++ 1 file changed, 430 insertions(+) create mode 100644 tests/translation-errors.md diff --git a/tests/translation-errors.md b/tests/translation-errors.md new file mode 100644 index 0000000..fcc160d --- /dev/null +++ b/tests/translation-errors.md @@ -0,0 +1,430 @@ +# Auto-generator translation errors + +Patterns of incorrect Python → JavaScript translation observed in +auto-generated `modules/*.js` files. Each entry has a search pattern so +this doc doubles as a checklist when reviewing a new auto-generator PR. + +When an entry is fixed at the generator level (no longer appears in +fresh output), mark it `[fixed]` and keep the entry around — useful +history when something regresses. + +## How to use + +- Found a new pattern? Add an entry below following the template. +- Reviewing a generator PR? `grep` each `Search pattern` against the + changed module files. Anything that hits is worth a manual look. +- Iterating on the generator prompt? The "Why" lines are the + feedback to add — they describe the exact Python-vs-JS semantic + difference the LLM keeps missing. + +## Template + +``` +### + +**Status:** open | fixed in generator | accepted + +**Why it happens:** + +**Wrong JS:** +```js + +``` + +**Correct JS:** +```js + +``` + +**Example:** `modules/.js:` + +**Search pattern:** `` +``` + +--- + +## Observed patterns + +### `in` operator on strings + +**Status:** open + +**Why it happens:** In Python, `"x" in some_string` is a substring check. +In JavaScript, the `in` operator only works on **objects** and checks for +property/key existence; using it with a string on the right-hand side +throws `TypeError: cannot use 'in' operator to search for "x" in `. + +**Wrong JS:** +```js +const is_polaris = '__typename' in item && 'polaris' in item.__typename.toLowerCase(); +``` + +**Correct JS:** +```js +const is_polaris = '__typename' in item && item.__typename.toLowerCase().includes('polaris'); +``` + +**Example:** `modules/instagram.js:513` + +**Search pattern:** `'[^']+' in [a-zA-Z_$][\w$]*\.` — quoted string followed +by `in` followed by a method call. Quick rough check: `grep -E "' in [a-zA-Z]" modules/` + +**Watch out for partial fixes:** seen as `'polaris' in (item.__typename ?? '').toLowerCase()` +— adding `?? ''` guards against `undefined` but the `in` operator itself +still throws on the resulting *string*. The fix is `.includes()`, not just +defaulting the operand. + +--- + +### Python f-string syntax left in single-quoted JS strings + +**Status:** open + +**Why it happens:** Python `f"... {var} ..."` interpolates. JS uses +template literals (backticks) with `${var}`. The auto-generator leaves the +`{var}` notation in a regular single- or double-quoted JS string, which is +just literal text — no interpolation happens. + +**Wrong JS:** +```js +throw new MapItemException('Unable to parse item: different user {user.id} and owner {owner.id}'); +``` + +**Correct JS:** +```js +throw new MapItemException(`Unable to parse item: different user ${user.id} and owner ${owner.id}`); +``` + +**Example:** `modules/instagram.js:754` + +**Search pattern:** `'[^']*\{[a-zA-Z_$][\w$.]*\}[^']*'` or `"[^"]*\{[a-zA-Z_$][\w$.]*\}[^"]*"` +— a non-template-literal string containing `{identifier}` or `{identifier.path}`. +Quick check: `grep -nE "['\"][^'\"]*\{[a-zA-Z_][a-zA-Z0-9_.]*\}[^'\"]*['\"]" modules/` + +--- + +### `?? {}` default that defeats subsequent truthy checks + +**Status:** open + +**Why it happens:** When porting Python's `node.get('user') or {}` (which is +intended to make subsequent code safe to call), the generator emits +`node.user ?? {}`. That's a *valid* Python-equivalent, **but** any following +`if (user && owner) { ... }` guard then never short-circuits because both +`{}` references are truthy. The check ends up reading "if user and owner +*objects* exist" when the intent was "if user and owner data exist." +Subsequent property accesses then compare real ids/usernames against +`undefined` on the missing side, often throwing. + +**Wrong JS:** +```js +const user = node.user ?? {}; +const owner = node.owner ?? {}; +if (user && owner) { + if (user.id === owner.id) { /* … */ } + else if (user.username !== owner.username) { + throw new MapItemException('different user and owner'); + } +} +``` + +**Correct JS** (depending on intent — pick one): +```js +// (a) drop the defaults so truthy guard means "both present" +const user = node.user; +const owner = node.owner; +if (user && owner) { /* compare */ } +``` +```js +// (b) check for actual content, not just object identity +const user = node.user ?? {}; +const owner = node.owner ?? {}; +if (Object.keys(user).length && Object.keys(owner).length) { /* compare */ } +``` + +**Example:** `modules/instagram.js:748-756` + +**Search pattern:** `\?\?\s*\{\s*\}` — any `?? {}` occurrence is worth a +review of subsequent guards. Quick check: `grep -nE "\?\?\s*\{\s*\}" modules/` + +--- + +### Bare relative path as a statement (junk auto-imports section) + +**Status:** open + +**Why it happens:** The generator emits an "auto-generated imports" marker +block at the top of the module but writes the import target as a bare +relative path on its own line (`../js/lib.js`) instead of a real `import` +statement. JS parses that as `..` then `.` then `/js/lib.js` — syntax error. + +**Wrong JS:** +```js +// === auto-generated imports for map_item — DO NOT EDIT BY HAND === +../js/lib.js +// === end auto-generated imports === +``` + +**Correct JS** (one of): +```js +// === auto-generated imports — DO NOT EDIT BY HAND === +// Provided as globals by js/lib.js (loaded via manifest.json): +// MappedItem, MissingMappedField, MapItemException, traverse_data, +// strip_tags, normalize_url_encoding, formatUtcTimestamp +// === end auto-generated imports === +``` + +Or, if a real import is intended, an ESM import with named bindings: +```js +import { MappedItem, MissingMappedField } from '../js/lib.js'; +``` + +**Example:** seen historically in `modules/tiktok.js:2` + +**Search pattern:** `^\.\./` at the start of a line in module files. +Quick check: `grep -nE "^\.\." modules/*.js` + +--- + +### Key-existence check (`'X' in obj`) used where Python intended value-truthiness (`obj.get('X')`) + +**Status:** open + +**Why it happens:** Python's `if node.get('usertags'):` is a *truthy check on +the value* — returns False if the key is missing **or** if the value is +`None`/empty/falsy. The generator translates this to `if ('usertags' in +node)`, which in JS is a *key-existence check* — returns True even when +the value is `null`. Subsequent property accesses on the null value then +throw `Cannot read properties of null`. + +**Wrong JS:** +```js +const usertags = 'usertags' in node ? node.usertags.in.map(...).join(',') : ''; +// node.usertags can be null → .in.map blows up +``` + +**Correct JS:** +```js +const usertags = node.usertags ? node.usertags.in.map(...).join(',') : ''; +``` + +**Example:** `modules/instagram.js:777` + +**Search pattern:** `'[^']+' in [a-zA-Z_$][\w$]*\s*\?` — quoted-string `in` +identifier followed by `?` (ternary). Quick check: +`grep -nE "'[^']+' in [a-zA-Z_]+ \?" modules/` + +--- + +### Datetime serialization format mismatch + +**Status:** open + +**Why it happens:** Python's `datetime.utcfromtimestamp(t).strftime('%Y-%m-%d %H:%M:%S')` +produces `"2026-05-13 21:27:31"` — space-separated, no timezone marker. JS's +`new Date(t * 1000).toISOString()` produces `"2026-05-13T21:27:31.000Z"` — T +separator, milliseconds, Z. The generator emits the JS `.toISOString()` form +instead of using the existing `formatUtcTimestamp` helper from lib.js that +mimics Python's output exactly. + +**Wrong JS:** +```js +collected_at = new Date(node.taken_at * 1000).toISOString(); +``` + +**Correct JS:** +```js +collected_at = formatUtcTimestamp(node.taken_at); +// formatUtcTimestamp is defined in js/lib.js as: +// new Date(unixSeconds * 1000).toISOString().replace('T', ' ').slice(0, 19) +``` + +**Example:** `modules/instagram.js:782` + +**Search pattern:** `new Date\([^)]+\)\.toISOString\(\)` — any use of +`.toISOString()`. The helper should be used instead. Quick check: +`grep -nE "\.toISOString\(\)" modules/` + +--- + +### `re.findall` capture groups vs JS `.match` with /g flag + +**Status:** open + +**Why it happens:** Python's `re.findall(r'#(\w+)', s)` returns the **capture +group contents**: `['lotr', 'woodart']`. JS's `s.match(/#(\w+)/g)` (with the +global flag) returns the **full matches**: `['#lotr', '#woodart']` — capture +groups are ignored. The generator translates the regex literally without +adjusting for this semantic difference, so the resulting strings keep +prefixes/wrappers that Python would have stripped. + +**Wrong JS:** +```js +hashtags: caption.match(/#([^\s!@#$%^&*()_+{}:"|<>?;',./`~]+)/g)?.join(',') +// produces "#lotr,#woodart" +``` + +**Correct JS:** +```js +// Option A: strip the literal prefix from each full match +hashtags: caption.match(/#([^\s...]+)/g)?.map(h => h.slice(1)).join(',') ?? '' +// Option B: use matchAll to get capture groups properly +hashtags: [...caption.matchAll(/#([^\s...]+)/g)].map(m => m[1]).join(',') ?? '' +``` + +**Example:** `modules/instagram.js:812` (also 766, 870 — three copies) + +**Search pattern:** `\.match\(/[^/]*\([^/]*\)[^/]*/g\)` — any `.match()` with +a global-flag regex containing a capture group. Quick check: +`grep -nE "\.match\(/.*\(.*\).*\/g\)" modules/` + +--- + +### `undefined` field values get dropped from JSON, but Python's `None` becomes `null` + +**Status:** open + +**Why it happens:** When `JSON.stringify` encounters an object property whose +value is `undefined`, it **omits the key entirely** from the output. Python's +`json.dumps` serializes `None` as `null`, keeping the key. The generator +writes assignments like `location.city = node.location.city` where the +right-hand side can be `undefined`, producing missing keys in JS output +that show up as `only in Python: = null` diffs against 4CAT. + +**Wrong JS:** +```js +location.city = node.location.city; // undefined if .city missing +// JSON.stringify({location_city: undefined}) → "{}" (key omitted) + +body: caption, // null if no caption — Python returns "" here, not null +``` + +**Correct JS:** +```js +// Whichever fallback Python uses for that specific field: +location.city = node.location.city ?? null; // some fields → null +body: caption ?? '', // other fields → "" +``` + +**Example:** `modules/instagram.js:745, 853` (`null` flavor), +559, 648, 798 (`""` flavor for `body`) + +**Note:** Python's choice of `None` vs `""` is per-field — there's no +universal rule. When the comparator reports `~ X JS: null Python: ""` use +`?? ''`. When it reports `- only in Python: X = null` use `?? null`. The +distinction matters because the JS output should match Python's choice +exactly for that field. + +**Search pattern:** harder to grep automatically — any property assignment +where the RHS could be `undefined`/`null` and the resulting field is +expected to appear in the mapped output. Look at "only in Python: X = null" +and "~ X JS: null Python: \"\"" diffs in the comparator output to find +specific cases. + +--- + +### Object-reference inequality used as type check + +**Status:** open + +**Why it happens:** The generator emits `caption !== new MissingMappedField('')` +to mean "caption is not a missing-marker", but `new MissingMappedField('')` +creates a fresh object every time, and `!==` on objects compares references. +The expression is **always true**, so the conditional never takes the +"missing" branch. Likely originates from Python idioms like `caption != ""` +or `caption is not None`, mistranslated through the MissingMappedField +abstraction. + +**Wrong JS:** +```js +hashtags: caption !== new MissingMappedField('') ? caption.match(...) : '', +// !== between two different object references is always true +``` + +**Correct JS:** +```js +// If the intent was "if caption has content", just truthy-check it: +hashtags: caption ? caption.match(...) : '', +// If the intent was "if caption is not a MissingMappedField instance": +hashtags: !(caption instanceof MissingMappedField) ? caption.match(...) : '', +``` + +**Example:** `modules/instagram.js:812` (and two other copies) + +**Search pattern:** `!== new [A-Z]` or `=== new [A-Z]` — any equality +comparison with a freshly-constructed object. Quick check: +`grep -nE "(!==|===) new [A-Z]" modules/` + +--- + +### `.method()` chain on potentially-null result + +**Status:** open + +**Why it happens:** In Python, calling a method on `None` raises +`AttributeError`, which 4CAT sometimes catches. In JS, calling a method on +`null`/`undefined` throws `TypeError: Cannot read properties of null +(reading '')`. The generator emits the same dotted chain without +optional-chaining (`?.`) protection. + +**Wrong JS:** +```js +hashtags: caption !== new MissingMappedField('') + ? caption.match(/#([^\s!@#$%^&*()_+{}:"|<>?;',./`~]+)/g)?.join(',') + : '', +``` +(here `caption` is allowed to be `null`, so `caption.match(...)` blows up +on null caption) + +**Correct JS:** +```js +hashtags: caption + ? caption.match(/#([^\s!@#$%^&*()_+{}:"|<>?;',./`~]+)/g)?.join(',') ?? '' + : '', +``` + +**Example:** `modules/instagram.js:809` + +**Search pattern:** harder to grep — needs reading. Worth manual review of +any field that uses `caption.match`, `something.split`, `something.join` +without `?.` on a value that could be null/undefined. + +--- + +## Generator prompt feedback (running list) + +Concrete things to fold into the generator's prompt over time: + +1. **Python `x in y` where `y` is a string** → use `y.includes(x)` in JS, + never `x in y`. +2. **Python f-strings** → use JS template literals (backticks) with + `${...}` syntax. Never leave `{...}` in single- or double-quoted strings. +3. **`?? {}` after a `.get(...) or {}` translation** → only use this if the + following code does property-access. If the following code does a + truthy guard (`if (x && y)`), drop the default and use just `node.user`. +4. **Method chains on possibly-null values** → use `?.` (optional + chaining) instead of `.` whenever the receiver could be null/undefined. +5. **The auto-imports header block** → emit either real `import { ... }` + statements with valid relative paths, or a comment-only header. + Never emit bare paths as JS statements. +6. **Python `node.get('X')` truthy check** → in JS, use `node.X` (or + `node.X != null`), not `'X' in node`. The `in` operator checks key + existence, which is True even for explicit-null values. +7. **Datetime serialization** → use the `formatUtcTimestamp` helper from + lib.js (which mimics Python's `strftime('%Y-%m-%d %H:%M:%S')` format), + not `new Date(...).toISOString()` (which has a different output shape: + T separator, milliseconds, Z suffix). +8. **`re.findall` with capture groups** → in JS, `.match(/.../g)` returns + full matches, NOT capture groups. To get capture-group behavior, use + either `[...s.matchAll(/.../g)].map(m => m[1])` or post-process the + full matches with `.map(...)` to strip the literal parts. +9. **Object-reference equality (`!== new X(...)`)** → never. Creating an + object with `new` produces a fresh reference; `===`/`!==` compares + identity. Use `instanceof X` for type checks, or compare values + directly. The MissingMappedField "is this missing?" check should be + `caption instanceof MissingMappedField` or just truthy-check the value. +10. **Python `None` → JSON `null` vs JS `undefined` → omitted** — when a + field's value could be missing and Python returns `null` for it, + JS must explicitly assign `null` (not leave the value as `undefined`). + `JSON.stringify` drops `undefined` keys silently. Use `value ?? null` + when the field is expected to appear in the mapped output. From 6ad4c134cf35d0993b2968f3b2dc832e2766794d Mon Sep 17 00:00:00 2001 From: Dale Wahl Date: Wed, 27 May 2026 18:45:52 +0200 Subject: [PATCH 20/20] package.json fix --- tests/package.json | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/package.json b/tests/package.json index 333564a..390fdd3 100644 --- a/tests/package.json +++ b/tests/package.json @@ -13,6 +13,7 @@ "dotenv": "^16.4.5", "fake-indexeddb": "^5.0.1", "jest": "^29.7.0", - "jest-environment-jsdom": "^29.7.0" + "jest-environment-jsdom": "^29.7.0", + "undici": "^6.20.0" } }