From 491f51bc07520317f31416a68a9a221ccade03f9 Mon Sep 17 00:00:00 2001
From: Dale Wahl
Date: Tue, 5 May 2026 17:38:25 +0200
Subject: [PATCH 01/20] minimal changes for direct from 4CAT mapping
---
js/lib.js | 16 +++++++++++++++-
modules/_loader.js | 6 +++++-
popup/interface.js | 2 +-
3 files changed, 21 insertions(+), 3 deletions(-)
diff --git a/js/lib.js b/js/lib.js
index 6199d01..1579195 100644
--- a/js/lib.js
+++ b/js/lib.js
@@ -57,4 +57,18 @@ class MissingMappedField {
toString() {
return `${this.value}`;
}
-}
\ No newline at end of file
+}
+
+/**
+ * Wrap a Zeeschuimer stored item to match the shape a 4CAT map_item expects.
+ *
+ * 4CAT's importer constructs:
+ * { ...item.data, __import_meta: { ...everything in item except data } }
+ *
+ * Mirroring that here means map_item functions auto-generated from 4CAT
+ * data sources can run against Zeeschuimer-stored items without translation.
+ */
+function wrap_for_map_item(stored_item) {
+ const { data, ...meta } = stored_item;
+ return { ...data, __import_meta: meta };
+}
diff --git a/modules/_loader.js b/modules/_loader.js
index 47697ca..afae2d7 100644
--- a/modules/_loader.js
+++ b/modules/_loader.js
@@ -17,11 +17,15 @@ async function load() {
];
for(const module of imported_modules) {
+ const mapper = module.map_item
+ ? (stored_item) => module.map_item(wrap_for_map_item(stored_item))
+ : null;
+
zeeschuimer.register_module(
module.MODULE_NAME,
module.DOMAIN,
module.capture,
- module.map_item,
+ mapper,
module.MODULE_ID ? module.MODULE_ID : module.MODULE_DOMAIN,
module.overwrite_partial,
module.TOOLTIP ? module.TOOLTIP : null,
diff --git a/popup/interface.js b/popup/interface.js
index 5cc7864..1ae60a2 100644
--- a/popup/interface.js
+++ b/popup/interface.js
@@ -619,7 +619,7 @@ async function get_csv_blob(platform) {
let csv = [];
const module = background.zeeschuimer.modules[platform];
await iterate_items(platform, function(item) {
- item = module.mapper(item.data);
+ item = module.mapper(item);
if(csv.length === 0) {
csv.push(Object.keys(item).map(v => csv_escape(v)).join(CSV_SEPARATOR) + "\n");
}
From b06805f711a97fad6e9e3f6615db3a0cf936205e Mon Sep 17 00:00:00 2001
From: Dale Wahl
Date: Wed, 6 May 2026 10:54:13 +0200
Subject: [PATCH 02/20] give me some standard helper functions
---
js/lib.js | 54 +++++++++++++++++++++
modules/tiktok.js | 119 +++++++++++++++++++++++++++++++++++++++++++++-
2 files changed, 172 insertions(+), 1 deletion(-)
diff --git a/js/lib.js b/js/lib.js
index 1579195..3b144d2 100644
--- a/js/lib.js
+++ b/js/lib.js
@@ -72,3 +72,57 @@ function wrap_for_map_item(stored_item) {
const { data, ...meta } = stored_item;
return { ...data, __import_meta: meta };
}
+
+/**
+ * Ports of 4CAT functions commonly used by `map_item` below
+ */
+
+/**
+ * Strip HTML tags from a string.
+ * @param {string} html
+ * @param {boolean} convertNewlines Convert
and
tags to \n before stripping.
+ * @returns {string}
+ */
+function strip_tags(html, convertNewlines = true) {
+ if (!html) return "";
+ if (convertNewlines) {
+ html = html.replace(/
/gi, "\n").replace(/<\/p>/gi, "\n");
+ html = html.replace(/\n+/g, "\n");
+ }
+ const doc = new DOMParser().parseFromString(html, "text/html");
+ return doc.body.textContent || "";
+}
+
+/**
+ * Normalize URL encoding for display and linking.
+ * Decodes percent-encoded URLs and re-encodes the query string canonically.
+ * Returns the original URL on parse failure.
+ * @param {string} url
+ * @returns {string}
+ */
+function normalize_url_encoding(url) {
+ if (!url) return "";
+ try {
+ // Iterative decode handles double-encoded inputs.
+ let decoded = url;
+ let prev;
+ do {
+ prev = decoded;
+ try {
+ decoded = decodeURIComponent(prev);
+ } catch {
+ decoded = prev;
+ break;
+ }
+ } while (decoded !== prev);
+ const parsed = new URL(decoded);
+ // URL.toString() re-encodes the query/fragment correctly.
+ return parsed.toString();
+ } catch {
+ return url;
+ }
+}
+
+function formatUtcTimestamp(unixSeconds) {
+ return new Date(unixSeconds * 1000).toISOString().replace('T', ' ').slice(0, 19);
+}
\ No newline at end of file
diff --git a/modules/tiktok.js b/modules/tiktok.js
index 55e6fbf..ea52532 100644
--- a/modules/tiktok.js
+++ b/modules/tiktok.js
@@ -1,3 +1,4 @@
+
export const MODULE_NAME = 'TikTok (posts)';
export const DOMAIN = 'tiktok.com';
@@ -103,4 +104,120 @@ export function capture(response, source_platform_url, source_url) {
} else {
return [];
}
-}
\ No newline at end of file
+}
+
+// === auto-generated by 4cat map_item sync — DO NOT EDIT BY HAND ===
+// (regenerated from datasources/tiktok/search_tiktok.py)
+export function map_item(post) {
+ // Zeeschuimer metadata
+ const metadata = post.__import_meta || {};
+
+ const challenges = Array.isArray(post.challenges)
+ ? post.challenges.map(ch => ch.title).filter(Boolean)
+ : [];
+
+ const hashtags = Array.isArray(post.textExtra)
+ ? post.textExtra
+ .filter(e => e.hasOwnProperty('hashtagName') && e.hashtagName)
+ .map(e => e.hashtagName)
+ : [];
+
+ const diversificationLabels = Array.isArray(post.diversificationLabels)
+ ? post.diversificationLabels.join(',')
+ : '';
+
+ let user_nickname = '';
+ let user_fullname = '';
+ let user_thumbnail = '';
+
+ if (post.author && typeof post.author === 'object') {
+ user_nickname = post.author.uniqueId || '';
+ user_fullname = post.author.nickname || '';
+ user_thumbnail = post.author.avatarThumb || '';
+ } else if (post.author) {
+ user_nickname = post.author || '';
+ user_fullname = post.nickname || '';
+ user_thumbnail = '';
+ }
+
+ const thumbnailOptions = [];
+
+ if (post.video && Array.isArray(post.video.shareCover)) {
+ thumbnailOptions.push(...post.video.shareCover);
+ }
+
+ if (post.video && post.video.cover) {
+ thumbnailOptions.push(post.video.cover);
+ }
+
+ const now = Math.floor(Date.now() / 1000);
+
+ const validThumbnails = thumbnailOptions.filter(url => {
+ try {
+ const parsedUrl = new URL(url);
+ const expires = parseInt(parsedUrl.searchParams.get('x-expires'), 10) || 0;
+ return expires >= now;
+ } catch (e) {
+ return false;
+ }
+ });
+
+ const thumbnail_url = validThumbnails.length ? validThumbnails[validThumbnails.length - 1] : '';
+
+ return new MappedItem({
+ collected_from_url: metadata.source_platform_url
+ ? normalize_url_encoding(metadata.source_platform_url)
+ : '',
+ id: post.id || '',
+ thread_id: post.id || '',
+ author: user_nickname,
+ author_full: user_fullname,
+ author_followers: post.authorStats?.followerCount ?? '',
+ author_likes: post.authorStats?.diggCount ?? '',
+ author_videos: post.authorStats?.videoCount ?? '',
+ author_avatar: user_thumbnail,
+ body: post.desc || '',
+ stickers: Array.isArray(post.stickersOnItem)
+ ? post.stickersOnItem
+ .map(s => (Array.isArray(s.stickerText) ? s.stickerText.join(' ') : ''))
+ .filter(Boolean)
+ .join('')
+ : '',
+ timestamp: post.createTime
+ ? formatUtcTimestamp(parseInt(post.createTime, 10))
+ : '',
+ unix_timestamp: post.createTime ? parseInt(post.createTime, 10) : 0,
+ is_duet:
+ post.duetInfo && post.duetInfo.duetFromId && post.duetInfo.duetFromId !== '0'
+ ? 'yes'
+ : 'no',
+ is_ad: post.isAd ? 'yes' : 'no',
+ is_paid_partnership: post.adAuthorization ? 'yes' : 'no',
+ is_sensitive: post.maskType === 3 ? 'yes' : 'no',
+ is_photosensitive: post.maskType === 4 ? 'yes' : 'no',
+ music_name: post.music?.title ?? '',
+ music_id: post.music?.id ?? '',
+ music_url: post.music?.playUrl ?? '',
+ music_thumbnail: post.music?.coverLarge ?? '',
+ music_author: post.music?.authorName ?? '',
+ video_url: post.video?.downloadAddr ?? '',
+ tiktok_url: `https://www.tiktok.com/@${user_nickname}/video/${post.id}`,
+ thumbnail_url: thumbnail_url,
+ likes: post.stats?.diggCount ?? '',
+ comments: post.stats?.commentCount ?? '',
+ shares: post.stats?.shareCount ?? '',
+ plays: post.stats?.playCount ?? '',
+ hashtags: hashtags.join(','),
+ challenges: challenges.join(','),
+ diversification_labels: diversificationLabels,
+ location_created: post.locationCreated ?? '',
+ effects: Array.isArray(post.effectStickers)
+ ? post.effectStickers.map(e => e.name).join(',')
+ : '',
+ warning: Array.isArray(post.warnInfo)
+ ? post.warnInfo.map(w => w.text).join(',')
+ : '',
+ });
+}
+// === end auto-generated ===
+// === end auto-generated ===
From f9a2405a0703bcadfdee7492ccd57af12917733e Mon Sep 17 00:00:00 2001
From: Dale Wahl
Date: Wed, 6 May 2026 13:07:43 +0200
Subject: [PATCH 03/20] fix csv export
---
popup/interface.js | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/popup/interface.js b/popup/interface.js
index 1ae60a2..8afd1b1 100644
--- a/popup/interface.js
+++ b/popup/interface.js
@@ -595,7 +595,7 @@ const CSV_ESCAPED = `"${CSV_SEPARATOR}\n`;
function csv_escape(value) {
value = String(value);
let needs_escape = false;
- for(const character in CSV_ESCAPED) {
+ for(const character of CSV_ESCAPED) {
if(value.indexOf(character) >= 0) {
needs_escape = true;
}
From 2f084b9352c25a1034429bb05d8390b5961d35ef Mon Sep 17 00:00:00 2001
From: Dale Wahl
Date: Wed, 6 May 2026 15:19:18 +0200
Subject: [PATCH 04/20] another to CSV fix
---
popup/interface.js | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/popup/interface.js b/popup/interface.js
index 8afd1b1..94fff77 100644
--- a/popup/interface.js
+++ b/popup/interface.js
@@ -626,7 +626,7 @@ async function get_csv_blob(platform) {
csv.push(Object.values(item).map(v => csv_escape(v)).join(CSV_SEPARATOR) + "\n");
})
- return new Blob([csv], {type: 'text/csv'});
+ return new Blob(csv, {type: 'text/csv'});
}
/**
From d7870426c7765a6107c47c4fff062f5643725167 Mon Sep 17 00:00:00 2001
From: Dale Wahl
Date: Wed, 6 May 2026 15:25:42 +0200
Subject: [PATCH 05/20] revert tiktok (mistaken test result commited)
---
modules/tiktok.js | 119 +---------------------------------------------
1 file changed, 1 insertion(+), 118 deletions(-)
diff --git a/modules/tiktok.js b/modules/tiktok.js
index ea52532..55e6fbf 100644
--- a/modules/tiktok.js
+++ b/modules/tiktok.js
@@ -1,4 +1,3 @@
-
export const MODULE_NAME = 'TikTok (posts)';
export const DOMAIN = 'tiktok.com';
@@ -104,120 +103,4 @@ export function capture(response, source_platform_url, source_url) {
} else {
return [];
}
-}
-
-// === auto-generated by 4cat map_item sync — DO NOT EDIT BY HAND ===
-// (regenerated from datasources/tiktok/search_tiktok.py)
-export function map_item(post) {
- // Zeeschuimer metadata
- const metadata = post.__import_meta || {};
-
- const challenges = Array.isArray(post.challenges)
- ? post.challenges.map(ch => ch.title).filter(Boolean)
- : [];
-
- const hashtags = Array.isArray(post.textExtra)
- ? post.textExtra
- .filter(e => e.hasOwnProperty('hashtagName') && e.hashtagName)
- .map(e => e.hashtagName)
- : [];
-
- const diversificationLabels = Array.isArray(post.diversificationLabels)
- ? post.diversificationLabels.join(',')
- : '';
-
- let user_nickname = '';
- let user_fullname = '';
- let user_thumbnail = '';
-
- if (post.author && typeof post.author === 'object') {
- user_nickname = post.author.uniqueId || '';
- user_fullname = post.author.nickname || '';
- user_thumbnail = post.author.avatarThumb || '';
- } else if (post.author) {
- user_nickname = post.author || '';
- user_fullname = post.nickname || '';
- user_thumbnail = '';
- }
-
- const thumbnailOptions = [];
-
- if (post.video && Array.isArray(post.video.shareCover)) {
- thumbnailOptions.push(...post.video.shareCover);
- }
-
- if (post.video && post.video.cover) {
- thumbnailOptions.push(post.video.cover);
- }
-
- const now = Math.floor(Date.now() / 1000);
-
- const validThumbnails = thumbnailOptions.filter(url => {
- try {
- const parsedUrl = new URL(url);
- const expires = parseInt(parsedUrl.searchParams.get('x-expires'), 10) || 0;
- return expires >= now;
- } catch (e) {
- return false;
- }
- });
-
- const thumbnail_url = validThumbnails.length ? validThumbnails[validThumbnails.length - 1] : '';
-
- return new MappedItem({
- collected_from_url: metadata.source_platform_url
- ? normalize_url_encoding(metadata.source_platform_url)
- : '',
- id: post.id || '',
- thread_id: post.id || '',
- author: user_nickname,
- author_full: user_fullname,
- author_followers: post.authorStats?.followerCount ?? '',
- author_likes: post.authorStats?.diggCount ?? '',
- author_videos: post.authorStats?.videoCount ?? '',
- author_avatar: user_thumbnail,
- body: post.desc || '',
- stickers: Array.isArray(post.stickersOnItem)
- ? post.stickersOnItem
- .map(s => (Array.isArray(s.stickerText) ? s.stickerText.join(' ') : ''))
- .filter(Boolean)
- .join('')
- : '',
- timestamp: post.createTime
- ? formatUtcTimestamp(parseInt(post.createTime, 10))
- : '',
- unix_timestamp: post.createTime ? parseInt(post.createTime, 10) : 0,
- is_duet:
- post.duetInfo && post.duetInfo.duetFromId && post.duetInfo.duetFromId !== '0'
- ? 'yes'
- : 'no',
- is_ad: post.isAd ? 'yes' : 'no',
- is_paid_partnership: post.adAuthorization ? 'yes' : 'no',
- is_sensitive: post.maskType === 3 ? 'yes' : 'no',
- is_photosensitive: post.maskType === 4 ? 'yes' : 'no',
- music_name: post.music?.title ?? '',
- music_id: post.music?.id ?? '',
- music_url: post.music?.playUrl ?? '',
- music_thumbnail: post.music?.coverLarge ?? '',
- music_author: post.music?.authorName ?? '',
- video_url: post.video?.downloadAddr ?? '',
- tiktok_url: `https://www.tiktok.com/@${user_nickname}/video/${post.id}`,
- thumbnail_url: thumbnail_url,
- likes: post.stats?.diggCount ?? '',
- comments: post.stats?.commentCount ?? '',
- shares: post.stats?.shareCount ?? '',
- plays: post.stats?.playCount ?? '',
- hashtags: hashtags.join(','),
- challenges: challenges.join(','),
- diversification_labels: diversificationLabels,
- location_created: post.locationCreated ?? '',
- effects: Array.isArray(post.effectStickers)
- ? post.effectStickers.map(e => e.name).join(',')
- : '',
- warning: Array.isArray(post.warnInfo)
- ? post.warnInfo.map(w => w.text).join(',')
- : '',
- });
-}
-// === end auto-generated ===
-// === end auto-generated ===
+}
\ No newline at end of file
From a9fba9a9caee86d8799ee35d11374fbb602c9a41 Mon Sep 17 00:00:00 2001
From: Dale Wahl
Date: Wed, 6 May 2026 15:57:45 +0200
Subject: [PATCH 06/20] clean up UI (make download menu button)
---
popup/interface.html | 32 +++++++++++++++++++++-
popup/interface.js | 63 +++++++++++++++++++++++++++++++++-----------
2 files changed, 78 insertions(+), 17 deletions(-)
diff --git a/popup/interface.html b/popup/interface.html
index 356f2b5..e9d9b3f 100644
--- a/popup/interface.html
+++ b/popup/interface.html
@@ -215,10 +215,39 @@
text-indent: 2em;
}
- td > button:not(:last-child) {
+ td > button:not(:last-child),
+ td > .download-menu:not(:last-child) {
margin-right: 0.25em;
}
+ /* download chooser: trigger is a regular button (inherits all button
+ styles); */
+ .download-menu {
+ display: inline-block;
+ position: relative;
+ }
+
+ /* :not([hidden]) so the explicit display:flex doesn't override the
+ [hidden] attribute's default display:none */
+ .download-menu > .download-options:not([hidden]) {
+ position: absolute;
+ top: calc(100% + 0.25em);
+ left: 0;
+ display: flex;
+ flex-direction: column;
+ gap: 0.25em;
+ padding: 0.25em;
+ background: var(--neutral-contrast-alt);
+ border: 2px solid var(--neutral-contrast);
+ border-radius: 0.5em;
+ z-index: 10;
+ white-space: nowrap;
+ }
+
+ .download-menu > .download-options > button {
+ margin: 0;
+ }
+
input:not([type=checkbox]):not([type=radio]), button {
background: var(--neutral-contrast-alt);
color: var(--accent);
@@ -302,6 +331,7 @@
.toggle-switch input {
-moz-appearance: none;
+ appearance: none;
opacity: 0;
}
diff --git a/popup/interface.js b/popup/interface.js
index 94fff77..3b8aaa9 100644
--- a/popup/interface.js
+++ b/popup/interface.js
@@ -119,7 +119,7 @@ async function set_4cat_url(e) {
function activate_buttons() {
document.querySelectorAll("td button").forEach(button => {
let current = button.disabled;
- let items = parseInt(button.parentNode.parentNode.querySelector('.num-items').innerText);
+ let items = parseInt(button.closest('tr').querySelector('.num-items').innerText);
let new_status = current;
if(button.classList.contains('upload-to-4cat') && !is_uploading) {
@@ -132,7 +132,7 @@ function activate_buttons() {
button.setAttribute('title', '');
}
- } else if(button.classList.contains('download-ndjson') || button.classList.contains('reset') || button.classList.contains('download-csv')) {
+ } else if(button.classList.contains('download-format') || button.classList.contains('download-menu-trigger') || button.classList.contains('reset')) {
new_status = !(items > 0);
}
@@ -234,21 +234,32 @@ async function get_stats() {
let actions = createElement("td");
const clear_button = createElement("button", {"data-platform": platform, "class": "reset"}, "Delete");
- const csv_button = createElement("button", {"data-platform": platform, 'class': 'download-csv'}, '.csv');
- const download_button = createElement("button", {
- "data-platform": platform,
- "class": "download-ndjson"
- }, ".ndjson");
+
+ // Render the download chooser as a button + popover panel,
+ // (even when only NDJSON is available as visual consistent)
+ const download_widget = createElement("span", {"class": "download-menu"});
+ const trigger = createElement("button", {
+ "data-platform": platform, "class": "download-menu-trigger"
+ }, "Download");
+ const options = createElement("div", {"class": "download-options", "hidden": ""});
+ options.appendChild(createElement("button", {
+ "data-platform": platform, "data-format": "ndjson", "class": "download-format"
+ }, ".ndjson (original)"));
+ if(module.mapper) {
+ options.appendChild(createElement("button", {
+ "data-platform": platform, "data-format": "csv", "class": "download-format"
+ }, ".csv"));
+ }
+ download_widget.appendChild(trigger);
+ download_widget.appendChild(options);
+
const fourcat_button = createElement("button", {
"data-platform": platform,
"class": "upload-to-4cat",
}, "to 4CAT");
actions.appendChild(clear_button);
- if(module.mapper) {
- actions.appendChild(csv_button);
- }
- actions.appendChild(download_button);
+ actions.appendChild(download_widget);
actions.appendChild(fourcat_button);
row.appendChild(actions);
@@ -317,22 +328,38 @@ async function get_stats() {
async function button_handler(event) {
let status = document.getElementById('upload-status');
- if (event.target.matches('.reset')) {
+ // Close any open download-format popovers when clicking outside their host.
+ // Skip if the click is on a trigger or inside an options panel
+ if(!event.target.matches('.download-menu-trigger') && !event.target.closest('.download-options')) {
+ document.querySelectorAll('.download-options:not([hidden])').forEach(el => el.hidden = true);
+ }
+
+ if (event.target.matches('.download-menu-trigger')) {
+ const widget = event.target.closest('.download-menu');
+ const options = widget.querySelector('.download-options');
+ const opening = options.hidden;
+ // close any other menus before opening this one
+ document.querySelectorAll('.download-options:not([hidden])').forEach(el => {
+ if(el !== options) el.hidden = true;
+ });
+ options.hidden = !opening;
+
+ } else if (event.target.matches('.reset')) {
let platform = event.target.getAttribute('data-platform');
await background.db.items.where("source_platform").equals(platform).delete();
} else if (event.target.matches('.reset-all')) {
await background.db.items.clear();
- } else if (event.target.matches('.download-ndjson') || event.target.matches('.download-csv')) {
- const blobber = event.target.matches('.download-ndjson') ? get_ndjson_blob : get_csv_blob;
- const extension = event.target.matches('.download-ndjson') ? 'ndjson' : 'csv';
+ } else if (event.target.matches('.download-format')) {
+ const format = event.target.getAttribute('data-format');
+ const blobber = format === 'csv' ? get_csv_blob : get_ndjson_blob;
+ const extension = format;
let platform = event.target.getAttribute('data-platform');
let date = new Date();
event.target.classList.add('loading');
- //let blob = await download_blob(platform, 'zeeschuimer-export-' + platform + '-' + date.toISOString().split(".")[0].replace(/:/g, "") + '.ndjson');
let blob = await blobber(platform);
let filename = 'zeeschuimer-export-' + platform + '-' + date.toISOString().split(".")[0].replace(/:/g, "") + '.' + extension;
const downloadUrl = window.URL.createObjectURL(blob);
@@ -345,6 +372,10 @@ async function button_handler(event) {
event.target.classList.remove('loading');
+ // collapse the popover menu after the download fires
+ const widget = event.target.closest('.download-menu');
+ if(widget) widget.querySelector('.download-options').hidden = true;
+
} else if (event.target.matches('.upload-to-4cat')) {
let platform = event.target.getAttribute('data-platform');
status.innerText = 'Creating data file for uploading...';
From 0980a56f0ba6872884bfc1e891efc2cb9f4e4c33 Mon Sep 17 00:00:00 2001
From: Dale Wahl
Date: Wed, 6 May 2026 16:13:52 +0200
Subject: [PATCH 07/20] testing is hard in JS
---
docs/test-plan.md | 162 ++++++++++++++++++++++
modules/package.json | 3 +
tests/__pycache__/test.cpython-39.pyc | Bin 0 -> 7345 bytes
tests/duplicate-behavior.test.js | 3 +-
tests/{jest.config.js => jest.config.cjs} | 3 +-
tests/map_item.test.js | 130 +++++++++++++++++
tests/package.json | 5 +-
tests/setup-globals.cjs | 41 ++++++
8 files changed, 343 insertions(+), 4 deletions(-)
create mode 100644 docs/test-plan.md
create mode 100644 modules/package.json
create mode 100644 tests/__pycache__/test.cpython-39.pyc
rename tests/{jest.config.js => jest.config.cjs} (64%)
create mode 100644 tests/map_item.test.js
create mode 100644 tests/setup-globals.cjs
diff --git a/docs/test-plan.md b/docs/test-plan.md
new file mode 100644
index 0000000..249a7e0
--- /dev/null
+++ b/docs/test-plan.md
@@ -0,0 +1,162 @@
+# Selenium Test Harness — Improvement Plan
+
+Date: 2026-04-30
+
+Overview
+
+This document captures an actionable plan to improve the Selenium-based integration tests in `tests/test.py` for the Zeeschuimer Firefox extension. The goals are to:
+
+- Make profile handling reliable and reusable (so logged-in sessions persist across runs).
+- Preserve and export captured data per platform for offline analysis and for passing to 4CAT.
+- Add optional automated upload to a 4CAT instance for mapping/validation tests.
+- Reduce fragility caused by popups and interactive dialogs (pausing/dismissal patterns).
+- Improve robustness, error handling, and machine-readable results.
+
+Scope
+
+All changes are confined to the test harness and test metadata (`tests/test.py` and `tests/tests.json`) and to this planning document. No changes are required in the extension source for the planned items (the test harness will interact with the extension's UI pages and background DB).
+
+Phases & Changes
+
+Phase 1 — Profile management
+
+- Problem: copying an entire profile can race with a running Firefox and the current ignore rule hides potentially useful session data.
+- Changes:
+ - Detect if the selected profile directory appears locked (presence of `lock` or `.parentlock`) and warn if Firefox is running.
+ - Replace the naive ignore lambda used in `shutil.copytree` with a function that only excludes `storage`, `extensions`, and `signedInUser.json` at the profile root.
+ - Add CLI flags: `--profile-name NAME` (choose profile by display name from `profiles.ini`), `--save-profile PATH` (save the temp profile for reuse), and `--no-cleanup` (do not remove `.temp-profile` after run).
+
+Implementation note (copytree ignore example):
+
+```python
+def _profile_ignore(root, names):
+ # Only ignore these entries in the root profile dir
+ if os.path.abspath(root) == os.path.abspath(profile_dir):
+ return {"storage", "extensions", "signedInUser.json"}
+ return set()
+
+shutil.copytree(profile_dir, profile_file, ignore=_profile_ignore)
+```
+
+Phase 2 — Data preservation & export
+
+- Problem: `reset-all` wipes the DB before each URL; no artifacts are kept for post-mortem or mapping tests.
+- Decision: export a single combined NDJSON file per platform containing items collected while testing that platform.
+- Changes:
+ - Add CLI `--export-dir PATH` (default `./zeeschuimer-exports/{timestamp}/`).
+ - Before clicking `reset-all` for each URL, read the current DB contents from the extension background page (Dexie) via `execute_async_script` and append those items to a per-platform in-memory list in Python. After all URLs for a platform are done, write `{export-dir}/{platform}.ndjson`.
+ - Optionally add `--no-reset` to skip the `reset-all` call entirely (default behavior remains to reset before each URL).
+
+Execute_async_script pattern (example):
+
+```python
+script = '''
+const cb = arguments[0];
+background.db.items.toArray().then(items => cb(JSON.stringify(items))).catch(e => cb(JSON.stringify({error: String(e)})));
+'''
+items_json = driver.execute_async_script(script)
+items = json.loads(items_json)
+```
+
+Phase 3 — 4CAT integration (optional)
+
+- Problem: mapping tests live in 4CAT and need NDJSON input.
+- Changes:
+ - Add CLI flags: `--4cat-url URL` and `--4cat-key KEY` (API key). Require both for upload.
+ - After writing the per-platform NDJSON, POST it to `{4cat_url.rstrip('/')}/api/import-dataset/` with header `X-Zeeschuimer-Platform: {platform}` and `Authorization: Bearer {key}` (confirm header with your 4CAT instance; alternative is to trigger the extension UI upload button when cookie-based auth is required).
+ - Do not fail the test run on 4CAT errors — print status and continue.
+
+Example upload with `requests`:
+
+```python
+import requests
+with open(ndjson_path, 'rb') as f:
+ headers = {
+ 'X-Zeeschuimer-Platform': platform,
+ 'Authorization': f'Bearer {fourcat_key}'
+ }
+ r = requests.post(f"{fourcat_url.rstrip('/')}/api/import-dataset/", headers=headers, data=f)
+ # check r.status_code and r.text for details
+```
+
+Phase 4 — Interactive controls & popup dismissals
+
+- Problem: cookie banners, paywall prompts, and other popups frequently interfere with automated navigation and can cause false failures.
+- Decision: pause by default **once per platform** (not before every URL) so the tester can clear residual prompts; provide opt-out and finer-grained options.
+- Changes:
+ - CLI flags: `--no-interactive` (disable all pauses), `--pause-before-url` (pause before each URL), `--pause-on-fail` (pause on failure), `--extra-wait N` (add N seconds to every wait), `--screenshot-dir PATH` (capture screenshots on fail/warning).
+ - Add a `dismiss-selectors` optional field in `tests.json` per URL: a list of CSS selectors to click to dismiss known popups. Example:
+
+```json
+"dismiss-selectors": ["button.cookie-accept", ".modal .close"]
+```
+
+ - Add per-URL `timeout` (page load timeout override).
+
+Phase 5 — Runner robustness & reporting
+
+- Problem: unhandled exceptions abort the run; final runtime is calculated incorrectly; no machine-readable results.
+- Changes:
+ - Wrap each URL test body in try/except, increment `failed` on exceptions, and continue.
+ - Move the global `start_time = time.time()` to before the outer platform loop so the final elapsed time is for the full run.
+ - Add CLI flags: `--results-file PATH` (write JSON summary), `--resume-from PLATFORM` (skip earlier platforms), and `--screenshot-dir PATH` (as noted).
+ - Fix small test metadata issues (e.g., `more-after-scrolll` typo in `tests.json`).
+
+tests.json schema additions
+
+- Per-URL optional fields:
+ - `dismiss-selectors`: array of CSS selectors to click after page load
+ - `timeout`: numeric page load timeout seconds for this URL
+ - `extra-wait`: per-URL additional wait seconds
+
+CLI flags (summary)
+
+- `--profiledir PATH` — explicit profile path (existing)
+- `--profile-name NAME` — choose Firefox profile by display name
+- `--save-profile PATH` — persist the copied profile for reuse
+- `--no-cleanup` — keep `.temp-profile`
+- `--export-dir PATH` — where to write NDJSON exports
+- `--no-reset` — do not click `reset-all` between URLs
+- `--4cat-url URL` — base URL for 4CAT server
+- `--4cat-key KEY` — API key for 4CAT uploads
+- `--4cat-per-url` — upload per URL instead of per platform (optional)
+- `--no-interactive` — disable pausing (default is to pause per-platform)
+- `--pause-before-url` — pause before each URL
+- `--pause-on-fail` — pause when a test fails
+- `--extra-wait N` — add N seconds to every URL wait
+- `--screenshot-dir PATH` — save screenshots on fail/warning
+- `--results-file PATH` — write machine-readable results JSON
+- `--resume-from PLATFORM` — resume a run from a platform
+
+Verification checklist
+
+1. `python tests/test.py --sources instagram.com --export-dir ./exports` -> `exports/instagram.com.ndjson` exists and contains NDJSON with captured items.
+2. `python tests/test.py --save-profile .saved-profile --login` -> create a saved profile that can be reused with `--profiledir .saved-profile`.
+3. Run with default interactive behavior and confirm one pause per platform.
+4. `python tests/test.py --results-file results.json` -> JSON summary produced with per-URL status and counts.
+5. Test 4CAT upload using a local mock server and `--4cat-url http://localhost:8000 --4cat-key KEY`.
+
+Implementation steps (recommended order)
+
+1. Docs and small fixes (this document + tests.json typo fix).
+2. Profile management changes (`--profile-name`, improved copy ignore, `--save-profile`, lock detection).
+3. Export behavior: `--export-dir` + `execute_async_script` collection and NDJSON write.
+4. Runner robustness: try/except around URL loop, `--results-file`, fix `start_time` placement.
+5. Interactive and dismissal features (`dismiss-selectors`, pause flags, screenshots).
+6. 4CAT upload integration (optional, requires confirmation of auth header).
+
+Estimated effort: 6–10 hours of focused work to implement and test everything end-to-end; can be split into 3-4 incremental PRs.
+
+Open questions / confirmations needed
+
+- Confirm 4CAT API key header format (currently suggested: `Authorization: Bearer {key}`). If your 4CAT requires cookie-based auth, we should emulate the extension upload button via Selenium instead.
+- Confirm desired default for interactive mode. (Current recommendation: pause once per platform by default; provide `--no-interactive` to run fully headless.)
+
+Next steps
+
+- I have created a matching TODO list in the session tracker and written this document to `docs/test-plan.md`.
+- If you want, I can start implementing Phase 1 (profile management) in `tests/test.py` now and submit incremental changes.
+
+---
+
+Requested file: `docs/test-plan.md`
diff --git a/modules/package.json b/modules/package.json
new file mode 100644
index 0000000..3dbc1ca
--- /dev/null
+++ b/modules/package.json
@@ -0,0 +1,3 @@
+{
+ "type": "module"
+}
diff --git a/tests/__pycache__/test.cpython-39.pyc b/tests/__pycache__/test.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..745e2b4aaad921a459372bb50b39980c50a68136
GIT binary patch
literal 7345
zcmai3-E$k)b>CeqKnNl!lATnGrZeqKUpk()>F?ZK
zfYgUo?Cjn9aqc#`iup9_Rb`1mDjmADRio3;Y0oamS9A_{-Zn
zI^O|iiy!3Q;nuH=&MRVKL*uV(8SId->aT93{IL20jve)WFJ2Ur;?Rb^p|fd`7tSsMTX)
znjhuUpKCnNk3CEQ%k)KU>2-EojCW>4x>FFxxAb~(OJ^nQIW9__6MSY%2S-@>LVIKq
zmQDT6@3C5>zBh(m30UiHNgf0MuUxrUvyol_h;E^3h5X>j#U=M10ioE2|%-rdr+^exB}bWd*@
z@=L6qQfsH_)bV+|EM`w9Cpn|O*$s`I6MH-7#e3r91~5Q^$2%AJSw6940N3%(#Vv3k
zj*D~U-e*Pih
zy4?9dyvg6g{P&4lHFJeuNP1WK#iVzQB%NzodA)+be-L|A4>0
zWz=u5dH#Wzhi?br&a$LC;2bI__ahHBlP7u
zTfn=BcS)EX6T1C`e+ZqHx3v1Fd~Q-3*O(1?FK=lVwaxU>_@cq~u}TjXr@z(V;v=%6
zgwGH7^?vUMVqZt_*ZGYtW534d_s1|(!d5(--ZbT}#XfQRuC97hVjs?QGuGXvd8a5U
z30-sPFF|*ORYkQw^ApU}m?Pp{x5Nte-V!zFKCQ4ni?bgp&hla(v~+u1;~(wUSiOIS
z!|@p+OG&h*V4(rwKp!7NM+VK@*1R`OP3)Gy*k-{)9n^1&QO@`RU##kU>0t^p
zpTvImvE~^kG=-;3cD9+8{}P=B71F62&%>Gx))0-3hnSL
zN45rQTUV#9ku+Fav^NLjcppom2esGfX@S>OkMKQrZd+UY@ML0>PuS@5xm!sc9`}3p!khawwR?dd#&^N{w-`$T|E1{d{8H__vu$ZyiU9ltk5UPSC52(h`@24E
z6I>rkFi?+ZKaC-L3PWwjS+cU;$EO2dUEwoS?@4^ECpfVbMaJ*;PrH^tEGk|Xz288l
z>rY58EC28q#XtPNGW)a-Z*K0AxoAtIKgC}x)2I)b@Cm=XNAU&m>34X4qvF)%
z&V7|NE_WWN?{-e?#kwi+nOG2uVyOxbP(F4q$+=VfejH83CzBd~0Pp)OsUfELkD#R=
ziv{I}#s7=PzEJD>iuUYpN%&<@N5sht1IOy@0y&Lf&;M0$t<1
zHrkw6uZZpCitqYTa4hUv6U$sWcZI|t%k5BvAcB4-Xa}Ka#Cw~z$6JAdF|XTU4YOv|
z^_Qc;LZK=B6~`6akf++jadli&!CGwR(x=f)J-Ehq5K|DM=-@+=Hf@
zu`48y2zWx0WOS%U<%9VTuUW*kRq?&hc09+cS}{yceTww}8+R27f!~r95kzm!#~mU|
z5Hr}>XxN27G;Il~@l)1v+e%J1qd^Li`JS-EI}l6-EOYzp
zJ3#TDcr`hR<|eE%wJO@(0V!brfrwHCYdRX8`nWLFC`@r{>hkQ=)!C`}$jDpKI~Quc
zACR3i{2;WXs0fdw0bArsLer2n797tCYr=9J^zCK;uEo
z)QUk8NPg6(SE(e6P8C(?U+YkZ0d=qe@+`Y5D1L;(3n;uqqk&WP1b^QnMJPv6x`8zH
z^{(DlgSgobgi6%csq+FF3M|+ScYVwDnQ_
zjbPTg_aHK6tY|m}fmmhBo3)~`8!b=82)J|{CyLSN6}#nCYNUq5lcT9XR9eyr+eL~k
z=oN|0N?7)#gE&@h`tYGPxE_GyC_Z-w{5U8_MzNUJ*{;40BLp>JyI~Ci{+goM*Xz`z
z(L@L5ya32Sct={WoB(>aQRhWdu)9_`%f)i53bNH#OE?9&QYoxCK_Q6=)nEP<{r>UY
z=whi9Ai^)43*q15%%L4v4=t5S#bT+%!-WtgFr7bh=zT8~(z9Lb)~)w1_Sbndf|yh2
zo5^gc6zwZD{bs9Ka%jhjT@l4v*l<}Yw3nlyoqD0Q_<1q
z5P3BU3c|COkrNA1YH_g`z40wHUU@bVkN^!U8nk9-tzgw@Hlf7Gq)~M8*@UN$F1Nza
z_lgjs2n#S%kW6?!^^I;tDeUf=YsjN5vY~~q{^qw!EYlX0Pno%lJWXC@hoKCjBUTeW
z51IO_5WZK1GQye-@xz2*SR`z+v}mMaHz7t!g0YNS*MOSXT=?pjoqzw`&z7tUF$5B0
z2rUSXh-5uS)q(Hcg{n!(3GQwdfl+OZvdbb|6T-8iw5kQZ(F;*#;gb`Ecb1|fQiLt(
zS>Gm|S1G3ig=oK3^jeLA13wEOA;ez&&Xa=3CureOP>8i+J8lb**0f?hQTk(e^Z6fj2g+p_
zYq`v_wydIt34U@zl41wj+nX(K-zq3|$%eXDNRm9u_JHI7hLUQF
zZnhV>!SMIP4HA6vf=ZxszUj7l81M<_N`s)AZX$z*T@%BQ?e2l2)?(RY5OhrELE;@g_PIL)_tTH9E+jERIzQFF{%()Mm9qD
zI?Gm2&{3~z`c2`nw91Ms<9GztH7VSp7>lnUD1K0jhisT6OhL7*pt~8WVNf}nM73+8
zvJK=>CB~#axgJZQn*xKhJ1jS0KSBzxBG_1~Nw>cpQvut(<2ELb5UPO$75?#{E{$x+
z@gN4*Eki55$ABHD_UHCEaPat|QKoE#
z!@5E=+29I-aTy1u%+AsuT|a;GGF$|qo0+Ya!`Yr4^?$h~UA8Mu^5xKHIZv#W<7m!?
z<8K)Wjk_XX1HDdK-6p$fRTQvH95Gmi+*$B$wihkvwrR^yG29)6twH({7ZBSm2Tp{F
z4nELocmXqN638ggkZ+(GMGVCyEhbNQ4mbnkp6#vS9MTO~ig<_fgdxQyG67B}9=RDO
z-?`pBo(*1@pD)kPU75RZ^TQi#Pm(YDA{umpGIwApb{ob!am}A^Rcdpw{Q5{B!FH4E
z3oao`q~IYBWhwkR&N^VkUO_DbTTYR;q=MEm+l35Tjt@3MU6mI|k4TzqD3tA_5B|Gp
zS#Jf{*9g$BxVYAI%{wiq6_^@3s&=~ENxClJvexi<%N1wowivi*&2(1pKgOW`jK|38
zBY4JCn;kH8<9{=$%>U3%n#~&7)Sy00SPW`0gZH43GsdtJk2$92^ju~f*pC?#`lO!G
zC$J}r?=a|P4UF~S)PdAk3UGREAWN+=q7Py`sAmjNAS$}4o-FVZ)e*zgjX#>8jTUbP
z7)OB7#OjznnHm9>5wr%r6Z(j8BsFXp`Zz}Pj_4Wlm_C~tS9@}Q%%)65i+1&%3Ggti
r)7nAsdQd+C-g9`Zeye5-%!0d@)ao#;8W#8)2S0R@tbPE`VLbl_#VpU)
literal 0
HcmV?d00001
diff --git a/tests/duplicate-behavior.test.js b/tests/duplicate-behavior.test.js
index 031f663..9f0662b 100644
--- a/tests/duplicate-behavior.test.js
+++ b/tests/duplicate-behavior.test.js
@@ -5,8 +5,9 @@
* update or merge behaviors to duplicates across navigation boundaries.
*/
+import 'fake-indexeddb/auto';
+
let Dexie;
-require('fake-indexeddb/auto');
// Mock browser extension APIs
global.browser = {
diff --git a/tests/jest.config.js b/tests/jest.config.cjs
similarity index 64%
rename from tests/jest.config.js
rename to tests/jest.config.cjs
index 7dd5b02..ea72b10 100644
--- a/tests/jest.config.js
+++ b/tests/jest.config.cjs
@@ -3,6 +3,7 @@ module.exports = {
testMatch: ['**/*.test.js'],
transform: {},
moduleFileExtensions: ['js', 'json'],
- collectCoverageFrom: ['duplicate-behavior.test.js'],
+ collectCoverageFrom: ['*.test.js'],
+ setupFiles: ['/setup-globals.cjs'],
verbose: true
};
diff --git a/tests/map_item.test.js b/tests/map_item.test.js
new file mode 100644
index 0000000..9dee6e8
--- /dev/null
+++ b/tests/map_item.test.js
@@ -0,0 +1,130 @@
+/**
+ * Auto-discovery test driver for module `map_item` functions.
+ *
+ * Convention:
+ * tests/fixtures//*.ndjson
+ *
+ * matches a file in modules/ (e.g. "tiktok" maps to modules/tiktok.js).
+ * Each .ndjson line is one Zeeschuimer-stored item exported from the popup.
+ *
+ * Each item is wrapped via wrap_for_map_item to mirror how 4CAT's importer
+ * presents items to a map_item function, then run through the module's
+ * map_item. Tests assert: function returns a non-null object, and any fields
+ * listed in REQUIRED_NON_EMPTY for that module are present and non-empty.
+ */
+
+import { readdirSync, readFileSync, statSync, existsSync } from 'node:fs';
+import { spawnSync } from 'node:child_process';
+import { join, dirname } from 'node:path';
+import { fileURLToPath } from 'node:url';
+
+/**
+ * Local mirror of wrap_for_map_item from js/lib.js.
+ *
+ * lib.js is loaded by the browser as a plain script (it defines globals
+ * like traverse_data, MappedItem, wrap_for_map_item) and so cannot be
+ * imported from Node. The wrap is three trivial lines with no dependencies
+ * — duplicating it here is cheaper than restructuring lib.js into a module.
+ * If lib.js's wrap_for_map_item ever gains real logic, this needs to track.
+ */
+function wrap_for_map_item(stored_item) {
+ const { data, ...meta } = stored_item;
+ return { ...data, __import_meta: meta };
+}
+
+const __dirname = dirname(fileURLToPath(import.meta.url));
+const FIXTURE_ROOT = join(__dirname, 'fixtures');
+const MODULES_ROOT = join(__dirname, '..', 'modules');
+
+/**
+ * Pre-validate module syntax before dynamic import.
+ *
+ * `await import()` on a module with a syntax error throws inside V8's module
+ * linker in a way Jest's experimental-vm-modules can't always recover from
+ * (worker retry loop or Node process exit). Running `node --check` first
+ * gives us a clean error string we can fail the test with.
+ */
+function check_module_syntax(module_name) {
+ const module_path = join(MODULES_ROOT, `${module_name}.js`);
+ const result = spawnSync(process.execPath, ['--check', module_path], {
+ encoding: 'utf8',
+ });
+ if (result.status === 0) return null;
+ return (result.stderr || result.stdout || `exit code ${result.status}`).trim();
+}
+
+const REQUIRED_NON_EMPTY = {
+ tiktok: ['id', 'author', 'unix_timestamp'],
+};
+
+function list_module_dirs() {
+ if (!existsSync(FIXTURE_ROOT)) return [];
+ return readdirSync(FIXTURE_ROOT).filter(name => {
+ try { return statSync(join(FIXTURE_ROOT, name)).isDirectory(); }
+ catch { return false; }
+ });
+}
+
+const module_dirs = list_module_dirs();
+let total_fixtures = 0;
+
+for (const module_name of module_dirs) {
+ const fixture_dir = join(FIXTURE_ROOT, module_name);
+ const fixture_files = readdirSync(fixture_dir).filter(f => f.endsWith('.ndjson'));
+
+ if (fixture_files.length === 0) continue;
+ total_fixtures += fixture_files.length;
+
+ describe(`map_item: ${module_name}`, () => {
+ let map_item;
+ let import_error;
+
+ beforeAll(async () => {
+ const syntax_error = check_module_syntax(module_name);
+ if (syntax_error) {
+ import_error = new Error(`syntax error:\n${syntax_error}`);
+ return;
+ }
+ try {
+ const mod = await import(`../modules/${module_name}.js`);
+ map_item = mod.map_item;
+ if (typeof map_item !== 'function') {
+ import_error = new Error(`modules/${module_name}.js does not export a map_item function`);
+ }
+ } catch (e) {
+ import_error = e;
+ }
+ });
+
+ for (const fixture_file of fixture_files) {
+ const lines = readFileSync(join(fixture_dir, fixture_file), 'utf8')
+ .split('\n')
+ .filter(line => line.trim().length > 0);
+
+ describe(fixture_file, () => {
+ lines.forEach((line, i) => {
+ test(`item ${i} maps without throwing`, () => {
+ if (import_error) {
+ throw new Error(`failed to import modules/${module_name}.js: ${import_error.message}`);
+ }
+ const stored_item = JSON.parse(line);
+ const mapped = map_item(wrap_for_map_item(stored_item));
+ expect(mapped).not.toBeNull();
+ expect(typeof mapped).toBe('object');
+ for (const field of REQUIRED_NON_EMPTY[module_name] ?? []) {
+ expect(mapped[field]).toBeDefined();
+ expect(mapped[field]).not.toBe('');
+ expect(mapped[field]).not.toBeNull();
+ }
+ });
+ });
+ });
+ }
+ });
+}
+
+if (total_fixtures === 0) {
+ describe('map_item', () => {
+ test.skip('no fixtures found under tests/fixtures//*.ndjson', () => {});
+ });
+}
diff --git a/tests/package.json b/tests/package.json
index dc3654c..6dd35fb 100644
--- a/tests/package.json
+++ b/tests/package.json
@@ -2,9 +2,10 @@
"name": "zeeschuimer-db-tests",
"version": "1.0.0",
"description": "Unit tests for Zeeschuimer duplicate handling logic",
+ "type": "module",
"scripts": {
- "test": "jest",
- "test:watch": "jest --watch"
+ "test": "node --experimental-vm-modules node_modules/jest/bin/jest.js",
+ "test:watch": "node --experimental-vm-modules node_modules/jest/bin/jest.js --watch"
},
"devDependencies": {
"dexie": "^3.2.4",
diff --git a/tests/setup-globals.cjs b/tests/setup-globals.cjs
new file mode 100644
index 0000000..a19fb09
--- /dev/null
+++ b/tests/setup-globals.cjs
@@ -0,0 +1,41 @@
+/**
+ * Make js/lib.js's helpers available as globals inside the Jest test
+ * environment, mirroring how the browser sees them after the manifest
+ * loads lib.js as a plain script.
+ *
+ * map_item bodies reference these as free identifiers (MappedItem,
+ * MissingMappedField, strip_tags, normalize_url_encoding, ...). Without this
+ * shim they'd hit ReferenceError as soon as a test invokes map_item.
+ *
+ * Approach: read lib.js, wrap it in a new Function() body that returns the
+ * named helpers, call the function, and assign the returned object onto
+ * globalThis. (Earlier attempt with vm.runInThisContext failed because in
+ * the jsdom env the vm context's global differs from jsdom's window.)
+ *
+ * If a new helper is added to lib.js, append its name to EXPOSED_NAMES.
+ */
+
+const fs = require('node:fs');
+const path = require('node:path');
+
+const EXPOSED_NAMES = [
+ 'traverse_data',
+ 'MappedItem',
+ 'MissingMappedField',
+ 'wrap_for_map_item',
+ 'strip_tags',
+ 'normalize_url_encoding',
+ 'formatUtcTimestamp',
+];
+
+const lib_source = fs.readFileSync(
+ path.join(__dirname, '..', 'js', 'lib.js'),
+ 'utf8',
+);
+
+const factory = new Function(`
+${lib_source}
+return { ${EXPOSED_NAMES.join(', ')} };
+`);
+
+Object.assign(globalThis, factory());
From 46b96c77ffd45f465f90880915e1f6d2836bd87e Mon Sep 17 00:00:00 2001
From: Dale Wahl
Date: Wed, 6 May 2026 16:25:56 +0200
Subject: [PATCH 08/20] add fixtures folder and README.md to explain what I did
---
tests/fixtures/.gitignore | 5 +++++
tests/fixtures/README.md | 29 +++++++++++++++++++++++++++++
2 files changed, 34 insertions(+)
create mode 100644 tests/fixtures/.gitignore
create mode 100644 tests/fixtures/README.md
diff --git a/tests/fixtures/.gitignore b/tests/fixtures/.gitignore
new file mode 100644
index 0000000..8e89a83
--- /dev/null
+++ b/tests/fixtures/.gitignore
@@ -0,0 +1,5 @@
+# Ignore everything in this directory
+*
+# Except these files
+!.gitignore
+!README.md
\ No newline at end of file
diff --git a/tests/fixtures/README.md b/tests/fixtures/README.md
new file mode 100644
index 0000000..d24fe06
--- /dev/null
+++ b/tests/fixtures/README.md
@@ -0,0 +1,29 @@
+# Test fixtures for `map_item`
+
+Real captured items used to exercise each module's auto-generated `map_item`
+function.
+
+## Layout
+
+```
+tests/fixtures/
+ /
+ .ndjson
+ .ndjson
+```
+
+`` matches the filename in `modules/` without `.js` —
+e.g. `tiktok/` → `modules/tiktok.js`, `pinterest/` → `modules/pinterest.js`.
+You can drop multiple `.ndjson` files in a module folder; each gets its own
+`describe` block and each line becomes its own `test`.
+
+Filenames are free-form — the auto-export filename from the popup
+(`zeeschuimer-export--.ndjson`) is fine.
+
+## Privacy / committing
+
+These files contain real captured platform data — usernames, post
+content, URLs, sometimes images and other PII.
+
+If we want to create test exports or annonomize real exports, add them to
+.gitignore.
\ No newline at end of file
From 487b5b618e4a989cbfca7dbfe2b30b1e78dc62ad Mon Sep 17 00:00:00 2001
From: Dale Wahl
Date: Thu, 7 May 2026 15:53:22 +0200
Subject: [PATCH 09/20] add MapItemException
---
js/lib.js | 13 +++++++++++++
1 file changed, 13 insertions(+)
diff --git a/js/lib.js b/js/lib.js
index 3b144d2..e38430e 100644
--- a/js/lib.js
+++ b/js/lib.js
@@ -59,6 +59,19 @@ class MissingMappedField {
}
}
+/**
+ * Raised by `map_item` to signal a known mapping failure.
+ *
+ * Mirrors 4CAT's MapItemException: callers should catch it, skip the item,
+ * and warn the user that the platform's format may have shifted.
+ */
+class MapItemException extends Error {
+ constructor(message) {
+ super(message);
+ this.name = "MapItemException";
+ }
+}
+
/**
* Wrap a Zeeschuimer stored item to match the shape a 4CAT map_item expects.
*
From b6f487dbfa017a79207726f04f059078aaf4c4b5 Mon Sep 17 00:00:00 2001
From: Dale Wahl
Date: Thu, 7 May 2026 15:56:14 +0200
Subject: [PATCH 10/20] make a warning pop up
---
popup/interface.html | 42 ++++++++++++++++++++++++++++++
popup/interface.js | 62 +++++++++++++++++++++++++++++++++++++++-----
2 files changed, 97 insertions(+), 7 deletions(-)
diff --git a/popup/interface.html b/popup/interface.html
index e9d9b3f..0570e40 100644
--- a/popup/interface.html
+++ b/popup/interface.html
@@ -303,6 +303,42 @@
text-align: center;
}
+ #csv-warning {
+ position: fixed;
+ inset: 0;
+ background: rgba(60, 60, 59, 0.55);
+ display: flex;
+ align-items: center;
+ justify-content: center;
+ z-index: 1000;
+ }
+
+ #csv-warning[hidden] {
+ display: none;
+ }
+
+ #csv-warning .csv-warning-content {
+ background: var(--accent);
+ color: var(--neutral-contrast);
+ border: 2px solid var(--accent-alt);
+ border-radius: 6px;
+ padding: 1.25em 1.25em 1em 1.25em;
+ max-width: 24em;
+ text-align: center;
+ box-shadow: 0 0 20px var(--neutral-contrast);
+ }
+
+ #csv-warning .csv-warning-content p {
+ margin: 0 0 1em 0;
+ line-height: 1.4;
+ }
+
+ #csv-warning .dismiss-csv-warning {
+ display: block;
+ margin: 0 auto;
+ padding: 0.3em 1.25em;
+ }
+
.tooltippable:not(a):not(button) {
display: inline-block;
background: var(--neutral-contrast);
@@ -409,6 +445,12 @@
+
Zeeschuimer
diff --git a/popup/interface.js b/popup/interface.js
index 3b8aaa9..c56375a 100644
--- a/popup/interface.js
+++ b/popup/interface.js
@@ -351,16 +351,29 @@ async function button_handler(event) {
} else if (event.target.matches('.reset-all')) {
await background.db.items.clear();
+ } else if (event.target.matches('.dismiss-csv-warning')) {
+ const warning = document.getElementById('csv-warning');
+ if(warning) warning.hidden = true;
+
} else if (event.target.matches('.download-format')) {
const format = event.target.getAttribute('data-format');
- const blobber = format === 'csv' ? get_csv_blob : get_ndjson_blob;
const extension = format;
let platform = event.target.getAttribute('data-platform');
let date = new Date();
event.target.classList.add('loading');
- let blob = await blobber(platform);
+ let blob;
+ if(format === 'csv') {
+ const result = await get_csv_blob(platform);
+ blob = result.blob;
+ if(result.skipped > 0) {
+ console.warn(`Zeeschuimer: skipped ${result.skipped} ${platform} item(s) during CSV export. First reason: ${result.firstReason}`);
+ show_csv_warning(platform, result.skipped);
+ }
+ } else {
+ blob = await get_ndjson_blob(platform);
+ }
let filename = 'zeeschuimer-export-' + platform + '-' + date.toISOString().split(".")[0].replace(/:/g, "") + '.' + extension;
const downloadUrl = window.URL.createObjectURL(blob);
const downloadId = await browser.downloads.download({
@@ -637,27 +650,62 @@ function csv_escape(value) {
return value;
}
+/**
+ * Surface a CSV-export skip warning in the popup.
+ *
+ * Shown when the platform's `map_item` raised MapItemException for one or
+ * more items — typically the platform's response shape has shifted and the
+ * mapper no longer recognises every field. The user is steered to the
+ * .ndjson export, which is unaffected because it skips the mapper entirely.
+ */
+function show_csv_warning(platform, skipped) {
+ const warning = document.getElementById('csv-warning');
+ if(!warning) return;
+ const message = warning.querySelector('p');
+ message.innerText = `Skipped ${skipped} ${platform} item${skipped === 1 ? '' : 's'} in the CSV export — the platform's data format may have changed. Use the .ndjson export to get the full dataset until Zeeschuimer is updated.`;
+ warning.hidden = false;
+}
+
/**
* Get a CSV dump of items
*
* Returns a Blob with all items in it as CSV rows, mapped via the module's
* registered mapper function. A header row is included.
*
+ * Items whose mapper raises MapItemException are skipped and counted; any
+ * other error propagates. Skip count and the first skip reason are returned
+ * alongside the blob so the caller can warn the user. Just like 4CAT!
+ *
* @param platform
- * @returns {Promise}
+ * @returns {Promise<{blob: Blob, skipped: number, firstReason: string|null}>}
*/
async function get_csv_blob(platform) {
let csv = [];
+ let skipped = 0;
+ let firstReason = null;
const module = background.zeeschuimer.modules[platform];
await iterate_items(platform, function(item) {
- item = module.mapper(item);
+ let mapped;
+ try {
+ mapped = module.mapper(item);
+ } catch(e) {
+ // More JS fun: Check tag rather than `instanceof`.
+ // Actual Exception lives in some other realm (where modules and lib.js live), and cross-realm
+ // `instanceof` is unreliable under Firefox's wrappers.
+ if(e && e.name === 'MapItemException') {
+ skipped++;
+ if(firstReason === null) firstReason = e.message;
+ return;
+ }
+ throw e;
+ }
if(csv.length === 0) {
- csv.push(Object.keys(item).map(v => csv_escape(v)).join(CSV_SEPARATOR) + "\n");
+ csv.push(Object.keys(mapped).map(v => csv_escape(v)).join(CSV_SEPARATOR) + "\n");
}
- csv.push(Object.values(item).map(v => csv_escape(v)).join(CSV_SEPARATOR) + "\n");
+ csv.push(Object.values(mapped).map(v => csv_escape(v)).join(CSV_SEPARATOR) + "\n");
})
- return new Blob(csv, {type: 'text/csv'});
+ return {blob: new Blob(csv, {type: 'text/csv'}), skipped, firstReason};
}
/**
From f28e310c8893bb49ac535d33cc94089e8d0686b2 Mon Sep 17 00:00:00 2001
From: Dale Wahl
Date: Thu, 7 May 2026 16:42:19 +0200
Subject: [PATCH 11/20] add MapItemException
---
tests/setup-globals.cjs | 1 +
1 file changed, 1 insertion(+)
diff --git a/tests/setup-globals.cjs b/tests/setup-globals.cjs
index a19fb09..4f54e34 100644
--- a/tests/setup-globals.cjs
+++ b/tests/setup-globals.cjs
@@ -22,6 +22,7 @@ const EXPOSED_NAMES = [
'traverse_data',
'MappedItem',
'MissingMappedField',
+ 'MapItemException',
'wrap_for_map_item',
'strip_tags',
'normalize_url_encoding',
From 5baff31ae49167d215a56cf16ead326b22d975f3 Mon Sep 17 00:00:00 2001
From: Dale Wahl
Date: Wed, 27 May 2026 15:16:06 +0200
Subject: [PATCH 12/20] add env variables for tests (to connect to 4CAT)
---
.gitignore | 2 ++
tests/.env.example | 9 +++++++++
tests/package-lock.json | 14 ++++++++++++++
tests/package.json | 4 +++-
4 files changed, 28 insertions(+), 1 deletion(-)
create mode 100644 tests/.env.example
diff --git a/.gitignore b/.gitignore
index 6cf9326..fea65f3 100644
--- a/.gitignore
+++ b/.gitignore
@@ -5,6 +5,8 @@
# Testing artefacts
.temp-profile
+tests/.env
+tests/.env.local
# logs
geckodriver.log
diff --git a/tests/.env.example b/tests/.env.example
new file mode 100644
index 0000000..2e021bb
--- /dev/null
+++ b/tests/.env.example
@@ -0,0 +1,9 @@
+# 4CAT API config for the map_item comparison tests.
+# Copy this file to .env in this directory and fill in real values.
+# .env is gitignored; .env.example is the committed template.
+
+# Base URL of the 4CAT instance to hit. No trailing slash.
+FOURCAT_URL=http://localhost
+
+# API key for that 4CAT instance. Get one from the 4CAT UI; tied to your user.
+FOURCAT_API_KEY=your-api-key-here
diff --git a/tests/package-lock.json b/tests/package-lock.json
index cc8f457..d055883 100644
--- a/tests/package-lock.json
+++ b/tests/package-lock.json
@@ -9,6 +9,7 @@
"version": "1.0.0",
"devDependencies": {
"dexie": "^3.2.4",
+ "dotenv": "^16.4.5",
"fake-indexeddb": "^5.0.1",
"jest": "^29.7.0",
"jest-environment-jsdom": "^29.7.0"
@@ -1758,6 +1759,19 @@
"node": ">=12"
}
},
+ "node_modules/dotenv": {
+ "version": "16.6.1",
+ "resolved": "https://registry.npmjs.org/dotenv/-/dotenv-16.6.1.tgz",
+ "integrity": "sha512-uBq4egWHTcTt33a72vpSG0z3HnPuIl6NqYcTrKEg2azoEyl2hpW0zqlxysq2pK9HlDIHyHyakeYaYnSAwd8bow==",
+ "dev": true,
+ "license": "BSD-2-Clause",
+ "engines": {
+ "node": ">=12"
+ },
+ "funding": {
+ "url": "https://dotenvx.com"
+ }
+ },
"node_modules/dunder-proto": {
"version": "1.0.1",
"resolved": "https://registry.npmjs.org/dunder-proto/-/dunder-proto-1.0.1.tgz",
diff --git a/tests/package.json b/tests/package.json
index 6dd35fb..333564a 100644
--- a/tests/package.json
+++ b/tests/package.json
@@ -5,10 +5,12 @@
"type": "module",
"scripts": {
"test": "node --experimental-vm-modules node_modules/jest/bin/jest.js",
- "test:watch": "node --experimental-vm-modules node_modules/jest/bin/jest.js --watch"
+ "test:watch": "node --experimental-vm-modules node_modules/jest/bin/jest.js --watch",
+ "probe": "node probe-4cat.mjs"
},
"devDependencies": {
"dexie": "^3.2.4",
+ "dotenv": "^16.4.5",
"fake-indexeddb": "^5.0.1",
"jest": "^29.7.0",
"jest-environment-jsdom": "^29.7.0"
From 6a8ce3870f4e0b6c050d68573d8affa4cc46e37b Mon Sep 17 00:00:00 2001
From: Dale Wahl
Date: Wed, 27 May 2026 15:16:34 +0200
Subject: [PATCH 13/20] mirror 4CAT API missing value
---
js/lib.js | 6 ++++++
1 file changed, 6 insertions(+)
diff --git a/js/lib.js b/js/lib.js
index e38430e..c618a6a 100644
--- a/js/lib.js
+++ b/js/lib.js
@@ -57,6 +57,12 @@ class MissingMappedField {
toString() {
return `${this.value}`;
}
+
+ // Mirror 4CAT's API serialization so JSON.stringify produces the same
+ // tagged form on both sides. See docs/4cat-map-item-api.md.
+ toJSON() {
+ return { __missing: true, value: this.value };
+ }
}
/**
From 0c3140376ebd6e37cb1706fc48a105168d84d089 Mon Sep 17 00:00:00 2001
From: Dale Wahl
Date: Wed, 27 May 2026 18:41:52 +0200
Subject: [PATCH 14/20] test the 4cat API endpoint
---
tests/probe-4cat.mjs | 140 +++++++++++++++++++++++++++++++++++++++++++
1 file changed, 140 insertions(+)
create mode 100644 tests/probe-4cat.mjs
diff --git a/tests/probe-4cat.mjs b/tests/probe-4cat.mjs
new file mode 100644
index 0000000..0bf4e4d
--- /dev/null
+++ b/tests/probe-4cat.mjs
@@ -0,0 +1,140 @@
+/**
+ * Manually exercise 4CAT's /api/map-item/ endpoint against a fixture item.
+ *
+ * Usage:
+ * node probe-4cat.mjs [] [--index N]
+ *
+ * is the Zeeschuimer module filename without `.js` (e.g.
+ * "tiktok", "pinterest"). If is omitted, the first
+ * .ndjson in tests/fixtures// is used. --index selects which
+ * line of the fixture to send (default 0).
+ *
+ * Requires tests/.env with FOURCAT_URL and FOURCAT_API_KEY.
+ */
+
+import 'dotenv/config';
+import { readFileSync, existsSync, readdirSync } from 'node:fs';
+import { join, dirname } from 'node:path';
+import { fileURLToPath } from 'node:url';
+
+const __dirname = dirname(fileURLToPath(import.meta.url));
+
+const FOURCAT_URL = process.env.FOURCAT_URL?.replace(/\/$/, '');
+const FOURCAT_API_KEY = process.env.FOURCAT_API_KEY;
+
+if (!FOURCAT_URL || !FOURCAT_API_KEY || FOURCAT_API_KEY === 'your-api-key-here') {
+ console.error('error: FOURCAT_URL and FOURCAT_API_KEY must be set in tests/.env');
+ console.error(' (copy tests/.env.example to tests/.env and fill in real values)');
+ process.exit(1);
+}
+
+const ID_MAP_PATH = join(__dirname, 'zeeschuimer-to-4cat.json');
+const ID_MAP = existsSync(ID_MAP_PATH)
+ ? JSON.parse(readFileSync(ID_MAP_PATH, 'utf8'))
+ : {};
+
+function auth_headers() {
+ return { 'Authorization': `${FOURCAT_API_KEY}` };
+}
+
+async function list_datasources() {
+ const res = await fetch(`${FOURCAT_URL}/api/datasources/`, { headers: auth_headers() });
+ if (!res.ok) {
+ throw new Error(`GET /api/datasources/ → ${res.status}: ${await res.text()}`);
+ }
+ const body = await res.json();
+ return body.datasources ?? [];
+}
+
+async function map_item(datasource_id, item) {
+ const res = await fetch(`${FOURCAT_URL}/api/map-item/${datasource_id}/`, {
+ method: 'POST',
+ headers: { ...auth_headers(), 'Content-Type': 'application/json' },
+ body: JSON.stringify({ item }),
+ });
+ const text = await res.text();
+ let body;
+ try { body = JSON.parse(text); } catch { body = { raw: text }; }
+ return { status_code: res.status, body };
+}
+
+function parse_args(argv) {
+ const args = { module: null, fixture: null, index: 0 };
+ const positional = [];
+ for (let i = 2; i < argv.length; i++) {
+ if (argv[i] === '--index') {
+ args.index = parseInt(argv[++i], 10);
+ } else if (argv[i].startsWith('--index=')) {
+ args.index = parseInt(argv[i].split('=')[1], 10);
+ } else {
+ positional.push(argv[i]);
+ }
+ }
+ args.module = positional[0];
+ args.fixture = positional[1];
+ return args;
+}
+
+async function main() {
+ const args = parse_args(process.argv);
+ if (!args.module) {
+ console.error('Usage: node probe-4cat.mjs [] [--index N]');
+ process.exit(1);
+ }
+
+ const datasource_id = ID_MAP[args.module] ?? args.module;
+ const fixture_dir = join(__dirname, 'fixtures', args.module);
+
+ if (!existsSync(fixture_dir)) {
+ console.error(`error: no fixture dir at ${fixture_dir}`);
+ process.exit(1);
+ }
+
+ const candidates = readdirSync(fixture_dir).filter(f => f.endsWith('.ndjson'));
+ if (candidates.length === 0) {
+ console.error(`error: no .ndjson fixtures under ${fixture_dir}`);
+ process.exit(1);
+ }
+ const fixture_name = args.fixture ?? candidates[0];
+ const fixture_path = join(fixture_dir, fixture_name);
+ if (!existsSync(fixture_path)) {
+ console.error(`error: fixture ${fixture_path} not found`);
+ process.exit(1);
+ }
+
+ const lines = readFileSync(fixture_path, 'utf8').split('\n').filter(l => l.trim().length > 0);
+ if (args.index >= lines.length) {
+ console.error(`error: --index ${args.index} but fixture has ${lines.length} items`);
+ process.exit(1);
+ }
+ const item = JSON.parse(lines[args.index]);
+
+ console.log(`Module: ${args.module}`);
+ console.log(`Datasource id: ${datasource_id}${ID_MAP[args.module] ? ' (mapped via zeeschuimer-to-4cat.json)' : ''}`);
+ console.log(`URL: ${FOURCAT_URL}/api/map-item/${datasource_id}/`);
+ console.log(`Fixture: ${fixture_name}, item ${args.index} (item_id=${item.item_id ?? item.id})`);
+ console.log('');
+
+ const { status_code, body } = await map_item(datasource_id, item);
+ console.log(`HTTP ${status_code}`);
+ console.log(JSON.stringify(body, null, 2));
+
+ if (status_code === 404) {
+ console.error('');
+ console.error('Hint: datasource id may be wrong. Available Zeeschuimer-origin datasources:');
+ try {
+ const datasources = await list_datasources();
+ datasources
+ .filter(d => d.is_from_zeeschuimer && d.has_map_item)
+ .forEach(d => console.error(` - ${d.id} (${d.name})`));
+ } catch (e) {
+ console.error(` (couldn't fetch list: ${e.message})`);
+ }
+ process.exit(2);
+ }
+}
+
+main().catch(e => {
+ console.error(`probe failed: ${e.message}`);
+ process.exit(2);
+});
From be2f3087d8dd5af07175101a808903604c84d78b Mon Sep 17 00:00:00 2001
From: Dale Wahl
Date: Wed, 27 May 2026 18:43:04 +0200
Subject: [PATCH 15/20] update docs and packages
---
docs/test-plan.md | 6 +++---
tests/package-lock.json | 13 ++++++++++++-
tests/setup-globals.cjs | 11 +++++++++++
3 files changed, 26 insertions(+), 4 deletions(-)
diff --git a/docs/test-plan.md b/docs/test-plan.md
index 249a7e0..a4265eb 100644
--- a/docs/test-plan.md
+++ b/docs/test-plan.md
@@ -63,7 +63,7 @@ Phase 3 — 4CAT integration (optional)
- Problem: mapping tests live in 4CAT and need NDJSON input.
- Changes:
- Add CLI flags: `--4cat-url URL` and `--4cat-key KEY` (API key). Require both for upload.
- - After writing the per-platform NDJSON, POST it to `{4cat_url.rstrip('/')}/api/import-dataset/` with header `X-Zeeschuimer-Platform: {platform}` and `Authorization: Bearer {key}` (confirm header with your 4CAT instance; alternative is to trigger the extension UI upload button when cookie-based auth is required).
+ - After writing the per-platform NDJSON, POST it to `{4cat_url.rstrip('/')}/api/import-dataset/` with header `X-Zeeschuimer-Platform: {platform}` and `Authorization: {key}` (confirm header with your 4CAT instance; alternative is to trigger the extension UI upload button when cookie-based auth is required).
- Do not fail the test run on 4CAT errors — print status and continue.
Example upload with `requests`:
@@ -73,7 +73,7 @@ import requests
with open(ndjson_path, 'rb') as f:
headers = {
'X-Zeeschuimer-Platform': platform,
- 'Authorization': f'Bearer {fourcat_key}'
+ 'Authorization': f'{fourcat_key}'
}
r = requests.post(f"{fourcat_url.rstrip('/')}/api/import-dataset/", headers=headers, data=f)
# check r.status_code and r.text for details
@@ -149,7 +149,7 @@ Estimated effort: 6–10 hours of focused work to implement and test everything
Open questions / confirmations needed
-- Confirm 4CAT API key header format (currently suggested: `Authorization: Bearer {key}`). If your 4CAT requires cookie-based auth, we should emulate the extension upload button via Selenium instead.
+- Confirm 4CAT API key header format (currently suggested: `Authorization: {key}`). If your 4CAT requires cookie-based auth, we should emulate the extension upload button via Selenium instead.
- Confirm desired default for interactive mode. (Current recommendation: pause once per platform by default; provide `--no-interactive` to run fully headless.)
Next steps
diff --git a/tests/package-lock.json b/tests/package-lock.json
index d055883..7758e9f 100644
--- a/tests/package-lock.json
+++ b/tests/package-lock.json
@@ -12,7 +12,8 @@
"dotenv": "^16.4.5",
"fake-indexeddb": "^5.0.1",
"jest": "^29.7.0",
- "jest-environment-jsdom": "^29.7.0"
+ "jest-environment-jsdom": "^29.7.0",
+ "undici": "^6.20.0"
}
},
"node_modules/@babel/code-frame": {
@@ -4197,6 +4198,16 @@
"url": "https://github.com/sponsors/sindresorhus"
}
},
+ "node_modules/undici": {
+ "version": "6.26.0",
+ "resolved": "https://registry.npmjs.org/undici/-/undici-6.26.0.tgz",
+ "integrity": "sha512-4yqz8a3n5HmGTlsbADNtr/dJlhkh/55Rq798G6ibiULcXbDtaLpTl1pvdqcbFfeoj3iSi52lePFM7h9H21cw/A==",
+ "dev": true,
+ "license": "MIT",
+ "engines": {
+ "node": ">=18.17"
+ }
+ },
"node_modules/undici-types": {
"version": "7.16.0",
"resolved": "https://registry.npmjs.org/undici-types/-/undici-types-7.16.0.tgz",
diff --git a/tests/setup-globals.cjs b/tests/setup-globals.cjs
index 4f54e34..6793cc0 100644
--- a/tests/setup-globals.cjs
+++ b/tests/setup-globals.cjs
@@ -40,3 +40,14 @@ return { ${EXPOSED_NAMES.join(', ')} };
`);
Object.assign(globalThis, factory());
+
+// jsdom doesn't expose fetch and Jest's jsdom env shadows Node's global
+// fetch, so the comparator can't hit 4CAT without help. Polyfill from
+// undici (a Node-friendly HTTP client, separately installable on npm —
+// distinct from the undici bundled internally by Node, which isn't
+// require()-able by name).
+// Note: tests that use fetch (e.g. map_item_compare.test.js) declare
+// `@jest-environment node` at the top of the file. Node env has fetch
+// natively. Don't try to polyfill into jsdom — undici's internals use
+// Node-specific globals that jsdom shadows (clearImmediate,
+// markResourceTiming, fast timers), and polyfilling them all is brittle.
From caf1c7f48a19524282c06b688c08001e534791db Mon Sep 17 00:00:00 2001
From: Dale Wahl
Date: Wed, 27 May 2026 18:43:17 +0200
Subject: [PATCH 16/20] some mapping for odd datasource names
---
tests/zeeschuimer-to-4cat.json | 7 +++++++
1 file changed, 7 insertions(+)
create mode 100644 tests/zeeschuimer-to-4cat.json
diff --git a/tests/zeeschuimer-to-4cat.json b/tests/zeeschuimer-to-4cat.json
new file mode 100644
index 0000000..f7de942
--- /dev/null
+++ b/tests/zeeschuimer-to-4cat.json
@@ -0,0 +1,7 @@
+{
+ "_comment": "Maps Zeeschuimer module filenames (without .js) to 4CAT datasource ids when they differ. Default behavior is identity — only include entries where the two diverge. Discovered via http://localhost/api/datasources/.",
+ "9gag": "ninegag",
+ "truth": "truthsocial",
+ "rednote": "xiaohongshu",
+ "rednote-comments": "xiaohongshu-comments"
+}
From f10fc492845051c87b96b75561eb91de2af99d18 Mon Sep 17 00:00:00 2001
From: Dale Wahl
Date: Wed, 27 May 2026 18:44:05 +0200
Subject: [PATCH 17/20] update existing map_item tests and add helper
---
tests/_module-info.js | 45 ++++++++++++++++++
tests/map_item.test.js | 105 +++++++++++++++++++----------------------
2 files changed, 93 insertions(+), 57 deletions(-)
create mode 100644 tests/_module-info.js
diff --git a/tests/_module-info.js b/tests/_module-info.js
new file mode 100644
index 0000000..e261e4e
--- /dev/null
+++ b/tests/_module-info.js
@@ -0,0 +1,45 @@
+/**
+ * Shared helper for the map_item test drivers.
+ *
+ * Pre-validates a module by:
+ * 1. Running `node --check` on its file (syntax check; avoids the
+ * worker-killing experimental-ESM crash when a syntax error reaches
+ * the dynamic importer).
+ * 2. Dynamically importing it and checking for a `map_item` export.
+ *
+ * Returns one of four states the test driver can branch on:
+ * { state: 'ok', map_item: }
+ * { state: 'no_map_item' }
+ * { state: 'syntax_error', error: }
+ * { state: 'import_error', error: }
+ */
+
+import { spawnSync } from 'node:child_process';
+import { join, dirname } from 'node:path';
+import { fileURLToPath } from 'node:url';
+
+const __dirname = dirname(fileURLToPath(import.meta.url));
+const MODULES_ROOT = join(__dirname, '..', 'modules');
+
+function check_module_syntax(module_name) {
+ const module_path = join(MODULES_ROOT, `${module_name}.js`);
+ const result = spawnSync(process.execPath, ['--check', module_path], { encoding: 'utf8' });
+ if (result.status === 0) return null;
+ return (result.stderr || result.stdout || `exit code ${result.status}`).trim();
+}
+
+export async function inspect_module(module_name) {
+ const syntax_error = check_module_syntax(module_name);
+ if (syntax_error) {
+ return { state: 'syntax_error', error: syntax_error };
+ }
+ try {
+ const mod = await import(`../modules/${module_name}.js`);
+ if (typeof mod.map_item !== 'function') {
+ return { state: 'no_map_item' };
+ }
+ return { state: 'ok', map_item: mod.map_item };
+ } catch (e) {
+ return { state: 'import_error', error: e };
+ }
+}
diff --git a/tests/map_item.test.js b/tests/map_item.test.js
index 9dee6e8..2dc1bb6 100644
--- a/tests/map_item.test.js
+++ b/tests/map_item.test.js
@@ -1,5 +1,5 @@
/**
- * Auto-discovery test driver for module `map_item` functions.
+ * Smoke test driver for module `map_item` functions.
*
* Convention:
* tests/fixtures//*.ndjson
@@ -11,52 +11,36 @@
* presents items to a map_item function, then run through the module's
* map_item. Tests assert: function returns a non-null object, and any fields
* listed in REQUIRED_NON_EMPTY for that module are present and non-empty.
+ *
+ * Module-level state is determined upfront by inspect_module():
+ * - 'ok' → register per-item tests
+ * - 'no_map_item' → register a single skipped test (not applicable)
+ * - 'syntax_error' → register a single failing test pointing at the line
+ * - 'import_error' → register a single failing test with the message
*/
import { readdirSync, readFileSync, statSync, existsSync } from 'node:fs';
-import { spawnSync } from 'node:child_process';
import { join, dirname } from 'node:path';
import { fileURLToPath } from 'node:url';
-
-/**
- * Local mirror of wrap_for_map_item from js/lib.js.
- *
- * lib.js is loaded by the browser as a plain script (it defines globals
- * like traverse_data, MappedItem, wrap_for_map_item) and so cannot be
- * imported from Node. The wrap is three trivial lines with no dependencies
- * — duplicating it here is cheaper than restructuring lib.js into a module.
- * If lib.js's wrap_for_map_item ever gains real logic, this needs to track.
- */
-function wrap_for_map_item(stored_item) {
- const { data, ...meta } = stored_item;
- return { ...data, __import_meta: meta };
-}
+import { inspect_module } from './_module-info.js';
const __dirname = dirname(fileURLToPath(import.meta.url));
const FIXTURE_ROOT = join(__dirname, 'fixtures');
-const MODULES_ROOT = join(__dirname, '..', 'modules');
-
-/**
- * Pre-validate module syntax before dynamic import.
- *
- * `await import()` on a module with a syntax error throws inside V8's module
- * linker in a way Jest's experimental-vm-modules can't always recover from
- * (worker retry loop or Node process exit). Running `node --check` first
- * gives us a clean error string we can fail the test with.
- */
-function check_module_syntax(module_name) {
- const module_path = join(MODULES_ROOT, `${module_name}.js`);
- const result = spawnSync(process.execPath, ['--check', module_path], {
- encoding: 'utf8',
- });
- if (result.status === 0) return null;
- return (result.stderr || result.stdout || `exit code ${result.status}`).trim();
-}
const REQUIRED_NON_EMPTY = {
tiktok: ['id', 'author', 'unix_timestamp'],
};
+/**
+ * Local mirror of wrap_for_map_item from js/lib.js. lib.js is loaded by
+ * the browser as a plain script and so cannot be imported from Node; this
+ * three-line mirror is cheaper than restructuring lib.js into a module.
+ */
+function wrap_for_map_item(stored_item) {
+ const { data, ...meta } = stored_item;
+ return { ...data, __import_meta: meta };
+}
+
function list_module_dirs() {
if (!existsSync(FIXTURE_ROOT)) return [];
return readdirSync(FIXTURE_ROOT).filter(name => {
@@ -66,36 +50,46 @@ function list_module_dirs() {
}
const module_dirs = list_module_dirs();
+
+// Pre-pass: synchronously determine each module's state so we can branch
+// on it at describe/test registration time. Top-level await is supported
+// in Jest's experimental-vm-modules mode.
+const module_info = {};
+for (const module_name of module_dirs) {
+ module_info[module_name] = await inspect_module(module_name);
+}
+
let total_fixtures = 0;
for (const module_name of module_dirs) {
const fixture_dir = join(FIXTURE_ROOT, module_name);
const fixture_files = readdirSync(fixture_dir).filter(f => f.endsWith('.ndjson'));
-
if (fixture_files.length === 0) continue;
total_fixtures += fixture_files.length;
- describe(`map_item: ${module_name}`, () => {
- let map_item;
- let import_error;
-
- beforeAll(async () => {
- const syntax_error = check_module_syntax(module_name);
- if (syntax_error) {
- import_error = new Error(`syntax error:\n${syntax_error}`);
- return;
- }
- try {
- const mod = await import(`../modules/${module_name}.js`);
- map_item = mod.map_item;
- if (typeof map_item !== 'function') {
- import_error = new Error(`modules/${module_name}.js does not export a map_item function`);
- }
- } catch (e) {
- import_error = e;
- }
+ const info = module_info[module_name];
+
+ if (info.state === 'no_map_item') {
+ describe(`map_item: ${module_name}`, () => {
+ test.skip(`modules/${module_name}.js does not export a map_item function — nothing to smoke test`, () => {});
+ });
+ continue;
+ }
+
+ if (info.state === 'syntax_error' || info.state === 'import_error') {
+ const msg = info.state === 'syntax_error'
+ ? `syntax error:\n${info.error}`
+ : `import failed: ${info.error.message}`;
+ describe(`map_item: ${module_name}`, () => {
+ test(`module loads`, () => { throw new Error(msg); });
});
+ continue;
+ }
+
+ // state === 'ok' — register per-item tests
+ const map_item = info.map_item;
+ describe(`map_item: ${module_name}`, () => {
for (const fixture_file of fixture_files) {
const lines = readFileSync(join(fixture_dir, fixture_file), 'utf8')
.split('\n')
@@ -104,9 +98,6 @@ for (const module_name of module_dirs) {
describe(fixture_file, () => {
lines.forEach((line, i) => {
test(`item ${i} maps without throwing`, () => {
- if (import_error) {
- throw new Error(`failed to import modules/${module_name}.js: ${import_error.message}`);
- }
const stored_item = JSON.parse(line);
const mapped = map_item(wrap_for_map_item(stored_item));
expect(mapped).not.toBeNull();
From 3633cde656da3f70880ae49a2909deba3a044953 Mon Sep 17 00:00:00 2001
From: Dale Wahl
Date: Wed, 27 May 2026 18:44:23 +0200
Subject: [PATCH 18/20] comparison testing for datasources
---
tests/map_item_compare.test.js | 283 +++++++++++++++++++++++++++++++++
1 file changed, 283 insertions(+)
create mode 100644 tests/map_item_compare.test.js
diff --git a/tests/map_item_compare.test.js b/tests/map_item_compare.test.js
new file mode 100644
index 0000000..37e3e4c
--- /dev/null
+++ b/tests/map_item_compare.test.js
@@ -0,0 +1,283 @@
+/**
+ * @jest-environment node
+ *
+ * This file runs in Node test environment (not jsdom) because undici's
+ * fetch implementation uses Node-internal APIs (`clearImmediate`,
+ * `markResourceTiming`, fast-now timers, etc.) that jsdom shadows or
+ * doesn't expose. Polyfilling them into jsdom is whack-a-mole; node env
+ * has them all natively.
+ *
+ * Trade-off: no DOMParser in node env. The four modules that use
+ * `strip_tags` (gab, pinterest, rednote, truth) will need a DOMParser
+ * polyfill (e.g. via linkedom) before the comparator can run against
+ * them. Other modules (including instagram) work as-is.
+ */
+/**
+ * Compare JS map_item output against 4CAT's Python map_item via the API.
+ *
+ * For every line in every fixture, runs the JS map_item locally AND sends
+ * the same stored item to 4CAT's /api/map-item// endpoint, then
+ * diffs the two outputs field-by-field. Each item is its own Jest test —
+ * failures point at exactly which item and which fields diverge.
+ *
+ * Skips itself entirely if FOURCAT_URL / FOURCAT_API_KEY aren't set, so
+ * `npm test` keeps working without 4CAT configuration. Drop real values in
+ * tests/.env to enable.
+ *
+ * Datasource id mapping: tests/zeeschuimer-to-4cat.json (Zeeschuimer
+ * module filename → 4CAT datasource id, for the few names that diverge).
+ *
+ * Module-level state is determined upfront by inspect_module() (no
+ * map_item / syntax errors / import errors are handled before tests are
+ * registered, so they appear once per module, not once per item).
+ */
+
+import 'dotenv/config';
+import { jest } from '@jest/globals';
+import { readdirSync, readFileSync, statSync, existsSync } from 'node:fs';
+import { join, dirname } from 'node:path';
+import { fileURLToPath } from 'node:url';
+import { inspect_module } from './_module-info.js';
+
+const __dirname = dirname(fileURLToPath(import.meta.url));
+
+const FOURCAT_URL = process.env.FOURCAT_URL?.replace(/\/$/, '');
+const FOURCAT_API_KEY = process.env.FOURCAT_API_KEY;
+const HAS_4CAT = Boolean(
+ FOURCAT_URL && FOURCAT_API_KEY && FOURCAT_API_KEY !== 'your-api-key-here'
+);
+
+// When true (default), once any item in a module fails, subsequent items
+// in that same module skip the HTTP + map_item work and fail fast with a
+// "halted" message. Saves time when generator output is broken at the top.
+// Set FAIL_FAST=0 in env to run all items regardless.
+// Trim because cmd.exe's `set FAIL_FAST=0 && ...` includes the trailing
+// space in the variable value, which would otherwise defeat `!== '0'`.
+const FAIL_FAST = (process.env.FAIL_FAST ?? '').trim() !== '0';
+const halted_modules = new Set();
+
+const FIXTURE_ROOT = join(__dirname, 'fixtures');
+const ID_MAP_PATH = join(__dirname, 'zeeschuimer-to-4cat.json');
+const ID_MAP = existsSync(ID_MAP_PATH)
+ ? JSON.parse(readFileSync(ID_MAP_PATH, 'utf8'))
+ : {};
+
+function wrap_for_map_item(stored_item) {
+ const { data, ...meta } = stored_item;
+ return { ...data, __import_meta: meta };
+}
+
+async function call_4cat_map_item(datasource_id, item) {
+ const res = await fetch(`${FOURCAT_URL}/api/map-item/${datasource_id}/`, {
+ method: 'POST',
+ headers: {
+ // 4CAT accepts the raw key without a `Bearer ` prefix, per probe
+ 'Authorization': FOURCAT_API_KEY,
+ 'Content-Type': 'application/json',
+ },
+ body: JSON.stringify({ item }),
+ });
+ const text = await res.text();
+ if (!res.ok) {
+ throw new Error(`HTTP ${res.status} from 4CAT: ${text}`);
+ }
+ return JSON.parse(text);
+}
+
+// Round-trip a value through JSON so MappedItem, MissingMappedField, etc.
+// become plain JSON-compatible objects matching what 4CAT emits.
+function normalize(value) {
+ return JSON.parse(JSON.stringify(value));
+}
+
+// Recursive structural equality. Doesn't care about object key order, which
+// matters for nested values like {__missing: true, value: ""} where JS and
+// Python might emit keys in different orders.
+function deep_equal(a, b) {
+ if (a === b) return true;
+ if (a === null || b === null) return a === b;
+ if (typeof a !== typeof b) return false;
+ if (typeof a !== 'object') return false;
+ if (Array.isArray(a) !== Array.isArray(b)) return false;
+ if (Array.isArray(a)) {
+ if (a.length !== b.length) return false;
+ return a.every((v, i) => deep_equal(v, b[i]));
+ }
+ const a_keys = Object.keys(a);
+ const b_keys = Object.keys(b);
+ if (a_keys.length !== b_keys.length) return false;
+ return a_keys.every(k => k in b && deep_equal(a[k], b[k]));
+}
+
+function diff_objects(js_obj, py_obj) {
+ const diffs = [];
+ const keys = new Set([...Object.keys(js_obj ?? {}), ...Object.keys(py_obj ?? {})]);
+ for (const key of keys) {
+ const in_js = js_obj && key in js_obj;
+ const in_py = py_obj && key in py_obj;
+ if (!in_js) {
+ diffs.push({ key, kind: 'only_python', python: py_obj[key] });
+ } else if (!in_py) {
+ diffs.push({ key, kind: 'only_js', js: js_obj[key] });
+ } else if (!deep_equal(js_obj[key], py_obj[key])) {
+ diffs.push({ key, kind: 'mismatch', js: js_obj[key], python: py_obj[key] });
+ }
+ }
+ return diffs;
+}
+
+function format_diffs(diffs) {
+ return diffs.map(d => {
+ if (d.kind === 'only_js') {
+ return ` + only in JS: ${d.key} = ${JSON.stringify(d.js)}`;
+ }
+ if (d.kind === 'only_python') {
+ return ` - only in Python: ${d.key} = ${JSON.stringify(d.python)}`;
+ }
+ return ` ~ ${d.key}\n JS: ${JSON.stringify(d.js)}\n Python: ${JSON.stringify(d.python)}`;
+ }).join('\n');
+}
+
+// Pull out the first few module-frame lines from an error's stack so the
+// failure message points at where in modules/.js the throw happened.
+function format_error_with_location(err) {
+ if (!err) return String(err);
+ const message = err.message || String(err);
+ const stack = err.stack || '';
+ const module_frames = stack.split('\n')
+ .filter(l => l.includes('/modules/') || l.includes('\\modules\\'))
+ .slice(0, 3)
+ .map(l => l.trim());
+ return module_frames.length
+ ? `${message}\n ${module_frames.join('\n ')}`
+ : message;
+}
+
+function list_module_dirs() {
+ if (!existsSync(FIXTURE_ROOT)) return [];
+ return readdirSync(FIXTURE_ROOT).filter(name => {
+ try { return statSync(join(FIXTURE_ROOT, name)).isDirectory(); }
+ catch { return false; }
+ });
+}
+
+// Per-test timeout: each test does one HTTP round-trip to 4CAT. Jest's
+// default 5s is tight under load.
+jest.setTimeout(30000);
+
+if (!HAS_4CAT) {
+ describe('map_item compare (JS vs 4CAT Python)', () => {
+ test.skip('FOURCAT_URL / FOURCAT_API_KEY not configured — set them in tests/.env to enable', () => {});
+ });
+} else {
+ const module_dirs = list_module_dirs();
+
+ // Pre-pass: synchronously determine each module's state so we can branch
+ // on it at registration time.
+ const module_info = {};
+ for (const module_name of module_dirs) {
+ module_info[module_name] = await inspect_module(module_name);
+ }
+
+ let any_fixtures = false;
+
+ for (const module_name of module_dirs) {
+ const fixture_dir = join(FIXTURE_ROOT, module_name);
+ const fixture_files = readdirSync(fixture_dir).filter(f => f.endsWith('.ndjson'));
+ if (fixture_files.length === 0) continue;
+ any_fixtures = true;
+
+ const datasource_id = ID_MAP[module_name] ?? module_name;
+ const info = module_info[module_name];
+
+ if (info.state === 'no_map_item') {
+ // eslint-disable-next-line no-console
+ console.log(`[compare] skipping ${module_name}: modules/${module_name}.js does not export a map_item`);
+ continue;
+ }
+
+ if (info.state === 'syntax_error' || info.state === 'import_error') {
+ const msg = info.state === 'syntax_error'
+ ? `syntax error:\n${info.error}`
+ : `import failed: ${info.error.message}`;
+ describe(`map_item compare: ${module_name}`, () => {
+ test(`module loads`, () => { throw new Error(msg); });
+ });
+ continue;
+ }
+
+ // state === 'ok' — register per-item comparison tests
+ const map_item = info.map_item;
+
+ describe(`map_item compare: ${module_name} (4CAT id: ${datasource_id})`, () => {
+ for (const fixture_file of fixture_files) {
+ const lines = readFileSync(join(fixture_dir, fixture_file), 'utf8')
+ .split('\n')
+ .filter(line => line.trim().length > 0);
+
+ describe(fixture_file, () => {
+ lines.forEach((line, i) => {
+ test(`item ${i}`, async () => {
+ if (FAIL_FAST && halted_modules.has(module_name)) {
+ throw new Error(
+ '[halted after prior failure in this module — set FAIL_FAST=0 to run all items]'
+ );
+ }
+ try {
+ const stored_item = JSON.parse(line);
+
+ // 4CAT side
+ const response = await call_4cat_map_item(datasource_id, stored_item);
+
+ // JS side
+ let js_result;
+ let js_error;
+ try {
+ js_result = map_item(wrap_for_map_item(stored_item));
+ } catch (e) {
+ js_error = e;
+ }
+
+ if (response.status === 'mapped') {
+ if (js_error) {
+ throw new Error(
+ `4CAT mapped this item but JS threw: ${format_error_with_location(js_error)}`
+ );
+ }
+ const js_obj = normalize(js_result);
+ const py_obj = normalize(response.item);
+ const diffs = diff_objects(js_obj, py_obj);
+ if (diffs.length > 0) {
+ throw new Error(
+ `${diffs.length} field(s) differ between JS and 4CAT:\n${format_diffs(diffs)}`
+ );
+ }
+ } else if (response.status === 'skipped') {
+ if (!js_error) {
+ throw new Error(
+ `4CAT skipped this item ("${response.reason}") but JS produced a result`
+ );
+ }
+ // Both rejected — good. Skip reasons may differ in wording.
+ } else if (response.status === 'error') {
+ throw new Error(`4CAT errored on this item: ${response.message}`);
+ } else {
+ throw new Error(`unexpected 4CAT response status: ${JSON.stringify(response)}`);
+ }
+ } catch (e) {
+ if (FAIL_FAST) halted_modules.add(module_name);
+ throw e;
+ }
+ });
+ });
+ });
+ }
+ });
+ }
+
+ if (!any_fixtures) {
+ describe('map_item compare (JS vs 4CAT Python)', () => {
+ test.skip('no fixtures under tests/fixtures//*.ndjson', () => {});
+ });
+ }
+}
From 7d97a0fe342e3b7f932c79fe22e9b8c6b3c25bb3 Mon Sep 17 00:00:00 2001
From: Dale Wahl
Date: Wed, 27 May 2026 18:44:35 +0200
Subject: [PATCH 19/20] list common translation errors
---
tests/translation-errors.md | 430 ++++++++++++++++++++++++++++++++++++
1 file changed, 430 insertions(+)
create mode 100644 tests/translation-errors.md
diff --git a/tests/translation-errors.md b/tests/translation-errors.md
new file mode 100644
index 0000000..fcc160d
--- /dev/null
+++ b/tests/translation-errors.md
@@ -0,0 +1,430 @@
+# Auto-generator translation errors
+
+Patterns of incorrect Python → JavaScript translation observed in
+auto-generated `modules/*.js` files. Each entry has a search pattern so
+this doc doubles as a checklist when reviewing a new auto-generator PR.
+
+When an entry is fixed at the generator level (no longer appears in
+fresh output), mark it `[fixed]` and keep the entry around — useful
+history when something regresses.
+
+## How to use
+
+- Found a new pattern? Add an entry below following the template.
+- Reviewing a generator PR? `grep` each `Search pattern` against the
+ changed module files. Anything that hits is worth a manual look.
+- Iterating on the generator prompt? The "Why" lines are the
+ feedback to add — they describe the exact Python-vs-JS semantic
+ difference the LLM keeps missing.
+
+## Template
+
+```
+###
+
+**Status:** open | fixed in generator | accepted
+
+**Why it happens:**
+
+**Wrong JS:**
+```js
+
+```
+
+**Correct JS:**
+```js
+
+```
+
+**Example:** `modules/.js:`
+
+**Search pattern:** ``
+```
+
+---
+
+## Observed patterns
+
+### `in` operator on strings
+
+**Status:** open
+
+**Why it happens:** In Python, `"x" in some_string` is a substring check.
+In JavaScript, the `in` operator only works on **objects** and checks for
+property/key existence; using it with a string on the right-hand side
+throws `TypeError: cannot use 'in' operator to search for "x" in `.
+
+**Wrong JS:**
+```js
+const is_polaris = '__typename' in item && 'polaris' in item.__typename.toLowerCase();
+```
+
+**Correct JS:**
+```js
+const is_polaris = '__typename' in item && item.__typename.toLowerCase().includes('polaris');
+```
+
+**Example:** `modules/instagram.js:513`
+
+**Search pattern:** `'[^']+' in [a-zA-Z_$][\w$]*\.` — quoted string followed
+by `in` followed by a method call. Quick rough check: `grep -E "' in [a-zA-Z]" modules/`
+
+**Watch out for partial fixes:** seen as `'polaris' in (item.__typename ?? '').toLowerCase()`
+— adding `?? ''` guards against `undefined` but the `in` operator itself
+still throws on the resulting *string*. The fix is `.includes()`, not just
+defaulting the operand.
+
+---
+
+### Python f-string syntax left in single-quoted JS strings
+
+**Status:** open
+
+**Why it happens:** Python `f"... {var} ..."` interpolates. JS uses
+template literals (backticks) with `${var}`. The auto-generator leaves the
+`{var}` notation in a regular single- or double-quoted JS string, which is
+just literal text — no interpolation happens.
+
+**Wrong JS:**
+```js
+throw new MapItemException('Unable to parse item: different user {user.id} and owner {owner.id}');
+```
+
+**Correct JS:**
+```js
+throw new MapItemException(`Unable to parse item: different user ${user.id} and owner ${owner.id}`);
+```
+
+**Example:** `modules/instagram.js:754`
+
+**Search pattern:** `'[^']*\{[a-zA-Z_$][\w$.]*\}[^']*'` or `"[^"]*\{[a-zA-Z_$][\w$.]*\}[^"]*"`
+— a non-template-literal string containing `{identifier}` or `{identifier.path}`.
+Quick check: `grep -nE "['\"][^'\"]*\{[a-zA-Z_][a-zA-Z0-9_.]*\}[^'\"]*['\"]" modules/`
+
+---
+
+### `?? {}` default that defeats subsequent truthy checks
+
+**Status:** open
+
+**Why it happens:** When porting Python's `node.get('user') or {}` (which is
+intended to make subsequent code safe to call), the generator emits
+`node.user ?? {}`. That's a *valid* Python-equivalent, **but** any following
+`if (user && owner) { ... }` guard then never short-circuits because both
+`{}` references are truthy. The check ends up reading "if user and owner
+*objects* exist" when the intent was "if user and owner data exist."
+Subsequent property accesses then compare real ids/usernames against
+`undefined` on the missing side, often throwing.
+
+**Wrong JS:**
+```js
+const user = node.user ?? {};
+const owner = node.owner ?? {};
+if (user && owner) {
+ if (user.id === owner.id) { /* … */ }
+ else if (user.username !== owner.username) {
+ throw new MapItemException('different user and owner');
+ }
+}
+```
+
+**Correct JS** (depending on intent — pick one):
+```js
+// (a) drop the defaults so truthy guard means "both present"
+const user = node.user;
+const owner = node.owner;
+if (user && owner) { /* compare */ }
+```
+```js
+// (b) check for actual content, not just object identity
+const user = node.user ?? {};
+const owner = node.owner ?? {};
+if (Object.keys(user).length && Object.keys(owner).length) { /* compare */ }
+```
+
+**Example:** `modules/instagram.js:748-756`
+
+**Search pattern:** `\?\?\s*\{\s*\}` — any `?? {}` occurrence is worth a
+review of subsequent guards. Quick check: `grep -nE "\?\?\s*\{\s*\}" modules/`
+
+---
+
+### Bare relative path as a statement (junk auto-imports section)
+
+**Status:** open
+
+**Why it happens:** The generator emits an "auto-generated imports" marker
+block at the top of the module but writes the import target as a bare
+relative path on its own line (`../js/lib.js`) instead of a real `import`
+statement. JS parses that as `..` then `.` then `/js/lib.js` — syntax error.
+
+**Wrong JS:**
+```js
+// === auto-generated imports for map_item — DO NOT EDIT BY HAND ===
+../js/lib.js
+// === end auto-generated imports ===
+```
+
+**Correct JS** (one of):
+```js
+// === auto-generated imports — DO NOT EDIT BY HAND ===
+// Provided as globals by js/lib.js (loaded via manifest.json):
+// MappedItem, MissingMappedField, MapItemException, traverse_data,
+// strip_tags, normalize_url_encoding, formatUtcTimestamp
+// === end auto-generated imports ===
+```
+
+Or, if a real import is intended, an ESM import with named bindings:
+```js
+import { MappedItem, MissingMappedField } from '../js/lib.js';
+```
+
+**Example:** seen historically in `modules/tiktok.js:2`
+
+**Search pattern:** `^\.\./` at the start of a line in module files.
+Quick check: `grep -nE "^\.\." modules/*.js`
+
+---
+
+### Key-existence check (`'X' in obj`) used where Python intended value-truthiness (`obj.get('X')`)
+
+**Status:** open
+
+**Why it happens:** Python's `if node.get('usertags'):` is a *truthy check on
+the value* — returns False if the key is missing **or** if the value is
+`None`/empty/falsy. The generator translates this to `if ('usertags' in
+node)`, which in JS is a *key-existence check* — returns True even when
+the value is `null`. Subsequent property accesses on the null value then
+throw `Cannot read properties of null`.
+
+**Wrong JS:**
+```js
+const usertags = 'usertags' in node ? node.usertags.in.map(...).join(',') : '';
+// node.usertags can be null → .in.map blows up
+```
+
+**Correct JS:**
+```js
+const usertags = node.usertags ? node.usertags.in.map(...).join(',') : '';
+```
+
+**Example:** `modules/instagram.js:777`
+
+**Search pattern:** `'[^']+' in [a-zA-Z_$][\w$]*\s*\?` — quoted-string `in`
+identifier followed by `?` (ternary). Quick check:
+`grep -nE "'[^']+' in [a-zA-Z_]+ \?" modules/`
+
+---
+
+### Datetime serialization format mismatch
+
+**Status:** open
+
+**Why it happens:** Python's `datetime.utcfromtimestamp(t).strftime('%Y-%m-%d %H:%M:%S')`
+produces `"2026-05-13 21:27:31"` — space-separated, no timezone marker. JS's
+`new Date(t * 1000).toISOString()` produces `"2026-05-13T21:27:31.000Z"` — T
+separator, milliseconds, Z. The generator emits the JS `.toISOString()` form
+instead of using the existing `formatUtcTimestamp` helper from lib.js that
+mimics Python's output exactly.
+
+**Wrong JS:**
+```js
+collected_at = new Date(node.taken_at * 1000).toISOString();
+```
+
+**Correct JS:**
+```js
+collected_at = formatUtcTimestamp(node.taken_at);
+// formatUtcTimestamp is defined in js/lib.js as:
+// new Date(unixSeconds * 1000).toISOString().replace('T', ' ').slice(0, 19)
+```
+
+**Example:** `modules/instagram.js:782`
+
+**Search pattern:** `new Date\([^)]+\)\.toISOString\(\)` — any use of
+`.toISOString()`. The helper should be used instead. Quick check:
+`grep -nE "\.toISOString\(\)" modules/`
+
+---
+
+### `re.findall` capture groups vs JS `.match` with /g flag
+
+**Status:** open
+
+**Why it happens:** Python's `re.findall(r'#(\w+)', s)` returns the **capture
+group contents**: `['lotr', 'woodart']`. JS's `s.match(/#(\w+)/g)` (with the
+global flag) returns the **full matches**: `['#lotr', '#woodart']` — capture
+groups are ignored. The generator translates the regex literally without
+adjusting for this semantic difference, so the resulting strings keep
+prefixes/wrappers that Python would have stripped.
+
+**Wrong JS:**
+```js
+hashtags: caption.match(/#([^\s!@#$%^&*()_+{}:"|<>?;',./`~]+)/g)?.join(',')
+// produces "#lotr,#woodart"
+```
+
+**Correct JS:**
+```js
+// Option A: strip the literal prefix from each full match
+hashtags: caption.match(/#([^\s...]+)/g)?.map(h => h.slice(1)).join(',') ?? ''
+// Option B: use matchAll to get capture groups properly
+hashtags: [...caption.matchAll(/#([^\s...]+)/g)].map(m => m[1]).join(',') ?? ''
+```
+
+**Example:** `modules/instagram.js:812` (also 766, 870 — three copies)
+
+**Search pattern:** `\.match\(/[^/]*\([^/]*\)[^/]*/g\)` — any `.match()` with
+a global-flag regex containing a capture group. Quick check:
+`grep -nE "\.match\(/.*\(.*\).*\/g\)" modules/`
+
+---
+
+### `undefined` field values get dropped from JSON, but Python's `None` becomes `null`
+
+**Status:** open
+
+**Why it happens:** When `JSON.stringify` encounters an object property whose
+value is `undefined`, it **omits the key entirely** from the output. Python's
+`json.dumps` serializes `None` as `null`, keeping the key. The generator
+writes assignments like `location.city = node.location.city` where the
+right-hand side can be `undefined`, producing missing keys in JS output
+that show up as `only in Python: = null` diffs against 4CAT.
+
+**Wrong JS:**
+```js
+location.city = node.location.city; // undefined if .city missing
+// JSON.stringify({location_city: undefined}) → "{}" (key omitted)
+
+body: caption, // null if no caption — Python returns "" here, not null
+```
+
+**Correct JS:**
+```js
+// Whichever fallback Python uses for that specific field:
+location.city = node.location.city ?? null; // some fields → null
+body: caption ?? '', // other fields → ""
+```
+
+**Example:** `modules/instagram.js:745, 853` (`null` flavor),
+559, 648, 798 (`""` flavor for `body`)
+
+**Note:** Python's choice of `None` vs `""` is per-field — there's no
+universal rule. When the comparator reports `~ X JS: null Python: ""` use
+`?? ''`. When it reports `- only in Python: X = null` use `?? null`. The
+distinction matters because the JS output should match Python's choice
+exactly for that field.
+
+**Search pattern:** harder to grep automatically — any property assignment
+where the RHS could be `undefined`/`null` and the resulting field is
+expected to appear in the mapped output. Look at "only in Python: X = null"
+and "~ X JS: null Python: \"\"" diffs in the comparator output to find
+specific cases.
+
+---
+
+### Object-reference inequality used as type check
+
+**Status:** open
+
+**Why it happens:** The generator emits `caption !== new MissingMappedField('')`
+to mean "caption is not a missing-marker", but `new MissingMappedField('')`
+creates a fresh object every time, and `!==` on objects compares references.
+The expression is **always true**, so the conditional never takes the
+"missing" branch. Likely originates from Python idioms like `caption != ""`
+or `caption is not None`, mistranslated through the MissingMappedField
+abstraction.
+
+**Wrong JS:**
+```js
+hashtags: caption !== new MissingMappedField('') ? caption.match(...) : '',
+// !== between two different object references is always true
+```
+
+**Correct JS:**
+```js
+// If the intent was "if caption has content", just truthy-check it:
+hashtags: caption ? caption.match(...) : '',
+// If the intent was "if caption is not a MissingMappedField instance":
+hashtags: !(caption instanceof MissingMappedField) ? caption.match(...) : '',
+```
+
+**Example:** `modules/instagram.js:812` (and two other copies)
+
+**Search pattern:** `!== new [A-Z]` or `=== new [A-Z]` — any equality
+comparison with a freshly-constructed object. Quick check:
+`grep -nE "(!==|===) new [A-Z]" modules/`
+
+---
+
+### `.method()` chain on potentially-null result
+
+**Status:** open
+
+**Why it happens:** In Python, calling a method on `None` raises
+`AttributeError`, which 4CAT sometimes catches. In JS, calling a method on
+`null`/`undefined` throws `TypeError: Cannot read properties of null
+(reading '')`. The generator emits the same dotted chain without
+optional-chaining (`?.`) protection.
+
+**Wrong JS:**
+```js
+hashtags: caption !== new MissingMappedField('')
+ ? caption.match(/#([^\s!@#$%^&*()_+{}:"|<>?;',./`~]+)/g)?.join(',')
+ : '',
+```
+(here `caption` is allowed to be `null`, so `caption.match(...)` blows up
+on null caption)
+
+**Correct JS:**
+```js
+hashtags: caption
+ ? caption.match(/#([^\s!@#$%^&*()_+{}:"|<>?;',./`~]+)/g)?.join(',') ?? ''
+ : '',
+```
+
+**Example:** `modules/instagram.js:809`
+
+**Search pattern:** harder to grep — needs reading. Worth manual review of
+any field that uses `caption.match`, `something.split`, `something.join`
+without `?.` on a value that could be null/undefined.
+
+---
+
+## Generator prompt feedback (running list)
+
+Concrete things to fold into the generator's prompt over time:
+
+1. **Python `x in y` where `y` is a string** → use `y.includes(x)` in JS,
+ never `x in y`.
+2. **Python f-strings** → use JS template literals (backticks) with
+ `${...}` syntax. Never leave `{...}` in single- or double-quoted strings.
+3. **`?? {}` after a `.get(...) or {}` translation** → only use this if the
+ following code does property-access. If the following code does a
+ truthy guard (`if (x && y)`), drop the default and use just `node.user`.
+4. **Method chains on possibly-null values** → use `?.` (optional
+ chaining) instead of `.` whenever the receiver could be null/undefined.
+5. **The auto-imports header block** → emit either real `import { ... }`
+ statements with valid relative paths, or a comment-only header.
+ Never emit bare paths as JS statements.
+6. **Python `node.get('X')` truthy check** → in JS, use `node.X` (or
+ `node.X != null`), not `'X' in node`. The `in` operator checks key
+ existence, which is True even for explicit-null values.
+7. **Datetime serialization** → use the `formatUtcTimestamp` helper from
+ lib.js (which mimics Python's `strftime('%Y-%m-%d %H:%M:%S')` format),
+ not `new Date(...).toISOString()` (which has a different output shape:
+ T separator, milliseconds, Z suffix).
+8. **`re.findall` with capture groups** → in JS, `.match(/.../g)` returns
+ full matches, NOT capture groups. To get capture-group behavior, use
+ either `[...s.matchAll(/.../g)].map(m => m[1])` or post-process the
+ full matches with `.map(...)` to strip the literal parts.
+9. **Object-reference equality (`!== new X(...)`)** → never. Creating an
+ object with `new` produces a fresh reference; `===`/`!==` compares
+ identity. Use `instanceof X` for type checks, or compare values
+ directly. The MissingMappedField "is this missing?" check should be
+ `caption instanceof MissingMappedField` or just truthy-check the value.
+10. **Python `None` → JSON `null` vs JS `undefined` → omitted** — when a
+ field's value could be missing and Python returns `null` for it,
+ JS must explicitly assign `null` (not leave the value as `undefined`).
+ `JSON.stringify` drops `undefined` keys silently. Use `value ?? null`
+ when the field is expected to appear in the mapped output.
From 6ad4c134cf35d0993b2968f3b2dc832e2766794d Mon Sep 17 00:00:00 2001
From: Dale Wahl
Date: Wed, 27 May 2026 18:45:52 +0200
Subject: [PATCH 20/20] package.json fix
---
tests/package.json | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/tests/package.json b/tests/package.json
index 333564a..390fdd3 100644
--- a/tests/package.json
+++ b/tests/package.json
@@ -13,6 +13,7 @@
"dotenv": "^16.4.5",
"fake-indexeddb": "^5.0.1",
"jest": "^29.7.0",
- "jest-environment-jsdom": "^29.7.0"
+ "jest-environment-jsdom": "^29.7.0",
+ "undici": "^6.20.0"
}
}