Skip to content
This repository was archived by the owner on Sep 29, 2025. It is now read-only.

Commit 7f4e4a8

Browse files
authored
(EAI-627): Skip snooty pages with noindex meta.robots tag (#577)
* ingest snooty docs facets and meta * page prefix on keys * remove trailing/leading whitespace * Support concurrent embedding * page transform and exclude * test chunk transformer * skip noindex pages * fix length b/c noindex page
1 parent 370a2d5 commit 7f4e4a8

File tree

5 files changed

+184
-14
lines changed

5 files changed

+184
-14
lines changed

packages/ingest-mongodb-public/src/sources/snooty/SnootyDataSource.test.ts

Lines changed: 41 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import nock from "nock";
2+
import { Readable } from "stream";
23
import fs from "fs";
34
import Path from "path";
45
import JSONL from "jsonl-parse-stringify";
@@ -56,7 +57,7 @@ describe("SnootyDataSource", () => {
5657
});
5758

5859
const pages = await source.fetchPages();
59-
expect(pages.length).toBe(12);
60+
expect(pages).toHaveLength(11);
6061
const astPages = JSONL.parse<{ type: string; data: { ast: SnootyNode } }>(
6162
fs.readFileSync(sampleDataPath, "utf8")
6263
);
@@ -82,7 +83,7 @@ describe("SnootyDataSource", () => {
8283
snootyDataApiBaseUrl,
8384
});
8485
const pages = await source.fetchPages();
85-
expect(pages.length).toBe(12);
86+
expect(pages.length).toBe(11);
8687
expect(pages[0]).toMatchObject({
8788
format: "md",
8889
sourceName: "snooty-docs",
@@ -174,8 +175,45 @@ describe("SnootyDataSource", () => {
174175
)
175176
).toBeUndefined();
176177
});
178+
179+
it("skips noindex page", async () => {
180+
const mockUrl = "https://example.com";
181+
const noIndexMock = nock(mockUrl);
182+
// Use normal sample data (no deletes)
183+
const source = await makeSnootyDataSource({
184+
name: `snooty-test`,
185+
project,
186+
snootyDataApiBaseUrl: mockUrl,
187+
});
188+
noIndexMock
189+
.get(`/projects/${project.name}/${project.currentBranch}/documents`)
190+
.reply(200, () => {
191+
const noIndexAst = jsonLify(
192+
Path.resolve(SRC_ROOT, "../testData/noindex.json")
193+
);
194+
195+
const astWithIndex = jsonLify(
196+
Path.resolve(SRC_ROOT, "../testData/samplePage.json")
197+
);
198+
199+
const stream = new Readable();
200+
stream.push(noIndexAst + "\n");
201+
stream.push(astWithIndex + "\n");
202+
stream.push(null); // End the stream
203+
return stream;
204+
});
205+
206+
const pages = await source.fetchPages();
207+
// only captures the astWithIndex page, not the noIndexAst page
208+
expect(pages).toHaveLength(1);
209+
noIndexMock.done();
210+
});
177211
});
178212
});
213+
214+
function jsonLify(path: string) {
215+
return JSON.stringify(JSON.parse(fs.readFileSync(path, "utf-8")));
216+
}
179217
describe("handlePage()", () => {
180218
it("should correctly parse openapi spec page", async () => {
181219
const apiSpecPage = JSON.parse(
@@ -220,6 +258,6 @@ describe("handlePage()", () => {
220258
version: "1.0",
221259
},
222260
});
223-
expect(result.body).toContain("# $merge (aggregation)");
261+
expect(result?.body).toContain("# $merge (aggregation)");
224262
});
225263
});

packages/ingest-mongodb-public/src/sources/snooty/SnootyDataSource.ts

Lines changed: 16 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,12 @@ export type SnootyMetaNode = SnootyNode & {
7979
*/
8080
description: string;
8181
[key: string]: string | undefined;
82+
83+
/**
84+
Robots meta tag value for the page.
85+
@example "noindex, nofollow"
86+
*/
87+
robots?: string;
8288
};
8389
};
8490

@@ -221,7 +227,9 @@ export const makeSnootyDataSource = ({
221227
productName,
222228
version,
223229
});
224-
pages.push(page);
230+
if (page !== undefined) {
231+
pages.push(page);
232+
}
225233
} catch (error) {
226234
// Log the error and discard this document, but don't break the
227235
// overall fetchPages() call.
@@ -333,7 +341,7 @@ export const handlePage = async (
333341
productName?: string;
334342
version?: string;
335343
}
336-
): Promise<Page> => {
344+
): Promise<Page | undefined> => {
337345
// Strip first three path segments - according to Snooty team, they'll always
338346
// be ${property}/docsworker-xlarge/${branch}
339347
const pagePath = page.page_id
@@ -361,7 +369,12 @@ export const handlePage = async (
361369
body = snootyAstToMd(page.ast);
362370
title = getTitleFromSnootyAst(page.ast);
363371
}
364-
const pageMetadata = getMetadataFromSnootyAst(page.ast);
372+
const { metadata: pageMetadata, noIndex } = getMetadataFromSnootyAst(
373+
page.ast
374+
);
375+
if (noIndex) {
376+
return;
377+
}
365378

366379
return {
367380
url: new URL(pagePath, baseUrl.replace(/\/?$/, "/")).href.replace(

packages/ingest-mongodb-public/src/sources/snooty/snootyAstToMd.test.ts

Lines changed: 18 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -315,22 +315,37 @@ describe("getMetadataFromSnootyAst", () => {
315315
)
316316
);
317317
it("extracts meta directives", () => {
318-
const metadata = getMetadataFromSnootyAst(sampleMetadataPage.data.ast);
318+
const { metadata } = getMetadataFromSnootyAst(sampleMetadataPage.data.ast);
319319
expect(metadata).toMatchObject({
320320
description: expect.any(String),
321321
});
322322
});
323323
it("extracts meta.keyword directives as string[]", () => {
324-
const metadata = getMetadataFromSnootyAst(sampleMetadataPage.data.ast);
324+
const { metadata } = getMetadataFromSnootyAst(sampleMetadataPage.data.ast);
325325
expect(metadata).toMatchObject({
326326
keywords: expect.arrayContaining([expect.any(String)]),
327327
});
328328
});
329329
it("extracts facet directives", () => {
330-
const metadata = getMetadataFromSnootyAst(sampleMetadataPage.data.ast);
330+
const { metadata } = getMetadataFromSnootyAst(sampleMetadataPage.data.ast);
331331
expect(metadata).toMatchObject({
332332
genre: "tutorial",
333333
foo: "bar",
334334
});
335335
});
336+
337+
it("doesn't extract noindex if not present", () => {
338+
const { noIndex } = getMetadataFromSnootyAst(sampleMetadataPage.data.ast);
339+
expect(noIndex).toBe(false);
340+
});
341+
342+
it("extracts noindex if present", () => {
343+
const sampleMetadataPage = JSON.parse(
344+
fs.readFileSync(Path.resolve(SRC_ROOT, "../testData/noindex.json"), {
345+
encoding: "utf-8",
346+
})
347+
);
348+
const { noIndex } = getMetadataFromSnootyAst(sampleMetadataPage.data.ast);
349+
expect(noIndex).toBe(true);
350+
});
336351
});

packages/ingest-mongodb-public/src/sources/snooty/snootyAstToMd.ts

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -211,9 +211,7 @@ export const getTitleFromSnootyAst = (node: SnootyNode): string | undefined => {
211211
return textNodes.map(({ value }) => value).join("");
212212
};
213213

214-
export const getMetadataFromSnootyAst = (
215-
node: SnootyNode
216-
): Record<string, unknown> => {
214+
export const getMetadataFromSnootyAst = (node: SnootyNode) => {
217215
const facetAndMetaNodes = findAll(
218216
node,
219217
({ name }) => name === "facet" || name === "meta"
@@ -238,6 +236,7 @@ export const getMetadataFromSnootyAst = (
238236
return acc;
239237
}, {} as Record<string, string>);
240238

239+
let noIndex = false;
241240
const meta = metaNodes.reduce((acc, metaNode) => {
242241
if (!metaNode.options) {
243242
return acc;
@@ -248,13 +247,18 @@ export const getMetadataFromSnootyAst = (
248247
acc[key] = value.split(",").map((s) => s.trim());
249248
} else if (key === "description" && value) {
250249
acc[key] = value;
250+
} else if (key === "robots" && value) {
251+
noIndex = value.includes("noindex");
251252
}
252253
}
253254

254255
return acc;
255256
}, {} as Record<string, string | string[]>);
256257
return {
257-
...facets,
258-
...meta,
258+
metadata: {
259+
...facets,
260+
...meta,
261+
},
262+
noIndex,
259263
};
260264
};
Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,100 @@
1+
{
2+
"type": "page",
3+
"data": {
4+
"_id": "64e8d24124fcc731b479906d",
5+
"github_username": "docs-builder-bot",
6+
"page_id": "docs/docsworker-xlarge/master/release-notes/1.2",
7+
"ast": {
8+
"type": "root",
9+
"position": { "start": { "line": 0 } },
10+
"children": [
11+
{
12+
"type": "comment",
13+
"position": { "start": { "line": 1 } },
14+
"children": [
15+
{
16+
"type": "text",
17+
"position": { "start": { "line": 1 } },
18+
"value": "This page is hidden from the TOC and search indexing."
19+
}
20+
]
21+
},
22+
{
23+
"type": "directive",
24+
"position": { "start": { "line": 4 } },
25+
"children": [],
26+
"domain": "",
27+
"name": "meta",
28+
"argument": [],
29+
"options": { "robots": "noindex, nosnippet" }
30+
},
31+
{
32+
"type": "target",
33+
"position": { "start": { "line": 7 } },
34+
"children": [
35+
{
36+
"type": "target_identifier",
37+
"position": { "start": { "line": 7 } },
38+
"children": [
39+
{
40+
"type": "text",
41+
"position": { "start": { "line": 11 } },
42+
"value": "Release Notes for MongoDB 1.2.x"
43+
}
44+
],
45+
"ids": ["release-notes-1.2"]
46+
}
47+
],
48+
"domain": "std",
49+
"name": "label",
50+
"html_id": "std-label-release-notes-1.2"
51+
},
52+
{
53+
"type": "section",
54+
"position": { "start": { "line": 11 } },
55+
"children": [
56+
{
57+
"type": "heading",
58+
"position": { "start": { "line": 11 } },
59+
"children": [
60+
{
61+
"type": "text",
62+
"position": { "start": { "line": 11 } },
63+
"value": "Release Notes for MongoDB 1.2.x"
64+
}
65+
],
66+
"id": "release-notes-for-mongodb-1.2.x"
67+
},
68+
{
69+
"type": "directive",
70+
"position": { "start": { "line": 15 } },
71+
"children": [],
72+
"domain": "",
73+
"name": "contents",
74+
"argument": [
75+
{
76+
"type": "text",
77+
"position": { "start": { "line": 15 } },
78+
"value": "On this page"
79+
}
80+
],
81+
"options": {
82+
"local": true,
83+
"backlinks": "none",
84+
"depth": 1,
85+
"class": "singlecol"
86+
}
87+
}
88+
]
89+
}
90+
],
91+
"fileid": "release-notes/1.2.txt"
92+
},
93+
"created_at": "2023-08-25T16:09:35.577Z",
94+
"deleted": true,
95+
"filename": "release-notes/1.2.txt",
96+
"static_assets": [],
97+
"updated_at": "2024-02-01T21:50:41.225Z",
98+
"build_id": "65bc1166bdcf995e0c6983bb"
99+
}
100+
}

0 commit comments

Comments
 (0)