From 436ce9171733dea50738e16eee5d7a9b45b53e18 Mon Sep 17 00:00:00 2001 From: Tim Hostetler <6970899+thostetler@users.noreply.github.com> Date: Tue, 2 Jun 2026 14:17:54 -0400 Subject: [PATCH] SCIX-884 feat(links): encode DOI/OpenURL special chars; fix abs path rewrite for multi-segment DOIs --- .../__tests__/createUrlByType.test.ts | 33 ++++++++++ .../__tests__/linkGenerator.test.ts | 4 +- .../__tests__/openUrlGenerator.test.ts | 13 ++++ .../AbstractSources/linkGenerator.ts | 3 +- .../AbstractSources/openUrlGenerator.ts | 24 ++++---- src/components/Layout/AbsLayout.tsx | 15 +++++ .../__tests__/identifier.test.ts | 20 ++++-- .../Metatags/json-ld-abstract/identifiers.ts | 3 +- .../__tests__/absCanonicalization.test.ts | 61 +++++++++++++++++-- src/lib/serverside/absCanonicalization.ts | 41 +++++++++++++ src/middleware.ts | 19 +++++- src/middleware/__tests__/rewrite.test.ts | 56 +++++++++++++---- .../__tests__/middleware-helpers.test.ts | 38 ++++++------ src/utils/common/__tests__/encodeDOI.test.ts | 34 +++++++++++ src/utils/common/encodeDOI.ts | 10 +++ 15 files changed, 314 insertions(+), 60 deletions(-) create mode 100644 src/components/AbstractSources/__tests__/createUrlByType.test.ts create mode 100644 src/utils/common/__tests__/encodeDOI.test.ts create mode 100644 src/utils/common/encodeDOI.ts diff --git a/src/components/AbstractSources/__tests__/createUrlByType.test.ts b/src/components/AbstractSources/__tests__/createUrlByType.test.ts new file mode 100644 index 000000000..63e4912a4 --- /dev/null +++ b/src/components/AbstractSources/__tests__/createUrlByType.test.ts @@ -0,0 +1,33 @@ +import { describe, expect, test } from 'vitest'; +import { createUrlByType } from '@/components/AbstractSources/linkGenerator'; + +describe('createUrlByType', () => { + test('encodes # in a DOI identifier', () => { + const url = createUrlByType( + '1999AN....320..163M', + 'doi', + '10.1002/1521-3994(199908)320:4/5<163::AID-ASNA163>3.0.CO;2-#', + ); + expect(url).not.toContain('#'); + expect(url).toContain('%23'); + }); + + test('encodes < and > in a DOI identifier', () => { + const url = createUrlByType('test', 'doi', '10.1000/foo'); + expect(url).toContain('%3C'); + expect(url).toContain('%3E'); + expect(url).not.toContain('<'); + expect(url).not.toContain('>'); + }); + + test('preserves / in DOI identifiers', () => { + const url = createUrlByType('test', 'doi', '10.48550/arXiv.2507.19320'); + expect(url).toContain('10.48550/arXiv.2507.19320'); + }); + + test('returns empty string for non-string arguments', () => { + expect(createUrlByType(null as unknown as string, 'doi', '10.1000/x')).toBe(''); + expect(createUrlByType('bib', null as unknown as string, '10.1000/x')).toBe(''); + expect(createUrlByType('bib', 'doi', null as unknown as string)).toBe(''); + }); +}); diff --git a/src/components/AbstractSources/__tests__/linkGenerator.test.ts b/src/components/AbstractSources/__tests__/linkGenerator.test.ts index b176aedf5..d550636d1 100644 --- a/src/components/AbstractSources/__tests__/linkGenerator.test.ts +++ b/src/components/AbstractSources/__tests__/linkGenerator.test.ts @@ -108,7 +108,7 @@ describe('processLinkData', () => { rawType: 'INSTITUTION', shortName: 'My Institution', type: 'INSTITUTION', - url: 'http://my-link-server.com/?url_ver=Z39.88-2004&rft_val_fmt=info:ofi/fmt:kev:mtx:article&rfr_id=info:sid/ADS&sid=ADS&id=doi:foo&genre=article&rft_id=info:doi/foo&rft.degree=false&rft.genre=article', + url: 'http://my-link-server.com/?url_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Aarticle&rfr_id=info%3Asid%2FADS&sid=ADS&id=doi%3Afoo&genre=article&rft_id=info%3Adoi%2Ffoo&rft.degree=false&rft.genre=article', }, ], dataProducts: [], @@ -163,7 +163,7 @@ describe('processLinkData', () => { rawType: 'INSTITUTION', shortName: 'My Institution', type: 'INSTITUTION', - url: 'http://my-link-server.com/?url_ver=Z39.88-2004&rft_val_fmt=info:ofi/fmt:kev:mtx:article&rfr_id=info:sid/ADS&sid=ADS&id=doi:foo&genre=article&rft_id=info:doi/foo&rft_id=info:bibcode/test&rft.genre=article', + url: 'http://my-link-server.com/?url_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Aarticle&rfr_id=info%3Asid%2FADS&sid=ADS&id=doi%3Afoo&genre=article&rft_id=info%3Adoi%2Ffoo&rft_id=info%3Abibcode%2Ftest&rft.genre=article', }, { description: 'Publisher PDF', diff --git a/src/components/AbstractSources/__tests__/openUrlGenerator.test.ts b/src/components/AbstractSources/__tests__/openUrlGenerator.test.ts index 7b3521796..50ac92502 100644 --- a/src/components/AbstractSources/__tests__/openUrlGenerator.test.ts +++ b/src/components/AbstractSources/__tests__/openUrlGenerator.test.ts @@ -1,5 +1,6 @@ import { expect, test } from 'vitest'; import { processLinkData } from '@/components/AbstractSources/linkGenerator'; +import { getOpenUrl } from '@/components/AbstractSources/openUrlGenerator'; test('processLinkData produces correct output', () => { expect( @@ -151,3 +152,15 @@ test('processLinkData can handle empty input', () => { ), ).toEqual(defaultReturn); }); + +test('encodes # in DOI when building OpenURL', () => { + const url = getOpenUrl({ + metadata: { + doi: ['10.1002/1521-3994(199908)320:4/5<163::AID-ASNA163>3.0.CO;2-#'], + bibcode: '1999AN....320..163M', + }, + linkServer: 'https://example.com/openurl', + }); + expect(url).not.toContain('#'); + expect(url).toContain('%23'); +}); diff --git a/src/components/AbstractSources/linkGenerator.ts b/src/components/AbstractSources/linkGenerator.ts index 994831798..a5341a733 100644 --- a/src/components/AbstractSources/linkGenerator.ts +++ b/src/components/AbstractSources/linkGenerator.ts @@ -4,6 +4,7 @@ import { getOpenUrl } from './openUrlGenerator'; import { isNilOrEmpty, isNonEmptyString } from 'ramda-adjunct'; import { IDataProductSource, IFullTextSource, ProcessLinkDataReturns } from '@/components/AbstractSources/types'; import { Esources, IDocsEntity } from '@/api/search/types'; +import { encodeDOIPath } from '@/utils/common/encodeDOI'; /** * Create the resolver url @@ -139,7 +140,7 @@ export const processLinkData = (doc: IDocsEntity, linkServer?: string): ProcessL */ export const createUrlByType = function (bibcode: string, type: string, identifier: string): string { if (typeof bibcode === 'string' && typeof type === 'string' && typeof identifier === 'string') { - return `${GATEWAY_BASE_URL + bibcode}/${type}:${identifier}`; + return `${GATEWAY_BASE_URL + bibcode}/${type}:${encodeDOIPath(identifier)}`; } return ''; }; diff --git a/src/components/AbstractSources/openUrlGenerator.ts b/src/components/AbstractSources/openUrlGenerator.ts index 276b2f5e5..31214e3d1 100644 --- a/src/components/AbstractSources/openUrlGenerator.ts +++ b/src/components/AbstractSources/openUrlGenerator.ts @@ -74,15 +74,15 @@ export const getOpenUrl = (options: IGetOpenUrlOptions): string => { }; interface IContext extends Partial { - spage: typeof parsed['rft.spage']; - volume: typeof parsed['rft.volume']; - title: typeof parsed['rft.jtitle']; - atitle: typeof parsed['rft.atitle']; - aulast: typeof parsed['rft.aulast']; - aufirst: typeof parsed['rft.aufirst']; - date: typeof parsed['rft.date']; - isbn: typeof parsed['rft.isbn']; - issn: typeof parsed['rft.issn']; + spage: (typeof parsed)['rft.spage']; + volume: (typeof parsed)['rft.volume']; + title: (typeof parsed)['rft.jtitle']; + atitle: (typeof parsed)['rft.atitle']; + aulast: (typeof parsed)['rft.aulast']; + aufirst: (typeof parsed)['rft.aufirst']; + date: (typeof parsed)['rft.date']; + isbn: (typeof parsed)['rft.isbn']; + issn: (typeof parsed)['rft.issn']; } // add extra fields to context object @@ -110,11 +110,11 @@ export const getOpenUrl = (options: IGetOpenUrlOptions): string => { if (isArray(val)) { return val .filter((v) => v) - .map((v) => `${k}=${v}`) + .map((v) => `${k}=${encodeURIComponent(String(v))}`) .join('&'); } - return `${k}=${val}`; + return `${k}=${encodeURIComponent(String(val))}`; }); - return encodeURI(openUrl + fields.join('&')); + return openUrl + fields.join('&'); }; diff --git a/src/components/Layout/AbsLayout.tsx b/src/components/Layout/AbsLayout.tsx index fe0df6681..d2a66ef07 100644 --- a/src/components/Layout/AbsLayout.tsx +++ b/src/components/Layout/AbsLayout.tsx @@ -21,6 +21,21 @@ interface IAbsLayoutProps { export const AbsLayout: FC = ({ children, doc, titleDescription, label }) => { const rawTitle = doc ? unwrapStringValue(doc.title) : ''; + useEffect(() => { + // After a server-side redirect from a legacy DOI URL whose '#' was stripped by the + // browser as a fragment, the browser re-attaches the original fragment to the redirect + // destination (RFC 9110). Strip it when it's just a redundant view name. + const hash = window.location.hash; + if ( + hash && + /^#\/?(?:abstract|citations|references|credits|mentions|coreads|similar|graphics|metrics|toc|exportcitation)/.test( + hash, + ) + ) { + history.replaceState(null, '', window.location.pathname + window.location.search); + } + }, []); + useEffect(() => { if (!doc?.bibcode) { return; diff --git a/src/components/Metatags/json-ld-abstract/__tests__/identifier.test.ts b/src/components/Metatags/json-ld-abstract/__tests__/identifier.test.ts index 330abc8b5..4a2f07d2c 100644 --- a/src/components/Metatags/json-ld-abstract/__tests__/identifier.test.ts +++ b/src/components/Metatags/json-ld-abstract/__tests__/identifier.test.ts @@ -1,8 +1,8 @@ -import { describe, expect, it } from 'vitest'; +import { describe, expect, test } from 'vitest'; import { collectIdentifiersFromArray } from '../identifiers'; describe('collectIdentifiersFromArray', () => { - it('parses common IDs from identifier[] only', () => { + test('parses common IDs from identifier[] only', () => { const { identifiers, sameAs } = collectIdentifiersFromArray({ identifier: [ 'arXiv:2503.12263', @@ -40,7 +40,7 @@ describe('collectIdentifiersFromArray', () => { expect(sameAs).toContain('https://www.semanticscholar.org/paper/abcdef'); }); - it('ignores junk without throwing', () => { + test('ignores junk without throwing', () => { const { identifiers, sameAs } = collectIdentifiersFromArray({ identifier: ['', ' ', 'not-an-id', 0 as unknown as string], }); @@ -48,7 +48,7 @@ describe('collectIdentifiersFromArray', () => { expect(Array.isArray(sameAs)).toBe(true); }); - it('dedupes duplicate identifiers and trims spaces/tags', () => { + test('dedupes duplicate identifiers and trims spaces/tags', () => { const { identifiers, sameAs } = collectIdentifiersFromArray({ identifier: [ 'arXiv:2503.12263', @@ -73,4 +73,16 @@ describe('collectIdentifiersFromArray', () => { expect(sa.has('https://hdl.handle.net/1234/abc')).toBe(true); expect(sa.size).toBe(3); }); + + test('encodes special characters in DOI sameAs URL', () => { + const { sameAs } = collectIdentifiersFromArray({ + identifier: ['10.1002/1521-3994(199908)320:4/5<163::AID-ASNA163>3.0.CO;2-#'], + }); + const doiLink = sameAs.find((u) => u.startsWith('https://doi.org/')); + expect(doiLink).toBeDefined(); + expect(doiLink).not.toContain('#'); + expect(doiLink).toContain('%23'); + expect(doiLink).not.toContain('<'); + expect(doiLink).toContain('%3C'); + }); }); diff --git a/src/components/Metatags/json-ld-abstract/identifiers.ts b/src/components/Metatags/json-ld-abstract/identifiers.ts index 08fb41d55..6dd6870b1 100644 --- a/src/components/Metatags/json-ld-abstract/identifiers.ts +++ b/src/components/Metatags/json-ld-abstract/identifiers.ts @@ -1,4 +1,5 @@ import type { PropertyValue } from 'schema-dts'; +import { encodeDOIPath } from '@/utils/common/encodeDOI'; /** * Minimal structure containing only identifiers we parse. @@ -68,7 +69,7 @@ function buildSameAs(pvs: PropertyValue[]) { const v = String(value); switch (propertyID) { case 'DOI': - out.add(`https://doi.org/${v}`); + out.add(`https://doi.org/${encodeDOIPath(v)}`); break; case 'arXiv': out.add(`https://arxiv.org/abs/${v}`); diff --git a/src/lib/serverside/__tests__/absCanonicalization.test.ts b/src/lib/serverside/__tests__/absCanonicalization.test.ts index 545575595..1b96b2d0a 100644 --- a/src/lib/serverside/__tests__/absCanonicalization.test.ts +++ b/src/lib/serverside/__tests__/absCanonicalization.test.ts @@ -1,4 +1,4 @@ -import { describe, expect, it, beforeEach, beforeAll, afterAll, vi } from 'vitest'; +import { describe, expect, test, beforeEach, beforeAll, afterAll, vi } from 'vitest'; import type { GetServerSidePropsContext } from 'next'; import { createAbsGetServerSideProps } from '../absCanonicalization'; @@ -77,7 +77,7 @@ beforeEach(() => { }); describe('createAbsGetServerSideProps', () => { - it('redirects to canonical bibcode with encoding and preserves query', async () => { + test('redirects to canonical bibcode with encoding and preserves query', async () => { fetchMock.mockResolvedValue({ ok: true, json: async () => ({ @@ -100,7 +100,7 @@ describe('createAbsGetServerSideProps', () => { } }); - it('redirects for other views', async () => { + test('redirects for other views', async () => { fetchMock.mockResolvedValue({ ok: true, json: async () => ({ @@ -123,7 +123,7 @@ describe('createAbsGetServerSideProps', () => { } }); - it('returns props when identifier is already canonical', async () => { + test('returns props when identifier is already canonical', async () => { const bibcode = 'MATCHING'; fetchMock.mockResolvedValue({ ok: true, @@ -150,7 +150,7 @@ describe('createAbsGetServerSideProps', () => { ); }); - it('forwards tracing headers to the search API', async () => { + test('forwards tracing headers to the search API', async () => { fetchMock.mockResolvedValue({ ok: true, json: async () => ({ @@ -182,7 +182,7 @@ describe('createAbsGetServerSideProps', () => { ); }); - it('does not redirect when no docs are returned', async () => { + test('does not redirect when no docs are returned', async () => { fetchMock.mockResolvedValue({ ok: true, json: async () => ({ @@ -201,4 +201,53 @@ describe('createAbsGetServerSideProps', () => { expect(result).not.toHaveProperty('redirect'); expect(result).toHaveProperty('props'); }); + + test('retries with # appended for DOIs when not found and redirects to canonical bibcode', async () => { + // DOIs like 10.1002/...3.0.CO;2-# end with '#', which browsers strip as a URL + // fragment when the character is not percent-encoded. The fallback detects this + // by retrying with '#' appended (DOIs only) and redirecting to the canonical bibcode URL. + fetchMock + .mockResolvedValueOnce({ + ok: true, + json: async () => ({ response: { docs: [] } }), + }) + .mockResolvedValueOnce({ + ok: true, + json: async () => ({ response: { docs: [{ bibcode: '1999AN....320..163H' }] } }), + }); + + const ctx = buildCtx({ + id: '10.1002/1521-3994(199908)320:4/5<163::AID-ASNA163>3.0.CO;2-', + resolvedUrl: '/abs/10.1002/1521-3994(199908)320:4/5<163::AID-ASNA163>3.0.CO;2-/abstract', + }); + + const gssp = createAbsGetServerSideProps('abstract'); + const result = await gssp(ctx); + + expect(fetchMock).toHaveBeenCalledTimes(2); + expect(result).toHaveProperty('redirect'); + if ('redirect' in result) { + expect(result.redirect?.destination).toBe('/abs/1999AN....320..163H/abstract'); + expect(result.redirect?.statusCode).toBe(302); + } + }); + + test('does not retry with # for non-DOI identifiers', async () => { + fetchMock.mockResolvedValue({ + ok: true, + json: async () => ({ response: { docs: [] } }), + }); + + const ctx = buildCtx({ + id: '2024ApJ...123..456X', + resolvedUrl: '/abs/2024ApJ...123..456X/abstract', + }); + + const gssp = createAbsGetServerSideProps('abstract'); + const result = await gssp(ctx); + + expect(fetchMock).toHaveBeenCalledTimes(1); + expect(result).not.toHaveProperty('redirect'); + expect(result).toHaveProperty('props'); + }); }); diff --git a/src/lib/serverside/absCanonicalization.ts b/src/lib/serverside/absCanonicalization.ts index 8579f57cb..2304b2282 100644 --- a/src/lib/serverside/absCanonicalization.ts +++ b/src/lib/serverside/absCanonicalization.ts @@ -152,6 +152,47 @@ const absCanonicalize = (viewPathResolver: ViewPathResolver): IncomingGSSP => { ctx.res.setHeader('Cache-Control', 's-maxage=60, stale-while-revalidate=300'); const initialDoc = data?.response?.docs?.[0] ?? null; + + // Some Wiley DOIs end with '#', which browsers strip as a URL fragment when the + // character is not percent-encoded. Retry once with '#' appended and redirect to + // the canonical bibcode URL so the page always loads correctly. + // Scoped to DOIs only (10.NNNN/ prefix) to avoid the extra round-trip for + // bibcodes, arXiv IDs, and other identifiers that can never end with '#'. + const isDoi = /^10\.\d{4,}\//.test(requestedId); + if (!initialDoc && isDoi && !requestedId.endsWith('#')) { + try { + const retryId = requestedId + '#'; + const retryUrl = new URL(`${process.env.API_HOST_SERVER}${ApiTargets.SEARCH}`); + retryUrl.search = stringifySearchParams(getAbstractParams(retryId)); + const retryResponse = await fetch(retryUrl, { + headers: { + Authorization: `Bearer ${bootstrapResult.token.access_token}`, + ...tracingHeaders, + }, + }); + if (retryResponse.ok) { + const retryData = (await retryResponse.json()) as IADSApiSearchResponse; + const retryDoc = retryData?.response?.docs?.[0] ?? null; + if (retryDoc?.bibcode) { + const requestUrl = new URL(ctx.req.url ?? ctx.resolvedUrl, 'http://adsabs.local'); + log.info({ requestedId, retryId, bibcode: retryDoc.bibcode, viewPath }, 'Hash fallback redirect'); + return { + redirect: { + destination: buildRedirect({ + canonicalIdentifier: retryDoc.bibcode, + viewPath, + search: requestUrl.search, + }), + statusCode: 302, + }, + }; + } + } + } catch (retryError) { + log.warn({ err: retryError, requestedId }, 'Hash fallback retry failed'); + } + } + const canonicalIdentifier = initialDoc?.bibcode; if (canonicalIdentifier && canonicalIdentifier !== requestedId) { diff --git a/src/middleware.ts b/src/middleware.ts index 5af8d0c99..e8d4719b2 100644 --- a/src/middleware.ts +++ b/src/middleware.ts @@ -147,7 +147,7 @@ export const normalizeAbsPath = ( let view = 'abstract'; let idSegments: string[] = []; - const hasEncodedId = parts.some((segment, idx) => idx > 0 && segment.includes('%2F')); + const hasEncodedId = parts.some((segment, idx) => idx > 0 && /%2F/i.test(segment)); if (parts.length >= 3 && parts[parts.length - 2] === 'exportcitation') { view = `exportcitation/${parts[parts.length - 1]}`; @@ -163,7 +163,11 @@ export const normalizeAbsPath = ( } else if (parts.length === 3) { idSegments = parts.slice(1); // treat as no explicit view, keep both segments } else if (parts.length > 3) { - idSegments = parts.slice(1, -1); // drop trailing unknown segment + // Keep all segments: there is no reliable way to distinguish an unknown + // view token from a legitimate DOI suffix component. Preserving the full + // identifier is safer — a 404 for a bad DOI is better than silently + // routing to the wrong paper. + idSegments = parts.slice(1); } else { idSegments = parts.slice(1); } @@ -181,7 +185,16 @@ export const normalizeAbsPath = ( } const rawIdentifier = idSegments.join('/'); - const encodedIdentifier = hasEncodedId ? rawIdentifier : encodeURIComponent(rawIdentifier); + // Decode first to avoid double-encoding any %-sequences already present in the + // path (e.g. %3C/%3E for < and > in a DOI), then re-encode cleanly. + const safeDecoded = (() => { + try { + return decodeURIComponent(rawIdentifier); + } catch { + return rawIdentifier; + } + })(); + const encodedIdentifier = hasEncodedId ? rawIdentifier : encodeURIComponent(safeDecoded); const rewrittenPath = `/abs/${encodedIdentifier}/${view}`; return { shouldRewrite: true, rewrittenPath, rawIdentifier, view }; diff --git a/src/middleware/__tests__/rewrite.test.ts b/src/middleware/__tests__/rewrite.test.ts index a77da56d2..990e177b0 100644 --- a/src/middleware/__tests__/rewrite.test.ts +++ b/src/middleware/__tests__/rewrite.test.ts @@ -1,4 +1,4 @@ -import { describe, expect, it } from 'vitest'; +import { describe, expect, test } from 'vitest'; import { NextRequest } from 'next/server'; import { normalizeAbsPath, rewriteAbsIdentifier } from '@/middleware'; @@ -6,61 +6,93 @@ import { normalizeAbsPath, rewriteAbsIdentifier } from '@/middleware'; const buildRequest = (path: string) => new NextRequest(`http://localhost:8000${path}`); describe.concurrent('normalizeAbsPath', () => { - it('rewrites multi-segment DOI with view', () => { + test('rewrites multi-segment DOI with view', () => { const result = normalizeAbsPath('/abs/10.48550/arXiv.2507.19320/citations'); expect(result.shouldRewrite).toBe(true); expect(result.rewrittenPath).toBe('/abs/10.48550%2FarXiv.2507.19320/citations'); }); - it('rewrites exportcitation with format', () => { + test('rewrites exportcitation with format', () => { const result = normalizeAbsPath('/abs/10.48550/arXiv.2507.19320/exportcitation/bibtex'); expect(result.shouldRewrite).toBe(true); expect(result.rewrittenPath).toBe('/abs/10.48550%2FarXiv.2507.19320/exportcitation/bibtex'); }); - it('does not rewrite single-segment id with known view', () => { + test('does not rewrite single-segment id with known view', () => { const result = normalizeAbsPath('/abs/2026JHEAp..5000470P/abstract'); expect(result.shouldRewrite).toBe(false); }); - it('rewrites multi-segment id without explicit view to abstract', () => { + test('rewrites multi-segment id without explicit view to abstract', () => { const result = normalizeAbsPath('/abs/10.48550/arXiv.2507.19320'); expect(result.shouldRewrite).toBe(true); expect(result.rewrittenPath).toBe('/abs/10.48550%2FarXiv.2507.19320/abstract'); }); - it('rewrites unknown view to abstract when multi-segment id', () => { + test('preserves unknown trailing segment as part of DOI identifier', () => { + // There is no safe way to distinguish an unknown view token from a DOI suffix + // component — keep all segments rather than risk dropping a real DOI part. const result = normalizeAbsPath('/abs/10.48550/arXiv.2507.19320/unknown'); expect(result.shouldRewrite).toBe(true); - expect(result.rewrittenPath).toBe('/abs/10.48550%2FarXiv.2507.19320/abstract'); + expect(result.rewrittenPath).toBe('/abs/10.48550%2FarXiv.2507.19320%2Funknown/abstract'); }); - it('rewrites encoded id with unknown view to abstract', () => { + test('rewrites encoded id with unknown view to abstract', () => { const result = normalizeAbsPath('/abs/10.48550%2FarXiv.2507.19320/unknown'); expect(result.shouldRewrite).toBe(true); expect(result.rewrittenPath).toBe('/abs/10.48550%2FarXiv.2507.19320/abstract'); }); - it('skips rewrite for already encoded id', () => { + test('skips rewrite for already encoded id', () => { const result = normalizeAbsPath('/abs/10.48550%2FarXiv.2507.19320/metrics'); expect(result.shouldRewrite).toBe(false); }); - it('ignores non-abs paths', () => { + test('ignores non-abs paths', () => { const result = normalizeAbsPath('/search?q=test'); expect(result.shouldRewrite).toBe(false); }); + + test('reassembles multi-segment DOI with special chars when no view is present', () => { + // Browser strips '#/abstract' from .../3.0.CO;2-#/abstract before sending to server. + // The server receives the path without '#' and without the view segment. + // The middleware should keep all segments (special chars rule out a view token). + const result = normalizeAbsPath('/abs/10.1002/1521-3994(199908)320:4/5<163::AID-ASNA163>3.0.CO;2-'); + expect(result.shouldRewrite).toBe(true); + expect(result.rewrittenPath).toBe( + '/abs/10.1002%2F1521-3994(199908)320%3A4%2F5%3C163%3A%3AAID-ASNA163%3E3.0.CO%3B2-/abstract', + ); + }); + + test('handles pre-encoded special chars in DOI without double-encoding', () => { + // <> encoded as %3C/%3E in the URL — the middleware must not further encode the % + // into %25, which would produce %253C and cause a no-results lookup. + const result = normalizeAbsPath('/abs/10.1002/1521-3994(199908)320:4/5%3C163::AID-ASNA163%3E3.0.CO;2-'); + expect(result.shouldRewrite).toBe(true); + expect(result.rewrittenPath).toBe( + '/abs/10.1002%2F1521-3994(199908)320%3A4%2F5%3C163%3A%3AAID-ASNA163%3E3.0.CO%3B2-/abstract', + ); + }); + + test('skips rewrite for pre-encoded # DOI — treated as single-segment id', () => { + // When the link was generated with encodeURIComponent the # becomes %23 and the + // whole DOI is a single path segment; no multi-segment rewrite needed. + const result = normalizeAbsPath( + '/abs/10.1002%2F1521-3994(199908)320%3A4%2F5%3C163%3A%3AAID-ASNA163%3E3.0.CO%3B2-%23/abstract', + ); + expect(result.shouldRewrite).toBe(false); + }); }); describe.concurrent('rewriteAbsIdentifier', () => { - it('sets middleware rewrite header for multi-segment DOI', () => { + test('sets middleware rewrite header for multi-segment DOI', () => { const req = buildRequest('/abs/10.48550/arXiv.2507.19320/citations'); const res = rewriteAbsIdentifier(req); expect(res).not.toBeNull(); expect(res?.headers.get('x-middleware-rewrite')).toContain('/abs/10.48550%2FarXiv.2507.19320/citations'); }); - it('returns null for single-segment id + view', () => { + test('returns null for single-segment id + view', () => { const req = buildRequest('/abs/2026JHEAp..5000470P/abstract'); const res = rewriteAbsIdentifier(req); expect(res).toBeNull(); diff --git a/src/middlewares/__tests__/middleware-helpers.test.ts b/src/middlewares/__tests__/middleware-helpers.test.ts index cff7b4e2f..12f550183 100644 --- a/src/middlewares/__tests__/middleware-helpers.test.ts +++ b/src/middlewares/__tests__/middleware-helpers.test.ts @@ -1,4 +1,4 @@ -import { afterAll, afterEach, beforeAll, describe, expect, it, vi } from 'vitest'; +import { afterAll, afterEach, beforeAll, describe, expect, test, vi } from 'vitest'; import { NextRequest } from 'next/server'; import { webcrypto } from 'crypto'; import { normalizeAbsPath } from '@/middleware'; @@ -8,12 +8,12 @@ import { isFromLegacyApp } from '@/utils/legacyAppDetection'; import { extractToken } from '@/middlewares/verifyMiddleware'; describe('normalizeAbsPath', () => { - it('returns shouldRewrite=false for non-abs paths or insufficient segments', () => { + test('returns shouldRewrite=false for non-abs paths or insufficient segments', () => { expect(normalizeAbsPath('/search')).toEqual({ shouldRewrite: false }); expect(normalizeAbsPath('/abs/123')).toEqual({ shouldRewrite: false }); }); - it('rewrites multi-part identifiers without explicit view', () => { + test('rewrites multi-part identifiers without explicit view', () => { const result = normalizeAbsPath('/abs/123/456'); expect(result).toEqual({ shouldRewrite: true, @@ -23,7 +23,7 @@ describe('normalizeAbsPath', () => { }); }); - it('rewrites known view paths with multi-part IDs', () => { + test('rewrites known view paths with multi-part IDs', () => { const result = normalizeAbsPath('/abs/123/456/abstract'); expect(result).toEqual({ shouldRewrite: true, @@ -33,7 +33,7 @@ describe('normalizeAbsPath', () => { }); }); - it('handles exportcitation special-case view', () => { + test('handles exportcitation special-case view', () => { const result = normalizeAbsPath('/abs/123/456/exportcitation/csl'); expect(result).toEqual({ shouldRewrite: true, @@ -43,7 +43,7 @@ describe('normalizeAbsPath', () => { }); }); - it('preserves already-encoded identifiers', () => { + test('preserves already-encoded identifiers', () => { const result = normalizeAbsPath('/abs/123%2F456/abstract'); expect(result).toEqual({ shouldRewrite: false }); }); @@ -62,7 +62,7 @@ describe('session helpers', () => { }); describe('isUserData', () => { - it('validates presence of required fields', () => { + test('validates presence of required fields', () => { const validUser = { access_token: 'token', expires_at: '9999999999', @@ -89,7 +89,7 @@ describe('session helpers', () => { vi.useRealTimers(); }); - it('returns false when expiry is in the future and true when in the past', () => { + test('returns false when expiry is in the future and true when in the past', () => { vi.setSystemTime(new Date('2023-01-01T00:00:00Z')); const nowSeconds = Math.floor(new Date('2023-01-01T00:00:00Z').getTime() / 1000); expect(isTokenExpired(String(nowSeconds + 10))).toBe(false); @@ -99,7 +99,7 @@ describe('session helpers', () => { }); describe('isValidToken', () => { - it('requires valid user data and non-expired token', () => { + test('requires valid user data and non-expired token', () => { const validUser = { access_token: 'token', expires_at: `${Math.floor(Date.now() / 1000) + 60}`, @@ -113,7 +113,7 @@ describe('session helpers', () => { }); describe('isAuthenticated', () => { - it('treats non-anonymous or non-default anonymous users as authenticated', () => { + test('treats non-anonymous or non-default anonymous users as authenticated', () => { const baseUser = { access_token: 'token', expires_at: '9999999999', @@ -127,16 +127,16 @@ describe('session helpers', () => { }); describe('hash', () => { - it('returns SHA-1 hex digest for a string', async () => { + test('returns SHA-1 hex digest for a string', async () => { await expect(hash('abc')).resolves.toBe('a9993e364706816aba3e25717850c26c9cd0d89d'); }); - it('returns empty string for empty input', async () => { + test('returns empty string for empty input', async () => { await expect(hash('')).resolves.toBe(''); await expect(hash()).resolves.toBe(''); }); - it('returns empty string when digest throws', async () => { + test('returns empty string when digest throws', async () => { const digestSpy = vi.spyOn(globalThis.crypto.subtle, 'digest').mockRejectedValueOnce(new Error('boom')); await expect(hash('abc')).resolves.toBe(''); digestSpy.mockRestore(); @@ -145,14 +145,14 @@ describe('session helpers', () => { }); describe('legacy search helpers', () => { - it('detects legacy /search paths that should redirect', () => { + test('detects legacy /search paths that should redirect', () => { expect(isLegacySearchURL(new NextRequest('https://example.com/search/q=star'))).toBe(true); expect(isLegacySearchURL(new NextRequest('https://example.com/search/'))).toBe(false); expect(isLegacySearchURL(new NextRequest('https://example.com/search/exportcitation'))).toBe(false); expect(isLegacySearchURL(new NextRequest('https://example.com/abs/123'))).toBe(false); }); - it('redirects legacy /search paths to canonical query string', () => { + test('redirects legacy /search paths to canonical query string', () => { const req = new NextRequest('https://example.com/search/q=foo+bar&fl=abstract'); const res = legacySearchURLMiddleware(req); @@ -167,14 +167,14 @@ describe('legacy search helpers', () => { }); describe('referer helpers', () => { - it('identifies referers from legacy app domains', () => { + test('identifies referers from legacy app domains', () => { expect(isFromLegacyApp('https://ui.adsabs.harvard.edu/search')).toBe(true); expect(isFromLegacyApp('https://devui.adsabs.harvard.edu/search')).toBe(true); expect(isFromLegacyApp('https://qa.adsabs.harvard.edu/search')).toBe(true); expect(isFromLegacyApp('https://dev.adsabs.harvard.edu/search')).toBe(true); }); - it('returns false for missing or invalid referer', () => { + test('returns false for missing or invalid referer', () => { expect(isFromLegacyApp()).toBe(false); expect(isFromLegacyApp('')).toBe(false); expect(isFromLegacyApp(':::::')).toBe(false); @@ -183,7 +183,7 @@ describe('referer helpers', () => { }); describe('extractToken', () => { - it('extracts route and token segments from verification path', () => { + test('extracts route and token segments from verification path', () => { expect(extractToken('/user/account/verify/change-email/abc123')).toEqual({ route: 'change-email', token: 'abc123', @@ -194,7 +194,7 @@ describe('extractToken', () => { }); }); - it('returns empty strings for non-string paths', () => { + test('returns empty strings for non-string paths', () => { expect(extractToken(undefined as unknown as string)).toEqual({ route: '', token: '' }); }); }); diff --git a/src/utils/common/__tests__/encodeDOI.test.ts b/src/utils/common/__tests__/encodeDOI.test.ts new file mode 100644 index 000000000..766997dbf --- /dev/null +++ b/src/utils/common/__tests__/encodeDOI.test.ts @@ -0,0 +1,34 @@ +import { describe, expect, test } from 'vitest'; +import { encodeDOIPath } from '../encodeDOI'; + +describe('encodeDOIPath', () => { + test('encodes # as %23', () => { + expect(encodeDOIPath('10.1002/1521-3994(199908)320:4/5<163::AID-ASNA163>3.0.CO;2-#')).toBe( + '10.1002/1521-3994(199908)320%3A4/5%3C163%3A%3AAID-ASNA163%3E3.0.CO%3B2-%23', + ); + }); + + test('encodes < and > as %3C and %3E', () => { + expect(encodeDOIPath('10.1000/foobaz')).toBe('10.1000/foo%3Cbar%3Ebaz'); + }); + + test('encodes ? as %3F', () => { + expect(encodeDOIPath('10.1000/foo?bar')).toBe('10.1000/foo%3Fbar'); + }); + + test('encodes space as %20', () => { + expect(encodeDOIPath('10.1000/foo bar')).toBe('10.1000/foo%20bar'); + }); + + test('preserves / as a path separator', () => { + expect(encodeDOIPath('10.48550/arXiv.2507.19320')).toBe('10.48550/arXiv.2507.19320'); + }); + + test('encodes % in pre-encoded input (not idempotent — pass raw DOIs only)', () => { + expect(encodeDOIPath('10.1000/foo%23bar')).toBe('10.1000/foo%2523bar'); + }); + + test('leaves a plain DOI unchanged', () => { + expect(encodeDOIPath('10.1086/345794')).toBe('10.1086/345794'); + }); +}); diff --git a/src/utils/common/encodeDOI.ts b/src/utils/common/encodeDOI.ts new file mode 100644 index 000000000..daf3f5143 --- /dev/null +++ b/src/utils/common/encodeDOI.ts @@ -0,0 +1,10 @@ +/** + * Encodes a DOI value for safe insertion into a URL path. + * + * Uses encodeURIComponent to encode all URL-unsafe characters (#, ?, <, >, [, ], + * spaces, etc.) then restores '/' which is a legitimate path separator in both + * doi.org URLs and ADS gateway URLs. + */ +export function encodeDOIPath(doi: string): string { + return encodeURIComponent(doi).replace(/%2F/gi, '/'); +}