diff --git a/src/Document/Dictionary/DictionaryValue/TextString/TextStringValue.php b/src/Document/Dictionary/DictionaryValue/TextString/TextStringValue.php index b159e0b4..7ef8420c 100644 --- a/src/Document/Dictionary/DictionaryValue/TextString/TextStringValue.php +++ b/src/Document/Dictionary/DictionaryValue/TextString/TextStringValue.php @@ -5,6 +5,7 @@ use Override; use PrinsFrank\PdfParser\Document\Dictionary\DictionaryValue\DictionaryValue; +use PrinsFrank\PdfParser\Document\Encoding\PDFDocEncoding; use PrinsFrank\PdfParser\Exception\ParseFailureException; /** @api */ @@ -15,42 +16,29 @@ public function __construct( /** @throws ParseFailureException */ public function getText(): string { - if (str_starts_with($this->textStringValue, '(') && str_ends_with($this->textStringValue, ')')) { - $value = preg_replace_callback( - '/\\\\([0-7]{1,3})/', - fn(array $matches) => mb_chr((int) octdec($matches[1])), - substr($this->textStringValue, 1, -1), + if (str_starts_with($this->textStringValue, '/')) { + return preg_replace_callback( + '/#([0-9A-F]{2})/', + fn(array $matches) => chr((int) hexdec($matches[1])), + $this->textStringValue, ) ?? throw new ParseFailureException(); - - return str_replace( - ['\\\\', '\n', '\r', '\t', '\b', '\f', '\(', '\)'], - ['\\', "\n", "\r", "\t", "\x08", "\f", '(', ')'], - $value, - ); } - if (str_starts_with($this->textStringValue, '<') && str_ends_with($this->textStringValue, '>')) { - $binaryValue = hex2bin(substr($this->textStringValue, 1, -1)); - if ($binaryValue === false) { - throw new ParseFailureException('Invalid hex string'); - } + $binaryValue = $this->getBinaryString(); - if (str_starts_with($binaryValue, "\xFE\xFF")) { - return mb_convert_encoding(substr($binaryValue, 2), 'UTF-8', 'UTF-16BE'); - } + if (str_starts_with($binaryValue, "\xFE\xFF")) { + return mb_convert_encoding(substr($binaryValue, 2), 'UTF-8', 'UTF-16BE'); + } - return $binaryValue; + if (str_starts_with($binaryValue, "\xFF\xFE")) { + return mb_convert_encoding(substr($binaryValue, 2), 'UTF-8', 'UTF-16LE'); } - if (str_starts_with($this->textStringValue, '/')) { - return preg_replace_callback( - '/#([0-9A-F]{2})/', - fn(array $matches) => chr((int) hexdec($matches[1])), - $this->textStringValue, - ) ?? throw new ParseFailureException(); + if (str_starts_with($binaryValue, "\xEF\xBB\xBF")) { + return substr($binaryValue, 3); } - throw new ParseFailureException(sprintf('Unrecognized format %s', $this->textStringValue)); + return PDFDocEncoding::textToUnicode($binaryValue); } public function getBinaryString(): string { diff --git a/tests/Samples/files/issue-127/contents.yml b/tests/Samples/files/issue-127/contents.yml index 573773f1..624173b2 100644 --- a/tests/Samples/files/issue-127/contents.yml +++ b/tests/Samples/files/issue-127/contents.yml @@ -3,7 +3,7 @@ version: '2.0' userPassword: null ownerPassword: null fileEncryptionKey: null -title: !!binary /v8AMQAuADIAIABUAGEAYgBlAGwAbABlAF8AMQBfAEsAQQBfADEAOQAtADAANwAwADUAOQ== +title: '1.2 Tabelle_1_KA_19-07059' producer: 'Kofax Power PDF' author: 'Rehm, Martin (LSN)' creator: 'Kofax Power PDF' diff --git a/tests/Samples/files/issue-235/contents.yml b/tests/Samples/files/issue-235/contents.yml index 66e64c6a..933fee63 100644 --- a/tests/Samples/files/issue-235/contents.yml +++ b/tests/Samples/files/issue-235/contents.yml @@ -6,7 +6,7 @@ fileEncryptionKey: null title: '' producer: 'Adobe PDF Library 20.1.50' author: null -creator: !!binary QWNyb2JhdCBQREZNYWtlciAyMCBm/HIgV29yZA== +creator: 'Acrobat PDFMaker 20 für Word' creationDate: 2025-04-10T10:26:21+02:00 modificationDate: 2025-04-10T10:28:29+02:00 pages: diff --git a/tests/Samples/files/issue-291/contents.yml b/tests/Samples/files/issue-291/contents.yml index ab4214c6..6b308958 100644 --- a/tests/Samples/files/issue-291/contents.yml +++ b/tests/Samples/files/issue-291/contents.yml @@ -4,9 +4,9 @@ userPassword: null ownerPassword: null fileEncryptionKey: null title: "The Netherlands courts have jurisdiction to hear a representative action concerning the alleged anticompetitive conduct of Apple in relation to its App Store aimed at the Netherlands market\r" -producer: !!binary /v8ATQBpAGMAcgBvAHMAbwBmAHQArgAgAFcAbwByAGQAIAAyADAAMgAx +producer: 'Microsoft® Word 2021' author: null -creator: !!binary /v8ATQBpAGMAcgBvAHMAbwBmAHQArgAgAFcAbwByAGQAIAAyADAAMgAx +creator: 'Microsoft® Word 2021' creationDate: 2025-12-01T16:55:05+01:00 modificationDate: 2025-12-01T16:55:05+01:00 pages: diff --git a/tests/Unit/Document/Dictionary/DictionaryValue/TextString/TextStringValueTest.php b/tests/Unit/Document/Dictionary/DictionaryValue/TextString/TextStringValueTest.php index dac00361..ba495cb6 100644 --- a/tests/Unit/Document/Dictionary/DictionaryValue/TextString/TextStringValueTest.php +++ b/tests/Unit/Document/Dictionary/DictionaryValue/TextString/TextStringValueTest.php @@ -80,6 +80,56 @@ public function testGetTextWithOctalCharacters(): void { ); } + /** @see 7.9.2.2 Text string type — UTF-16BE with a leading byte order mark */ + public function testGetTextConvertsUTF16BEToUTF8(): void { + // "Tïtle" as UTF-16BE (FE FF BOM) written as a hex string + static::assertSame( + 'Tïtle', + (new TextStringValue(''))->getText(), + ); + + // The same UTF-16BE bytes written as a literal string with octal escapes + static::assertSame( + 'Tïtle', + (new TextStringValue("(\376\377\000T\000\357\000t\000l\000e)"))->getText(), + ); + } + + /** @see 7.9.2.2 Text string type — UTF-16LE with a leading byte order mark */ + public function testGetTextConvertsUTF16LEToUTF8(): void { + static::assertSame( + 'Tïtle', + (new TextStringValue(''))->getText(), + ); + } + + /** @see 7.9.2.2.1 Text string type — UTF-8 with a leading byte order mark (PDF 2.0) */ + public function testGetTextStripsUTF8BOM(): void { + static::assertSame( + 'Tïtle', + (new TextStringValue(''))->getText(), + ); + } + + /** @see 7.9.2.2 Text string type — PDFDocEncoding (no byte order mark) is normalized to valid UTF-8 */ + public function testGetTextNormalizesPDFDocEncodingToUTF8(): void { + // 0xFC ("ü", shared with Latin-1) written as a literal octal escape and as a hex string + static::assertSame( + 'für', + (new TextStringValue('(f\374r)'))->getText(), + ); + static::assertSame( + 'für', + (new TextStringValue('<66FC72>'))->getText(), + ); + + // 0x80 ("•") and 0xA0 ("€") sit in the range where PDFDocEncoding diverges from Latin-1 + static::assertSame( + '•€', + (new TextStringValue('<80A0>'))->getText(), + ); + } + /** @see 7.3.5, table 4 */ public function testGetTextLiteralNames(): void { static::assertSame(