diff --git a/pdfbox/src/main/java/org/apache/pdfbox/text/TextPosition.java b/pdfbox/src/main/java/org/apache/pdfbox/text/TextPosition.java index 953bd8d251f..dc42aca0e9b 100644 --- a/pdfbox/src/main/java/org/apache/pdfbox/text/TextPosition.java +++ b/pdfbox/src/main/java/org/apache/pdfbox/text/TextPosition.java @@ -759,16 +759,25 @@ private void insertDiacritic(int i, TextPosition diacritic) float[] widths2 = new float[widths.length + 1]; System.arraycopy(widths, 0, widths2, 0, i); + // First we add a zero-width entry for the diacritic in the widths array + widths2[i] = widths[i]; + widths2[i + 1] = 0; + System.arraycopy(widths, i + 1, widths2, i + 2, widths.length - i - 1); + // Unicode combining diacritics always go after the base character, regardless of whether // the string is in presentation order or logical order sb.append(unicode.charAt(i)); - widths2[i] = widths[i]; + + // If a surrogate starts at the current position, make sure we preserve it + if (i < unicode.length() - 1 && Character.isSurrogatePair(unicode.charAt(i), unicode.charAt(i + 1))) { + sb.append(unicode.charAt(i + 1)); + i++; + } + sb.append(combineDiacritic(diacritic.getUnicode())); - widths2[i + 1] = 0; // get the rest of the string sb.append(unicode.substring(i + 1)); - System.arraycopy(widths, i + 1, widths2, i + 2, widths.length - i - 1); unicode = sb.toString(); widths = widths2; diff --git a/pdfbox/src/test/resources/input/PDFBOX-5747-unicode-surrogate-with-diacritic.pdf b/pdfbox/src/test/resources/input/PDFBOX-5747-unicode-surrogate-with-diacritic.pdf new file mode 100644 index 00000000000..9fc1d4069ca Binary files /dev/null and b/pdfbox/src/test/resources/input/PDFBOX-5747-unicode-surrogate-with-diacritic.pdf differ diff --git a/pdfbox/src/test/resources/input/PDFBOX-5747-unicode-surrogate-with-diacritic.pdf-sorted.txt b/pdfbox/src/test/resources/input/PDFBOX-5747-unicode-surrogate-with-diacritic.pdf-sorted.txt new file mode 100644 index 00000000000..2dd520823d4 --- /dev/null +++ b/pdfbox/src/test/resources/input/PDFBOX-5747-unicode-surrogate-with-diacritic.pdf-sorted.txt @@ -0,0 +1,3 @@ +Firefox file:///home/pablo/invchar +𝑋̂ +1 of 1 18/12/2023, 12:49 diff --git a/pdfbox/src/test/resources/input/PDFBOX-5747-unicode-surrogate-with-diacritic.pdf.txt b/pdfbox/src/test/resources/input/PDFBOX-5747-unicode-surrogate-with-diacritic.pdf.txt new file mode 100644 index 00000000000..03e4dbea436 --- /dev/null +++ b/pdfbox/src/test/resources/input/PDFBOX-5747-unicode-surrogate-with-diacritic.pdf.txt @@ -0,0 +1,3 @@ +𝑋̂ +Firefox file:///home/pablo/invchar +1 of 1 18/12/2023, 12:49