From e4ecc9e065ef1e7e170e7c3472758cd16fef25e2 Mon Sep 17 00:00:00 2001 From: Matteo Polito Date: Wed, 20 Aug 2025 00:59:49 +0200 Subject: [PATCH] Bundle Tesseract native libs --- AGENTS.md | 9 +- README.md | 9 +- docs/funsd_comparison.md | 1 - src/MarkItDownNet/MarkItDownConverter.cs | 94 +++++------------- src/MarkItDownNet/MarkItDownNet.csproj | 27 +---- .../x64/libleptonica-1.85.0.dll.so | 1 - .../TesseractOCR/x64/libtesseract55.dll.so | 1 - .../linux-x64/native}/libdl.so | Bin .../linux-x64/native/libleptonica-1.82.0.so} | Bin .../linux-x64/native}/libopenjp2.so.7 | Bin .../linux-x64/native/libtesseract50.so} | Bin tests/MarkItDownNet.Tests/LeptonicaTests.cs | 73 -------------- .../MarkItDownNet.Tests.csproj | 2 +- tests/MarkItDownNet.Tests/OcrImageTests.cs | 9 -- tests/MarkItDownNet.Tests/OcrPdfTests.cs | 7 -- tests/MarkItDownNet.Tests/OcrTestHelpers.cs | 15 --- .../DoclingComparison.csproj | 2 +- tools/DoclingComparison/Program.cs | 2 - tools/DoclingTimings/DoclingTimings.csproj | 2 +- tools/DoclingTimings/Program.cs | 2 - tools/FunsdComparison/FunsdComparison.csproj | 2 +- tools/FunsdComparison/Program.cs | 4 +- 22 files changed, 34 insertions(+), 228 deletions(-) delete mode 100644 src/MarkItDownNet/TesseractOCR/x64/libleptonica-1.85.0.dll.so delete mode 100644 src/MarkItDownNet/TesseractOCR/x64/libtesseract55.dll.so rename src/MarkItDownNet/{TesseractOCR/x64 => runtimes/linux-x64/native}/libdl.so (100%) rename src/MarkItDownNet/{TesseractOCR/x64/liblept.so.5 => runtimes/linux-x64/native/libleptonica-1.82.0.so} (100%) rename src/MarkItDownNet/{TesseractOCR/x64 => runtimes/linux-x64/native}/libopenjp2.so.7 (100%) rename src/MarkItDownNet/{TesseractOCR/x64/libtesseract.so.5 => runtimes/linux-x64/native/libtesseract50.so} (100%) delete mode 100644 tests/MarkItDownNet.Tests/LeptonicaTests.cs delete mode 100644 tests/MarkItDownNet.Tests/OcrTestHelpers.cs diff --git a/AGENTS.md b/AGENTS.md index bbe311e..dd66019 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -27,14 +27,7 @@ - Use `Serilog__MinimumLevel=Verbose` to enable detailed timings and counts. ## Operations -Le librerie native minime per Linux `x64` sono disponibili in `src/MarkItDownNet/TesseractOCR/x64` e vengono copiate accanto ai binari (`x64`) ad eccezione di `libdl.so`, posizionata in `runtimes/linux-x64/native`: - -* `libopenjp2.so.7` -* `liblept.so.5` con il symlink `libleptonica-1.85.0.dll.so` -* `libtesseract.so.5` con il symlink `libtesseract55.dll.so` -* `libdl.so` - -Grazie a queste dipendenze la libreria è auto‑consistente e **non richiede l'installazione di Tesseract o Leptonica**. +Le dipendenze native minime per Linux `x64` (Tesseract e Leptonica) sono incluse nel repository in `runtimes/linux-x64/native` e vengono copiate accanto ai binari. Non è richiesta l'installazione separata di Tesseract. Per l'OCR servono solo i dati delle lingue. Su Ubuntu 24.04 possono essere installati con: diff --git a/README.md b/README.md index f8f2eec..2b7a19f 100644 --- a/README.md +++ b/README.md @@ -46,14 +46,7 @@ All build and test commands must use the locally installed `dotnet`: ## Tesseract and leptonica -Per l'esecuzione su Linux `x64` il progetto include solo le librerie native strettamente necessarie. Tesseract e Leptonica risiedono nella sottocartella `x64` accanto ai binari, mentre `libdl.so` è collocata sotto `runtimes/linux-x64/native` per soddisfare il loader di `TesseractOCR`: - -* `libopenjp2.so.7` -* `liblept.so.5` e il symlink `libleptonica-1.85.0.dll.so` -* `libtesseract.so.5` e il symlink `libtesseract55.dll.so` -* `libdl.so` - -Grazie a queste dipendenze pre‑caricate la libreria è *auto‑consistente* e **non richiede l'installazione di Tesseract o Leptonica sul sistema**. +La libreria include le dipendenze native minime per Linux `x64` in `runtimes/linux-x64/native` e non richiede l'installazione di Tesseract o Leptonica sul sistema. Per eseguire l'OCR è necessario soltanto fornire i file `tessdata` delle lingue. Su Ubuntu 24.04 è sufficiente installare i pacchetti delle lingue desiderate, ad esempio: diff --git a/docs/funsd_comparison.md b/docs/funsd_comparison.md index ce92dd3..e5e2316 100644 --- a/docs/funsd_comparison.md +++ b/docs/funsd_comparison.md @@ -61,6 +61,5 @@ Confronto tra le bounding box delle parole annotate nel dataset di test FUNSD e ```bash export PATH=$HOME/.dotnet:$PATH -export LD_LIBRARY_PATH=/usr/lib/x86_64-linux-gnu:$PWD/tools/FunsdComparison/bin/Debug/net9.0/runtimes/linux-x64/native dotnet run --project tools/FunsdComparison ``` diff --git a/src/MarkItDownNet/MarkItDownConverter.cs b/src/MarkItDownNet/MarkItDownConverter.cs index ffbe6c7..eb34aae 100644 --- a/src/MarkItDownNet/MarkItDownConverter.cs +++ b/src/MarkItDownNet/MarkItDownConverter.cs @@ -2,14 +2,11 @@ using System.Collections.Generic; using System.IO; using System.Linq; -using System.Runtime.InteropServices; using System.Threading; using System.Threading.Tasks; using Markdig; using Serilog; -using TesseractOCR; -using TesseractOCR.Enums; -using TesseractOCR.InteropDotNet; +using Tesseract; using UglyToad.PdfPig; using UglyToad.PdfPig.Content; using PDFtoImage; @@ -23,40 +20,6 @@ public class MarkItDownConverter private readonly MarkItDownOptions _options; private readonly ILogger _logger; - static MarkItDownConverter() - { - if (RuntimeInformation.IsOSPlatform(OSPlatform.Linux)) - { - var libDir = Path.Combine(AppContext.BaseDirectory, "x64"); - var ldPath = Environment.GetEnvironmentVariable("LD_LIBRARY_PATH"); - Environment.SetEnvironmentVariable("LD_LIBRARY_PATH", string.IsNullOrEmpty(ldPath) ? libDir : libDir + ":" + ldPath); - LibraryLoader.Instance.CustomSearchPath = libDir; - LoadNative("libopenjp2.so.7"); - LoadNative("libleptonica-1.85.0.dll.so"); - LoadNative("libtesseract.so.5"); - } - } - - private const int RTLD_NOW = 2; - private const int RTLD_GLOBAL = 0x100; - - [DllImport("libdl.so.2")] - private static extern IntPtr dlopen(string fileName, int flags); - - private static void LoadNative(string name) - { - var path = Path.Combine(AppContext.BaseDirectory, "x64", name); - if (!File.Exists(path)) - { - throw new DllNotFoundException($"Unable to find '{name}' at '{path}'"); - } - var handle = dlopen(path, RTLD_NOW | RTLD_GLOBAL); - if (handle == IntPtr.Zero) - { - throw new DllNotFoundException($"Unable to load '{name}' from '{path}'"); - } - } - public MarkItDownConverter(MarkItDownOptions? options = null, ILogger? logger = null) { _options = options ?? new MarkItDownOptions(); @@ -136,7 +99,7 @@ private MarkItDownResult ProcessPdfWithOcr(string path, CancellationToken ct) pages.Add(new Page(pages.Count + 1, bitmap.Width, bitmap.Height)); using var image = SKImage.FromBitmap(bitmap); using var data = image.Encode(SKEncodedImageFormat.Png, 100); - using var pix = TesseractOCR.Pix.Image.LoadFromMemory(data.ToArray()); + using var pix = Pix.LoadFromMemory(data.ToArray()); var result = ProcessPix(pix, pages.Count, ct); lines.AddRange(result.lines); words.AddRange(result.words); @@ -149,51 +112,44 @@ private MarkItDownResult ProcessPdfWithOcr(string path, CancellationToken ct) private MarkItDownResult ProcessImage(string path, CancellationToken ct) { - using var pix = TesseractOCR.Pix.Image.LoadFromFile(path); + using var pix = Pix.LoadFromFile(path); var (lines, words) = ProcessPix(pix, 1, ct); var pages = new List { new Page(1, pix.Width, pix.Height) }; var markdown = BuildMarkdown(lines); return new MarkItDownResult(markdown, pages, lines, words); } - private (List lines, List words) ProcessPix(TesseractOCR.Pix.Image pix, int pageNumber, CancellationToken ct) + private (List lines, List words) ProcessPix(Pix pix, int pageNumber, CancellationToken ct) { var lines = new List(); var words = new List(); - using var engine = new Engine(_options.OcrDataPath ?? string.Empty, _options.OcrLanguages, EngineMode.Default); + using var engine = new TesseractEngine(_options.OcrDataPath ?? string.Empty, _options.OcrLanguages, EngineMode.Default); using var page = engine.Process(pix); - - foreach (var block in page.Layout) + using var iter = page.GetIterator(); + iter.Begin(); + do { - foreach (var paragraph in block.Paragraphs) + ct.ThrowIfCancellationRequested(); + + if (iter.IsAtBeginningOf(PageIteratorLevel.TextLine) && + iter.TryGetBoundingBox(PageIteratorLevel.TextLine, out var rectLine)) { - foreach (var textLine in paragraph.TextLines) + var text = iter.GetText(PageIteratorLevel.TextLine)?.Trim() ?? string.Empty; + if (!string.IsNullOrEmpty(text)) { - ct.ThrowIfCancellationRequested(); - if (textLine.BoundingBox is Rect rectLine) - { - var text = textLine.Text?.Trim() ?? string.Empty; - if (!string.IsNullOrEmpty(text)) - { - lines.Add(new Line(pageNumber, text, Normalize(rectLine, pix.Width, pix.Height))); - } - } - - foreach (var word in textLine.Words) - { - ct.ThrowIfCancellationRequested(); - if (word.BoundingBox is Rect rectWord) - { - var wText = word.Text?.Trim() ?? string.Empty; - if (!string.IsNullOrEmpty(wText)) - { - words.Add(new Word(pageNumber, wText, Normalize(rectWord, pix.Width, pix.Height))); - } - } - } + lines.Add(new Line(pageNumber, text, Normalize(rectLine, pix.Width, pix.Height))); } } - } + + if (iter.TryGetBoundingBox(PageIteratorLevel.Word, out var rectWord)) + { + var wText = iter.GetText(PageIteratorLevel.Word)?.Trim() ?? string.Empty; + if (!string.IsNullOrEmpty(wText)) + { + words.Add(new Word(pageNumber, wText, Normalize(rectWord, pix.Width, pix.Height))); + } + } + } while (iter.Next(PageIteratorLevel.Word)); return (lines, words); } diff --git a/src/MarkItDownNet/MarkItDownNet.csproj b/src/MarkItDownNet/MarkItDownNet.csproj index cb863b4..af500df 100644 --- a/src/MarkItDownNet/MarkItDownNet.csproj +++ b/src/MarkItDownNet/MarkItDownNet.csproj @@ -13,36 +13,13 @@ - + - - x64/libtesseract.so.5 - PreserveNewest - - - x64/libtesseract55.dll.so - PreserveNewest - - - x64/liblept.so.5 - PreserveNewest - - - x64/libleptonica-1.85.0.dll.so - PreserveNewest - - - x64/libopenjp2.so.7 - PreserveNewest - - - runtimes/linux-x64/native/libdl.so - PreserveNewest - + diff --git a/src/MarkItDownNet/TesseractOCR/x64/libleptonica-1.85.0.dll.so b/src/MarkItDownNet/TesseractOCR/x64/libleptonica-1.85.0.dll.so deleted file mode 100644 index b962e75..0000000 --- a/src/MarkItDownNet/TesseractOCR/x64/libleptonica-1.85.0.dll.so +++ /dev/null @@ -1 +0,0 @@ -liblept.so.5 \ No newline at end of file diff --git a/src/MarkItDownNet/TesseractOCR/x64/libtesseract55.dll.so b/src/MarkItDownNet/TesseractOCR/x64/libtesseract55.dll.so deleted file mode 100644 index 68a9149..0000000 --- a/src/MarkItDownNet/TesseractOCR/x64/libtesseract55.dll.so +++ /dev/null @@ -1 +0,0 @@ -libtesseract.so.5 \ No newline at end of file diff --git a/src/MarkItDownNet/TesseractOCR/x64/libdl.so b/src/MarkItDownNet/runtimes/linux-x64/native/libdl.so similarity index 100% rename from src/MarkItDownNet/TesseractOCR/x64/libdl.so rename to src/MarkItDownNet/runtimes/linux-x64/native/libdl.so diff --git a/src/MarkItDownNet/TesseractOCR/x64/liblept.so.5 b/src/MarkItDownNet/runtimes/linux-x64/native/libleptonica-1.82.0.so similarity index 100% rename from src/MarkItDownNet/TesseractOCR/x64/liblept.so.5 rename to src/MarkItDownNet/runtimes/linux-x64/native/libleptonica-1.82.0.so diff --git a/src/MarkItDownNet/TesseractOCR/x64/libopenjp2.so.7 b/src/MarkItDownNet/runtimes/linux-x64/native/libopenjp2.so.7 similarity index 100% rename from src/MarkItDownNet/TesseractOCR/x64/libopenjp2.so.7 rename to src/MarkItDownNet/runtimes/linux-x64/native/libopenjp2.so.7 diff --git a/src/MarkItDownNet/TesseractOCR/x64/libtesseract.so.5 b/src/MarkItDownNet/runtimes/linux-x64/native/libtesseract50.so similarity index 100% rename from src/MarkItDownNet/TesseractOCR/x64/libtesseract.so.5 rename to src/MarkItDownNet/runtimes/linux-x64/native/libtesseract50.so diff --git a/tests/MarkItDownNet.Tests/LeptonicaTests.cs b/tests/MarkItDownNet.Tests/LeptonicaTests.cs deleted file mode 100644 index 0f4a13b..0000000 --- a/tests/MarkItDownNet.Tests/LeptonicaTests.cs +++ /dev/null @@ -1,73 +0,0 @@ -using System; -using System.Runtime.InteropServices; -using System.Runtime.CompilerServices; -using Xunit; -using MarkItDownNet; - -namespace MarkItDownNet.Tests; - -public class LeptonicaTests -{ - private const string LeptonicaDll = "x64/libleptonica-1.85.0.dll.so"; - - static LeptonicaTests() - { - RuntimeHelpers.RunClassConstructor(typeof(MarkItDownConverter).TypeHandle); - } - - [DllImport(LeptonicaDll, CallingConvention = CallingConvention.Cdecl)] - private static extern IntPtr pixCreate(int width, int height, int depth); - - [DllImport(LeptonicaDll, CallingConvention = CallingConvention.Cdecl)] - private static extern int pixGetWidth(IntPtr pix); - - [DllImport(LeptonicaDll, CallingConvention = CallingConvention.Cdecl)] - private static extern int pixGetHeight(IntPtr pix); - - [DllImport(LeptonicaDll, CallingConvention = CallingConvention.Cdecl)] - private static extern int pixGetDepth(IntPtr pix); - - [DllImport(LeptonicaDll, CallingConvention = CallingConvention.Cdecl)] - private static extern int pixSetPixel(IntPtr pix, int x, int y, uint value); - - [DllImport(LeptonicaDll, CallingConvention = CallingConvention.Cdecl)] - private static extern int pixGetPixel(IntPtr pix, int x, int y, out uint value); - - [DllImport(LeptonicaDll, CallingConvention = CallingConvention.Cdecl)] - private static extern void pixDestroy(ref IntPtr pix); - - [Fact] - public void PixCreate_ShouldReturnCorrectDimensions() - { - IntPtr pix = pixCreate(100, 200, 8); - try - { - Assert.NotEqual(IntPtr.Zero, pix); - Assert.Equal(100, pixGetWidth(pix)); - Assert.Equal(200, pixGetHeight(pix)); - Assert.Equal(8, pixGetDepth(pix)); - } - finally - { - pixDestroy(ref pix); - } - } - - [Fact] - public void PixSetPixel_ShouldRoundTripValue() - { - IntPtr pix = pixCreate(1, 1, 8); - try - { - uint expected = 123; - Assert.Equal(0, pixSetPixel(pix, 0, 0, expected)); - uint actual; - Assert.Equal(0, pixGetPixel(pix, 0, 0, out actual)); - Assert.Equal(expected, actual); - } - finally - { - pixDestroy(ref pix); - } - } -} diff --git a/tests/MarkItDownNet.Tests/MarkItDownNet.Tests.csproj b/tests/MarkItDownNet.Tests/MarkItDownNet.Tests.csproj index bf48396..3d5e84e 100644 --- a/tests/MarkItDownNet.Tests/MarkItDownNet.Tests.csproj +++ b/tests/MarkItDownNet.Tests/MarkItDownNet.Tests.csproj @@ -14,7 +14,7 @@ - + diff --git a/tests/MarkItDownNet.Tests/OcrImageTests.cs b/tests/MarkItDownNet.Tests/OcrImageTests.cs index 55cc553..77aa4d7 100644 --- a/tests/MarkItDownNet.Tests/OcrImageTests.cs +++ b/tests/MarkItDownNet.Tests/OcrImageTests.cs @@ -11,15 +11,6 @@ public class OcrImageTests [Fact] public async Task Can_extract_text_from_simple_png() { - try - { - OcrTestHelpers.EnsureOcrLibraries(); - } - catch (Exception) - { - return; - } - using var surface = SKSurface.Create(new SKImageInfo(120, 40)); var canvas = surface.Canvas; canvas.Clear(SKColors.White); diff --git a/tests/MarkItDownNet.Tests/OcrPdfTests.cs b/tests/MarkItDownNet.Tests/OcrPdfTests.cs index 359a50d..7ececd5 100644 --- a/tests/MarkItDownNet.Tests/OcrPdfTests.cs +++ b/tests/MarkItDownNet.Tests/OcrPdfTests.cs @@ -5,8 +5,6 @@ using System.Text.Json; using System.Threading.Tasks; using MarkItDownNet; -using TesseractOCR; -using TesseractOCR.InteropDotNet; using Xunit; namespace MarkItDownNet.Tests; @@ -16,11 +14,6 @@ public class OcrPdfTests [Fact] public async Task OcrTestPdfMatchesGroundTruth() { - try { - OcrTestHelpers.EnsureOcrLibraries(); - } catch (Exception) { - return; - } using var http = new HttpClient(); var baseUrl = "https://raw.githubusercontent.com/docling-project/docling/main/tests/data_scanned"; diff --git a/tests/MarkItDownNet.Tests/OcrTestHelpers.cs b/tests/MarkItDownNet.Tests/OcrTestHelpers.cs deleted file mode 100644 index 800ad44..0000000 --- a/tests/MarkItDownNet.Tests/OcrTestHelpers.cs +++ /dev/null @@ -1,15 +0,0 @@ -using System; -using System.IO; -using TesseractOCR.InteropDotNet; - -namespace MarkItDownNet.Tests; - -internal static class OcrTestHelpers -{ - public static void EnsureOcrLibraries() - { - var dir = Path.Combine(AppContext.BaseDirectory, "x64"); - Environment.SetEnvironmentVariable("LD_LIBRARY_PATH", dir); - LibraryLoader.Instance.CustomSearchPath = dir; - } -} diff --git a/tools/DoclingComparison/DoclingComparison.csproj b/tools/DoclingComparison/DoclingComparison.csproj index 84259cf..5e0156a 100644 --- a/tools/DoclingComparison/DoclingComparison.csproj +++ b/tools/DoclingComparison/DoclingComparison.csproj @@ -9,7 +9,7 @@ - + diff --git a/tools/DoclingComparison/Program.cs b/tools/DoclingComparison/Program.cs index c51e4ab..f248b9a 100644 --- a/tools/DoclingComparison/Program.cs +++ b/tools/DoclingComparison/Program.cs @@ -3,7 +3,6 @@ using SkiaSharp; using System.Text; using System.Text.Json; -using TesseractOCR.InteropDotNet; namespace DoclingComparison; @@ -19,7 +18,6 @@ static async Task Main(string[] args) var groundRoot = Path.Combine(dataRoot, "groundtruth", "docling_v2"); var reportPath = Path.Combine(repoRoot, "docs", "docling_comparison.md"); - LibraryLoader.Instance.CustomSearchPath = "/lib/x86_64-linux-gnu"; Environment.SetEnvironmentVariable("TESSDATA_PREFIX", "/usr/share/tesseract-ocr/5/tessdata"); var files = args.Length > 0 ? args : Directory.GetFiles(Path.Combine(dataRoot, "pdf"), "*.pdf") diff --git a/tools/DoclingTimings/DoclingTimings.csproj b/tools/DoclingTimings/DoclingTimings.csproj index 41f9e98..e749004 100644 --- a/tools/DoclingTimings/DoclingTimings.csproj +++ b/tools/DoclingTimings/DoclingTimings.csproj @@ -7,6 +7,6 @@ - + diff --git a/tools/DoclingTimings/Program.cs b/tools/DoclingTimings/Program.cs index ad56431..ed2d6c9 100644 --- a/tools/DoclingTimings/Program.cs +++ b/tools/DoclingTimings/Program.cs @@ -1,6 +1,5 @@ using MarkItDownNet; using System.Diagnostics; -using TesseractOCR.InteropDotNet; record TimingRecord(string FileName, string Type, double MarkdownMs, double BBoxMs); @@ -12,7 +11,6 @@ static async Task Main(string[] args) var repoRoot = Path.GetFullPath(Path.Combine(baseDir, "..", "..", "..", "..", "..")); var dataRoot = Path.Combine(repoRoot, "docling", "tests", "data"); - LibraryLoader.Instance.CustomSearchPath = "/usr/lib/x86_64-linux-gnu"; Environment.SetEnvironmentVariable("TESSDATA_PREFIX", "/usr/share/tesseract-ocr/5/tessdata"); var pdfFiles = Directory.GetFiles(Path.Combine(dataRoot, "pdf"), "*.pdf"); diff --git a/tools/FunsdComparison/FunsdComparison.csproj b/tools/FunsdComparison/FunsdComparison.csproj index d77195e..7b09b06 100644 --- a/tools/FunsdComparison/FunsdComparison.csproj +++ b/tools/FunsdComparison/FunsdComparison.csproj @@ -7,7 +7,7 @@ - + diff --git a/tools/FunsdComparison/Program.cs b/tools/FunsdComparison/Program.cs index 10ea38f..7491fcd 100644 --- a/tools/FunsdComparison/Program.cs +++ b/tools/FunsdComparison/Program.cs @@ -2,7 +2,6 @@ using SkiaSharp; using System.Text; using System.Text.Json; -using TesseractOCR.InteropDotNet; namespace FunsdComparison; @@ -23,7 +22,7 @@ static async Task Main(string[] args) Directory.CreateDirectory(outputRoot); - // TesseractOCR relies on system libraries; no custom search path required on Linux + // Tesseract relies on system libraries; no custom search path required on Linux Environment.SetEnvironmentVariable("TESSDATA_PREFIX", "/usr/share/tesseract-ocr/5/tessdata"); var converter = new MarkItDownConverter(new MarkItDownOptions { NormalizeMarkdown = false }); @@ -119,7 +118,6 @@ static async Task Main(string[] args) sb.AppendLine(); sb.AppendLine("```bash"); sb.AppendLine("export PATH=$HOME/.dotnet:$PATH"); - sb.AppendLine("export LD_LIBRARY_PATH=/usr/lib/x86_64-linux-gnu:$PWD/tools/FunsdComparison/bin/Debug/net9.0/runtimes/linux-x64/native"); sb.AppendLine("dotnet run --project tools/FunsdComparison"); sb.AppendLine("```");