Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 1 addition & 8 deletions AGENTS.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,14 +27,7 @@
- Use `Serilog__MinimumLevel=Verbose` to enable detailed timings and counts.

## Operations
Le librerie native minime per Linux `x64` sono disponibili in `src/MarkItDownNet/TesseractOCR/x64` e vengono copiate accanto ai binari (`x64`) ad eccezione di `libdl.so`, posizionata in `runtimes/linux-x64/native`:

* `libopenjp2.so.7`
* `liblept.so.5` con il symlink `libleptonica-1.85.0.dll.so`
* `libtesseract.so.5` con il symlink `libtesseract55.dll.so`
* `libdl.so`

Grazie a queste dipendenze la libreria è auto‑consistente e **non richiede l'installazione di Tesseract o Leptonica**.
Le dipendenze native minime per Linux `x64` (Tesseract e Leptonica) sono incluse nel repository in `runtimes/linux-x64/native` e vengono copiate accanto ai binari. Non è richiesta l'installazione separata di Tesseract.

Per l'OCR servono solo i dati delle lingue. Su Ubuntu 24.04 possono essere installati con:

Expand Down
9 changes: 1 addition & 8 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -46,14 +46,7 @@ All build and test commands must use the locally installed `dotnet`:

## Tesseract and leptonica

Per l'esecuzione su Linux `x64` il progetto include solo le librerie native strettamente necessarie. Tesseract e Leptonica risiedono nella sottocartella `x64` accanto ai binari, mentre `libdl.so` è collocata sotto `runtimes/linux-x64/native` per soddisfare il loader di `TesseractOCR`:

* `libopenjp2.so.7`
* `liblept.so.5` e il symlink `libleptonica-1.85.0.dll.so`
* `libtesseract.so.5` e il symlink `libtesseract55.dll.so`
* `libdl.so`

Grazie a queste dipendenze pre‑caricate la libreria è *auto‑consistente* e **non richiede l'installazione di Tesseract o Leptonica sul sistema**.
La libreria include le dipendenze native minime per Linux `x64` in `runtimes/linux-x64/native` e non richiede l'installazione di Tesseract o Leptonica sul sistema.

Per eseguire l'OCR è necessario soltanto fornire i file `tessdata` delle lingue. Su Ubuntu 24.04 è sufficiente installare i pacchetti delle lingue desiderate, ad esempio:

Expand Down
1 change: 0 additions & 1 deletion docs/funsd_comparison.md
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,5 @@ Confronto tra le bounding box delle parole annotate nel dataset di test FUNSD e

```bash
export PATH=$HOME/.dotnet:$PATH
export LD_LIBRARY_PATH=/usr/lib/x86_64-linux-gnu:$PWD/tools/FunsdComparison/bin/Debug/net9.0/runtimes/linux-x64/native
dotnet run --project tools/FunsdComparison
```
94 changes: 25 additions & 69 deletions src/MarkItDownNet/MarkItDownConverter.cs
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,11 @@
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Runtime.InteropServices;
using System.Threading;
using System.Threading.Tasks;
using Markdig;
using Serilog;
using TesseractOCR;
using TesseractOCR.Enums;
using TesseractOCR.InteropDotNet;
using Tesseract;
using UglyToad.PdfPig;
using UglyToad.PdfPig.Content;
using PDFtoImage;
Expand All @@ -23,40 +20,6 @@ public class MarkItDownConverter
private readonly MarkItDownOptions _options;
private readonly ILogger _logger;

static MarkItDownConverter()
{
if (RuntimeInformation.IsOSPlatform(OSPlatform.Linux))
{
var libDir = Path.Combine(AppContext.BaseDirectory, "x64");
var ldPath = Environment.GetEnvironmentVariable("LD_LIBRARY_PATH");
Environment.SetEnvironmentVariable("LD_LIBRARY_PATH", string.IsNullOrEmpty(ldPath) ? libDir : libDir + ":" + ldPath);
LibraryLoader.Instance.CustomSearchPath = libDir;
LoadNative("libopenjp2.so.7");
LoadNative("libleptonica-1.85.0.dll.so");
LoadNative("libtesseract.so.5");
}
}

private const int RTLD_NOW = 2;
private const int RTLD_GLOBAL = 0x100;

[DllImport("libdl.so.2")]
private static extern IntPtr dlopen(string fileName, int flags);

private static void LoadNative(string name)
{
var path = Path.Combine(AppContext.BaseDirectory, "x64", name);
if (!File.Exists(path))
{
throw new DllNotFoundException($"Unable to find '{name}' at '{path}'");
}
var handle = dlopen(path, RTLD_NOW | RTLD_GLOBAL);
if (handle == IntPtr.Zero)
{
throw new DllNotFoundException($"Unable to load '{name}' from '{path}'");
}
}

public MarkItDownConverter(MarkItDownOptions? options = null, ILogger? logger = null)
{
_options = options ?? new MarkItDownOptions();
Expand Down Expand Up @@ -136,7 +99,7 @@ private MarkItDownResult ProcessPdfWithOcr(string path, CancellationToken ct)
pages.Add(new Page(pages.Count + 1, bitmap.Width, bitmap.Height));
using var image = SKImage.FromBitmap(bitmap);
using var data = image.Encode(SKEncodedImageFormat.Png, 100);
using var pix = TesseractOCR.Pix.Image.LoadFromMemory(data.ToArray());
using var pix = Pix.LoadFromMemory(data.ToArray());
var result = ProcessPix(pix, pages.Count, ct);
lines.AddRange(result.lines);
words.AddRange(result.words);
Expand All @@ -149,51 +112,44 @@ private MarkItDownResult ProcessPdfWithOcr(string path, CancellationToken ct)

private MarkItDownResult ProcessImage(string path, CancellationToken ct)
{
using var pix = TesseractOCR.Pix.Image.LoadFromFile(path);
using var pix = Pix.LoadFromFile(path);
var (lines, words) = ProcessPix(pix, 1, ct);
var pages = new List<Page> { new Page(1, pix.Width, pix.Height) };
var markdown = BuildMarkdown(lines);
return new MarkItDownResult(markdown, pages, lines, words);
}

private (List<Line> lines, List<Word> words) ProcessPix(TesseractOCR.Pix.Image pix, int pageNumber, CancellationToken ct)
private (List<Line> lines, List<Word> words) ProcessPix(Pix pix, int pageNumber, CancellationToken ct)
{
var lines = new List<Line>();
var words = new List<Word>();
using var engine = new Engine(_options.OcrDataPath ?? string.Empty, _options.OcrLanguages, EngineMode.Default);
using var engine = new TesseractEngine(_options.OcrDataPath ?? string.Empty, _options.OcrLanguages, EngineMode.Default);
using var page = engine.Process(pix);

foreach (var block in page.Layout)
using var iter = page.GetIterator();
iter.Begin();
do
{
foreach (var paragraph in block.Paragraphs)
ct.ThrowIfCancellationRequested();

if (iter.IsAtBeginningOf(PageIteratorLevel.TextLine) &&
iter.TryGetBoundingBox(PageIteratorLevel.TextLine, out var rectLine))
{
foreach (var textLine in paragraph.TextLines)
var text = iter.GetText(PageIteratorLevel.TextLine)?.Trim() ?? string.Empty;
if (!string.IsNullOrEmpty(text))
{
ct.ThrowIfCancellationRequested();
if (textLine.BoundingBox is Rect rectLine)
{
var text = textLine.Text?.Trim() ?? string.Empty;
if (!string.IsNullOrEmpty(text))
{
lines.Add(new Line(pageNumber, text, Normalize(rectLine, pix.Width, pix.Height)));
}
}

foreach (var word in textLine.Words)
{
ct.ThrowIfCancellationRequested();
if (word.BoundingBox is Rect rectWord)
{
var wText = word.Text?.Trim() ?? string.Empty;
if (!string.IsNullOrEmpty(wText))
{
words.Add(new Word(pageNumber, wText, Normalize(rectWord, pix.Width, pix.Height)));
}
}
}
lines.Add(new Line(pageNumber, text, Normalize(rectLine, pix.Width, pix.Height)));
}
}
}

if (iter.TryGetBoundingBox(PageIteratorLevel.Word, out var rectWord))
{
var wText = iter.GetText(PageIteratorLevel.Word)?.Trim() ?? string.Empty;
if (!string.IsNullOrEmpty(wText))
{
words.Add(new Word(pageNumber, wText, Normalize(rectWord, pix.Width, pix.Height)));
}
}
} while (iter.Next(PageIteratorLevel.Word));

return (lines, words);
}
Expand Down
27 changes: 2 additions & 25 deletions src/MarkItDownNet/MarkItDownNet.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -13,36 +13,13 @@
<PackageReference Include="Serilog.Settings.Configuration" Version="9.0.0" />
<PackageReference Include="Serilog.Sinks.Console" Version="6.0.0" />
<PackageReference Include="Serilog.Sinks.File" Version="7.0.0" />
<PackageReference Include="TesseractOCR" Version="5.5.1" />
<PackageReference Include="Tesseract" Version="5.2.0" />
<PackageReference Include="PdfPig" Version="0.1.11" />
</ItemGroup>

<ItemGroup>
<!-- Bundled native libraries for Linux (x64) -->
<None Include="TesseractOCR/x64/libtesseract.so.5">
<Link>x64/libtesseract.so.5</Link>
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
</None>
<None Include="TesseractOCR/x64/libtesseract55.dll.so">
<Link>x64/libtesseract55.dll.so</Link>
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
</None>
<None Include="TesseractOCR/x64/liblept.so.5">
<Link>x64/liblept.so.5</Link>
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
</None>
<None Include="TesseractOCR/x64/libleptonica-1.85.0.dll.so">
<Link>x64/libleptonica-1.85.0.dll.so</Link>
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
</None>
<None Include="TesseractOCR/x64/libopenjp2.so.7">
<Link>x64/libopenjp2.so.7</Link>
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
</None>
<None Include="TesseractOCR/x64/libdl.so">
<Link>runtimes/linux-x64/native/libdl.so</Link>
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
</None>
<None Include="runtimes/linux-x64/native/*" CopyToOutputDirectory="PreserveNewest" Pack="true" PackagePath="runtimes/linux-x64/native" />
</ItemGroup>

</Project>

This file was deleted.

1 change: 0 additions & 1 deletion src/MarkItDownNet/TesseractOCR/x64/libtesseract55.dll.so

This file was deleted.

73 changes: 0 additions & 73 deletions tests/MarkItDownNet.Tests/LeptonicaTests.cs

This file was deleted.

2 changes: 1 addition & 1 deletion tests/MarkItDownNet.Tests/MarkItDownNet.Tests.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
<PackageReference Include="Microsoft.NET.Test.Sdk" Version="17.14.1" />
<PackageReference Include="xunit" Version="2.9.3" />
<PackageReference Include="xunit.runner.visualstudio" Version="3.1.3" />
<PackageReference Include="TesseractOCR" Version="5.5.1" />
<PackageReference Include="Tesseract" Version="5.2.0" />
<PackageReference Include="SkiaSharp" Version="3.119.0" />
<PackageReference Include="SkiaSharp.NativeAssets.Linux" Version="3.119.0" />
</ItemGroup>
Expand Down
9 changes: 0 additions & 9 deletions tests/MarkItDownNet.Tests/OcrImageTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -11,15 +11,6 @@ public class OcrImageTests
[Fact]
public async Task Can_extract_text_from_simple_png()
{
try
{
OcrTestHelpers.EnsureOcrLibraries();
}
catch (Exception)
{
return;
}

using var surface = SKSurface.Create(new SKImageInfo(120, 40));
var canvas = surface.Canvas;
canvas.Clear(SKColors.White);
Expand Down
7 changes: 0 additions & 7 deletions tests/MarkItDownNet.Tests/OcrPdfTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,6 @@
using System.Text.Json;
using System.Threading.Tasks;
using MarkItDownNet;
using TesseractOCR;
using TesseractOCR.InteropDotNet;
using Xunit;

namespace MarkItDownNet.Tests;
Expand All @@ -16,11 +14,6 @@ public class OcrPdfTests
[Fact]
public async Task OcrTestPdfMatchesGroundTruth()
{
try {
OcrTestHelpers.EnsureOcrLibraries();
} catch (Exception) {
return;
}
using var http = new HttpClient();
var baseUrl = "https://raw.githubusercontent.com/docling-project/docling/main/tests/data_scanned";

Expand Down
15 changes: 0 additions & 15 deletions tests/MarkItDownNet.Tests/OcrTestHelpers.cs

This file was deleted.

2 changes: 1 addition & 1 deletion tools/DoclingComparison/DoclingComparison.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

<ItemGroup>
<ProjectReference Include="../../src/MarkItDownNet/MarkItDownNet.csproj" />
<PackageReference Include="TesseractOCR" Version="5.5.1" />
<PackageReference Include="Tesseract" Version="5.2.0" />
<PackageReference Include="BitMiracle.LibTiff.NET" Version="2.4.660" />
<PackageReference Include="SkiaSharp" Version="3.119.0" />
<PackageReference Include="SkiaSharp.NativeAssets.Linux" Version="3.119.0" />
Expand Down
2 changes: 0 additions & 2 deletions tools/DoclingComparison/Program.cs
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
using SkiaSharp;
using System.Text;
using System.Text.Json;
using TesseractOCR.InteropDotNet;

namespace DoclingComparison;

Expand All @@ -19,7 +18,6 @@ static async Task Main(string[] args)
var groundRoot = Path.Combine(dataRoot, "groundtruth", "docling_v2");
var reportPath = Path.Combine(repoRoot, "docs", "docling_comparison.md");

LibraryLoader.Instance.CustomSearchPath = "/lib/x86_64-linux-gnu";
Environment.SetEnvironmentVariable("TESSDATA_PREFIX", "/usr/share/tesseract-ocr/5/tessdata");

var files = args.Length > 0 ? args : Directory.GetFiles(Path.Combine(dataRoot, "pdf"), "*.pdf")
Expand Down
2 changes: 1 addition & 1 deletion tools/DoclingTimings/DoclingTimings.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,6 @@
</PropertyGroup>
<ItemGroup>
<ProjectReference Include="../../src/MarkItDownNet/MarkItDownNet.csproj" />
<PackageReference Include="TesseractOCR" Version="5.5.1" />
<PackageReference Include="Tesseract" Version="5.2.0" />
</ItemGroup>
</Project>
Loading