diff --git a/README.md b/README.md index 47429cc..90a5cc5 100644 --- a/README.md +++ b/README.md @@ -64,7 +64,7 @@ var options = new MarkItDownOptions { OcrDataPath = "/usr/share/tesseract-ocr/5/tessdata", OcrLanguages = "eng", - PdfRasterDpi = 300 + OcrUserDpi = 300 }; var converter = new MarkItDownConverter(options); var result = await converter.ConvertAsync("sample.pdf", "application/pdf"); @@ -77,7 +77,7 @@ Console.WriteLine(result.Markdown); * `OcrDataPath` – location of Tesseract language data (`TESSDATA_PREFIX`) * `OcrLanguages` – languages passed to Tesseract (e.g. `ita+eng`) -* `PdfRasterDpi` – DPI for rasterising PDFs during OCR fallback +* `OcrUserDpi` – DPI for rasterising PDFs during OCR fallback * `MinimumNativeWordThreshold` – minimum words before OCR is triggered * `NormalizeMarkdown` – toggle Markdig normalisation @@ -108,6 +108,13 @@ Docling's image samples are distributed as TIFF files. The comparison tool conve These large arXiv PDFs showed a 99.37% word match rate and a 10.74% mean absolute error in bounding boxes. +## OCR benchmark + +```bash +dotnet run --project tools/OcrBench -- extract --input-dir dataset/validation --out-dir dataset/validation/_ocr --threads 1 --langs eng --psm 6 --refresh markitdownnet +dotnet run --project tools/OcrBench -- compare --ocr-dir dataset/validation/_ocr --out-json artifacts/validation/OCR/bench-ocr.json --out-md artifacts/validation/OCR/summary-ocr.md +``` + ## Docling comparison The `tests` project verifies Markdown and bounding box accuracy against the [Docling](https://github.com/docling-project/docling) ground truth for `ocr_test.pdf`. diff --git a/artifacts/validation/OCR/bench-ocr.json b/artifacts/validation/OCR/bench-ocr.json index 9e619df..4636108 100644 --- a/artifacts/validation/OCR/bench-ocr.json +++ b/artifacts/validation/OCR/bench-ocr.json @@ -13,186 +13,134 @@ }, "files": [ { - "dataset": "FUNSD", - "file": "82250337_0338", - "cer_char": 0.011834319526627219, - "token_precision": 1, - "token_recall": 1, - "token_f1": 1, - "line_count_ref": 27, - "line_count_hyp": 27, - "line_f1": 1, - "timing_markitdownnet": 511, - "timing_pytesseract": 1845 + "dataset": "MARMOT", + "file": "10.1.1.1.2013_64", + "cer_char": 0.11729323308270677, + "token_precision": 0.7592592592592593, + "token_recall": 0.7522935779816514, + "token_f1": 0.7557603686635944, + "line_count_ref": 21, + "line_count_hyp": 21, + "line_f1": 0.47619047619047616, + "timing_markitdownnet": 1305, + "timing_pytesseract": 0 }, { - "dataset": "FUNSD", - "file": "82200067_0069", - "cer_char": 0.020602218700475437, - "token_precision": 1, - "token_recall": 1, - "token_f1": 1, - "line_count_ref": 28, - "line_count_hyp": 28, - "line_f1": 1, - "timing_markitdownnet": 917, - "timing_pytesseract": 2055 + "dataset": "MARMOT", + "file": "10.1.1.1.2014_4", + "cer_char": 0.11923556294142086, + "token_precision": 0.8087557603686636, + "token_recall": 0.8013698630136986, + "token_f1": 0.805045871559633, + "line_count_ref": 37, + "line_count_hyp": 36, + "line_f1": 0.273972602739726, + "timing_markitdownnet": 1221, + "timing_pytesseract": 0 }, { - "dataset": "FUNSD", - "file": "82092117", - "cer_char": 0.01079734219269103, - "token_precision": 1, - "token_recall": 1, - "token_f1": 1, - "line_count_ref": 32, - "line_count_hyp": 32, - "line_f1": 1, - "timing_markitdownnet": 681, - "timing_pytesseract": 1963 + "dataset": "MARMOT", + "file": "10.1.1.1.2013_63", + "cer_char": 0.04133545310015898, + "token_precision": 0.8972332015810277, + "token_recall": 0.8937007874015748, + "token_f1": 0.8954635108481263, + "line_count_ref": 38, + "line_count_hyp": 38, + "line_f1": 0.5789473684210527, + "timing_markitdownnet": 982, + "timing_pytesseract": 0 }, { - "dataset": "FUNSD", - "file": "82251504", - "cer_char": 0.015776699029126214, - "token_precision": 1, - "token_recall": 1, - "token_f1": 1, - "line_count_ref": 28, - "line_count_hyp": 28, - "line_f1": 1, - "timing_markitdownnet": 680, - "timing_pytesseract": 2042 + "dataset": "MARMOT", + "file": "10.1.1.1.2006_3", + "cer_char": 0.04483471074380165, + "token_precision": 0.8641975308641975, + "token_recall": 0.8782936010037641, + "token_f1": 0.8711885500933415, + "line_count_ref": 60, + "line_count_hyp": 61, + "line_f1": 0.28099173553719003, + "timing_markitdownnet": 2511, + "timing_pytesseract": 0 }, { "dataset": "SROIE2019", - "file": "X00016469670", - "cer_char": 0.09898107714701601, - "token_precision": 0.8468468468468469, - "token_recall": 0.8103448275862069, - "token_f1": 0.8281938325991189, - "line_count_ref": 29, - "line_count_hyp": 28, - "line_f1": 0.6315789473684211, - "timing_markitdownnet": 464, - "timing_pytesseract": 1928 + "file": "X51005200931", + "cer_char": 0.03384279475982533, + "token_precision": 0.891566265060241, + "token_recall": 0.8862275449101796, + "token_f1": 0.888888888888889, + "line_count_ref": 41, + "line_count_hyp": 41, + "line_f1": 0.6585365853658537, + "timing_markitdownnet": 801, + "timing_pytesseract": 0 }, { "dataset": "SROIE2019", "file": "X00016469671", - "cer_char": 0.0802675585284281, - "token_precision": 0.83, - "token_recall": 0.8645833333333334, - "token_f1": 0.8469387755102041, + "cer_char": 0.18394648829431437, + "token_precision": 0.7227722772277227, + "token_recall": 0.7604166666666666, + "token_f1": 0.7411167512690354, "line_count_ref": 26, "line_count_hyp": 26, - "line_f1": 0.7307692307692306, - "timing_markitdownnet": 408, - "timing_pytesseract": 1755 + "line_f1": 0.6153846153846154, + "timing_markitdownnet": 745, + "timing_pytesseract": 0 }, { "dataset": "SROIE2019", "file": "X51005230605", - "cer_char": 0.03929273084479371, - "token_precision": 0.9574468085106383, - "token_recall": 0.967741935483871, - "token_f1": 0.9625668449197862, + "cer_char": 0.023575638506876228, + "token_precision": 0.978494623655914, + "token_recall": 0.978494623655914, + "token_f1": 0.978494623655914, "line_count_ref": 25, "line_count_hyp": 25, - "line_f1": 0.92, - "timing_markitdownnet": 458, - "timing_pytesseract": 1849 + "line_f1": 0.96, + "timing_markitdownnet": 478, + "timing_pytesseract": 0 }, { "dataset": "SROIE2019", - "file": "X51005200931", - "cer_char": 0.03384279475982533, - "token_precision": 0.891566265060241, - "token_recall": 0.8862275449101796, - "token_f1": 0.888888888888889, - "line_count_ref": 41, - "line_count_hyp": 41, - "line_f1": 0.6585365853658537, - "timing_markitdownnet": 642, - "timing_pytesseract": 2056 - }, - { - "dataset": "ICDAR", - "file": "cTDaR_t00016", - "cer_char": 0.707796852646638, - "token_precision": 0.46397694524495675, - "token_recall": 0.3950920245398773, - "token_f1": 0.4267726971504307, - "line_count_ref": 51, - "line_count_hyp": 44, - "line_f1": 0, - "timing_markitdownnet": 3818, - "timing_pytesseract": 5629 - }, - { - "dataset": "ICDAR", - "file": "cTDaR_t00014", - "cer_char": 0.6630648330058939, - "token_precision": 0.36752136752136755, - "token_recall": 0.3173431734317343, - "token_f1": 0.3405940594059406, - "line_count_ref": 40, - "line_count_hyp": 36, - "line_f1": 0, - "timing_markitdownnet": 2935, - "timing_pytesseract": 4447 - }, - { - "dataset": "ICDAR", - "file": "cTDaR_t00080", - "cer_char": 0.7515723270440252, - "token_precision": 0.3641732283464567, - "token_recall": 0.3798767967145791, - "token_f1": 0.37185929648241206, - "line_count_ref": 50, - "line_count_hyp": 48, - "line_f1": 0, - "timing_markitdownnet": 3762, - "timing_pytesseract": 5772 - }, - { - "dataset": "ICDAR", - "file": "cTDaR_t00015", - "cer_char": 0.7768157768157768, - "token_precision": 0.3161904761904762, - "token_recall": 0.30018083182640143, - "token_f1": 0.30797773654916516, - "line_count_ref": 36, - "line_count_hyp": 36, - "line_f1": 0, - "timing_markitdownnet": 3387, - "timing_pytesseract": 4978 + "file": "X00016469670", + "cer_char": 0.1848617176128093, + "token_precision": 0.7350427350427351, + "token_recall": 0.7413793103448276, + "token_f1": 0.7381974248927039, + "line_count_ref": 29, + "line_count_hyp": 30, + "line_f1": 0.5423728813559322, + "timing_markitdownnet": 806, + "timing_pytesseract": 0 }, { "dataset": "PUBTABLES", - "file": "PMC1064078_table_0", - "cer_char": 0.03770739064856712, - "token_precision": 0.8282828282828283, - "token_recall": 0.8367346938775511, - "token_f1": 0.8324873096446701, - "line_count_ref": 10, - "line_count_hyp": 10, - "line_f1": 0.10000000000000002, - "timing_markitdownnet": 350, - "timing_pytesseract": 1813 + "file": "PMC1064082_table_1", + "cer_char": 0.06351183063511831, + "token_precision": 0.7819548872180451, + "token_recall": 0.7938931297709924, + "token_f1": 0.7878787878787878, + "line_count_ref": 26, + "line_count_hyp": 26, + "line_f1": 0.3076923076923077, + "timing_markitdownnet": 448, + "timing_pytesseract": 0 }, { "dataset": "PUBTABLES", - "file": "PMC1064078_table_3", - "cer_char": 0.4519906323185012, - "token_precision": 0.5797101449275363, - "token_recall": 0.43010752688172044, - "token_f1": 0.49382716049382724, - "line_count_ref": 14, - "line_count_hyp": 13, + "file": "PMC1064082_table_0", + "cer_char": 0.6911764705882353, + "token_precision": 0.3076923076923077, + "token_recall": 0.14814814814814814, + "token_f1": 0.2, + "line_count_ref": 5, + "line_count_hyp": 3, "line_f1": 0, - "timing_markitdownnet": 331, - "timing_pytesseract": 1844 + "timing_markitdownnet": 156, + "timing_pytesseract": 0 }, { "dataset": "PUBTABLES", @@ -204,8 +152,21 @@ "line_count_ref": 18, "line_count_hyp": 18, "line_f1": 0.05555555555555555, - "timing_markitdownnet": 390, - "timing_pytesseract": 1791 + "timing_markitdownnet": 610, + "timing_pytesseract": 0 + }, + { + "dataset": "PUBTABLES", + "file": "PMC1064078_table_4", + "cer_char": 0.3125, + "token_precision": 0.4267515923566879, + "token_recall": 0.44666666666666666, + "token_f1": 0.4364820846905537, + "line_count_ref": 20, + "line_count_hyp": 21, + "line_f1": 0.04878048780487805, + "timing_markitdownnet": 732, + "timing_pytesseract": 0 }, { "dataset": "PUBTABLES", @@ -217,34 +178,21 @@ "line_count_ref": 16, "line_count_hyp": 15, "line_f1": 0.06451612903225808, - "timing_markitdownnet": 522, - "timing_pytesseract": 2071 + "timing_markitdownnet": 790, + "timing_pytesseract": 0 }, { "dataset": "PUBTABLES", - "file": "PMC1064082_table_0", - "cer_char": 0.6911764705882353, - "token_precision": 0.3076923076923077, - "token_recall": 0.14814814814814814, - "token_f1": 0.2, - "line_count_ref": 5, - "line_count_hyp": 3, + "file": "PMC1064078_table_3", + "cer_char": 0.4519906323185012, + "token_precision": 0.5797101449275363, + "token_recall": 0.43010752688172044, + "token_f1": 0.49382716049382724, + "line_count_ref": 14, + "line_count_hyp": 13, "line_f1": 0, - "timing_markitdownnet": 116, - "timing_pytesseract": 1539 - }, - { - "dataset": "PUBTABLES", - "file": "PMC1064078_table_4", - "cer_char": 0.3125, - "token_precision": 0.4267515923566879, - "token_recall": 0.44666666666666666, - "token_f1": 0.4364820846905537, - "line_count_ref": 20, - "line_count_hyp": 21, - "line_f1": 0.04878048780487805, - "timing_markitdownnet": 489, - "timing_pytesseract": 1842 + "timing_markitdownnet": 468, + "timing_pytesseract": 0 }, { "dataset": "PUBTABLES", @@ -256,112 +204,164 @@ "line_count_ref": 7, "line_count_hyp": 7, "line_f1": 0.14285714285714285, - "timing_markitdownnet": 304, - "timing_pytesseract": 1864 + "timing_markitdownnet": 363, + "timing_pytesseract": 0 }, { "dataset": "PUBTABLES", - "file": "PMC1064082_table_1", - "cer_char": 0.06351183063511831, - "token_precision": 0.7819548872180451, - "token_recall": 0.7938931297709924, - "token_f1": 0.7878787878787878, - "line_count_ref": 26, - "line_count_hyp": 26, - "line_f1": 0.3076923076923077, - "timing_markitdownnet": 385, - "timing_pytesseract": 1721 + "file": "PMC1064078_table_0", + "cer_char": 0.03770739064856712, + "token_precision": 0.8282828282828283, + "token_recall": 0.8367346938775511, + "token_f1": 0.8324873096446701, + "line_count_ref": 10, + "line_count_hyp": 10, + "line_f1": 0.10000000000000002, + "timing_markitdownnet": 376, + "timing_pytesseract": 0 }, { - "dataset": "MARMOT", - "file": "10.1.1.1.2013_63", - "cer_char": 0.04133545310015898, - "token_precision": 0.8972332015810277, - "token_recall": 0.8937007874015748, - "token_f1": 0.8954635108481263, - "line_count_ref": 38, - "line_count_hyp": 38, - "line_f1": 0.5789473684210527, - "timing_markitdownnet": 3631, - "timing_pytesseract": 2309 + "dataset": "ICDAR", + "file": "cTDaR_t00016", + "cer_char": 0.707796852646638, + "token_precision": 0.46397694524495675, + "token_recall": 0.3950920245398773, + "token_f1": 0.4267726971504307, + "line_count_ref": 51, + "line_count_hyp": 44, + "line_f1": 0, + "timing_markitdownnet": 5336, + "timing_pytesseract": 0 }, { - "dataset": "MARMOT", - "file": "10.1.1.1.2013_64", - "cer_char": 0.11729323308270677, - "token_precision": 0.7592592592592593, - "token_recall": 0.7522935779816514, - "token_f1": 0.7557603686635944, - "line_count_ref": 21, - "line_count_hyp": 21, - "line_f1": 0.47619047619047616, - "timing_markitdownnet": 626, - "timing_pytesseract": 1912 + "dataset": "ICDAR", + "file": "cTDaR_t00015", + "cer_char": 0.7768157768157768, + "token_precision": 0.3161904761904762, + "token_recall": 0.30018083182640143, + "token_f1": 0.30797773654916516, + "line_count_ref": 36, + "line_count_hyp": 36, + "line_f1": 0, + "timing_markitdownnet": 5144, + "timing_pytesseract": 0 }, { - "dataset": "MARMOT", - "file": "10.1.1.1.2006_3", - "cer_char": 0.03347107438016529, - "token_precision": 0.8811013767209012, - "token_recall": 0.8833124215809285, - "token_f1": 0.8822055137844611, - "line_count_ref": 60, - "line_count_hyp": 60, - "line_f1": 0.31666666666666665, - "timing_markitdownnet": 8684, - "timing_pytesseract": 3309 + "dataset": "ICDAR", + "file": "cTDaR_t00080", + "cer_char": 0.7647798742138365, + "token_precision": 0.3570057581573896, + "token_recall": 0.38193018480492813, + "token_f1": 0.36904761904761907, + "line_count_ref": 50, + "line_count_hyp": 48, + "line_f1": 0, + "timing_markitdownnet": 3394, + "timing_pytesseract": 0 }, { - "dataset": "MARMOT", - "file": "10.1.1.1.2014_4", - "cer_char": 0.11923556294142086, - "token_precision": 0.8087557603686636, - "token_recall": 0.8013698630136986, - "token_f1": 0.805045871559633, - "line_count_ref": 37, + "dataset": "ICDAR", + "file": "cTDaR_t00014", + "cer_char": 0.6630648330058939, + "token_precision": 0.36752136752136755, + "token_recall": 0.3173431734317343, + "token_f1": 0.3405940594059406, + "line_count_ref": 40, "line_count_hyp": 36, - "line_f1": 0.273972602739726, - "timing_markitdownnet": 991, - "timing_pytesseract": 2616 + "line_f1": 0, + "timing_markitdownnet": 4396, + "timing_pytesseract": 0 + }, + { + "dataset": "FUNSD", + "file": "82251504", + "cer_char": 0.015776699029126214, + "token_precision": 1, + "token_recall": 1, + "token_f1": 1, + "line_count_ref": 28, + "line_count_hyp": 28, + "line_f1": 1, + "timing_markitdownnet": 899, + "timing_pytesseract": 0 + }, + { + "dataset": "FUNSD", + "file": "82200067_0069", + "cer_char": 0.020602218700475437, + "token_precision": 1, + "token_recall": 1, + "token_f1": 1, + "line_count_ref": 28, + "line_count_hyp": 28, + "line_f1": 1, + "timing_markitdownnet": 1016, + "timing_pytesseract": 0 + }, + { + "dataset": "FUNSD", + "file": "82092117", + "cer_char": 0.01079734219269103, + "token_precision": 1, + "token_recall": 1, + "token_f1": 1, + "line_count_ref": 32, + "line_count_hyp": 32, + "line_f1": 1, + "timing_markitdownnet": 775, + "timing_pytesseract": 0 + }, + { + "dataset": "FUNSD", + "file": "82250337_0338", + "cer_char": 0.011834319526627219, + "token_precision": 1, + "token_recall": 1, + "token_f1": 1, + "line_count_ref": 27, + "line_count_hyp": 27, + "line_f1": 1, + "timing_markitdownnet": 737, + "timing_pytesseract": 0 } ], "aggregate": { "by_dataset": { - "FUNSD": { - "cer_avg": 0.014752644862229975, - "token_f1_avg": 1, - "line_f1_avg": 1, + "MARMOT": { + "cer_avg": 0.08067473996702205, + "token_f1_avg": 0.8318645752911739, + "line_f1_avg": 0.4025255457221112, "n_files": 4 }, "SROIE2019": { - "cer_avg": 0.06309604032001578, - "token_f1_avg": 0.8816470854794994, - "line_f1_avg": 0.7352211908758763, - "n_files": 4 - }, - "ICDAR": { - "cer_avg": 0.7248124473780835, - "token_f1_avg": 0.36180094739698715, - "line_f1_avg": 0, + "cer_avg": 0.1065566597934563, + "token_f1_avg": 0.8366744221766356, + "line_f1_avg": 0.6940735205266003, "n_files": 4 }, "PUBTABLES": { "cer_avg": 0.2689109317835886, "token_f1_avg": 0.5714839904880525, - "line_f1_avg": 0.08992520286776778, + "line_f1_avg": 0.08992520286776777, "n_files": 8 }, - "MARMOT": { - "cer_avg": 0.07783383087611297, - "token_f1_avg": 0.8346188162139537, - "line_f1_avg": 0.4114442785044804, + "ICDAR": { + "cer_avg": 0.7281143341705363, + "token_f1_avg": 0.3610980280382889, + "line_f1_avg": 0, + "n_files": 4 + }, + "FUNSD": { + "cer_avg": 0.014752644862229975, + "token_f1_avg": 1, + "line_f1_avg": 1, "n_files": 4 } }, "global": { - "cer_avg": 0.23638613783393658, - "token_f1_avg": 0.7035058050110908, - "line_f1_avg": 0.3877526458526486, + "cer_avg": 0.24465337372673698, + "token_f1_avg": 0.695434167747034, + "line_f1_avg": 0.37940824533070777, "n_files": 24 } } diff --git a/artifacts/validation/OCR/summary-ocr.md b/artifacts/validation/OCR/summary-ocr.md index c42a76e..581a448 100644 --- a/artifacts/validation/OCR/summary-ocr.md +++ b/artifacts/validation/OCR/summary-ocr.md @@ -2,20 +2,20 @@ ## Global | scope | CER | Token-F1 | line_F1 | n_files | -| Global | 0.2364 | 0.7035 | 0.3878 | 24 | +| Global | 0.2447 | 0.6954 | 0.3794 | 24 | ## By dataset | scope | CER | Token-F1 | line_F1 | n_files | -| FUNSD | 0.0148 | 1.0000 | 1.0000 | 4 | -| SROIE2019 | 0.0631 | 0.8816 | 0.7352 | 4 | -| ICDAR | 0.7248 | 0.3618 | 0.0000 | 4 | +| MARMOT | 0.0807 | 0.8319 | 0.4025 | 4 | +| SROIE2019 | 0.1066 | 0.8367 | 0.6941 | 4 | | PUBTABLES | 0.2689 | 0.5715 | 0.0899 | 8 | -| MARMOT | 0.0778 | 0.8346 | 0.4114 | 4 | +| ICDAR | 0.7281 | 0.3611 | 0.0000 | 4 | +| FUNSD | 0.0148 | 1.0000 | 1.0000 | 4 | ## Top-5 worst files | dataset/file | cer_char | token_f1 | line_f1 | note | | ICDAR/cTDaR_t00015 | 0.7768 | 0.3080 | 0.0000 | | -| ICDAR/cTDaR_t00080 | 0.7516 | 0.3719 | 0.0000 | | +| ICDAR/cTDaR_t00080 | 0.7648 | 0.3690 | 0.0000 | | | ICDAR/cTDaR_t00016 | 0.7078 | 0.4268 | 0.0000 | | | PUBTABLES/PMC1064082_table_0 | 0.6912 | 0.2000 | 0.0000 | | | ICDAR/cTDaR_t00014 | 0.6631 | 0.3406 | 0.0000 | | diff --git a/dataset/validation/_ocr/markitdownnet/FUNSD/82092117.txt b/dataset/validation/_ocr/markitdownnet/FUNSD/82092117.txt index e17b21d..f064294 100644 --- a/dataset/validation/_ocr/markitdownnet/FUNSD/82092117.txt +++ b/dataset/validation/_ocr/markitdownnet/FUNSD/82092117.txt @@ -1,6 +1,6 @@ y -ATT, GEN. ADMIN, OFFICE Fax:614~466-508? Dec 1098 17646 POL +ATT, GEN. ADMIN, OFFICE Fax:614~466-508? Dec 1098 17646 POL SAA) Attorney General C oe. =ae#7 Getty D. Montgomery @@ -12,16 +12,16 @@ FAX NO, (614) 466-5087 ‘TO: _ George Baroody -FAX NUMBER: (336) 335-7392 PHONE NUMBER: (336) 335-7363 -DATE: 2/10/98 +FAX NUMBER: (336) 335-7392 PHONE NUMBER: (336) 335-7363 +DATE: 2/10/98 -NUMBER OF PAGES INCLUDING COVER SHEET: 3 +NUMBER OF PAGES INCLUDING COVER SHEET: 3 SENDER/PHONE NUMBER: __June Flynn for Erie Brown/(614) 466-8980 -e SPECIAL INSTRUCTIONS: —— +e SPECIAL INSTRUCTIONS: —— -you. OF OP} +you. OF OP} PLEASE CONTACT SENDER ‘ASSOON AS POSSIBLE @@ -33,8 +33,8 @@ the message to the intended recipient, you are hereby notified that any dissemin corn. or comefng cfs eamueon In) amet setety poh" you hv rrogrel tis comuaion la cor, pase wei ts inmediely Wy liens ea ara | D Grgnal mexage w us oe ales low via he US. Poul Serv, Thankyou fer your] NO -al me =S -eee S +al me =S +eee S S 3 @@ -42,4 +42,4 @@ S Sete Ofice Tower! 0 East Broad Sree Coumbus, Oho 42215-0428 wenagatuochun An Oppo Eye -own | \ No newline at end of file +own | \ No newline at end of file diff --git a/dataset/validation/_ocr/markitdownnet/FUNSD/82200067_0069.txt b/dataset/validation/_ocr/markitdownnet/FUNSD/82200067_0069.txt index 2d99212..917aba9 100644 --- a/dataset/validation/_ocr/markitdownnet/FUNSD/82200067_0069.txt +++ b/dataset/validation/_ocr/markitdownnet/FUNSD/82200067_0069.txt @@ -1,9 +1,9 @@ -. 09/17/97 10:85 ‘503 641 1696 ‘LORILLARD PTLD Boor +. 09/17/97 10:85 ‘503 641 1696 ‘LORILLARD PTLD Boor -re ss Sa -FROM: Tp any way] wea +re ss Sa +FROM: Tp any way] wea wus] sees] -‘SUBJECT: (OLD GOLD MENTHOL LIGHTS & ULTRA LIGHTS 7100'S. PROGRESS REPORT +‘SUBJECT: (OLD GOLD MENTHOL LIGHTS & ULTRA LIGHTS 7100'S. PROGRESS REPORT necion: @@ -11,11 +11,11 @@ necion: brvsion: -DvisO Ponind 9S 8 ONION: ene Seah PREPS: 7 +DvisO Ponind 9S 8 ONION: ene Seah PREPS: 7 -‘DIVISION: Boise ‘# REPS: 2.5 DIVISION: Seaitle North ‘PREPS: 4 +‘DIVISION: Boise ‘# REPS: 2.5 DIVISION: Seaitle North ‘PREPS: 4 -DISION: eugene PRPS nvsON lna aes: « +DISION: eugene PRPS nvsON lna aes: « DIRECT ACCOUNTS AND CHAINS HEADQUARTERED WITHIN THE REGION (ns + STORES) STOCKING No OLD GOLD MENTHOL LIGHTS OR ULTRA LIGHTS 100°S @@ -41,4 +41,4 @@ S s g -Bt: OGMUS-TEte Page1 of 3 Pages \ No newline at end of file +Bt: OGMUS-TEte Page1 of 3 Pages \ No newline at end of file diff --git a/dataset/validation/_ocr/markitdownnet/FUNSD/82250337_0338.txt b/dataset/validation/_ocr/markitdownnet/FUNSD/82250337_0338.txt index 74b644e..9649741 100644 --- a/dataset/validation/_ocr/markitdownnet/FUNSD/82250337_0338.txt +++ b/dataset/validation/_ocr/markitdownnet/FUNSD/82250337_0338.txt @@ -1,12 +1,12 @@ COMPETITIVE PRODUCT INTRODUCTION PROGRESS REPORT -To: Sam Zolot MANUFACTURER: B&W -FROM: D.. Lando BRAND: Kool Waterfall -DATE: 2.Dec.97 ‘TYPE OF PACKINGS: All Packings -REPORTING PERIODS: oct. Nov. x Des. Jan, +To: Sam Zolot MANUFACTURER: B&W +FROM: D.. Lando BRAND: Kool Waterfall +DATE: 2.Dec.97 ‘TYPE OF PACKINGS: All Packings +REPORTING PERIODS: oct. Nov. x Des. Jan, -‘TEST MARKET GEOGRAPHY: Divisions 621 andar (us iscon sim ) +‘TEST MARKET GEOGRAPHY: Divisions 621 andar (us iscon sim ) PRICE POINT: FULLS _PIVS (Indicate Distributor's Cost Per Carton) @@ -26,11 +26,11 @@ Very well received. The ol packs are being consolidated and promoted in select e fVS4.00 off cartons co -ADVERTISING - EFFECTIVENESS OF POS. Ss -“The theme “B" Kool has replaced all previous POS. They have effectively replaced all old POS. New x -oor signage, hour signs, postr mats, and clocks have the new design. "B* Kool also appears on blboards ° -in Winois. On -ining oO +ADVERTISING - EFFECTIVENESS OF POS. Ss +“The theme “B" Kool has replaced all previous POS. They have effectively replaced all old POS. New x +oor signage, hour signs, postr mats, and clocks have the new design. "B* Kool also appears on blboards ° +in Winois. On +ining oO 3 PAGE 1 0F 2 \ No newline at end of file diff --git a/dataset/validation/_ocr/markitdownnet/FUNSD/82251504.txt b/dataset/validation/_ocr/markitdownnet/FUNSD/82251504.txt index 8654219..3062e79 100644 --- a/dataset/validation/_ocr/markitdownnet/FUNSD/82251504.txt +++ b/dataset/validation/_ocr/markitdownnet/FUNSD/82251504.txt @@ -1,13 +1,13 @@ -“41y0s/e7 11:03 epe1a 884 oses LORILLARD_TAXPA +++ GREENSBOR @oo2/003 +“41y0s/e7 11:03 epe1a 884 oses LORILLARD_TAXPA +++ GREENSBOR @oo2/003 Retail Excel Progress Report -Submission fr: Distribution by/to: -uly 3 ) (OM to RSM 1st of Month -August29() To: R.W.Caldarella RSM to RW.C, 10th -September 30 ( ) oo: DOS. -October 31 (x) From: Kent 8, Mills +Submission fr: Distribution by/to: +uly 3 ) (OM to RSM 1st of Month +August29() To: R.W.Caldarella RSM to RW.C, 10th +September 30 ( ) oo: DOS. +October 31 (x) From: Kent 8, Mills November28 ( ) -December30 ( ) Area: § Region: 17 +December30 ( ) Area: § Region: 17 Acceptance/Response: What is the retailers response to Lorillard's Excel Merchandising plan? @@ -18,7 +18,7 @@ ne a Independents: -~ — +~ — SSS @@ -30,11 +30,11 @@ sanceming the inability to be flush with the counter andior up against the regis ne Pennanent Advartising Evaluation/Effectiveness/Acceptance: (P-1/P-5 & C-5_ Plans Only: -u , +u , a a -— © +— © nD ~ a diff --git a/dataset/validation/_ocr/markitdownnet/ICDAR/cTDaR_t00014.txt b/dataset/validation/_ocr/markitdownnet/ICDAR/cTDaR_t00014.txt index cb8c49b..0dd4096 100644 --- a/dataset/validation/_ocr/markitdownnet/ICDAR/cTDaR_t00014.txt +++ b/dataset/validation/_ocr/markitdownnet/ICDAR/cTDaR_t00014.txt @@ -1,36 +1,36 @@ -2 TR Se seen tener ree P es i eer e 27: -| : CONNOTATIO REDINTEGRATIONIS co; an -| ; 4d. Be Appertinertiarum infrafertis Sy) (fuinnen Lehinect 7h hegre Colonis in | -| Ji €¢ adjuflationem Seffionis affignatarum. . : | -: ee ae le (ie) tie | 8 ieee Te -| a Dele hho oo, PA , | lor ica ae Fundis Defertis; conga Deraina lit | Remanentia | rem Competen || -ren; Vetdanrve Doe pacfone na Droge nepohte ce BT Lappertinentis | __ [ety ern | | | a -| i colar wera a eee fey MR TT Tue Ti -hi a fromete $2 2 fre Doanajtts’ Prev’ Vefenrh, | EAL : -ali Bridzpehz lei: Drove pofckteh Gybhie [1 Wrermalbseersnlimialigcerin|ierermijececcy | i -kijn ante amu ovem Mohit na tmefito Dovenne | i ‘ | a >| i ia -oo Davai Milefivione doter Sane | | @ Mal | | : Pie -He bude: eaberve Taken: me jedan pt, nage | Ler ra yt yy Baeee | -' [fond wu Thhed Gednih Kah Vazneh Tepahov | I 9 ga, | 7 OME | I. Js [Raat bag . Al BS -thas’ Dlacodines fe james’ Seapodi deny A Seog oe | bap -f JS i di Ve ] ae oe j | | | fe -chens tmferwerimud. | 2)| Clidguoe aie a se |. Te 4 -' t N : | ha S| ls | Katte alice be euler ce legion Fao ie 3 -/ — Vraainns pon Bri 24 $0 | Pal leg «| 24 «| | mane Wy -é PZ, ~ One S$ppo Bret 107 + veneer YES * » ; sd «| Al fe +] - LEE gis PO a -) . tleamuo Sygnaturn Ki WCC: Die i Gian | + | | | | eRe -en a “s de Ue coe GE GRR eae: -A= 1778: BN AC ch, Ve | / Pees. cd woes ala J. -Diet ra cabd tosh 17] Aoi. Verh, lw | 1 || -f- si ae | | *) -itofer. nb Merb ! i! el | | Hd tee -| Keni “| al Vande, Wey. eg fh || | > it ey ig -eo” eae | 4 Sele Hr the 8 | ‘| mid -. Pe OG ee 10} ely: Juin eo | -, Leemncsovag thin Baek h Hotes Fae 49* | | | i || ie +2 TR Se seen tener ree P es i eer e 27: +| : CONNOTATIO REDINTEGRATIONIS co; an +| ; 4d. Be Appertinertiarum infrafertis Sy) (fuinnen Lehinect 7h hegre Colonis in | +| Ji €¢ adjuflationem Seffionis affignatarum. . : | +: ee ae le (ie) tie | 8 ieee Te +| a Dele hho oo, PA , | lor ica ae Fundis Defertis; conga Deraina lit | Remanentia | rem Competen || +ren; Vetdanrve Doe pacfone na Droge nepohte ce BT Lappertinentis | __ [ety ern | | | a +| i colar wera a eee fey MR TT Tue Ti +hi a fromete $2 2 fre Doanajtts’ Prev’ Vefenrh, | EAL : +ali Bridzpehz lei: Drove pofckteh Gybhie [1 Wrermalbseersnlimialigcerin|ierermijececcy | i +kijn ante amu ovem Mohit na tmefito Dovenne | i ‘ | a >| i ia +oo Davai Milefivione doter Sane | | @ Mal | | : Pie +He bude: eaberve Taken: me jedan pt, nage | Ler ra yt yy Baeee | +' [fond wu Thhed Gednih Kah Vazneh Tepahov | I 9 ga, | 7 OME | I. Js [Raat bag . Al BS +thas’ Dlacodines fe james’ Seapodi deny A Seog oe | bap +f JS i di Ve ] ae oe j | | | fe +chens tmferwerimud. | 2)| Clidguoe aie a se |. Te 4 +' t N : | ha S| ls | Katte alice be euler ce legion Fao ie 3 +/ — Vraainns pon Bri 24 $0 | Pal leg «| 24 «| | mane Wy +é PZ, ~ One S$ppo Bret 107 + veneer YES * » ; sd «| Al fe +] - LEE gis PO a +) . tleamuo Sygnaturn Ki WCC: Die i Gian | + | | | | eRe +en a “s de Ue coe GE GRR eae: +A= 1778: BN AC ch, Ve | / Pees. cd woes ala J. +Diet ra cabd tosh 17] Aoi. Verh, lw | 1 || -f- si ae | | *) +itofer. nb Merb ! i! el | | Hd tee +| Keni “| al Vande, Wey. eg fh || | > it ey ig +eo” eae | 4 Sele Hr the 8 | ‘| mid +. Pe OG ee 10} ely: Juin eo | +, Leemncsovag thin Baek h Hotes Fae 49* | | | i || ie ng A pO esti abides, || EE t & -: ro, / as a h |G Din |< Thonn -> ty ams | % a2 beh b4.|-|-| 4 EG; Jibt Hated -| eee AEpfoc pZ | | mt Sata Memeo |=) 4) €) fo) I ie -aa 08 gift | | | moti ttt tale obded |e -[ Ee Al vine scien al Wi) it | 4 -Pe | a ! {ial \ No newline at end of file +: ro, / as a h |G Din |< Thonn +> ty ams | % a2 beh b4.|-|-| 4 EG; Jibt Hated +| eee AEpfoc pZ | | mt Sata Memeo |=) 4) €) fo) I ie +aa 08 gift | | | moti ttt tale obded |e +[ Ee Al vine scien al Wi) it | 4 +Pe | a ! {ial \ No newline at end of file diff --git a/dataset/validation/_ocr/markitdownnet/ICDAR/cTDaR_t00015.txt b/dataset/validation/_ocr/markitdownnet/ICDAR/cTDaR_t00015.txt index 8c711fb..f9c93f7 100644 --- a/dataset/validation/_ocr/markitdownnet/ICDAR/cTDaR_t00015.txt +++ b/dataset/validation/_ocr/markitdownnet/ICDAR/cTDaR_t00015.txt @@ -1,38 +1,38 @@ -| wo mry' as fier pete cata, SSM Some TY oe fe ae | tem Geral Hy - A| (812 B| (eS -| Ales tafmdcy |. Ce meh ak: ee a Meee es Huge -: li Jeff effi ebefe} ed i ba | 1 a i ae lan Waals -eta atl Maa Bent tay -ie aia Fai =a 0 g Pde yd a | i. WEA: 9% Br Ch. | is | -| G\ hal. FeQetled dlaval ||. | aia hs Bis. atk | | el ahogl | i 4 -Awa | i Oa | Wye Chil, Rrech | ila Hf SPs ae | -A Had | | t2\| Phila: ah fl oe yal os | -c cameo nnn TTT, Wee ee asl balal salelat (alate | | -| _——— | Alii WO] Fegehe’ Bebeen PA miles bused gh ae oAly\ | CE 74 9) - -oo | (ma. ast lone 1-1-1 al de Lt Weal ack | ALOR. | % Al | 4 2117 big thes fal be | -} | AbapanePAT PAGAL la) il ag | ars eg | 4 cou blaalilal.| dali +| ‘etc A| (812 B| (eS +| Ales tafmdcy |. Ce meh ak: ee a Meee es Huge +: li Jeff effi ebefe} ed i ba | 1 a i ae lan Waals +eta atl Maa Bent tay +ie aia Fai =a 0 g Pde yd a | i. WEA: 9% Br Ch. | is | +| G\ hal. FeQetled dlaval ||. | aia hs Bis. atk | | el ahogl | i 4 +Awa | i Oa | Wye Chil, Rrech | ila Hf SPs ae | +A Had | | t2\| Phila: ah fl oe yal os | +c cameo nnn TTT, Wee ee asl balal salelat (alate | | +| _——— | Alii WO] Fegehe’ Bebeen PA miles bused gh ae oAly\ | CE 74 9) - +oo | (ma. ast lone 1-1-1 al de Lt Weal ack | ALOR. | % Al | 4 2117 big thes fal be | +} | AbapanePAT PAGAL la) il ag | ars eg | 4 cou blaalilal.| dali Bl ga nesrseptcotpanipnl Perea ‘pane ias yf -Hi i} BY | “ne 4 1, \ fa.6 4 #2 . | } he . Zoey i -| Deki ried sped et sacaal lipamog b 444-|elslolaalel el atta -| pate y, 4 | Abell aed ll BATE | | Mpg | ae bi ass Bb’ -iT S ‘ WL each S63 eae | eal r Taal y - Gk Vad ail ‘Bug -At? eg A AY [2 7 A UZ, | ‘ Vauk .. jr 192 sire ies Bye 9. 34 s | -| ee ee TT lh (Re) ae Ho. A ea tA sili onl 7. «| Ms - -| | bbled te) kato ele ye. | NG a2 e5 4 #004 & 14 | 0424 24 te. | -: | | M tH is ; | f di Suxiefinect. uals i air a) a ‘ -| f eg sh it lef | SH | 2) A\1928 05) 74 481 7. r -| | beg het L AGIA J OT |: ce dacaees a bn ’ ae. we 4 14 4 74 | -i | y YE ey A Wopetan, Hal and ) 4 | G94 o9.Me gl 7 eg ow -| | | u4 Ze M1 TAD Zale ha Al 4 | Sabie! Bede; 9 Os HE a | gat ated ' -t 1 Ae aah 2 ai Go bes oe) | 7 -| | ae (od 2u3 Las (ad a sedges) -| | | SD i bee Gimsiy) 22 || danas 9) 2 3 ! -7 | | tT eee ga cl sll ol ig -| 2 -W) } Wie heh gy 4 “ bias) 2 / -Vee ‘ yy ma‘ | one a 3 .. Aq f i -on | | I | [SS at a - . -ae , | | | | | | | (ame A 2s Whe a 7 ; -ny ee bes | { ¢ | | | i ‘ - | PH Ey, 394 . I oe pt at \ No newline at end of file +Hi i} BY | “ne 4 1, \ fa.6 4 #2 . | } he . Zoey i +| Deki ried sped et sacaal lipamog b 444-|elslolaalel el atta +| pate y, 4 | Abell aed ll BATE | | Mpg | ae bi ass Bb’ +iT S ‘ WL each S63 eae | eal r Taal y - Gk Vad ail ‘Bug +At? eg A AY [2 7 A UZ, | ‘ Vauk .. jr 192 sire ies Bye 9. 34 s | +| ee ee TT lh (Re) ae Ho. A ea tA sili onl 7. «| Ms - +| | bbled te) kato ele ye. | NG a2 e5 4 #004 & 14 | 0424 24 te. | +: | | M tH is ; | f di Suxiefinect. uals i air a) a ‘ +| f eg sh it lef | SH | 2) A\1928 05) 74 481 7. r +| | beg het L AGIA J OT |: ce dacaees a bn ’ ae. we 4 14 4 74 | +i | y YE ey A Wopetan, Hal and ) 4 | G94 o9.Me gl 7 eg ow +| | | u4 Ze M1 TAD Zale ha Al 4 | Sabie! Bede; 9 Os HE a | gat ated ' +t 1 Ae aah 2 ai Go bes oe) | 7 +| | ae (od 2u3 Las (ad a sedges) +| | | SD i bee Gimsiy) 22 || danas 9) 2 3 ! +7 | | tT eee ga cl sll ol ig -| 2 +W) } Wie heh gy 4 “ bias) 2 / +Vee ‘ yy ma‘ | one a 3 .. Aq f i +on | | I | [SS at a - . +ae , | | | | | | | (ame A 2s Whe a 7 ; +ny ee bes | { ¢ | | | i ‘ - | PH Ey, 394 . I oe pt at \ No newline at end of file diff --git a/dataset/validation/_ocr/markitdownnet/ICDAR/cTDaR_t00080.txt b/dataset/validation/_ocr/markitdownnet/ICDAR/cTDaR_t00080.txt index 19d11c4..5816dc2 100644 --- a/dataset/validation/_ocr/markitdownnet/ICDAR/cTDaR_t00080.txt +++ b/dataset/validation/_ocr/markitdownnet/ICDAR/cTDaR_t00080.txt @@ -1,50 +1,52 @@ -Ba otr alorrire, mn Cathe L ares ; Fg 4 es Page Le - -LE as 3 SORE ST Sera Syprr stack aite Heya, -= oe ee 9 iy A ah ; , 2A: -fy |Peoee 2 7 : : rinse a -Too i en = base GLE, : / -pth Aye — ssf -eae ON I % ia. eee -nfs] sp ak f- OY te OO, ee MPEP — —— a see. -ye WO en |e eaten | faa! ae 1 Tie -om GA f i oe ae ) i ' Bas 7 -AU | v AYN tL LIP ru oY 2 spl 2m g 7 pe Sofi Hob ’ -hod} mins i/ Bf. V4 Gdns ie M Al ff // : -# | Lietigtia a. é wr di iS Luh if / J <7} . ff vy 4 a -five | Lu Tap, Lom ling Mlseelors thn ym ) Faas H -na PEW tan | Peep lil, » eae -Te os > 44 ref 2 » WF Fe ile ae ; ( ots e A q i -0 seal Dp on Pkeanaby Sai. | -5 eat ey IE ADR | WY, 24 . @ -: : [= | * a . oe BE bd sar jee v pod A a -be Xh > f Oy r VLnMNALBY wr REEL. “le a -| Gogh. Wy Pre! AT Ae anv pam -ay 4 prov. heb a Yow kath s ge fv br : 7 U/ zs : ;, -cl Lv" | Aix wher. _ 4 sey of ) | | — -oes Pi 8 de oe ee —— 4a -Vig ee by’ 4 pages ES oat | iB SS : : -tc 9 9t AS Z it padlox voa- fire tin soe a Wy eh -TEBE : eae a | -ee 4 a re hy is Z bp if A. Di i -Fees at = ade Let ooh flor ft |B ae -f { f j Jie f- . e 4 -ipcion aig eter | ae ae . £L,. ey -4 sub. by. “LYS Px : Lt 7 : : ae : i -re ae eee) >, -| ee acs Pflomnn heed yt nto ae i — - a ‘4 -ut 6 Us 4 Z Py 4 3 ef -YP hi oy , bey sit a “Ay s6Uy Sack ase: & a | -eae ter flay ay 2be / A -De nh ; 90 —— ! a -fa Hafli Rabat Bint SyhegYouyinem Be isto? @ -4 SES eo, ale 2 bm «fe 7 ee, .. i +Si! eee Se ee +cig 77zZ Cache Ja 2a rocaala Igy ie hae by > A387. ; +5 ayy : al > 7 < . -ff : Ree. -b2 1G. a Aid v7 = Me -i eithe Aero ee | oe -if La 3° VA , of " : J OH 3 Rese -fe te pee [tig | ~ a Aha AP IF7 : y PLD) sa cf y ’ ¥ Bo gh Z P PES. -| spilt Bisping nt) | oe -Za ak ae Fas 4 FA EB “ age Lb, : ane ee oe =e +soy iene a Ny AOP i +Tuy é eee | ope Ke . SiN aay oy i +Tie 0 ‘ one s Ai : Pua¢t7sz2t , * g thy ad —<— FF +i. es 4 fi : BPCLLS q +rin Pose f | . 7 5 ce eee ome oe +Pov". aunaber ss A Gls Moa pi pice TN CHa fanf Vit. ee > oe +Uhm Bry. eas On / 7, é : ry Ss oe haisdd 4 , f SU rine Gy § +J Wi | v ALY sess ALAS 14 esasdit act gay, nl $f i} Y) fe ‘ gp | : ; +{wd | eh Es / il ping “4 VA a rarer / h| '/ é /, 5 i f +mas TiO» Ie Ce. , 7 — ‘it —~Mlawbei. than yr ) a i ' +its pil csobetip bs, é par Arig Balbo 4’ | VG Baa) | +tay { ene —— A Serssta ot ppg ya a ae i - VA y &* ‘. +pvr. / 7 Urng~ if ay i +eas By oe TE PW GY yo . +if lon MAMA PAO. WA ‘ t, 24 fr a +Fie Porshe y gilyorud~ a. +IG BN eee eee He +bicNh > (Og {$+ W aap haee . A 1 = +LW ~ | 8: sess / bi Ba Mbt Dorn a! } “6 . Ts EBE AS Py dt i +/; | PO 7 wh o/s oe f fh) Pf, y 7 } fh i | +66 Bon i, Whine eg) prey A Tle +Us Sane io 0 a Sa ne ce a = +ee PANG 4 iil ee ee +fee on ULROS . | tind. j i oe Hi 2 | +fee 9h a LN testy p “2 ae, . oA st ieee en q +la Ii f ees gh ost a : (Poor 4 tl ; +be 4 sg i) * y he. : s - ae -poe col (eee xe ee Cy ag x ae es \ No newline at end of file +EH bef. rare fbdais fF? he > an +eee LOO ff i| +F on Mason | Ghar ghee Wn gle | | Diy og I + +pit Tied pele ue isto +pas Mave PT I pat on~ Gi BE a | +er ee Bee ree 9, Mg a +Feet ne AL. fed G8 fo + +oe af 4 +{ ne AF, ae : : E | +$= Gollah Phase) Slaw 8) F, 7 : | +tls pe §> Cof Oh 9.8 77 fj Vis A Pr ae ‘i be oe x +fe gs 1 aye ae? a 4 / ‘ © 4 : A (7 ; Ps of 58 are ; Sy 1 +be, qd. We Hla i” by (Oe ) ike +: : th cnl GRA Pe pe «ft {/. » See +MS yg Set OS ae) a a +D: Ritcarlefer fod en MR ne +* hs gee flor tae YU Cc ae eens MORSE 5 Nope eg +Eee See co ee ae + A eee Se eee Se eee ee Sei i \ No newline at end of file diff --git a/dataset/validation/_ocr/markitdownnet/MARMOT/10.1.1.1.2006_3.txt b/dataset/validation/_ocr/markitdownnet/MARMOT/10.1.1.1.2006_3.txt index 6cfe12e..99bcb7f 100644 --- a/dataset/validation/_ocr/markitdownnet/MARMOT/10.1.1.1.2006_3.txt +++ b/dataset/validation/_ocr/markitdownnet/MARMOT/10.1.1.1.2006_3.txt @@ -1,52 +1,52 @@ -09 these consume 0,99 CPU. That is, task3- misses -deadlines and the inverted pendulum! falls down, This -an explains the fact that the cost function in Figure 2 goes -. to infinite. Note that under RM, the task is not -Bod schedulable -a. + EDP: EDF is « dynamic alorihm in open bop -Which assigns priorities to tasks according to their - +09 these consume 0,99 CPU. That is, task3- misses +a isaaise deadlines and the inverted pendulum! falls down, This +og EF c explains the fact thatthe cost function in Figure 2 goes +. to infinite. Note that under RM, the task is not +Bod schedulable +i + EDP: EDF is « dynamic alorihm in open bop +§ Which assigns priorities to tasks according to their +i, Beavis dedine, Une BG’ holler te schedulabiity condition is given by U = 1. For our -wg simulations, since U <1, the task set is schedulable +4 simulations, since U <1, the task set is schedulable and the three pendulums can be controlled as it can be -A seen in Figure 2: the accumulated cost reaches a finite -a a oe a! value, which means that the deviation caused by each -ii Ail sal aed perturbation thet affected each of the three pendulums +A seen in Figure 2: the accumulated cost reaches a finite +a a oe i value, which means that the deviation caused by each +sig cic al ad perturbation thet affected each of the three pendulums could be adequately comected. ‘The performance with the difference in performance due 10 the we of sehieved by EDF is also given in Figure 2 in tems of -different scheduling polices, which is the objective of our tie coat BocHonsreadting a valus oF W2754 at tho -simulations. completion of the evaluation interval +different scheduling polices, which is the objective of our tie coat BocHonsreadting a valus oF W2754 at tho +simulations. completion of the evaluation interval ‘The perfomance of each pendulum is obtained by the following cost fmetion J, which measures the enor ¢,of 4 LER: LEF is a dynamie scheduling algorithm in -the pendulum weighted with time ¢ closed-loop, which adjusts the schedule based on -= fre2enae continuous feedback of each control loop. Under this -vin= |ehom scheduler, inthe simulation we ebain thatthe thee -Note that 1 sets the duration of the evaluation period, inverted pendulums can be perfectly controlled. As it -‘hich typically should go fen the’ perturbation sirival can be seen in Figure 2 the cost function of LEF goes -fine (0m the integral) wo the setling tine As higher blow the cost function of EDF, meaning that for this +the pendulum weighted with time ¢ closed-loop, which adjusts the schedule based on += fre2enae continuous feedback of each control loop. Under this +vin= |ehom scheduler, inthe simulation we ebain thatthe thee +Note that 1 sets the duration of the evaluation period, inverted pendulums can be perfectly controlled. As it +‘hich typically should go fen the’ perturbation sirival can be seen in Figure 2 the cost function of LEF goes +fine (0m the integral) wo the setling tine As higher blow the cost function of EDF, meaning that for this values the cost function gets, the worst the control palticular simple simulation set-up, LEF performs -performance (because major deviations occur or because it better that EDP. Note that the final cost of LEF is -fakes more time forthe inverted pendulum to recover from 0.2490 +performance (because major deviations occur or because it better that EDP. Note that the final cost of LEF is +fakes more time forthe inverted pendulum to recover from 0.2490 the perturbations) C. Discussion -B Results ‘The exact cost for each one of the three pendulums +B Results ‘The exact cost for each one of the three pendulums The simulation we run Keeps the following time under each scheduling algorithm is summarized in the sequence: at time =O, only the control task controlling the Table 1. RM fails in controlling the thitd pendulum, EDF fist pendulum is released. After executing alone, at time and LEF are able to control the three pendulums, However 12. another contol task is released to control the second LEF gives the best control in terms of the cost function pendulum, Finally the third control task is released at time f4, The three controllers run in parallel until t=7. Before Tate 1, Cost forthe tac inverted pendulum under cash -the release time of each control task, the corresponding different scheduling policy -pendulum isin equilibrium, At release time of each control ——— -fask, the pendulum suffers a perturbation (of equal Scheduling Jy Sp -magnitude for each pendulum). This simulation pattern is RM 00033009308 -repeated for different scheduling algorithm: RM, EDF and EDF 0033. 00930 0.1769 -LEF. During each experiment, the cost funtion presented LEF 000330812 _o.1645 +the release time of each control task, the corresponding different scheduling policy +pendulum isin equilibrium, At release time of each control ——— +fask, the pendulum suffers a perturbation (of equal Scheduling Jy Sp +magnitude for each pendulum). This simulation pattern is RM 00033009308 +repeated for different scheduling algorithm: RM, EDF and EDF 0033. 00930 0.1769 +LEF. During each experiment, the cost funtion presented LEF 000330812 _o.1645 earlier and the resulting schedule are recorded. The cost function evaluation period goes from the beginning of each This results shows that scheduling polices that take simulation, =0, to the simulation completion, at t=7. For advantage of the application dynamics and are able to cach scheduling policy, the results we obtained are agjust the schedule accordingly ean provide, in. some -summarized inthe following: Specific scenarios, better performance in terms of the +summarized inthe following: Specific scenarios, better performance in terms of the application (control performance in our case). + RM: RMisastatic scheduling algorithm in open loop, For opertoop scheduling approaches we identified (see which assigns priorities to tasks according to their Section 1) three main negative aspects: low real CPU diff --git a/dataset/validation/_ocr/markitdownnet/MARMOT/10.1.1.1.2013_63.txt b/dataset/validation/_ocr/markitdownnet/MARMOT/10.1.1.1.2013_63.txt index 9ed055e..8c0a1a5 100644 --- a/dataset/validation/_ocr/markitdownnet/MARMOT/10.1.1.1.2013_63.txt +++ b/dataset/validation/_ocr/markitdownnet/MARMOT/10.1.1.1.2013_63.txt @@ -3,40 +3,40 @@ Table 1—Activity, employment, and unemployment rates forages 15-64, by sex and urban/rural location, 1990-95 Cs -Rate 1990 1991 199219931994 1995 +Rate 1990 1991 199219931994 1995 Activity rate (15-64) -Urban Male no 108 69.8 108 105 699 -Female 24 203 19.1 203 202 195 -Total 473 457 444 457 454 448 -Rural Male 782 168 163 163 76.1 163 -Female 347 293 261 240 254 230 -Total 564 530 50.8 50.2 510 497 -Total Male 753 749 2 7 BS B3 -Female 290 252 2.8 23 230 214 -Total 52. 49.6 478 48.1 484 474 +Urban Male no 108 69.8 108 105 699 +Female 24 203 19.1 203 202 195 +Total 473 457 444 457 454 448 +Rural Male 782 168 163 163 76.1 163 +Female 347 293 261 240 254 230 +Total 564 530 50.8 50.2 510 497 +Total Male 753 749 2 7 BS B3 +Female 290 252 2.8 23 230 214 +Total 52. 49.6 478 48.1 484 474 Employment rate (15-64)** -Urban Male 670 653 647 649 649 646 -Female 168 154 143 146 146 14.1 -Total 42.1 405 395 399 308 304 -Rural Male 45 na 18 109 107 106 -Female 316 262 28 195 204 181 -Total 330 493 469 453 458 443 -Total Male 110 69.1 68.5 68.1 68.0 678 -Female 247 aul 189 172 117 162 -Total 479 452 434 428 43.0 420 +Urban Male 670 653 647 649 649 646 +Female 168 154 143 146 146 14.1 +Total 42.1 405 395 399 308 304 +Rural Male 45 na 18 109 107 106 +Female 316 262 28 195 204 181 +Total 330 493 469 453 458 443 +Total Male 110 69.1 68.5 68.1 68.0 678 +Female 247 aul 189 172 117 162 +Total 479 452 434 428 43.0 420 Unemployment Rate (15-64)** -Urban Male 69 1 13 84 19 16 -Female 248 244 249 279 280 276 -Total Ma 14 na 127 124 119 -Rural Male 47 57 59 10 A 1s -Female 9.0 108 125 187 19.6 214 -Total 60 mM 16 98 10.1 107 -Total Male 57 66 63 16 1s 1s -Female 147 159 173 27 2.1 241 -Total 82 89 om i Te 113 +Urban Male 69 1 13 84 19 16 +Female 248 244 249 279 280 276 +Total Ma 14 na 127 124 119 +Rural Male 47 57 59 10 A 1s +Female 9.0 108 125 187 19.6 214 +Total 60 mM 16 98 10.1 107 +Total Male 57 66 63 16 1s 1s +Female 147 159 173 27 2.1 241 +Total 82 89 om i Te 113 Jouce:CAPMAS,LFSS,——SSSCSCSSSSSSSSSSSSS Notes: Activity rate = labor force/population x 100 percent; employment rate = employment/population x 100 percent; unemployment rate = unemployment/labor force x 100 percent \ No newline at end of file diff --git a/dataset/validation/_ocr/markitdownnet/MARMOT/10.1.1.1.2013_64.txt b/dataset/validation/_ocr/markitdownnet/MARMOT/10.1.1.1.2013_64.txt index 6795c00..e50a605 100644 --- a/dataset/validation/_ocr/markitdownnet/MARMOT/10.1.1.1.2013_64.txt +++ b/dataset/validation/_ocr/markitdownnet/MARMOT/10.1.1.1.2013_64.txt @@ -6,24 +6,24 @@ Sg _ SSeS 99S SCSCOSIOT Wale han SSSC~SSSSC~—~OCSSSC -Rural 792 76.3 154 +Rural 792 76.3 154 -Female Urban 293 195 262 +Female Urban 293 195 262 -Rural 539 23.0 173 +Rural 539 23.0 173 ‘Table 3—Unemployment rate, by sex, education, and region, economically active population aged 15 ~64 ,_.-. TRS 19—S—S—~—~—~—~“—siES TTS -With search With search Without search +With search With search Without search “Tran Runt Tos! Urban Run Tot “Urban Rural Total Mie -Below secondary 13, 04 07 5402033 M3300 47 +Below secondary 13, 04 07 5402033 M3300 47 Secondary andabove 14126001893. «12.7s9 130169147 -Total 16° 75 78 84 S54 68 101-7788 +Total 16° 75 78 84 S54 68 101-7788 Females -Below secondary 4103 «0.70 «9816S 7S 1732-202 +Below secondary 4103 «0.70 «9816S 7S 1732-202 Secondary and above 30.9 $7.7 40.0232 «405 8S 53.2384 -Total 276 214 24.0 2012302139732. \ No newline at end of file +Total 276 214 24.0 2012302139732. \ No newline at end of file diff --git a/dataset/validation/_ocr/markitdownnet/MARMOT/10.1.1.1.2014_4.txt b/dataset/validation/_ocr/markitdownnet/MARMOT/10.1.1.1.2014_4.txt index 63908f7..61f9237 100644 --- a/dataset/validation/_ocr/markitdownnet/MARMOT/10.1.1.1.2014_4.txt +++ b/dataset/validation/_ocr/markitdownnet/MARMOT/10.1.1.1.2014_4.txt @@ -7,39 +7,39 @@ afresh from p,. Table 1. Parameters for deciding outlicrsichanges. Pa rameters with * are user-provided -Tae | -Sina | 2 tat a ea a is -sntrortrait | enim ny i tr W +Tae | +Sina | 2 tat a ea a is +sntrortrait | enim ny i tr W -—_ Scigcwsencame "=" S +—_ Scigcwsencame "=" S -een mare soutien amd dens om f Figure 1. Effect of older points in RLS-based -ey Re Date enuccemone b outlier/change point detection. (a) saw-tooth -___ | Sesto ac ofertas data stream. (b) slopes of the line segments -crenimseniot | LeRoi cr cn roe in the sawtooth. (c) slope estimate using -crmrovrdeds | ute miner of pon wat the nena RLS. (d) slope estimate using a moving win- +een mare soutien amd dens om f Figure 1. Effect of older points in RLS-based +ey Re Date enuccemone b outlier/change point detection. (a) saw-tooth +___ | Sesto ac ofertas data stream. (b) slopes of the line segments +crenimseniot | LeRoi cr cn roe in the sawtooth. (c) slope estimate using +crmrovrdeds | ute miner of pon wat the nena RLS. (d) slope estimate using a moving win- mecefleatececimet cries | dow of alze5, -ea Foeiee See ae Tee One approach to forget older points is to have a moving +ea Foeiee See ae Tee One approach to forget older points is to have a moving -oo Sear ece oc EEA ‘model over this window. At the same time, we would like +oo Sear ece oc EEA ‘model over this window. At the same time, we would like 3.2.1 Using Forward-Backward RLS ‘One method to achieve this would be to derive a backward -3.1 The Naive Approach recursion and altemately use the forward and backward re- +3.1 The Naive Approach recursion and altemately use the forward and backward re- In this approach the RLS algorithm is used to compute sions as follows,” Suppose the current window spans the model parameters over all the data seen so far (unre-_‘{lts-- fw) When a new data point comes in at fy 41, we -Geeaciae) use the forward recursion (equations 7-9) to derive a model +Geeaciae) use the forward recursion (equations 7-9) to derive a model Inivally (forthe fist few points) this approach may be "the points in [fs..» fy ty 11) Then, we use the back- succesfl in detecting outicrlchonge pots, As more Ward ecursion to remove the effets ofthe point at and points come in, RLS estimates the model parameters con- ‘its get a model forthe points in [fzy...«fovfust In what follows, we derive the backward recursion for -sidering all the points seen so far, and thus it gives equal 5 +sidering all the points seen so far, and thus it gives equal 5 ‘weight to all the points in the unrestricted window, Asa RLS. Suppose we have Ania and Pia over certain data S, result, the changes in the dynamics of the system may be W#eTE -hhidden by the dominance of the older points. Figure 3.1 il- a -lustrates this phenomenon. As a result RLS by itselfis not Pua BT) 10 -sdequne, Te pblem w de tothe fet dat RLS esto Hey ane) an +hhidden by the dominance of the older points. Figure 3.1 il- a +lustrates this phenomenon. As a result RLS by itselfis not Pua BT) 10 +sdequne, Te pblem w de tothe fet dat RLS esto Hey ane) an 4 \ No newline at end of file diff --git a/dataset/validation/_ocr/markitdownnet/PUBTABLES/PMC1064078_table_0.txt b/dataset/validation/_ocr/markitdownnet/PUBTABLES/PMC1064078_table_0.txt index 28df793..8533a23 100644 --- a/dataset/validation/_ocr/markitdownnet/PUBTABLES/PMC1064078_table_0.txt +++ b/dataset/validation/_ocr/markitdownnet/PUBTABLES/PMC1064078_table_0.txt @@ -1,15 +1,15 @@ Cases and controls in Carolina Breast Cancer Study (CBCS) analytic datasets by race -Atican-American write +Atican-American write -Anayic dataset Main exposure Cases(%s} Controls 6) Cases (8) Contos (8) +Anayic dataset Main exposure Cases(%s} Controls 6) Cases (8) Contos (8) -BCS, entre Beth order 385 (1000) $92(1000) 526(1000) 488 (100.0) +BCS, entre Beth order 385 (1000) $92(1000) 526(1000) 488 (100.0) -(©BCS, bom 1948 or later 191 (59.1) 198407) 235 (447) 181 (3085) +(©BCS, bom 1948 or later 191 (59.1) 198407) 235 (447) 181 (3085) -Maternal age dataset Bithorder, Matemal age —-«107(81.)116(349) 173 (829) 121 (26.4) -Paterna age dataset Paternal age 95 (284) 100(90.1) 171 (928) 118.(25.8) -(©BCS, NC bom 1949 o ater 90/206) 96 (289) 112 (218) 85 (18.6) -Birthweight fll datasot Bithweight 861257) 89268) 110209) 7817.0) -Binhweight, resticted dataset Bithweight 49(146) 970013) 98(186) 63188) \ No newline at end of file +Maternal age dataset Bithorder, Matemal age —-«107(81.)116(349) 173 (829) 121 (26.4) +Paterna age dataset Paternal age 95 (284) 100(90.1) 171 (928) 118.(25.8) +(©BCS, NC bom 1949 o ater 90/206) 96 (289) 112 (218) 85 (18.6) +Birthweight fll datasot Bithweight 861257) 89268) 110209) 7817.0) +Binhweight, resticted dataset Bithweight 49(146) 970013) 98(186) 63188) \ No newline at end of file diff --git a/dataset/validation/_ocr/markitdownnet/PUBTABLES/PMC1064078_table_2.txt b/dataset/validation/_ocr/markitdownnet/PUBTABLES/PMC1064078_table_2.txt index 5e046a6..b60b94b 100644 --- a/dataset/validation/_ocr/markitdownnet/PUBTABLES/PMC1064078_table_2.txt +++ b/dataset/validation/_ocr/markitdownnet/PUBTABLES/PMC1064078_table_2.txt @@ -1,19 +1,19 @@ Birthweight slistbutions and odd alos for breast cance in Aican-Amercan and white women combined -Minnay ated OR Ful aja OR +Minnay ated OR Ful aja OR Canes Conte ——OR" «SHC Gatee Canto ORY 95H -Ful bthwight dteoot a=08 | n=t07 nate nator -Lower tree 7 7 09 0618 72 10 oer -Conta te 20 st at © st Ret -Upper tre 8s 86 07 o&t2 80 807 ona -Mean 8D (@) anon #850 -Median) sea -Range () toat—4ea -Resticted dataset antay—— n=t00 nava nnor -Lower trae se “ os os17 88 4 10 os09 -Conta te ae a fet ” a Ret -Upper tre 0 25 10 0s21 98 2508 oat -Mean #80 (@) ano. a2 -Median () eae -Rage) 2081-4621 +Ful bthwight dteoot a=08 | n=t07 nate nator +Lower tree 7 7 09 0618 72 10 oer +Conta te 20 st at © st Ret +Upper tre 8s 86 07 o&t2 80 807 ona +Mean 8D (@) anon #850 +Median) sea +Range () toat—4ea +Resticted dataset antay—— n=t00 nava nnor +Lower trae se “ os os17 88 4 10 os09 +Conta te ae a fet ” a Ret +Upper tre 0 25 10 0s21 98 2508 oat +Mean #80 (@) ano. a2 +Median () eae +Rage) 2081-4621 ‘body mass index >25 kg/m?, 3458 g; \ No newline at end of file diff --git a/dataset/validation/_ocr/markitdownnet/PUBTABLES/PMC1064078_table_3.txt b/dataset/validation/_ocr/markitdownnet/PUBTABLES/PMC1064078_table_3.txt index d3d41ec..9bef231 100644 --- a/dataset/validation/_ocr/markitdownnet/PUBTABLES/PMC1064078_table_3.txt +++ b/dataset/validation/_ocr/markitdownnet/PUBTABLES/PMC1064078_table_3.txt @@ -1,23 +1,23 @@ Birhweiht strbutons and os tos for breast cancer among Aicar-Ameian andwhte women byece -en rao +en rao Gan Gatch OF HG Gam Ganah OF HG Gm Gomes OF WHO Caw Goma OF HONG -Comune 8 oe am 7 MM +Comune 8 oe am 7 MM -messes ome +messes ome -sn on ne +sn on ne -oe 4734 +oe 4734 -sioess0) sates cae +sioess0) sates cae i -sn oe ne +sn oe ne -nents ai a +nents ai a ay ee en en SE ea a ae oe rs acre ae, we hi ee \ No newline at end of file diff --git a/dataset/validation/_ocr/markitdownnet/PUBTABLES/PMC1064078_table_4.txt b/dataset/validation/_ocr/markitdownnet/PUBTABLES/PMC1064078_table_4.txt index 1a677f8..13b8b49 100644 --- a/dataset/validation/_ocr/markitdownnet/PUBTABLES/PMC1064078_table_4.txt +++ b/dataset/validation/_ocr/markitdownnet/PUBTABLES/PMC1064078_table_4.txt @@ -1,24 +1,24 @@ Parental age distributions and odds ratios for breast cancer among African-American and white women combined -Minimal acusted OR* Ful adjusted OR® -case Conti OR-—«8BHC!_—Case—=—Contol-—«OR—— BHI +Minimal acusted OR* Ful adjusted OR® +case Conti OR-—«8BHC!_—Case—=—Contol-—«OR—— BHI -Maternal age ears n= 260 n= 296 n=263 =a +Maternal age ears n= 260 n= 296 n=263 =a -1948 at a1 18 0934 90 2 18 og-a5 -19-22) st 80 Ret 4 7 Rel -29-27 a 49 20 1880 as 495 20-80 -28-44 m n 25 16-40 100 830182 -Mean # SD 2estes 252467 -Median 28 23 -Range 17-48 15-49 -Paternal age ears n= 206 na2ie n=251 n= 200 -15-22 39 38 10 0648 38 318 Ban -29-27 (ro) 73 0 Ret © Ret -29-34 27 o 19 06-21 a3 e212 aren -95-56 o 4s 16 0926 61 415 oraz -Moan # 80 sorz1) 298278) -Median 29 28 -Range 17-83 15-86 +1948 at a1 18 0934 90 2 18 og-a5 +19-22) st 80 Ret 4 7 Rel +29-27 a 49 20 1880 as 495 20-80 +28-44 m n 25 16-40 100 830182 +Mean # SD 2estes 252467 +Median 28 23 +Range 17-48 15-49 +Paternal age ears n= 206 na2ie n=251 n= 200 +15-22 39 38 10 0648 38 318 Ban +29-27 (ro) 73 0 Ret © Ret +29-34 27 o 19 06-21 a3 e212 aren +95-56 o 4s 16 0926 61 415 oraz +Moan # 80 sorz1) 298278) +Median 29 28 +Range 17-83 15-86 Soetatos or 1) sjased GR powsahocl cam £680,000 hd ith crdeAdstoralcovattcs or aly agusted OR aa boy ass index CORES 25: UY BRINE Ses, Beaies Ole INCRE emmy, Riel ONT \ No newline at end of file diff --git a/dataset/validation/_ocr/markitdownnet/PUBTABLES/PMC1064078_table_5.txt b/dataset/validation/_ocr/markitdownnet/PUBTABLES/PMC1064078_table_5.txt index b94eeeb..88fd679 100644 --- a/dataset/validation/_ocr/markitdownnet/PUBTABLES/PMC1064078_table_5.txt +++ b/dataset/validation/_ocr/markitdownnet/PUBTABLES/PMC1064078_table_5.txt @@ -1,25 +1,25 @@ Parental age distributions and odd ratios for breast cancer among African-American and white women by race -Moimayaustes OR Fal agurted OR +Moimayaustes OR Fal agurted OR Cue Contol OR 9B5CI Case _——Contol- OR BWI Cass Conteh OR 9BHCI Case Coaval OR 95M CI -yews" i = a es eae SUTURE: -15-18 10 7 2 0955 12 4 41 0529 18 15 27 10-70 12440 04-87 +yews" i = a es eae SUTURE: +15-18 10 7 2 0955 12 4 41 0529 18 15 27 10-70 12440 04-87 -wea 18 2 Ret 29 47 Re 789 Ret a 4k Ret +wea 18 2 Ret 29 47 Re 789 Ret a 4k Ret -2-27 28 2 19 094s 88 242 21-89 27 2824 10-84 88208 23-08 +2-27 28 2 19 094s 88 242 21-89 27 2824 10-84 88208 23-08 -Mein#8D 286279 255270 vests 249269 -Medan 26 2 26 2 +Mein#8D 286279 255270 vests 249269 +Medan 26 2 26 2 -Petoaiage = 95 n= 100 nai pate nao n= 94 ne te2 nena +Petoaiage = 95 n= 100 nai pate nao n= 94 ne te2 nena ‘yeu -15-22 16 15 14 06-99 29 2 07 0345 18 27 10S 21814 05-28 +15-22 16 15 14 06-99 29 2 07 0345 18 27 10S 21814 05-28 -an 2 20 Rt s 40 Rt 2 Ret 98 Ret +an 2 20 Rt s 40 Rt 2 Ret 98 Ret -Mon280 90926 204280 msz68 90269 +Mon280 90926 204280 msz68 90269 Saeco snd Geter sAdecorlcovrates or fly sduted OR ada body macs index 928 ln Nowsehod income S8S0,0, bh Oder ane maternal spe \ No newline at end of file diff --git a/dataset/validation/_ocr/markitdownnet/PUBTABLES/PMC1064078_table_6.txt b/dataset/validation/_ocr/markitdownnet/PUBTABLES/PMC1064078_table_6.txt index 30b2fe7..ac5078d 100644 --- a/dataset/validation/_ocr/markitdownnet/PUBTABLES/PMC1064078_table_6.txt +++ b/dataset/validation/_ocr/markitdownnet/PUBTABLES/PMC1064078_table_6.txt @@ -1,10 +1,10 @@ Bir order stb and odds rato fr beast cancer among whit and Atean-Ameriean women by ace -a et raya +a et raya Goat WAG Ge tot WHE “Gen Gas OF ONG Ge Gm RNG -mona a8 wae +mona a8 wae Ty diff --git a/dataset/validation/_ocr/markitdownnet/PUBTABLES/PMC1064082_table_0.txt b/dataset/validation/_ocr/markitdownnet/PUBTABLES/PMC1064082_table_0.txt index 12a91f2..8c068ba 100644 --- a/dataset/validation/_ocr/markitdownnet/PUBTABLES/PMC1064082_table_0.txt +++ b/dataset/validation/_ocr/markitdownnet/PUBTABLES/PMC1064082_table_0.txt @@ -1,5 +1,5 @@ -Imunchtochemy ndings alate mRNA epesson P +Imunchtochemy ndings alate mRNA epesson P -000 >2000 +000 >2000 -Hah erosion 3 " await Task.Run(() => ProcessImage(path, can }; } - private MarkItDownResult ProcessPdf(string path, CancellationToken ct) - { - using var stream = File.OpenRead(path); - using var document = PdfDocument.Open(stream); - var pages = new List(); - var lines = new List(); - var words = new List(); - - foreach (var page in document.GetPages()) - { - ct.ThrowIfCancellationRequested(); - pages.Add(new Page(page.Number, page.Width, page.Height)); - - var pageWords = page.GetWords() - .Select(w => new Word(page.Number, w.Text, BoundingBox.FromPdf(w.BoundingBox, page.Width, page.Height))) - .ToList(); - - words.AddRange(pageWords); - - foreach (var lineWords in GroupWordsIntoLines(pageWords)) - { - var text = string.Join(" ", lineWords.Select(w => w.Text)); - var union = Union(lineWords.Select(w => w.BBox)); - lines.Add(new Line(page.Number, text, union)); - } - } - - // If there are not enough words, fall back to OCR - if (words.Count < _options.MinimumNativeWordThreshold) - { - _logger.Information("Native text too small ({Count}), attempting OCR fallback", words.Count); - return ProcessPdfWithOcr(path, ct); - } - - var markdown = BuildMarkdown(lines); - return new MarkItDownResult(markdown, pages, lines, words); - } + public double LastDeskewAngle { get; private set; } + public bool LastDeskewApplied { get; private set; } - private MarkItDownResult ProcessPdfWithOcr(string path, CancellationToken ct) + private MarkItDownResult ProcessPdf(string path, CancellationToken ct) { var pages = new List(); var lines = new List(); var words = new List(); - // Rasterize PDF into images using PDFtoImage - var renderOptions = new RenderOptions { Dpi = _options.PdfRasterDpi }; - using var stream = File.OpenRead(path); - foreach (var bitmap in Conversion.ToImages(stream, leaveOpen: false, password: null, renderOptions)) + int pageNum = 0; + foreach (var item in Rasterizer.FromPdf(path, _options)) { - ct.ThrowIfCancellationRequested(); - using (bitmap) + using (item.pix) { - pages.Add(new Page(pages.Count + 1, bitmap.Width, bitmap.Height)); - using var image = SKImage.FromBitmap(bitmap); - using var data = image.Encode(SKEncodedImageFormat.Png, 100); - using var pix = Pix.LoadFromMemory(data.ToArray()); - var result = ProcessPix(pix, pages.Count, ct); + LastDeskewAngle = item.angle; + LastDeskewApplied = item.deskewed; + ct.ThrowIfCancellationRequested(); + pageNum++; + pages.Add(new Page(pageNum, item.pix.Width, item.pix.Height)); + var result = ProcessPix(item.pix, pageNum, ct); lines.AddRange(result.lines); words.AddRange(result.words); } @@ -125,7 +84,10 @@ private MarkItDownResult ProcessPdfWithOcr(string path, CancellationToken ct) private MarkItDownResult ProcessImage(string path, CancellationToken ct) { - using var pix = Pix.LoadFromFile(path); + var item = Rasterizer.FromImage(path, _options); + using var pix = item.pix; + LastDeskewAngle = item.angle; + LastDeskewApplied = item.deskewed; var (lines, words) = ProcessPix(pix, 1, ct); var pages = new List { new Page(1, pix.Width, pix.Height) }; var markdown = BuildMarkdown(lines); @@ -136,11 +98,13 @@ private MarkItDownResult ProcessImage(string path, CancellationToken ct) { var lines = new List(); var words = new List(); + Environment.SetEnvironmentVariable("OMP_THREAD_LIMIT", _options.OcrThreads.ToString()); using var engine = new TesseractEngine( _options.OcrDataPath ?? string.Empty, _options.OcrLanguages, - EngineMode.LstmOnly); - engine.DefaultPageSegMode = _options.PageSegMode; + _options.OcrOem); + engine.DefaultPageSegMode = (PageSegMode)_options.OcrPsm; + engine.SetVariable("preserve_interword_spaces", "1"); using var page = engine.Process(pix); using var iter = page.GetIterator(); iter.Begin(); @@ -176,45 +140,6 @@ private static BoundingBox Normalize(Rect rect, int width, int height) return new BoundingBox((double)rect.X1 / width, (double)rect.Y1 / height, (double)rect.Width / width, (double)rect.Height / height); } - private static IEnumerable> GroupWordsIntoLines(IReadOnlyList words) - { - const double tolerance = 0.02; // normalized units - var result = new List>(); - var sorted = words.OrderBy(w => w.BBox.Y).ThenBy(w => w.BBox.X).ToList(); - - var current = new List(); - double? currentTop = null; - foreach (var w in sorted) - { - if (currentTop == null || Math.Abs(w.BBox.Y - currentTop.Value) <= tolerance) - { - currentTop = w.BBox.Y; - current.Add(w); - } - else - { - result.Add(current); - current = new List { w }; - currentTop = w.BBox.Y; - } - } - if (current.Count > 0) - { - result.Add(current); - } - - return result; - } - - private static BoundingBox Union(IEnumerable rects) - { - var left = rects.Min(r => r.X); - var top = rects.Min(r => r.Y); - var right = rects.Max(r => r.X + r.Width); - var bottom = rects.Max(r => r.Y + r.Height); - return new BoundingBox(left, top, right - left, bottom - top); - } - private string BuildMarkdown(IEnumerable lines) { var ordered = lines diff --git a/src/MarkItDownNet/MarkItDownOptions.cs b/src/MarkItDownNet/MarkItDownOptions.cs index e1b86d7..bf07a62 100644 --- a/src/MarkItDownNet/MarkItDownOptions.cs +++ b/src/MarkItDownNet/MarkItDownOptions.cs @@ -2,6 +2,12 @@ namespace MarkItDownNet; using Tesseract; +public enum OcrColorDepth +{ + Grayscale8bpp, + Bgra32bpp +} + /// Runtime options for conversion. public class MarkItDownOptions { @@ -11,11 +17,32 @@ public class MarkItDownOptions /// Languages for OCR, e.g. "eng" or "ita+eng". public string OcrLanguages { get; set; } = "eng"; - /// Page segmentation mode used by Tesseract. - public PageSegMode PageSegMode { get; set; } = PageSegMode.SingleBlock; + /// User-specified DPI for OCR rasterization. + public int OcrUserDpi { get; set; } = 300; + + /// Page segmentation mode. + public int OcrPsm { get; set; } = 6; + + /// OCR engine mode. + public EngineMode OcrOem { get; set; } = EngineMode.LstmOnly; + + /// Maximum number of OCR threads. + public int OcrThreads { get; set; } = 1; + + /// Force rasterization even for digital PDFs. + public bool OcrForceRaster { get; set; } = true; + + /// Apply Otsu binarization before Tesseract. + public bool OcrPreBinarize { get; set; } = false; + + /// Deskew only if |angle| exceeds this threshold. + public double OcrDeskewMinAngleDeg { get; set; } = 2.0; + + /// Color depth for images passed to Tesseract. + public OcrColorDepth OcrColorDepth { get; set; } = OcrColorDepth.Grayscale8bpp; - /// DPI used when rasterizing PDFs for OCR fallback. - public int PdfRasterDpi { get; set; } = 300; + /// Set X/Y resolution on Pix before OCR. + public bool OcrSetDpiMetadata { get; set; } = true; /// Minimum number of native words required before falling back to OCR. public int MinimumNativeWordThreshold { get; set; } = 1; diff --git a/src/MarkItDownNet/Rasterizer.cs b/src/MarkItDownNet/Rasterizer.cs new file mode 100644 index 0000000..2fef2cf --- /dev/null +++ b/src/MarkItDownNet/Rasterizer.cs @@ -0,0 +1,87 @@ +using System; +using System.Collections.Generic; +using System.IO; +using PDFtoImage; +using SkiaSharp; +using Tesseract; + +namespace MarkItDownNet; + +static class Rasterizer +{ + public static IEnumerable<(Pix pix, double angle, bool deskewed)> FromPdf(string path, MarkItDownOptions opt) + { + var ropt = new RenderOptions { Dpi = opt.OcrUserDpi }; + using var stream = File.OpenRead(path); + foreach (var bmp in Conversion.ToImages(stream, leaveOpen: false, password: null, ropt)) + { + yield return FromBitmap(bmp, opt); + } + } + + public static (Pix pix, double angle, bool deskewed) FromImage(string path, MarkItDownOptions opt) + { + using var src = Pix.LoadFromFile(path); + var xres = src.XRes; + if (xres <= 0) xres = opt.OcrUserDpi; + float scale = xres < 220 ? opt.OcrUserDpi / (float)xres : 1f; + Pix scaled = scale == 1f ? src.Clone() : src.Scale(scale, scale); + return Preprocess(scaled, opt); + } + + static (Pix pix, double angle, bool deskewed) FromBitmap(SKBitmap bmp, MarkItDownOptions opt) + { + using (bmp) + { + using var gray = bmp.Copy(SKColorType.Gray8); + using var img = SKImage.FromBitmap(gray); + using var data = img.Encode(SKEncodedImageFormat.Png, 100); + var pix = Pix.LoadFromMemory(data.ToArray()); + return Preprocess(pix, opt); + } + } + + static (Pix pix, double angle, bool deskewed) Preprocess(Pix pix, MarkItDownOptions opt) + { + if (opt.OcrSetDpiMetadata) + { + pix.XRes = opt.OcrUserDpi; + pix.YRes = opt.OcrUserDpi; + } + Pix work = pix.Depth == 8 ? pix : pix.ConvertTo8(0); + if (!ReferenceEquals(work, pix)) pix.Dispose(); + Pix result = work; + double angle = 0; + bool deskewed = false; + if (opt.OcrPreBinarize) + { + try + { + var bin = result.BinarizeOtsuAdaptiveThreshold(0, 0, 0, 0, 0); + if (!ReferenceEquals(bin, result)) + { + result.Dispose(); + result = bin; + } + } + catch { } + } + try + { + var desk = result.Deskew(out var skew); + angle = skew.Angle; + if (Math.Abs(angle) >= opt.OcrDeskewMinAngleDeg) + { + result.Dispose(); + result = desk; + deskewed = true; + } + else + { + desk.Dispose(); + } + } + catch { } + return (result, angle, deskewed); + } +} diff --git a/tools/OcrBench/Program.cs b/tools/OcrBench/Program.cs index be3f5e2..5f868f5 100644 --- a/tools/OcrBench/Program.cs +++ b/tools/OcrBench/Program.cs @@ -40,24 +40,44 @@ static void Extract(Dictionary o) var langs = o["--langs"]; var psm = o["--psm"]; var threads = o["--threads"]; - var python = o["--python-exe"]; + var python = o.GetValueOrDefault("--python-exe", "python3"); + var refresh = o.GetValueOrDefault("--refresh", "markitdownnet"); + + var refreshSet = new HashSet(refresh.Split(',', StringSplitOptions.RemoveEmptyEntries), StringComparer.OrdinalIgnoreCase); Environment.SetEnvironmentVariable("OMP_THREAD_LIMIT", threads); - if (Directory.Exists(outDir)) Directory.Delete(outDir, true); - Directory.CreateDirectory(Path.Combine(outDir, "markitdownnet")); - Directory.CreateDirectory(Path.Combine(outDir, "pytesseract")); + Directory.CreateDirectory(outDir); + if (refreshSet.Contains("markitdownnet")) + { + var md = Path.Combine(outDir, "markitdownnet"); + if (Directory.Exists(md)) Directory.Delete(md, true); + Directory.CreateDirectory(md); + } + if (refreshSet.Contains("pytesseract")) + { + var py = Path.Combine(outDir, "pytesseract"); + if (Directory.Exists(py)) Directory.Delete(py, true); + Directory.CreateDirectory(py); + } var options = new MarkItDownOptions { OcrLanguages = langs, OcrDataPath = "/usr/share/tesseract-ocr/5/tessdata", - PageSegMode = (Tesseract.PageSegMode)6, + OcrPsm = int.Parse(psm), + OcrOem = Tesseract.EngineMode.LstmOnly, + OcrThreads = int.Parse(threads), NormalizeMarkdown = false, DetectBulletLists = false, MergeLines = false, MinimumNativeWordThreshold = int.MaxValue, - PdfRasterDpi = 300 + OcrForceRaster = true, + OcrUserDpi = 300, + OcrPreBinarize = false, + OcrDeskewMinAngleDeg = 2.0, + OcrColorDepth = OcrColorDepth.Grayscale8bpp, + OcrSetDpiMetadata = true }; var converter = new MarkItDownConverter(options); @@ -75,35 +95,44 @@ static void Extract(Dictionary o) { var name = Path.GetFileNameWithoutExtension(file) + ".txt"; var rel = dataset + "/" + name; - var images = GetImages(file).ToList(); - - var sw = Stopwatch.StartNew(); - var textMark = OcrMark(converter, images); - sw.Stop(); - var tMark = sw.ElapsedMilliseconds; - var outMarkDir = Path.Combine(outDir, "markitdownnet", dataset); - Directory.CreateDirectory(outMarkDir); - File.WriteAllText(Path.Combine(outMarkDir, name), textMark); - totalMark += tMark; - Console.WriteLine($"{dataset} | {Path.GetFileName(file)} | markitdownnet | {tMark} ms"); - - sw.Restart(); - var textPy = OcrPy(images, python, langs, psm); - sw.Stop(); - var tPy = sw.ElapsedMilliseconds; - var outPyDir = Path.Combine(outDir, "pytesseract", dataset); - Directory.CreateDirectory(outPyDir); - File.WriteAllText(Path.Combine(outPyDir, name), textPy); - totalPy += tPy; - Console.WriteLine($"{dataset} | {Path.GetFileName(file)} | pytesseract | {tPy} ms"); - - timings[rel] = new Dictionary { { "markitdownnet", tMark }, { "pytesseract", tPy } }; + + if (refreshSet.Contains("markitdownnet")) + { + var sw = Stopwatch.StartNew(); + var textMark = OcrMark(converter, file, out var angle, out var desk); + sw.Stop(); + var tMark = sw.ElapsedMilliseconds; + var outMarkDir = Path.Combine(outDir, "markitdownnet", dataset); + Directory.CreateDirectory(outMarkDir); + File.WriteAllText(Path.Combine(outMarkDir, name), textMark); + totalMark += tMark; + var deskTxt = desk ? $"{angle:F2}°" : "skipped"; + Console.WriteLine($"{dataset}/{Path.GetFileName(file)} | DPI {options.OcrUserDpi} | depth {options.OcrColorDepth} | deskew {deskTxt} | PSM {options.OcrPsm} | OEM {options.OcrOem} | {tMark} ms"); + if (!timings.TryGetValue(rel, out var dict)) timings[rel] = dict = new(); + dict["markitdownnet"] = tMark; + } + + if (refreshSet.Contains("pytesseract")) + { + var images = GetImages(file).ToList(); + var sw = Stopwatch.StartNew(); + var textPy = OcrPy(images, python, langs, psm); + sw.Stop(); + var tPy = sw.ElapsedMilliseconds; + var outPyDir = Path.Combine(outDir, "pytesseract", dataset); + Directory.CreateDirectory(outPyDir); + File.WriteAllText(Path.Combine(outPyDir, name), textPy); + totalPy += tPy; + Console.WriteLine($"{dataset}/{Path.GetFileName(file)} | pytesseract | {tPy} ms"); + if (!timings.TryGetValue(rel, out var dict)) timings[rel] = dict = new(); + dict["pytesseract"] = tPy; + } } } File.WriteAllText(Path.Combine(outDir, "timings.json"), JsonSerializer.Serialize(timings, new JsonSerializerOptions { WriteIndented = true })); - Console.WriteLine($"TOTAL markitdownnet {totalMark} ms"); - Console.WriteLine($"TOTAL pytesseract {totalPy} ms"); + if (refreshSet.Contains("markitdownnet")) Console.WriteLine($"TOTAL markitdownnet {totalMark} ms"); + if (refreshSet.Contains("pytesseract")) Console.WriteLine($"TOTAL pytesseract {totalPy} ms"); } static IEnumerable GetImages(string path) @@ -129,15 +158,12 @@ static IEnumerable GetImages(string path) } } -static string OcrMark(MarkItDownConverter conv, IEnumerable images) +static string OcrMark(MarkItDownConverter conv, string file, out double angle, out bool desk) { - var sb = new StringBuilder(); - foreach (var img in images) - { - var res = conv.ConvertAsync(img, GetMime(img)).Result; - sb.AppendLine(res.Markdown.Trim()); - } - return sb.ToString().Trim(); + var res = conv.ConvertAsync(file, GetMime(file)).Result; + angle = conv.LastDeskewAngle; + desk = conv.LastDeskewApplied; + return res.Markdown.Trim(); } static string OcrPy(IEnumerable images, string py, string lang, string psm) @@ -205,6 +231,12 @@ static void Compare(Dictionary o) var (tp, tr, tf) = TokenScores(gt, hyp); var (lcRef, lcHyp, lf) = LineScores(gt, hyp); timings.TryGetValue(rel, out var t); + long tm = 0, tpyt = 0; + if (t != null) + { + t.TryGetValue("markitdownnet", out tm); + t.TryGetValue("pytesseract", out tpyt); + } files.Add(new FileMetrics { dataset = ds, @@ -216,8 +248,8 @@ static void Compare(Dictionary o) line_count_ref = lcRef, line_count_hyp = lcHyp, line_f1 = lf, - timing_markitdownnet = t?["markitdownnet"] ?? 0, - timing_pytesseract = t?["pytesseract"] ?? 0 + timing_markitdownnet = tm, + timing_pytesseract = tpyt }); } } @@ -237,6 +269,21 @@ static void Compare(Dictionary o) n_files = files.Count }; + var icdarSmoke = new HashSet { "cTDaR_t00014", "cTDaR_t00015", "cTDaR_t00016" }; + var pubSmoke = new HashSet { "PMC1064078_table_0", "PMC1064078_table_2", "PMC1064078_table_6" }; + static void CheckSmoke(List files, string ds, HashSet set) + { + var sub = files.Where(f => f.dataset == ds && set.Contains(f.file)).ToList(); + if (sub.Count == 0) return; + var tf = sub.Average(f => f.token_f1); + var lf = sub.Average(f => f.line_f1); + if (tf < 0.80 || lf < 0.50) + { + Console.Error.WriteLine($"Smoke test failed for {ds}: token_f1={tf:F2} line_f1={lf:F2}"); + Environment.Exit(1); + } + } + var runConfig = new Dictionary { ["os"] = RuntimeInformation.OSDescription.Trim(), @@ -281,6 +328,17 @@ static void Compare(Dictionary o) sb.AppendLine($"- {kv.Key}: {kv.Value}"); Directory.CreateDirectory(Path.GetDirectoryName(outMd)!); File.WriteAllText(outMd, sb.ToString()); + + foreach (var kv in byDataset) + Console.WriteLine($"{kv.Key} | token_F1 {kv.Value.token_f1_avg:F4} | line_F1 {kv.Value.line_f1_avg:F4}"); + Console.WriteLine($"Global | token_F1 {global.token_f1_avg:F4} | line_F1 {global.line_f1_avg:F4}"); + CheckSmoke(files, "ICDAR", icdarSmoke); + CheckSmoke(files, "PUBTABLES", pubSmoke); + if (global.token_f1_avg < 0.80 || global.line_f1_avg < 0.50) + { + Console.Error.WriteLine("Global metrics below threshold"); + Environment.Exit(1); + } } static string Normalize(string text)