diff --git a/artifacts/validation/OCR/bench-ocr.json b/artifacts/validation/OCR/bench-ocr.json index 9e619df..0dce991 100644 --- a/artifacts/validation/OCR/bench-ocr.json +++ b/artifacts/validation/OCR/bench-ocr.json @@ -14,29 +14,29 @@ "files": [ { "dataset": "FUNSD", - "file": "82250337_0338", - "cer_char": 0.011834319526627219, + "file": "82251504", + "cer_char": 0.015776699029126214, "token_precision": 1, "token_recall": 1, "token_f1": 1, - "line_count_ref": 27, - "line_count_hyp": 27, + "line_count_ref": 28, + "line_count_hyp": 28, "line_f1": 1, - "timing_markitdownnet": 511, - "timing_pytesseract": 1845 + "timing_markitdownnet": 605, + "timing_pytesseract": 2042 }, { "dataset": "FUNSD", - "file": "82200067_0069", - "cer_char": 0.020602218700475437, + "file": "82250337_0338", + "cer_char": 0.011834319526627219, "token_precision": 1, "token_recall": 1, "token_f1": 1, - "line_count_ref": 28, - "line_count_hyp": 28, + "line_count_ref": 27, + "line_count_hyp": 27, "line_f1": 1, - "timing_markitdownnet": 917, - "timing_pytesseract": 2055 + "timing_markitdownnet": 516, + "timing_pytesseract": 1845 }, { "dataset": "FUNSD", @@ -48,73 +48,21 @@ "line_count_ref": 32, "line_count_hyp": 32, "line_f1": 1, - "timing_markitdownnet": 681, + "timing_markitdownnet": 848, "timing_pytesseract": 1963 }, { "dataset": "FUNSD", - "file": "82251504", - "cer_char": 0.015776699029126214, + "file": "82200067_0069", + "cer_char": 0.020602218700475437, "token_precision": 1, "token_recall": 1, "token_f1": 1, "line_count_ref": 28, "line_count_hyp": 28, "line_f1": 1, - "timing_markitdownnet": 680, - "timing_pytesseract": 2042 - }, - { - "dataset": "SROIE2019", - "file": "X00016469670", - "cer_char": 0.09898107714701601, - "token_precision": 0.8468468468468469, - "token_recall": 0.8103448275862069, - "token_f1": 0.8281938325991189, - "line_count_ref": 29, - "line_count_hyp": 28, - "line_f1": 0.6315789473684211, - "timing_markitdownnet": 464, - "timing_pytesseract": 1928 - }, - { - "dataset": "SROIE2019", - "file": "X00016469671", - "cer_char": 0.0802675585284281, - "token_precision": 0.83, - "token_recall": 0.8645833333333334, - "token_f1": 0.8469387755102041, - "line_count_ref": 26, - "line_count_hyp": 26, - "line_f1": 0.7307692307692306, - "timing_markitdownnet": 408, - "timing_pytesseract": 1755 - }, - { - "dataset": "SROIE2019", - "file": "X51005230605", - "cer_char": 0.03929273084479371, - "token_precision": 0.9574468085106383, - "token_recall": 0.967741935483871, - "token_f1": 0.9625668449197862, - "line_count_ref": 25, - "line_count_hyp": 25, - "line_f1": 0.92, - "timing_markitdownnet": 458, - "timing_pytesseract": 1849 - }, - { - "dataset": "SROIE2019", - "file": "X51005200931", - "cer_char": 0.03384279475982533, - "token_precision": 0.891566265060241, - "token_recall": 0.8862275449101796, - "token_f1": 0.888888888888889, - "line_count_ref": 41, - "line_count_hyp": 41, - "line_f1": 0.6585365853658537, - "timing_markitdownnet": 642, - "timing_pytesseract": 2056 + "timing_markitdownnet": 641, + "timing_pytesseract": 2055 }, { "dataset": "ICDAR", @@ -126,7 +74,7 @@ "line_count_ref": 51, "line_count_hyp": 44, "line_f1": 0, - "timing_markitdownnet": 3818, + "timing_markitdownnet": 3644, "timing_pytesseract": 5629 }, { @@ -139,22 +87,9 @@ "line_count_ref": 40, "line_count_hyp": 36, "line_f1": 0, - "timing_markitdownnet": 2935, + "timing_markitdownnet": 3278, "timing_pytesseract": 4447 }, - { - "dataset": "ICDAR", - "file": "cTDaR_t00080", - "cer_char": 0.7515723270440252, - "token_precision": 0.3641732283464567, - "token_recall": 0.3798767967145791, - "token_f1": 0.37185929648241206, - "line_count_ref": 50, - "line_count_hyp": 48, - "line_f1": 0, - "timing_markitdownnet": 3762, - "timing_pytesseract": 5772 - }, { "dataset": "ICDAR", "file": "cTDaR_t00015", @@ -165,21 +100,73 @@ "line_count_ref": 36, "line_count_hyp": 36, "line_f1": 0, - "timing_markitdownnet": 3387, + "timing_markitdownnet": 3335, "timing_pytesseract": 4978 }, + { + "dataset": "ICDAR", + "file": "cTDaR_t00080", + "cer_char": 0.7647798742138365, + "token_precision": 0.3570057581573896, + "token_recall": 0.38193018480492813, + "token_f1": 0.36904761904761907, + "line_count_ref": 50, + "line_count_hyp": 48, + "line_f1": 0, + "timing_markitdownnet": 2179, + "timing_pytesseract": 5772 + }, { "dataset": "PUBTABLES", - "file": "PMC1064078_table_0", - "cer_char": 0.03770739064856712, - "token_precision": 0.8282828282828283, - "token_recall": 0.8367346938775511, - "token_f1": 0.8324873096446701, - "line_count_ref": 10, - "line_count_hyp": 10, - "line_f1": 0.10000000000000002, - "timing_markitdownnet": 350, - "timing_pytesseract": 1813 + "file": "PMC1064078_table_5", + "cer_char": 0.16020025031289112, + "token_precision": 0.6770186335403726, + "token_recall": 0.6646341463414634, + "token_f1": 0.6707692307692308, + "line_count_ref": 16, + "line_count_hyp": 15, + "line_f1": 0.06451612903225808, + "timing_markitdownnet": 512, + "timing_pytesseract": 2071 + }, + { + "dataset": "PUBTABLES", + "file": "PMC1064082_table_1", + "cer_char": 0.06351183063511831, + "token_precision": 0.7819548872180451, + "token_recall": 0.7938931297709924, + "token_f1": 0.7878787878787878, + "line_count_ref": 26, + "line_count_hyp": 26, + "line_f1": 0.3076923076923077, + "timing_markitdownnet": 346, + "timing_pytesseract": 1721 + }, + { + "dataset": "PUBTABLES", + "file": "PMC1064078_table_6", + "cer_char": 0.3125, + "token_precision": 0.47058823529411764, + "token_recall": 0.4528301886792453, + "token_f1": 0.4615384615384615, + "line_count_ref": 7, + "line_count_hyp": 7, + "line_f1": 0.14285714285714285, + "timing_markitdownnet": 261, + "timing_pytesseract": 1864 + }, + { + "dataset": "PUBTABLES", + "file": "PMC1064082_table_0", + "cer_char": 0.6911764705882353, + "token_precision": 0.3076923076923077, + "token_recall": 0.14814814814814814, + "token_f1": 0.2, + "line_count_ref": 5, + "line_count_hyp": 3, + "line_f1": 0, + "timing_markitdownnet": 99, + "timing_pytesseract": 1539 }, { "dataset": "PUBTABLES", @@ -191,7 +178,7 @@ "line_count_ref": 14, "line_count_hyp": 13, "line_f1": 0, - "timing_markitdownnet": 331, + "timing_markitdownnet": 325, "timing_pytesseract": 1844 }, { @@ -204,34 +191,21 @@ "line_count_ref": 18, "line_count_hyp": 18, "line_f1": 0.05555555555555555, - "timing_markitdownnet": 390, + "timing_markitdownnet": 401, "timing_pytesseract": 1791 }, { "dataset": "PUBTABLES", - "file": "PMC1064078_table_5", - "cer_char": 0.16020025031289112, - "token_precision": 0.6770186335403726, - "token_recall": 0.6646341463414634, - "token_f1": 0.6707692307692308, - "line_count_ref": 16, - "line_count_hyp": 15, - "line_f1": 0.06451612903225808, - "timing_markitdownnet": 522, - "timing_pytesseract": 2071 - }, - { - "dataset": "PUBTABLES", - "file": "PMC1064082_table_0", - "cer_char": 0.6911764705882353, - "token_precision": 0.3076923076923077, - "token_recall": 0.14814814814814814, - "token_f1": 0.2, - "line_count_ref": 5, - "line_count_hyp": 3, - "line_f1": 0, - "timing_markitdownnet": 116, - "timing_pytesseract": 1539 + "file": "PMC1064078_table_0", + "cer_char": 0.03770739064856712, + "token_precision": 0.8282828282828283, + "token_recall": 0.8367346938775511, + "token_f1": 0.8324873096446701, + "line_count_ref": 10, + "line_count_hyp": 10, + "line_f1": 0.10000000000000002, + "timing_markitdownnet": 309, + "timing_pytesseract": 1813 }, { "dataset": "PUBTABLES", @@ -243,34 +217,34 @@ "line_count_ref": 20, "line_count_hyp": 21, "line_f1": 0.04878048780487805, - "timing_markitdownnet": 489, + "timing_markitdownnet": 421, "timing_pytesseract": 1842 }, { - "dataset": "PUBTABLES", - "file": "PMC1064078_table_6", - "cer_char": 0.3125, - "token_precision": 0.47058823529411764, - "token_recall": 0.4528301886792453, - "token_f1": 0.4615384615384615, - "line_count_ref": 7, - "line_count_hyp": 7, - "line_f1": 0.14285714285714285, - "timing_markitdownnet": 304, - "timing_pytesseract": 1864 + "dataset": "MARMOT", + "file": "10.1.1.1.2014_4", + "cer_char": 0.11923556294142086, + "token_precision": 0.8087557603686636, + "token_recall": 0.8013698630136986, + "token_f1": 0.805045871559633, + "line_count_ref": 37, + "line_count_hyp": 36, + "line_f1": 0.273972602739726, + "timing_markitdownnet": 924, + "timing_pytesseract": 2616 }, { - "dataset": "PUBTABLES", - "file": "PMC1064082_table_1", - "cer_char": 0.06351183063511831, - "token_precision": 0.7819548872180451, - "token_recall": 0.7938931297709924, - "token_f1": 0.7878787878787878, - "line_count_ref": 26, - "line_count_hyp": 26, - "line_f1": 0.3076923076923077, - "timing_markitdownnet": 385, - "timing_pytesseract": 1721 + "dataset": "MARMOT", + "file": "10.1.1.1.2013_64", + "cer_char": 0.11729323308270677, + "token_precision": 0.7592592592592593, + "token_recall": 0.7522935779816514, + "token_f1": 0.7557603686635944, + "line_count_ref": 21, + "line_count_hyp": 21, + "line_f1": 0.47619047619047616, + "timing_markitdownnet": 560, + "timing_pytesseract": 1912 }, { "dataset": "MARMOT", @@ -282,66 +256,86 @@ "line_count_ref": 38, "line_count_hyp": 38, "line_f1": 0.5789473684210527, - "timing_markitdownnet": 3631, + "timing_markitdownnet": 631, "timing_pytesseract": 2309 }, - { - "dataset": "MARMOT", - "file": "10.1.1.1.2013_64", - "cer_char": 0.11729323308270677, - "token_precision": 0.7592592592592593, - "token_recall": 0.7522935779816514, - "token_f1": 0.7557603686635944, - "line_count_ref": 21, - "line_count_hyp": 21, - "line_f1": 0.47619047619047616, - "timing_markitdownnet": 626, - "timing_pytesseract": 1912 - }, { "dataset": "MARMOT", "file": "10.1.1.1.2006_3", - "cer_char": 0.03347107438016529, - "token_precision": 0.8811013767209012, - "token_recall": 0.8833124215809285, - "token_f1": 0.8822055137844611, + "cer_char": 0.04483471074380165, + "token_precision": 0.8641975308641975, + "token_recall": 0.8782936010037641, + "token_f1": 0.8711885500933415, "line_count_ref": 60, - "line_count_hyp": 60, - "line_f1": 0.31666666666666665, - "timing_markitdownnet": 8684, + "line_count_hyp": 61, + "line_f1": 0.28099173553719003, + "timing_markitdownnet": 1561, "timing_pytesseract": 3309 }, { - "dataset": "MARMOT", - "file": "10.1.1.1.2014_4", - "cer_char": 0.11923556294142086, - "token_precision": 0.8087557603686636, - "token_recall": 0.8013698630136986, - "token_f1": 0.805045871559633, - "line_count_ref": 37, - "line_count_hyp": 36, - "line_f1": 0.273972602739726, - "timing_markitdownnet": 991, - "timing_pytesseract": 2616 + "dataset": "SROIE2019", + "file": "X51005200931", + "cer_char": 0.03384279475982533, + "token_precision": 0.891566265060241, + "token_recall": 0.8862275449101796, + "token_f1": 0.888888888888889, + "line_count_ref": 41, + "line_count_hyp": 41, + "line_f1": 0.6585365853658537, + "timing_markitdownnet": 524, + "timing_pytesseract": 2056 + }, + { + "dataset": "SROIE2019", + "file": "X00016469671", + "cer_char": 0.18394648829431437, + "token_precision": 0.7227722772277227, + "token_recall": 0.7604166666666666, + "token_f1": 0.7411167512690354, + "line_count_ref": 26, + "line_count_hyp": 26, + "line_f1": 0.6153846153846154, + "timing_markitdownnet": 520, + "timing_pytesseract": 1755 + }, + { + "dataset": "SROIE2019", + "file": "X51005230605", + "cer_char": 0.023575638506876228, + "token_precision": 0.978494623655914, + "token_recall": 0.978494623655914, + "token_f1": 0.978494623655914, + "line_count_ref": 25, + "line_count_hyp": 25, + "line_f1": 0.96, + "timing_markitdownnet": 391, + "timing_pytesseract": 1849 + }, + { + "dataset": "SROIE2019", + "file": "X00016469670", + "cer_char": 0.1848617176128093, + "token_precision": 0.7350427350427351, + "token_recall": 0.7413793103448276, + "token_f1": 0.7381974248927039, + "line_count_ref": 29, + "line_count_hyp": 30, + "line_f1": 0.5423728813559322, + "timing_markitdownnet": 589, + "timing_pytesseract": 1928 } ], "aggregate": { "by_dataset": { "FUNSD": { - "cer_avg": 0.014752644862229975, + "cer_avg": 0.014752644862229974, "token_f1_avg": 1, "line_f1_avg": 1, "n_files": 4 }, - "SROIE2019": { - "cer_avg": 0.06309604032001578, - "token_f1_avg": 0.8816470854794994, - "line_f1_avg": 0.7352211908758763, - "n_files": 4 - }, "ICDAR": { - "cer_avg": 0.7248124473780835, - "token_f1_avg": 0.36180094739698715, + "cer_avg": 0.7281143341705363, + "token_f1_avg": 0.3610980280382889, "line_f1_avg": 0, "n_files": 4 }, @@ -352,16 +346,22 @@ "n_files": 8 }, "MARMOT": { - "cer_avg": 0.07783383087611297, - "token_f1_avg": 0.8346188162139537, - "line_f1_avg": 0.4114442785044804, + "cer_avg": 0.08067473996702205, + "token_f1_avg": 0.8318645752911739, + "line_f1_avg": 0.4025255457221112, + "n_files": 4 + }, + "SROIE2019": { + "cer_avg": 0.1065566597934563, + "token_f1_avg": 0.8366744221766356, + "line_f1_avg": 0.6940735205266003, "n_files": 4 } }, "global": { - "cer_avg": 0.23638613783393658, - "token_f1_avg": 0.7035058050110908, - "line_f1_avg": 0.3877526458526486, + "cer_avg": 0.24465337372673693, + "token_f1_avg": 0.6954341677470338, + "line_f1_avg": 0.3794082453307079, "n_files": 24 } } diff --git a/artifacts/validation/OCR/summary-ocr.md b/artifacts/validation/OCR/summary-ocr.md index c42a76e..b69e86d 100644 --- a/artifacts/validation/OCR/summary-ocr.md +++ b/artifacts/validation/OCR/summary-ocr.md @@ -2,20 +2,20 @@ ## Global | scope | CER | Token-F1 | line_F1 | n_files | -| Global | 0.2364 | 0.7035 | 0.3878 | 24 | +| Global | 0.2447 | 0.6954 | 0.3794 | 24 | ## By dataset | scope | CER | Token-F1 | line_F1 | n_files | | FUNSD | 0.0148 | 1.0000 | 1.0000 | 4 | -| SROIE2019 | 0.0631 | 0.8816 | 0.7352 | 4 | -| ICDAR | 0.7248 | 0.3618 | 0.0000 | 4 | +| ICDAR | 0.7281 | 0.3611 | 0.0000 | 4 | | PUBTABLES | 0.2689 | 0.5715 | 0.0899 | 8 | -| MARMOT | 0.0778 | 0.8346 | 0.4114 | 4 | +| MARMOT | 0.0807 | 0.8319 | 0.4025 | 4 | +| SROIE2019 | 0.1066 | 0.8367 | 0.6941 | 4 | ## Top-5 worst files | dataset/file | cer_char | token_f1 | line_f1 | note | | ICDAR/cTDaR_t00015 | 0.7768 | 0.3080 | 0.0000 | | -| ICDAR/cTDaR_t00080 | 0.7516 | 0.3719 | 0.0000 | | +| ICDAR/cTDaR_t00080 | 0.7648 | 0.3690 | 0.0000 | | | ICDAR/cTDaR_t00016 | 0.7078 | 0.4268 | 0.0000 | | | PUBTABLES/PMC1064082_table_0 | 0.6912 | 0.2000 | 0.0000 | | | ICDAR/cTDaR_t00014 | 0.6631 | 0.3406 | 0.0000 | | diff --git a/dataset/validation/_ocr/markitdownnet/ICDAR/cTDaR_t00080.txt b/dataset/validation/_ocr/markitdownnet/ICDAR/cTDaR_t00080.txt index 19d11c4..f1bdb7c 100644 --- a/dataset/validation/_ocr/markitdownnet/ICDAR/cTDaR_t00080.txt +++ b/dataset/validation/_ocr/markitdownnet/ICDAR/cTDaR_t00080.txt @@ -1,50 +1,52 @@ -Ba otr alorrire, mn Cathe L ares ; Fg 4 es Page Le - -LE as 3 SORE ST Sera Syprr stack aite Heya, -= oe ee 9 iy A ah ; , 2A: -fy |Peoee 2 7 : : rinse a -Too i en = base GLE, : / -pth Aye — ssf -eae ON I % ia. eee -nfs] sp ak f- OY te OO, ee MPEP — —— a see. -ye WO en |e eaten | faa! ae 1 Tie -om GA f i oe ae ) i ' Bas 7 -AU | v AYN tL LIP ru oY 2 spl 2m g 7 pe Sofi Hob ’ -hod} mins i/ Bf. V4 Gdns ie M Al ff // : -# | Lietigtia a. é wr di iS Luh if / J <7} . ff vy 4 a -five | Lu Tap, Lom ling Mlseelors thn ym ) Faas H -na PEW tan | Peep lil, » eae -Te os > 44 ref 2 » WF Fe ile ae ; ( ots e A q i -0 seal Dp on Pkeanaby Sai. | -5 eat ey IE ADR | WY, 24 . @ -: : [= | * a . oe BE bd sar jee v pod A a -be Xh > f Oy r VLnMNALBY wr REEL. “le a -| Gogh. Wy Pre! AT Ae anv pam -ay 4 prov. heb a Yow kath s ge fv br : 7 U/ zs : ;, -cl Lv" | Aix wher. _ 4 sey of ) | | — -oes Pi 8 de oe ee —— 4a -Vig ee by’ 4 pages ES oat | iB SS : : -tc 9 9t AS Z it padlox voa- fire tin soe a Wy eh -TEBE : eae a | -ee 4 a re hy is Z bp if A. Di i -Fees at = ade Let ooh flor ft |B ae -f { f j Jie f- . e 4 -ipcion aig eter | ae ae . £L,. ey -4 sub. by. “LYS Px : Lt 7 : : ae : i -re ae eee) >, -| ee acs Pflomnn heed yt nto ae i — - a ‘4 -ut 6 Us 4 Z Py 4 3 ef -YP hi oy , bey sit a “Ay s6Uy Sack ase: & a | -eae ter flay ay 2be / A -De nh ; 90 —— ! a -fa Hafli Rabat Bint SyhegYouyinem Be isto? @ -4 SES eo, ale 2 bm «fe 7 ee, .. i +Si! eee Se ee +cig 77zZ Cache Ja 2a rocaala Igy ie hae by > A387. ; +5 ayy : al > 7 < . -ff : Ree. -b2 1G. a Aid v7 = Me -i eithe Aero ee | oe -if La 3° VA , of " : J OH 3 Rese -fe te pee [tig | ~ a Aha AP IF7 : y PLD) sa cf y ’ ¥ Bo gh Z P PES. -| spilt Bisping nt) | oe -Za ak ae Fas 4 FA EB “ age Lb, : ane ee oe =e +soy iene a Ny AOP i +Tuy é eee | ope Ke . SiN aay oy i +Tie 0 ‘ one s Ai : Pua¢t7sz2t , * g thy ad —<— FF +i. es 4 fi : BPCLLS q +rin Pose f | . 7 5 ce eee ome oe +Pov". aunaber ss A Gls Moa pi pice TN CHa fanf Vit. ee > oe +Uhm Bry. eas On / 7, é : ry Ss oe haisdd 4 , f SU rine Gy § +J Wi | v ALY sess ALAS 14 esasdit act gay, nl $f i} Y) fe ‘ gp | : ; +{wd | eh Es / il ping “4 VA a rarer / h| '/ é /, 5 i f +mas TiO» Ie Ce. , 7 — ‘it —~Mlawbei. than yr ) a i ' +its pil csobetip bs, é par Arig Balbo 4’ | VG Baa) | +tay { ene —— A Serssta ot ppg ya a ae i - VA y &* ‘. +pvr. / 7 Urng~ if ay i +eas By oe TE PW GY yo . +if lon MAMA PAO. WA ‘ t, 24 fr a +Fie Porshe y gilyorud~ a. +IG BN eee eee He +bicNh > (Og {$+ W aap haee . A 1 = +LW ~ | 8: sess / bi Ba Mbt Dorn a! } “6 . Ts EBE AS Py dt i +/; | PO 7 wh o/s oe f fh) Pf, y 7 } fh i | +66 Bon i, Whine eg) prey A Tle +Us Sane io 0 a Sa ne ce a = +ee PANG 4 iil ee ee +fee on ULROS . | tind. j i oe Hi 2 | +fee 9h a LN testy p “2 ae, . oA st ieee en q +la Ii f ees gh ost a : (Poor 4 tl ; +be 4 sg i) * y he. : s - ae -poe col (eee xe ee Cy ag x ae es \ No newline at end of file +EH bef. rare fbdais fF? he > an +eee LOO ff i| +F on Mason | Ghar ghee Wn gle | | Diy og I + +pit Tied pele ue isto +pas Mave PT I pat on~ Gi BE a | +er ee Bee ree 9, Mg a +Feet ne AL. fed G8 fo + +oe af 4 +{ ne AF, ae : : E | +$= Gollah Phase) Slaw 8) F, 7 : | +tls pe §> Cof Oh 9.8 77 fj Vis A Pr ae ‘i be oe x +fe gs 1 aye ae? a 4 / ‘ © 4 : A (7 ; Ps of 58 are ; Sy 1 +be, qd. We Hla i” by (Oe ) ike +: : th cnl GRA Pe pe «ft {/. » See +MS yg Set OS ae) a a +D: Ritcarlefer fod en MR ne +* hs gee flor tae YU Cc ae eens MORSE 5 Nope eg +Eee See co ee ae + A eee Se eee Se eee ee Sei i \ No newline at end of file diff --git a/dataset/validation/_ocr/markitdownnet/MARMOT/10.1.1.1.2006_3.txt b/dataset/validation/_ocr/markitdownnet/MARMOT/10.1.1.1.2006_3.txt index 6cfe12e..d13ffc4 100644 --- a/dataset/validation/_ocr/markitdownnet/MARMOT/10.1.1.1.2006_3.txt +++ b/dataset/validation/_ocr/markitdownnet/MARMOT/10.1.1.1.2006_3.txt @@ -1,17 +1,17 @@ 09 these consume 0,99 CPU. That is, task3- misses -deadlines and the inverted pendulum! falls down, This -an explains the fact that the cost function in Figure 2 goes +a isaaise deadlines and the inverted pendulum! falls down, This +og EF c explains the fact thatthe cost function in Figure 2 goes . to infinite. Note that under RM, the task is not Bod schedulable -a. + EDP: EDF is « dynamic alorihm in open bop -Which assigns priorities to tasks according to their - +i + EDP: EDF is « dynamic alorihm in open bop +§ Which assigns priorities to tasks according to their +i, Beavis dedine, Une BG’ holler te schedulabiity condition is given by U = 1. For our -wg simulations, since U <1, the task set is schedulable +4 simulations, since U <1, the task set is schedulable and the three pendulums can be controlled as it can be A seen in Figure 2: the accumulated cost reaches a finite -a a oe a! value, which means that the deviation caused by each -ii Ail sal aed perturbation thet affected each of the three pendulums +a a oe i value, which means that the deviation caused by each +sig cic al ad perturbation thet affected each of the three pendulums could be adequately comected. ‘The performance with the difference in performance due 10 the we of sehieved by EDF is also given in Figure 2 in tems of different scheduling polices, which is the objective of our tie coat BocHonsreadting a valus oF W2754 at tho diff --git a/dataset/validation/_ocr/markitdownnet/SROIE2019/X00016469670.txt b/dataset/validation/_ocr/markitdownnet/SROIE2019/X00016469670.txt index 23225d1..145b277 100644 --- a/dataset/validation/_ocr/markitdownnet/SROIE2019/X00016469670.txt +++ b/dataset/validation/_ocr/markitdownnet/SROIE2019/X00016469670.txt @@ -1,37 +1,38 @@ tan chay yee +ay y -88 COPY **# +*** COPY *** OJC MARKETING SDN BHD ROC NO: 538358-H NO 2 & 4, JALAN BAYU 4, BANDAR SERI ALAM, 81750 MASAI, JOHOR Tel:07-388 2218 Fax:07-388 8218 -Email: ng@ojegroup.com +Email: ng@ojcgroup.com TAX INVOICE -"jfvelee No” peGAOSoIEs -Date + 15/01/2019 11:05:16 AM +“Ynvoice No: PEGIV-1030765..~~CS~S~S +Date : 15/01/2019 11:05:16 AM Cashier : NG CHUAN MIN Sales Persor : FATIN Bill To : THE PEAK QUARRY WORKS -Address bs +Address es -Description. Oty. Price Amount -000000111 1 193,00 193.00 SR +_Description Qty. Price Amount +000000111 1 193.00 193.00 SR KINGS SAFETY SHOES KWD 805 -“OT Fetal Beclide tT T9560 +“@ty:1 ‘Total Exclude GST: ~—«193.00 Total GST @6%: 0.00 Total Inclusive GST: 193.00 -sippeaciaa Round Amtt 550.00... +sige ROUNE AMES 8-00 .. TOTAL: 193,00 - -9000000000004 31.8 -Approval Code:000 VJ a +camcacuecniae (Te +100000000004 3:18 (iu. +Approval Code:000 4 a) Goods Sold Are Not Returnable & Refundable -***4Thank You. Please Come Again.**** \ No newline at end of file +*+*+Thank You. Please Come Again.**** \ No newline at end of file diff --git a/dataset/validation/_ocr/markitdownnet/SROIE2019/X00016469671.txt b/dataset/validation/_ocr/markitdownnet/SROIE2019/X00016469671.txt index 4f6b624..f9a5838 100644 --- a/dataset/validation/_ocr/markitdownnet/SROIE2019/X00016469671.txt +++ b/dataset/validation/_ocr/markitdownnet/SROIE2019/X00016469671.txt @@ -11,23 +11,24 @@ Email: ng@ojegroup.com Cash Bill -“invoice No" PEGI 03085 +““Tavoice No”: PEGIV-1030531 =~: Date 02/01/2019 2:47:14 PM Cashier : RHYS TAN Sales Persor : FATIN -2 Deseription | Qty. Price Amount -000000111 1 170,00 = 170.00 -KINGS SAFETY ‘POO4 -SHOES KWD 805 +eOROGR, RY Fe Arne +000000111 1 170,00 170.00 +KINGS SAFETY Pood +SHOES KWO 805 -“Gey T Total Tem Biscount! G60 +Qty’ 1 Total Item Discount! =i.” Total Amount: 170.00 -cecseeseneeeeen ROUNd Amt. 0.00 +slags causes OO OD TOTAL: 170.00 VISA CARD 170.00 -2090000000043 18 -Approval Code:123 I30 U0 + +XXX KK KKK KK4 318 +Approval Code: 123 ey) U0 Goods Sold Are Not Returnable & Refundable -“Thank You. Please Come Again.**** \ No newline at end of file +****Thank You. Please Come Again.**** \ No newline at end of file diff --git a/dataset/validation/_ocr/markitdownnet/SROIE2019/X51005230605.txt b/dataset/validation/_ocr/markitdownnet/SROIE2019/X51005230605.txt index 844908e..4a00d8c 100644 --- a/dataset/validation/_ocr/markitdownnet/SROIE2019/X51005230605.txt +++ b/dataset/validation/_ocr/markitdownnet/SROIE2019/X51005230605.txt @@ -1,7 +1,7 @@ PETRON BKT LANJAN SB ALSERKAM ENTERPRISE Tel: 03-6156 8757 Co No: 001083069-M -KM 458.4 BKT LANJAN UTARA, +KM 456.4 BKT LANJAN UTARA, L/RAYA UTARA SELATAN,SG BULOH 47000 SUNGAI BUL @@ -32,4 +32,4 @@ Use 3000 Petron Miles points to pay for RM45 Fuel -* Boo bia aes H \ No newline at end of file +* F eee H \ No newline at end of file diff --git a/src/MarkItDownNet/MarkItDownConverter.cs b/src/MarkItDownNet/MarkItDownConverter.cs index 011cb2a..e13c758 100644 --- a/src/MarkItDownNet/MarkItDownConverter.cs +++ b/src/MarkItDownNet/MarkItDownConverter.cs @@ -84,15 +84,15 @@ private MarkItDownResult ProcessPdf(string path, CancellationToken ct) } } - // If there are not enough words, fall back to OCR - if (words.Count < _options.MinimumNativeWordThreshold) + // If forced or there are not enough words, fall back to OCR + if (_options.OcrForceRaster || words.Count < _options.MinimumNativeWordThreshold) { _logger.Information("Native text too small ({Count}), attempting OCR fallback", words.Count); return ProcessPdfWithOcr(path, ct); } var markdown = BuildMarkdown(lines); - return new MarkItDownResult(markdown, pages, lines, words); + return new MarkItDownResult(markdown, pages, lines, words, null); } private MarkItDownResult ProcessPdfWithOcr(string path, CancellationToken ct) @@ -100,9 +100,10 @@ private MarkItDownResult ProcessPdfWithOcr(string path, CancellationToken ct) var pages = new List(); var lines = new List(); var words = new List(); + OcrMetadata? meta = null; // Rasterize PDF into images using PDFtoImage - var renderOptions = new RenderOptions { Dpi = _options.PdfRasterDpi }; + var renderOptions = new RenderOptions { Dpi = _options.OcrUserDpi }; using var stream = File.OpenRead(path); foreach (var bitmap in Conversion.ToImages(stream, leaveOpen: false, password: null, renderOptions)) { @@ -110,26 +111,27 @@ private MarkItDownResult ProcessPdfWithOcr(string path, CancellationToken ct) using (bitmap) { pages.Add(new Page(pages.Count + 1, bitmap.Width, bitmap.Height)); - using var image = SKImage.FromBitmap(bitmap); - using var data = image.Encode(SKEncodedImageFormat.Png, 100); - using var pix = Pix.LoadFromMemory(data.ToArray()); - var result = ProcessPix(pix, pages.Count, ct); + var prep = PreparePix(bitmap); + meta ??= prep.meta; + var result = ProcessPix(prep.pix, pages.Count, ct); + prep.pix.Dispose(); lines.AddRange(result.lines); words.AddRange(result.words); } } var markdown = BuildMarkdown(lines); - return new MarkItDownResult(markdown, pages, lines, words); + return new MarkItDownResult(markdown, pages, lines, words, meta); } private MarkItDownResult ProcessImage(string path, CancellationToken ct) { - using var pix = Pix.LoadFromFile(path); - var (lines, words) = ProcessPix(pix, 1, ct); - var pages = new List { new Page(1, pix.Width, pix.Height) }; + var prep = PreparePix(path); + var pages = new List { new Page(1, prep.pix.Width, prep.pix.Height) }; + var (lines, words) = ProcessPix(prep.pix, 1, ct); + prep.pix.Dispose(); var markdown = BuildMarkdown(lines); - return new MarkItDownResult(markdown, pages, lines, words); + return new MarkItDownResult(markdown, pages, lines, words, prep.meta); } private (List lines, List words) ProcessPix(Pix pix, int pageNumber, CancellationToken ct) @@ -139,8 +141,9 @@ private MarkItDownResult ProcessImage(string path, CancellationToken ct) using var engine = new TesseractEngine( _options.OcrDataPath ?? string.Empty, _options.OcrLanguages, - EngineMode.LstmOnly); - engine.DefaultPageSegMode = _options.PageSegMode; + _options.OcrOem); + engine.DefaultPageSegMode = (PageSegMode)_options.OcrPsm; + engine.SetVariable("OMP_THREAD_LIMIT", _options.OcrThreads.ToString()); using var page = engine.Process(pix); using var iter = page.GetIterator(); iter.Begin(); @@ -171,6 +174,70 @@ private MarkItDownResult ProcessImage(string path, CancellationToken ct) return (lines, words); } + private (Pix pix, OcrMetadata meta) PreparePix(string path) + { + var src = Pix.LoadFromFile(path); + return PreparePix(src); + } + + private (Pix pix, OcrMetadata meta) PreparePix(SKBitmap bitmap) + { + using var image = SKImage.FromBitmap(bitmap); + using var data = image.Encode(SKEncodedImageFormat.Png, 100); + var pix = Pix.LoadFromMemory(data.ToArray()); + return PreparePix(pix); + } + + private (Pix pix, OcrMetadata meta) PreparePix(Pix pix) + { + var working = pix; + + if (working.Depth == 32) + { + var gray = working.ConvertRGBToGray(); + working.Dispose(); + working = gray; + } + else if (working.Depth != 8) + { + var eight = working.ConvertTo8(0); + working.Dispose(); + working = eight; + } + + var dpi = working.XRes > 0 ? working.XRes : _options.OcrUserDpi; + if (dpi < 220) + { + var scale = (float)_options.OcrUserDpi / dpi; + var scaled = working.Scale(scale, scale); + working.Dispose(); + working = scaled; + dpi = _options.OcrUserDpi; + } + + if (_options.OcrSetDpiMetadata) + { + working.XRes = _options.OcrUserDpi; + working.YRes = _options.OcrUserDpi; + } + + double? angle = null; + var deskewed = working.Deskew(out var scew); + if (Math.Abs(scew.Angle) >= _options.OcrDeskewMinAngleDeg) + { + working.Dispose(); + working = deskewed; + angle = scew.Angle; + } + else + { + deskewed.Dispose(); + } + + var meta = new OcrMetadata(dpi, _options.OcrColorDepth, angle, _options.OcrPsm, _options.OcrOem); + return (working, meta); + } + private static BoundingBox Normalize(Rect rect, int width, int height) { return new BoundingBox((double)rect.X1 / width, (double)rect.Y1 / height, (double)rect.Width / width, (double)rect.Height / height); diff --git a/src/MarkItDownNet/MarkItDownOptions.cs b/src/MarkItDownNet/MarkItDownOptions.cs index e1b86d7..048f6fc 100644 --- a/src/MarkItDownNet/MarkItDownOptions.cs +++ b/src/MarkItDownNet/MarkItDownOptions.cs @@ -11,11 +11,32 @@ public class MarkItDownOptions /// Languages for OCR, e.g. "eng" or "ita+eng". public string OcrLanguages { get; set; } = "eng"; + /// DPI hint for user-provided images. + public int OcrUserDpi { get; set; } = 300; + /// Page segmentation mode used by Tesseract. - public PageSegMode PageSegMode { get; set; } = PageSegMode.SingleBlock; + public int OcrPsm { get; set; } = 6; + + /// OCR Engine mode (OEM). + public EngineMode OcrOem { get; set; } = EngineMode.LstmOnly; + + /// Maximum number of OCR threads. + public int OcrThreads { get; set; } = 1; + + /// Force rasterization of PDFs for OCR. + public bool OcrForceRaster { get; set; } = true; + + /// Pre-binarize images before OCR. + public bool OcrPreBinarize { get; set; } = false; - /// DPI used when rasterizing PDFs for OCR fallback. - public int PdfRasterDpi { get; set; } = 300; + /// Minimum absolute angle (deg) to trigger deskew. + public double OcrDeskewMinAngleDeg { get; set; } = 2.0; + + /// Preferred color depth for OCR input. + public OcrColorDepth OcrColorDepth { get; set; } = OcrColorDepth.Grayscale8bpp; + + /// Set DPI metadata on images before OCR. + public bool OcrSetDpiMetadata { get; set; } = true; /// Minimum number of native words required before falling back to OCR. public int MinimumNativeWordThreshold { get; set; } = 1; @@ -39,3 +60,10 @@ public class MarkItDownOptions /// public bool MergeLines { get; set; } = true; } + +/// Supported color depths for OCR input. +public enum OcrColorDepth +{ + Grayscale8bpp, + Color32bpp +} diff --git a/src/MarkItDownNet/Models.cs b/src/MarkItDownNet/Models.cs index 0c75ac7..aa4555c 100644 --- a/src/MarkItDownNet/Models.cs +++ b/src/MarkItDownNet/Models.cs @@ -1,3 +1,5 @@ +using Tesseract; + namespace MarkItDownNet; /// Normalized bounding box. @@ -28,4 +30,8 @@ public record MarkItDownResult( string Markdown, IReadOnlyList Pages, IReadOnlyList Lines, - IReadOnlyList Words); + IReadOnlyList Words, + OcrMetadata? Ocr); + +/// Metadata about the OCR operation. +public record OcrMetadata(int Dpi, OcrColorDepth ColorDepth, double? DeskewAngleDeg, int Psm, EngineMode Oem); diff --git a/tests/MarkItDownNet.Tests/ConversionTests.cs b/tests/MarkItDownNet.Tests/ConversionTests.cs index 35e0bf8..be3bfe1 100644 --- a/tests/MarkItDownNet.Tests/ConversionTests.cs +++ b/tests/MarkItDownNet.Tests/ConversionTests.cs @@ -18,7 +18,7 @@ public async Task PdfWithDigitalTextProducesMarkdownAndWords() page.AddText("Hello world", 12, new PdfPoint(10, 150), font); await File.WriteAllBytesAsync(tmp, builder.Build()); - var converter = new MarkItDownConverter(new MarkItDownOptions { NormalizeMarkdown = false }); + var converter = new MarkItDownConverter(new MarkItDownOptions { NormalizeMarkdown = false, OcrForceRaster = false }); var result = await converter.ConvertAsync(tmp, "application/pdf"); Assert.False(string.IsNullOrWhiteSpace(result.Markdown)); diff --git a/tools/OcrBench/Program.cs b/tools/OcrBench/Program.cs index be3f5e2..9cc85a6 100644 --- a/tools/OcrBench/Program.cs +++ b/tools/OcrBench/Program.cs @@ -40,30 +40,49 @@ static void Extract(Dictionary o) var langs = o["--langs"]; var psm = o["--psm"]; var threads = o["--threads"]; - var python = o["--python-exe"]; + var python = o.ContainsKey("--python-exe") ? o["--python-exe"] : "python3"; + var refresh = o.ContainsKey("--refresh") ? o["--refresh"].Split(',') : new[] { "markitdownnet" }; + bool doMark = refresh.Contains("markitdownnet", StringComparer.OrdinalIgnoreCase); + bool doPy = refresh.Contains("pytesseract", StringComparer.OrdinalIgnoreCase); Environment.SetEnvironmentVariable("OMP_THREAD_LIMIT", threads); - if (Directory.Exists(outDir)) Directory.Delete(outDir, true); - Directory.CreateDirectory(Path.Combine(outDir, "markitdownnet")); - Directory.CreateDirectory(Path.Combine(outDir, "pytesseract")); + Directory.CreateDirectory(outDir); + var timingsPath = Path.Combine(outDir, "timings.json"); + var timings = File.Exists(timingsPath) + ? JsonSerializer.Deserialize>>(File.ReadAllText(timingsPath))! + : new Dictionary>(); + + if (doMark) + { + var dir = Path.Combine(outDir, "markitdownnet"); + if (Directory.Exists(dir)) Directory.Delete(dir, true); + Directory.CreateDirectory(dir); + } + if (doPy) + { + var dir = Path.Combine(outDir, "pytesseract"); + if (Directory.Exists(dir)) Directory.Delete(dir, true); + Directory.CreateDirectory(dir); + } var options = new MarkItDownOptions { OcrLanguages = langs, OcrDataPath = "/usr/share/tesseract-ocr/5/tessdata", - PageSegMode = (Tesseract.PageSegMode)6, + OcrPsm = int.Parse(psm), + OcrOem = Tesseract.EngineMode.LstmOnly, + OcrThreads = int.Parse(threads), NormalizeMarkdown = false, DetectBulletLists = false, MergeLines = false, MinimumNativeWordThreshold = int.MaxValue, - PdfRasterDpi = 300 + OcrUserDpi = 300 }; var converter = new MarkItDownConverter(options); var exts = new HashSet(new[] { ".png", ".jpg", ".jpeg", ".tif", ".tiff", ".bmp", ".pdf" }, StringComparer.OrdinalIgnoreCase); var datasets = Directory.GetDirectories(inputDir).Where(d => Path.GetFileName(d) != "_ocr"); - var timings = new Dictionary>(); long totalMark = 0, totalPy = 0; foreach (var datasetPath in datasets) @@ -77,33 +96,43 @@ static void Extract(Dictionary o) var rel = dataset + "/" + name; var images = GetImages(file).ToList(); - var sw = Stopwatch.StartNew(); - var textMark = OcrMark(converter, images); - sw.Stop(); - var tMark = sw.ElapsedMilliseconds; - var outMarkDir = Path.Combine(outDir, "markitdownnet", dataset); - Directory.CreateDirectory(outMarkDir); - File.WriteAllText(Path.Combine(outMarkDir, name), textMark); - totalMark += tMark; - Console.WriteLine($"{dataset} | {Path.GetFileName(file)} | markitdownnet | {tMark} ms"); - - sw.Restart(); - var textPy = OcrPy(images, python, langs, psm); - sw.Stop(); - var tPy = sw.ElapsedMilliseconds; - var outPyDir = Path.Combine(outDir, "pytesseract", dataset); - Directory.CreateDirectory(outPyDir); - File.WriteAllText(Path.Combine(outPyDir, name), textPy); - totalPy += tPy; - Console.WriteLine($"{dataset} | {Path.GetFileName(file)} | pytesseract | {tPy} ms"); - - timings[rel] = new Dictionary { { "markitdownnet", tMark }, { "pytesseract", tPy } }; + if (!timings.TryGetValue(rel, out var tdict)) + tdict = timings[rel] = new Dictionary(); + + if (doMark) + { + var sw = Stopwatch.StartNew(); + var (textMark, meta) = OcrMark(converter, images); + sw.Stop(); + var tMark = sw.ElapsedMilliseconds; + var outMarkDir = Path.Combine(outDir, "markitdownnet", dataset); + Directory.CreateDirectory(outMarkDir); + File.WriteAllText(Path.Combine(outMarkDir, name), textMark); + totalMark += tMark; + var angleStr = meta?.DeskewAngleDeg.HasValue == true ? meta.DeskewAngleDeg.Value.ToString("F2") : "skipped"; + Console.WriteLine($"{dataset}/{Path.GetFileName(file)} | dpi {meta?.Dpi} | {meta?.ColorDepth} | deskew {angleStr} | psm {meta?.Psm} | oem {(int)(meta?.Oem ?? 0)} | {tMark} ms"); + tdict["markitdownnet"] = tMark; + } + + if (doPy) + { + var sw = Stopwatch.StartNew(); + var textPy = OcrPy(images, python, langs, psm); + sw.Stop(); + var tPy = sw.ElapsedMilliseconds; + var outPyDir = Path.Combine(outDir, "pytesseract", dataset); + Directory.CreateDirectory(outPyDir); + File.WriteAllText(Path.Combine(outPyDir, name), textPy); + totalPy += tPy; + Console.WriteLine($"{dataset}/{Path.GetFileName(file)} | pytesseract | {tPy} ms"); + tdict["pytesseract"] = tPy; + } } } - File.WriteAllText(Path.Combine(outDir, "timings.json"), JsonSerializer.Serialize(timings, new JsonSerializerOptions { WriteIndented = true })); - Console.WriteLine($"TOTAL markitdownnet {totalMark} ms"); - Console.WriteLine($"TOTAL pytesseract {totalPy} ms"); + File.WriteAllText(timingsPath, JsonSerializer.Serialize(timings, new JsonSerializerOptions { WriteIndented = true })); + if (doMark) Console.WriteLine($"TOTAL markitdownnet {totalMark} ms"); + if (doPy) Console.WriteLine($"TOTAL pytesseract {totalPy} ms"); } static IEnumerable GetImages(string path) @@ -129,15 +158,17 @@ static IEnumerable GetImages(string path) } } -static string OcrMark(MarkItDownConverter conv, IEnumerable images) +static (string text, OcrMetadata? meta) OcrMark(MarkItDownConverter conv, IEnumerable images) { var sb = new StringBuilder(); + OcrMetadata? meta = null; foreach (var img in images) { var res = conv.ConvertAsync(img, GetMime(img)).Result; + meta ??= res.Ocr; sb.AppendLine(res.Markdown.Trim()); } - return sb.ToString().Trim(); + return (sb.ToString().Trim(), meta); } static string OcrPy(IEnumerable images, string py, string lang, string psm) @@ -281,6 +312,18 @@ static void Compare(Dictionary o) sb.AppendLine($"- {kv.Key}: {kv.Value}"); Directory.CreateDirectory(Path.GetDirectoryName(outMd)!); File.WriteAllText(outMd, sb.ToString()); + + Console.WriteLine($"GLOBAL Token-F1 {global.token_f1_avg:F4} | line_F1 {global.line_f1_avg:F4}"); + foreach (var kv in byDataset) + Console.WriteLine($"{kv.Key} Token-F1 {kv.Value.token_f1_avg:F4} | line_F1 {kv.Value.line_f1_avg:F4}"); + + bool fail = global.token_f1_avg < 0.80 || global.line_f1_avg < 0.50; + if (byDataset.TryGetValue("ICDAR", out var icdar)) + fail |= icdar.token_f1_avg < 0.80 || icdar.line_f1_avg < 0.50; + if (byDataset.TryGetValue("PUBTABLES", out var pubtables)) + fail |= pubtables.token_f1_avg < 0.80 || pubtables.line_f1_avg < 0.50; + + if (fail) Environment.Exit(1); } static string Normalize(string text)