From 8f653dba3dbec5b98d25cc58bd4bad0cf22d5aae Mon Sep 17 00:00:00 2001 From: svlandeg Date: Thu, 3 Apr 2025 14:03:49 +0200 Subject: [PATCH 1/3] PDF with a table with duplicate column names --- tests/data/duplicate_columns.pdf | Bin 0 -> 9044 bytes 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 tests/data/duplicate_columns.pdf diff --git a/tests/data/duplicate_columns.pdf b/tests/data/duplicate_columns.pdf new file mode 100644 index 0000000000000000000000000000000000000000..50255a4e71c1ad282c1c270ecd6fd470b39f6c3f GIT binary patch literal 9044 zcma)iby$>J*S>;ucL*}1gupN}z|cL?9nv{8GdP5FC@Dw^h#;VZA|=w@CDI_Bf^j+#AaTH8^PHT&-}lu1VS=No$mEx(#MK;j8|Vjq z03hc}Fyv_xm22407~uuYktLe;T!ju%h#Mkd>8s3GHy$mlecp-M5t$Kffdv#+?q^APjq-4A9#0xTc>2rK9 zTd8j`@rgFF)rQ>Y#2Wo|&WCS2oU~2o^IWh$*m?m=bKo)LA7XbqrARmVgqNaZf3BYA z1TRnsSK?k}5mg2CZb`&ok0fDNRPGV2zbi4rha+tMNDjJolTZAcG~QGoK+sL^Z<*kK zC0po!N%kLL5C8}OMS#%1r+!7m$^Gt1()^pD-ej8>HoeK&kLc~J#PXR?OVIYXu6B~i zNXEhm0mb0<1l~#NiDVWo;>A)eLyUZf*Kzy!xsd5R-j(H-0!j`RT-5}oo>ly{D-XGB z9@9X#L{ySr4kI*Dx_z6Fu*$^YMdo#AWg}7deCosJ6+Nu){RE^kSVD5Cw{*Lpo(G{^~9P+3Wmum`0 z9VKsWUiUF34*q_y^IU-K4hrW*#5KmxBx{P)>A7JEt%jE>sW-z6=yQd>EPNzAj95SLyR66;5E-%f!TxuOohZH9vfQ zT<-<0hblElC^yRV$7e4SeSTFgzDrOH97abklhyK%x6L}w43lkS}1sxFr; zdNyyN`y_sQ8SyzOnt#aac4I4z)s<&_iG_O8fPJI*$VA6{t3>bPl%|MoOdHApT#Kz4 zt$@wU?!oKR1)K;f{R6$6g^}tRQLEmoU}Ldsfu;n?gy1t3?MN z`eH1EobsBj;JsUHIA5PPG_Gl0^)av3F}M8P@FsUDK_yE;&8KD?$eM>ZPL8@RVCL14pNU%0tMFbjk*F{||$$NOE|ZE}09av0-3kWbKU**`JR z58K$Rx}nYEve3K9j*=#*l?OSMJf+<_sN9DX6-=c1tk14 ztxELtRkpuuOwW5*E(_$UJML;a=8x?(hI-V}q})+3XIET0MB;_9nqDp_E6Zooehir> z$LBUl!|-oW_Nl&lwYl!l#CHC^Z6yBSV3JvhGH0DG3NAVsyv`8}cny`hzPv(oT%X0U zgmf)T#ZlKbl^Ao)dFyB{e!N_IbcgG-O5x+oLUB2N_nlvqX%gNTkvL|lc zCbbc*lIHV()#8_VjGm<(G*+Kkd@6Y9X{h`KIq)iyxh>{3lN^GX6`FLAG_%#x6|>Bh ziuPes>v5v3Q}d&*2#6wHtrZ(pka=QJ)iB-`d8b!soi`Lt9}lE{FxUS)|1>8@o27zJ zH>GxrY?lN?rKZN^prh#ZXhhqO(b+tbab(eR49ihviq01SVz;R6 z53hxNHrk4)a6(ZwwSLH`)G;=gdma4F{(VueDcwii*mF_h$#<1N$5-~+b^8w(bh?3- zft4Nt zVX$s~1NRbiYwE z{tn&fhV}A6+#9aA*^lzn>^5UMieZ|$9JQANwO{^a|`xUxJXiafG z3G2*jW3HW4^{Fc&5@2fUXF7oXmFZZnUI?P#%6H9l%Y8w-O zkhq%bQKYf9$#LwC!y-$OE^hdj@;x&NTW7IGOu|L64|HGmPY(5?CM`gOVXERo2j?xM zyHG@EAytBYgpp+m)l?C{NaoYie*Ba|pk3&A(cW{a{*{`|!cW8ayL@jmD_fs=vo6*@ zVx{1*^ei@xYwW_Ot4wfI?4tJS!=EXnqU1@p!p2ZFKEm+kCg|I3vCi})+?h`K!sL5m z;=K%~X75fm5_3S67R=t}b)ql$lAhEzKH>D5=MBv3`0Glbm|K|J8@%eYwAtGG$%4Y) zt60A>HctO+S_+g-PBc8@(>B=x89#ufl(v@Di<255cwjlBRJc^hqKuq91pdrs(Cy$S z5_`{al*O|SqzPC^d;s5rKj3(JXU$@6ihml$M0DzlH#43{f5TKF36TK!E(nr*1$E}L zALhO#;=|7CY}9x$5JM)Ul@%HK{Bh+(+UUS**aV@Plgb-U!5xpWVf<`=W}Lys@6+RR z0e0L^HciEFJg<-5&hTCw2Zxf3SNJV9_L8KGkEgh`{AzjL+d;Y9vG3!2zJ0*7xBLAh z-AY^LGYT2mull7I>&?PXYJyqX1v0fl9d>vpBVBk_`FwCWD$&??>U=*z$7E5{C0C#wQBc<1fw{ zAELqo&d$AB_wJP2p_&G~%TBhY8I@Ngf9{5vU)`5BTv?1YJqq+HuD=&>xitUHb$5Dw z6M|lWSwTx^qch#b(K4>frKsQ;i7! zMEfVL_|M8vrh;oijP$107S`X#kzNPkQr>t=H4i(Uo1$ zq_uAu-ePI?C?k|&mwps*Xu8Ho(H-af_GB+ zX22JxyVgv4@p#@*v$}e@6uikO##nPa1IpqFI+Vz^Cj)^{-FVD&7vg?it9CZ=YDgj! zlbOZF$>J0CGQG3~$pL@UW*P(Ulo^0`>cPNxYHdMZ75%gF)_wYQ^Q3`vBlrEI)7fvNxQoIlzjxTO^#>l{daE!u zTc}GIPM>Qm9mqD~wPh}y^4Bb|TyK}^QZ-$54qdA)?kgFgwn92eaJtn83v53tb1Tj- zyj0U}@FVT~mV%7nD2=*;dmeG8X(ng+ItwQwBE_pFhgZ!nj%tH08efXDk&uV$z2|YI zd|7kYk=yYoK|77uyy9F||Lv$YDXwAqdlTU5Q$reDJ^-J+jJ&5zHi3K?51(Q!;A;eK zw!4sqOt8X0iJuqQc1!MTvUu&MkL>QxoxlxY>UF58ghA9s|4)ADk>HuYSE8BosCA*e zpq1zp-=_LsiCtedeI1RmLhiQQrz#p3Np}f9GR=u|?>&M&^&OFoeyRo8$Ew%jgA9Jj zm8(~&yj9wqj-_r0aK0_C7kEHtdThNVxxu(u#Vx^6`K1_H?6@?%P`&kRX{Kbc(PP6n zL~8hw-+i0;YstG^=1k^?%Wn!8^z`6xVtyniHw(75an;8Otu4iHzYa~42S3|cy>& zGHpZAy?pTH5^?@w+LsB|JHnA9uS+Hicc?KDG-8cB5y#}uhA`{*FP@nBeVH*bcXK{S zVafUH1mVtF-F0+V9?OpEt{?^|8*U&|7i;MeE)gp+lR z^wsd=nX;3>hW0bHV{y|^k5?mA){M^;Gq=h%R4q6$?wOyvJXJDpr`aZ1bfSFq0^2p; z&6Q6I6K=_@FN8fBoZG3NjHIfwDRIUQc>}HDJ{w4{xuksZ;L3Ri&tE;{B#Yw&dz9e^ z(-oUXw7GBU6)pZ@is~=gZ+0Shz2ntbh?=3EMa;Q1*F=`LxCL6e%dV7pa>SUc*eANU z64nFzBF9_$=99%+3Auug{2a3b3mjh8bU;epBW>6fEFiQzXV#gQduL@Dh_u=&5ksB| zN#C_10^tl_Udok>v!$xW%O_GIekWwf15>*J;Ij=9TOWi~Ba=l#MWmOa^3Dg>8vHHs zUA~0|E4r9rNMnFEp86wM!j+ZY_p&S4a%9IkmX&d3RI-+D-FXUV+7Xv9$(@jjFrvR{pK&D6!r536y6>JcqEO1r<6bsv2oE=P^&RJ(EX z9J_PzE_?EauUun;&CrZ*1QTH`KrEc={Q>ifz>eL zm{<;}lL<)r`J@aRy6k0!s{#aACCLU>~|t6YDnL|EA|kOmXX4#N@1} zEe+vHJSFri*=y{Y{Zj72Ny_jt_f^Bup!tLvZMaWXMk#1q24O<_v>?EY4^p1B{=VCj zky%o_>foW$tzF`)f*is{zT(hA*vtzZU)=kh2vpFWo}<^t6q-01uIw}5OYC1hqcNFnvMCjI(liAH@)F}SCUJGOVseq3klMQ;Z8U}Yj6?UC3KS!FrU^+aGF ztaXK8Z)*OCz}}2Z693qOZN+LnZoyi^c%^Si!4-^6#0O9q{MZr-#Jf!-l(;ezMQ|ebN2-71DgiK3$$Td-DgKOsXLNH zLZq=fNXUeuL7LL!#f%?w~x=~kY^d{X6xz`$K<3xFxPM<<&Of0@x@YLi2n)JyG z4fe{8PwmAink1;Hvg1|d8PwTG@|Lo#=^l^mIX-sz{u)*aj;CdejdY3B1;szNS&Hll zYppEk`1ai2{vA(3K1G!pvG8GgxYftP2_}_}hv!z=7hSCRMeB0;!b2l z?l7g{^rt+_S(XFGNiU=5h*VuR`m~4~1Qi9V!8voDRrXJJj$~OYKWzs4H6$r!ys9&Z z6~>XhRE*Uz7axD>8ch~%ov(T)lp+E;YuZgcrY5ow+%7B5XjV z?pB`#6FMsf%S?av$k(ci1F4zrQIN!`ebtEio>HAMYqq|W#EN!m@RE_1r8=P_6L@}RJcxrxa#&IsapC+exi0>cU(%-^P-F>> zfq|`$RK%H-RBbf{nPKm=2pP(fXy>G4HbWfxm)}GYz9;WiiFpXJhqKHAlY7`oj?Rb~ zpo64C%HZfE2kG8MsjglI)y{ICoDWHnKkq%bX5*a=&^M=-zplJjDja$CGu!PQhvxks zVs2_}^e%X+*5av3!a3i+6Ge`GC>fuZn^H=TUP^5_@FZu|Vp*b-5q~-eRElF0QPfs- zB+O@t8M3?_W=W{Xi7#x?VSB@wB>fAz5tT!x36g?M=Zl z>sb9phfYKpo0575{F5)T+2i4&ZP3vXydPIZiYi%inZ4l=C;|Nj2+E2u3REz9d`tNd zoPTR1C$;<27)R=8v4&wOjRAwqm8E`9{W7pjO-!~TUD`{vcb)guF69B>%i(@$?&B4y z;^y}!FsTE?^P-w(LtGiQ_4@52MkbE^o&x;jin%q=po~F}E?Nf5vuKdZly&0kAEcgd zPVL6Y1ZN}kQol?UOP*6Bq>=R%#etgViBM;zZ+k}V%ljj`mK-KWdVaR1bpbs>?r zatKSOkjQk@>RT9|%Mqlr`<0h6kzq3jqp{L0Q_JnFNE(gkniQ|~cq_?TI6oQr= zXWbLxPcXVZt@s*gV3<|+VdtrHMZ&2q9UwF3H?ak%SUMcQ2`-X=h z5QKCiN+TBV#TF3f|>)|k_P z;p3Srs|^}Rk5*#B-Y^6H3`ud{pQ;P!h+OtRTQU>(A7n)?;4onv;GINHQ9Mbnz7CP} zMjp?7KiQvb>mqT}&}e{kyMD`f%+N|2uuc#H@wx*D>Xojz8Tu+C6)IV!`_W22E zC}L`MZFaV1%6Mj(cF?+6l(PJ%SpdO&@BLHjOO^VeQJOukCrctU)szd%(!m|b&7gJoFFx0-B3F>k zlcv7?)++C1dD!r}Dz8{pCH3N!7kkBpQ+%~>k#-M8=4c-C3s&yFms!d9AD{~r$@_8A zN4F|Y^6(loZHqq2SB$(C$tyIYU6{I<|2Y_n@jtLVH$2V_#UmsL2LGGx5&nzr(e-wP z^TUt`cX_y*HOj%&9f{`l&`otq7dXGH%+152>R<&&S-LwQ5jt>`gDw6I`Df<_5JFeb znzFJ;FJoRo2pGT%27&cCeN~*g3-iKzx1~H)}Y;9RL*(z~{el zcGKVm381aWS-L919qjDg(c{paI__{6eE<|aGU{Q-8i3M zPUSP4(4qz2H6s!H!G?7q8uD0!@NEZv{KDj@n$c&L0_KvXXC9hYxR zXM;8w?^@7F;)2#sz5rPFN?#37vKILIuDa^o(*_+eL)D=JA}YL6j^fA{^V^S2WJrf&K#$ovImzzx#1@%eR;dI*Obt{QNIn|}aF z0QlGPU;==f#s0Hj;BWot^`VK+-zxl?D5SLx+})TT&7Je>!oA#0|BlbUjrtuPO-nmC zzosSX2KA5|1N_=>H>3y38tw)VxKY;TSA*L)Sl(==8wY|Sf_&)hBmx!&0)?T%5C9Y+ zzz2nj2n&IQg+&BGP*Z+6^lZ@P+yFOJz4qVs#*O`(J&&gD|Lxli|GymK|K$-q;Qzzr zO?*`xY}||izbAatltX&lgye6|1^#w%;J0@M|DT4|?{!0cFyvfZ!^@Ih$=+ytGOb{LY|B!(p=s(GS$b^CaH5XwK@PCgL5ke>D zpJSmQA@tV#4;dKxHy`dOOZ3T)x&g6{gAW`XDt>Jw5{KMb z7z4zivO*yAzCr(iK!SqkY=MPg3JNfJpnyCKy^p{mlKB6($_=RHk=AndaBC+w4;KKx zJWNJdP((lk3Phhy0x%( Date: Thu, 3 Apr 2025 14:05:45 +0200 Subject: [PATCH 2/3] unit test failing on main --- tests/test_general.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/tests/test_general.py b/tests/test_general.py index 132c827..27dc80b 100644 --- a/tests/test_general.py +++ b/tests/test_general.py @@ -20,6 +20,7 @@ PDF_SIMPLE_BYTES = PDF_SIMPLE.open("rb").read() PDF_TABLE = Path(__file__).parent / "data" / "table.pdf" PDF_INDEX = Path(__file__).parent / "data" / "table_document_index.pdf" +PDF_DUP_COL = Path(__file__).parent / "data" / "duplicate_columns.pdf" @pytest.fixture @@ -213,3 +214,15 @@ def test_serialize_roundtrip(path, nlp): table_before = before._.get(layout.attrs.span_data) table_after = after._.get(layout.attrs.span_data) assert_frame_equal(table_before, table_after) + + +@pytest.mark.parametrize("path", [PDF_DUP_COL]) +def test_duplicate_columns(path, nlp): + layout = spaCyLayout(nlp) + old_doc = layout(path) + old_table = old_doc._.tables[0]._.data + assert list(old_table.columns) == ['Index', 'Value', 'Value', 'Index', 'Value', 'Value'] + doc_bin = DocBin(docs=[old_doc], store_user_data=True) + new_doc = list(doc_bin.get_docs(nlp.vocab))[0] + new_table = new_doc._.tables[0]._.data + assert list(new_table.columns) == ['Index', 'Value', 'Value (2)', 'Index (2)', 'Value (3)', 'Value (4)'] From 118432b78fa3ea9be6bd9cbc177e16d4f029e3e2 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Thu, 3 Apr 2025 14:06:24 +0200 Subject: [PATCH 3/3] unit test failing on main --- spacy_layout/util.py | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/spacy_layout/util.py b/spacy_layout/util.py index 6b59ddb..65b7459 100644 --- a/spacy_layout/util.py +++ b/spacy_layout/util.py @@ -33,10 +33,26 @@ def decode_obj(obj: Any, chain: Callable | None = None) -> Any: def encode_df(obj: Any, chain: Callable | None = None) -> Any: """Convert pandas.DataFrame for serialization.""" if isinstance(obj, DataFrame): - return {"data": obj.to_dict(), TYPE_ATTR: "DataFrame"} + # ensure unique column names, as data will be lost otherwise + df = _ensure_unique_columns(obj) + return {"data": df.to_dict(), TYPE_ATTR: "DataFrame"} return obj if chain is None else chain(obj) +def _ensure_unique_columns(df: DataFrame) -> DataFrame: + seen_cols = {} + new_cols = [] + for col_name in df.columns: + if col_name not in seen_cols: + seen_cols[col_name] = 1 + new_cols.append(col_name) + else: + seen_cols[col_name] += 1 + new_cols.append(f"{col_name} ({seen_cols[col_name]})") + df.columns = new_cols + return df + + def decode_df(obj: Any, chain: Callable | None = None) -> Any: """Load pandas.DataFrame from serialized data.""" if isinstance(obj, dict) and obj.get(TYPE_ATTR) == "DataFrame":