From 4dfbb85991cff4cfd5153dc465d73cc51e7c8399 Mon Sep 17 00:00:00 2001 From: Jaakko Santala Date: Thu, 19 Sep 2024 12:55:39 +0300 Subject: [PATCH] fix header normalization for xlsx --- plugins/xlsx-extractor/ref/test-headers.xlsx | Bin 0 -> 5831 bytes .../src/header.normalization.spec.ts | 129 ++++++++++++++++++ plugins/xlsx-extractor/src/utils.ts | 8 +- utils/extractor/src/index.ts | 2 +- 4 files changed, 136 insertions(+), 3 deletions(-) create mode 100644 plugins/xlsx-extractor/ref/test-headers.xlsx create mode 100644 plugins/xlsx-extractor/src/header.normalization.spec.ts diff --git a/plugins/xlsx-extractor/ref/test-headers.xlsx b/plugins/xlsx-extractor/ref/test-headers.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..ef07060878181e420b9682887d3fed45b6c9f629 GIT binary patch literal 5831 zcmaJ_2UJtrwhc%T3B6b8qSVl&3aIp=5Nberq=nvll@3Z5ks@HE7wI5HN&x91HAwG@ z5C~o1=id9r`{KL)b4E@^cE+6N%(eGkYn}O6?HUdx006iSi1idUEXdR?MPhzQVFLh! z07`(VD-`A?!2j133qXqbVvq>0nAKmzOA}8sxCKc170%y*LK+SolM<5JwZ3Lj-7ed`gQavN3VQ* zSE_nzO_79clbHpH71l?^8MpeG44)mk!kko+){6f^2dgj6;s%gl`pT&=ub~TA+D0K8q zbGemv(^L2T9L4^w`T^;W45;J#B7u$VOZYS6_>Zr|1&4KlstKc=0Sf>i!^FkP$?~zQ zle3$ErIRa^-`fH9Z_#l`(6A4eBYn7YaU-L-&XEPj@E{FLjjdu<@c=n$o<#zeq6-S- zdjC=_L7~O>i_h$LNFdm1`ANl0lNP50I@!go;@(_N$t_XhZn_7N_deUBL-J^sgmxIc zXnkv=!ij@jSW6m*!uwa>J=ar9aEZu8!L38VW746Vqsi1nZ5?+Q6+QGdy}3K}jlDjG z%kdbxc8pB${j9n_y4M!gBGDT^DE~HP3tj1UqD45_-;WGz6-rQ$;=p}?x^`9kQ9_D9 z97g;k=0p0ccynjxKWWbz8obdVNQ$%Im&3L)2lpktecnhp%v^kw1e*z-ChCK1AsNX0 z0na-HZ;lZ+2SpKCxQ1s*>(o*Pzl!U8n_R!CoxM6jvn{F}+)JF)drdvdIO9RS&S#!u z3Ef7-tU*lR8slqC$fc-zU-Y@Rk5RUR=20X&%Wg&Lc7o*_|1!2#99iZ?5A z=1l3znspHL?n7@@q-*KR)a`GMjF#??eo_}Xob}4GMJ*r&V)^&2Sv{qfScF?tXTov3 zYAcBnY-qF_XKI4RN<+js~7kZT@W<+@8RtN70pUJ z;qb|fSI@X{cHv7wUF-8|P)$hbVz4|y`fWb%{zB;nqX92bh_@cn@M+^}Z0FMh!mDw@ z8Zcs>$Hb`!lM3kniPJyX;MZvVKbc_=lNm_EcYJPC9YH)Hs<#u}2%Z@u(OBUh>eOK> zHubVTXO3WHi|cRjbGqVcgmYf#ecN$)ob$5i`A0e3lw84vDfAoR-B!Owzfg}BgO(*T z!-N+*6t^=7tlEX&S(o+sF!E7>%wJ@Qt&P6#o!zwhI_FuK4m~N{hvI!nR^M_=?THGh(6QMQv)o4*@;@MlxkbXvENGH$oJtP3>e1cxf-f_qL zCsgh>PzR{M-&fF|mesbg0~jezdNF6Zh?>ltaIr`ZXLS!28OvOp0XnO;-QAv|*+JQc0n43U8tsx54sJA5i-xMP)s%GhpYn4(^lr{?mPslIH5 ze8`&Gc|OmW_<(9G1rofr_hR6?LDA0<7c0Q{!`6P+PjC1>JyfNn?4BXOJ-LxzFhEC) zhp(wh?wEew4L-HjO2Ki@#?4G|)djL~v)|xNbxBe~E^)jSKC8xp=+DTVH3%JC%Wum0 zLWM<<9#R$b==tuhzrn2z*PO^{kUvXSCT7KRnc6QqG?53!9MGuX2 z{4gZ$iy9!T1`J;8GmvmR4R}5O&7mJiu9ml=;o*``2J}^N=Zm|CBms}k7NJ}V{v}lT(PtE8&YtOMieyAz zvR}ckJ9M+OlnIb{p7I)%(X=^_9JShxiN$gxEc8%c9IH_8Y}fvgX_ZytVlQPz-pHWljN6d7O6zuxFK{F8mvDPybAbPG?__eW|I>kFvC z2eq5cF4S|ST~L)ihPAbdfd@1`1}Nvf5|nc$4JU)D1u&6TIqns0X9O)f2OfDo7ow0g zBC<=);!D>qy~VLIkV8=e*v5*$BEJ@`MX{XAilB`S=_-r{IYxoxEm%Ny7V7jC>i=+z zeMmA=(KjTBGaczw?LdAm>8`-`{B{G%#Bp3#E=quc}w=uM}nEQ-l?t>Xv_EM=$r}KPwwqso!q~-Q=P4;TBSQr7y z{Z&)_@BjeX|LZ^8Y@kqgH-W#u{#Y?mbv4{F1xZ`h^=;gHi$qNkF67Q!Ro~!aB7Ft6 zetKr0SAe{v%CVq%g$^;Xv5b^!HeU0sUvG7hKWRzDH4M`+)&U(T@^e1sWLq7-1DDNC zzxfVr=Sl7m?yoBm%oSDfM z)Dy2IW$u><$es1tytHi@>#we!QJJhAQV$0ruV0c@ zZLnj9T_+~Xk)~kCd;yC(fvRX z@dSRDyIsBgN`w*wcI;o#Eh+8A!sVU*(u`%Ta7kmte)4A2rc%!}kl2=wk!X(-Jod~_ zBJ0*vzl*PR#Z#VR#7c~Q4CM>+^&i(s>T8MzAh9#0kN0Co3VrNX#WUsVs3&^EM~phF zKFrJC742)#IVBrBgZpFo4GEX$C^!+*`W+>brV8$#G>8+^uZ1)@SU+}=od}t3`^*ch_53PVD)NaYrN#PGklK>4NZpz&(U7&=v6*S`w?mL6=wS5 zU7QlQ*=GcBV1IDA#Gb!8Fz&UC8ft{AEt}=ZPXC zj>m@`l=o}(n{`mHjZyFP-4ot5pz*5xayfx)j^`NFBV7)%XriH6yx#K#?nSks`(x%E z40(=xIC?||@xt>VgnivddSLOhlJ;`^CzjXb>ZoGHO>UILFRSKhCF%UIf2l058bk8| zR9*3AG>AX8Gcbp#e+OMifRIhqxX|eKJYprw29Hu))f}wLdwUqO&AX@o0HXiZ?Ct}D z{!#3>frArfZ)3Va^OG|>Wzz{wxmtx!bpqoB9-muZIg_d(6cL-ff#*N(laq_y={v<% zj^5t?dg3#*?(E_|nZhU&TAV=%A|AdLCpQ2^JS1B-C8;L4d(1BpYuB>0>xPw^2qN`} zqk-t>TrXNDt}9>yt)1CoiRUmKk=HMqrwwGQ5kGB_-g6281j0^^U7(C`WPOL~hpr z@G&yTK$pwxehhWYn4e7N{N!0^+-0)X_!uev%_bh9Nl2D`7lsn2JU60E2t3W^Hmw{9gqfK?+Gf%5IX=I~- zW8e0%u=O+UH}<;^REXTRw+h;WbM2{&?oHRXxvbktL})m+HsMHT@hA6_X-N=UivwS$ z*GtJcRGBwZ#QDce%X^E2I6s#=X2Th|sZfG%gKjdBj2^UNf%z5;CLaqVXr7rbwlNA_ zfS4k|cg;qQ{kksgR#0usCs-HaRzGF-)}OANxJm8(BnS&3V{sUtE`od6ZN^hh8x_*d z8+9?}b2-_<)exc-fe)J$%1{}iNZ*M;yH~DLPdKNH{pi9DpSRaA6|)`ORe)10OOwo6 z5*=Fq+(!?vX0lY~ytDp3`^JNqRou#QHS%e~`h%~__giTA-Y2k4&-HLwUjnY~hS}Me z{S?gZZ@?72zuu6H9ymF=Lmk~sb$pzmZcqM{0LoJTe(_c~I*;K--T}F5=~6vcuubRF z-o$_DokY7738Uzk_hP(J{DMNRQYD2h$Ilv}DfCJ#FUC$VjYlFOa+4)N?rDsRVcc>~ zx6&T@qgy2T-I`(X&Sf>r4epw4lp zG-G6sV`OT?QdbD!{ER9MQ&gIHM^9&$_6E!1vU%Qu&x>7@;G*R9Nfn#4qk#$Q&exAU zNFoi1nXJD00kNBl_Fh}J>(r4&74!}RzwQmm-%WLJbgi5LK_bFN-3>#y9MA~uw^?x+VReN-KQ-& zb^+BMX1ztZ#@VQPPSN((tdHmQUII%whmDf6<)O_qBQe=7p2zYL=N{hGK|F3B)9R@p zpXVaHew-Et6^7Nw=Jj<7JXXWPzK5B@f42-V!&i=948-4oS3R{~QM8zq@?Q?y@9?Xx z)UR+VOxpbq{C_>F-_ciHpkL9;81!$R(C^@@*2u45LY%+Be=tgZ=eg<|{K_MSaT72P z@wc>&!vCoFug>y!maFCYR~9|uzghmPSpUv&wb=d2pi1%&!+%QO@6fB6 g;a8{uIpE*&#A7vFOcKE~6PPm?GgqL$mLlMP0GnIl6951J literal 0 HcmV?d00001 diff --git a/plugins/xlsx-extractor/src/header.normalization.spec.ts b/plugins/xlsx-extractor/src/header.normalization.spec.ts new file mode 100644 index 000000000..edac974d1 --- /dev/null +++ b/plugins/xlsx-extractor/src/header.normalization.spec.ts @@ -0,0 +1,129 @@ +import api from '@flatfile/api' +import { + setupListener, + setupSpace, + getEnvironmentId, +} from '@flatfile/utils-testing' +import { ExcelExtractor } from '.' +import fs from 'fs' +import path from 'path' +import { FlatfileEvent } from '@flatfile/listener' + +describe('xlsx-extractor plugin', () => { + + const listener = setupListener() + let spaceId: string + + beforeAll(async () => { + const space = await setupSpace() + spaceId = space.id + }) + afterAll(async () => { + await api.spaces.delete(spaceId) + }) + + beforeEach(async () => { + listener.use(ExcelExtractor()) + }) + + it('Upload file with headers that require normalization', async () => { + + listener.on("**", async (event: FlatfileEvent) => { + console.log(event.topic) + }) + + await api.files.upload(fs.createReadStream(path.join(__dirname,'../ref/test-headers.xlsx')), { + environmentId: getEnvironmentId(), + spaceId, + }) + + const failure = async () => { + await listener.waitFor("job:failed", 1) + return false + } + const success = async () => { + await listener.waitFor("sheet:counts-updated", 3) + return true + } + + const ok = await Promise.race([failure(), success()]) + if(!ok) { + throw new Error("Job should not fail") + } else { + const { data: workbooks } = await api.workbooks.list({spaceId}) + expect(workbooks.length).toBe(1) + const { data: sheets } = await api.sheets.list({workbookId: workbooks[0].id}) + expect(sheets.length).toBe(1) + const EXPECTED_FIELDS = [{ + "description": "", + "key": "Code", + "label": "Code", + "type": "string", + }, + { + "description": "", + "key": "Amount_DOLLAR_", + "label": "Amount_DOLLAR_", + "type": "string", + }, + { + "description": "", + "key": "Amount_DOLLAR__1", + "label": "Amount_DOLLAR__1", + "type": "string", + }, + { + "description": "", + "key": "Rate_PERCENT_", + "label": "Rate_PERCENT_", + "type": "string", + }, + { + "description": "", + "key": "empty", + "label": "empty", + "type": "string", + }, + { + "description": "", + "key": "empty_1", + "label": "empty_1", + "type": "string", + }] + + expect(sheets[0].config.fields).toEqual(EXPECTED_FIELDS) + + const { data: { records } } = await api.records.get(sheets[0].id) + expect(records.length).toBe(2) + const data = records.map((record) => + EXPECTED_FIELDS.reduce((acc, field) => { + acc[field.key] = record.values[field.key].value + return acc + }, {}) + ) + expect(data).toEqual([ + { + "Amount_DOLLAR_": "100", + "Amount_DOLLAR__1": "300", + "Code": "ABC", + "Rate_PERCENT_": "5%", + "empty": undefined, + "empty_1": undefined, + }, + { + "Amount_DOLLAR_": "200", + "Amount_DOLLAR__1": "400", + "Code": "DEF", + "Rate_PERCENT_": "3%", + "empty": undefined, + "empty_1": undefined, + }, + ]) + + } + + }) + +}) + + diff --git a/plugins/xlsx-extractor/src/utils.ts b/plugins/xlsx-extractor/src/utils.ts index 1a25dda1e..7e9d4ecec 100644 --- a/plugins/xlsx-extractor/src/utils.ts +++ b/plugins/xlsx-extractor/src/utils.ts @@ -5,8 +5,8 @@ export function prependNonUniqueHeaderColumns( const result: Record = {} for (const [key, value] of Object.entries(record)) { - const newValue = value ? value : 'empty' - const cleanValue = newValue.replace('*', '') + const newValue = value || 'empty' + const cleanValue = normalizeKey(newValue.replace('*', '')) if (cleanValue && counts[cleanValue]) { result[key] = `${cleanValue}_${counts[cleanValue]}` @@ -19,3 +19,7 @@ export function prependNonUniqueHeaderColumns( return result } + +function normalizeKey(key: string): string { + return key.trim().replace(/%/g, '_PERCENT_').replace(/\$/g, '_DOLLAR_').replace(/[^a-zA-Z0-9]/g, "_") +} \ No newline at end of file diff --git a/utils/extractor/src/index.ts b/utils/extractor/src/index.ts index 979ad8528..1104301a3 100644 --- a/utils/extractor/src/index.ts +++ b/utils/extractor/src/index.ts @@ -211,7 +211,7 @@ function getSheetConfig( } function normalizeKey(key: string): string { - return key.trim().replace(/%/g, '_PERCENT_').replace(/\$/g, '_DOLLAR_') + return key.trim().replace(/%/g, '_PERCENT_').replace(/\$/g, '_DOLLAR_').replace(/[^a-zA-Z0-9]/g, "_") } function normalizeRecordKeys(record: Flatfile.RecordData): Flatfile.RecordData {