Skip to content

Commit 92e45e3

Browse files
Merge pull request microsoft#5 from Roopan-Microsoft/bug_fix_psl_7834
Bug fix psl 7834
2 parents 57dc21e + 9b313b1 commit 92e45e3

File tree

1 file changed

+18
-15
lines changed

1 file changed

+18
-15
lines changed

scripts/data_utils.py

Lines changed: 18 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -146,28 +146,31 @@ def find_imgs(string):
146146

147147
def split_text(self, text: str) -> List[str]:
148148
content_dict, masked_text = self.mask_urls_and_imgs(text)
149-
start_tag = self._table_tags["table_open"]
150-
end_tag = self._table_tags["table_close"]
151-
splits = masked_text.split(start_tag)
149+
start_tags = [self._table_tags["table_open"], "<tabular>"]
150+
end_tags = [self._table_tags["table_close"], "</tabular>"]
151+
splits = masked_text
152+
for start_tag in start_tags:
153+
splits = splits.split(start_tag)
152154

153155
final_chunks = self.chunk_rest(splits[0]) # the first split is before the first table tag so it is regular text
154156

155157
table_caption_prefix = ""
156158
if len(final_chunks)>0:
157159
table_caption_prefix += self.extract_caption(final_chunks[-1]) # extracted from the last chunk before the table
158160
for part in splits[1:]:
159-
table, rest = part.split(end_tag)
160-
table = start_tag + table + end_tag
161-
minitables = self.chunk_table(table, table_caption_prefix)
162-
final_chunks.extend(minitables)
163-
164-
if rest.strip()!="":
165-
text_minichunks = self.chunk_rest(rest)
166-
final_chunks.extend(text_minichunks)
167-
table_caption_prefix = self.extract_caption(text_minichunks[-1])
168-
else:
169-
table_caption_prefix = ""
170-
161+
for end_tag in part:
162+
if end_tag in part:
163+
table, rest = part.split(end_tag)
164+
table = start_tags[0] + table + end_tags[0]
165+
minitables = self.chunk_table(table, table_caption_prefix)
166+
final_chunks.extend(minitables)
167+
168+
if rest.strip()!="":
169+
text_minichunks = self.chunk_rest(rest)
170+
final_chunks.extend(text_minichunks)
171+
table_caption_prefix = self.extract_caption(text_minichunks[-1])
172+
else:
173+
table_caption_prefix = ""
171174

172175
final_final_chunks = [chunk for chunk, chunk_size in merge_chunks_serially(final_chunks, self._chunk_size, content_dict)]
173176

0 commit comments

Comments
 (0)