Skip to content

Commit 579c996

Browse files
fix bug #7834
1 parent deb656e commit 579c996

File tree

1 file changed

+12
-8
lines changed

1 file changed

+12
-8
lines changed

scripts/data_utils.py

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -146,28 +146,32 @@ def find_imgs(string):
146146

147147
def split_text(self, text: str) -> List[str]:
148148
content_dict, masked_text = self.mask_urls_and_imgs(text)
149-
start_tag = self._table_tags["table_open"]
150-
end_tag = self._table_tags["table_close"]
151-
splits = masked_text.split(start_tag)
149+
start_tags = [self._table_tags["table_open"] "<tabular>"]
150+
end_tags = [self._table_tags["table_close"] "</tabular>"]
151+
splits = masked_text
152+
for start_tag in start_tags:
153+
splits = splits.split(start_tag)
152154

153155
final_chunks = self.chunk_rest(splits[0]) # the first split is before the first table tag so it is regular text
154156

155157
table_caption_prefix = ""
156158
if len(final_chunks)>0:
157159
table_caption_prefix += self.extract_caption(final_chunks[-1]) # extracted from the last chunk before the table
158160
for part in splits[1:]:
159-
table, rest = part.split(end_tag)
160-
table = start_tag + table + end_tag
161-
minitables = self.chunk_table(table, table_caption_prefix)
162-
final_chunks.extend(minitables)
161+
for end_tag in part:
162+
if end_tag in part:
163+
table, rest = part.split(end_tag)
164+
table = start_tags[0] + table + end_tags[0]
165+
minitables = self.chunk_table(table, table_caption_prefix)
166+
final_chunks.extend(minitables)
163167

164168
if rest.strip()!="":
165169
text_minichunks = self.chunk_rest(rest)
166170
final_chunks.extend(text_minichunks)
167171
table_caption_prefix = self.extract_caption(text_minichunks[-1])
168172
else:
169173
table_caption_prefix = ""
170-
174+
break
171175

172176
final_final_chunks = [chunk for chunk, chunk_size in merge_chunks_serially(final_chunks, self._chunk_size, content_dict)]
173177

0 commit comments

Comments
 (0)