@@ -146,28 +146,31 @@ def find_imgs(string):
146146
147147 def split_text (self , text : str ) -> List [str ]:
148148 content_dict , masked_text = self .mask_urls_and_imgs (text )
149- start_tag = self ._table_tags ["table_open" ]
150- end_tag = self ._table_tags ["table_close" ]
151- splits = masked_text .split (start_tag )
149+ start_tags = [self ._table_tags ["table_open" ], "<tabular>" ]
150+ end_tags = [self ._table_tags ["table_close" ], "</tabular>" ]
151+ splits = masked_text
152+ for start_tag in start_tags :
153+ splits = splits .split (start_tag )
152154
153155 final_chunks = self .chunk_rest (splits [0 ]) # the first split is before the first table tag so it is regular text
154156
155157 table_caption_prefix = ""
156158 if len (final_chunks )> 0 :
157159 table_caption_prefix += self .extract_caption (final_chunks [- 1 ]) # extracted from the last chunk before the table
158160 for part in splits [1 :]:
159- table , rest = part .split (end_tag )
160- table = start_tag + table + end_tag
161- minitables = self .chunk_table (table , table_caption_prefix )
162- final_chunks .extend (minitables )
163-
164- if rest .strip ()!= "" :
165- text_minichunks = self .chunk_rest (rest )
166- final_chunks .extend (text_minichunks )
167- table_caption_prefix = self .extract_caption (text_minichunks [- 1 ])
168- else :
169- table_caption_prefix = ""
170-
161+ for end_tag in part :
162+ if end_tag in part :
163+ table , rest = part .split (end_tag )
164+ table = start_tags [0 ] + table + end_tags [0 ]
165+ minitables = self .chunk_table (table , table_caption_prefix )
166+ final_chunks .extend (minitables )
167+
168+ if rest .strip ()!= "" :
169+ text_minichunks = self .chunk_rest (rest )
170+ final_chunks .extend (text_minichunks )
171+ table_caption_prefix = self .extract_caption (text_minichunks [- 1 ])
172+ else :
173+ table_caption_prefix = ""
171174
172175 final_final_chunks = [chunk for chunk , chunk_size in merge_chunks_serially (final_chunks , self ._chunk_size , content_dict )]
173176
0 commit comments