@@ -146,28 +146,32 @@ def find_imgs(string):
146146
147147 def split_text (self , text : str ) -> List [str ]:
148148 content_dict , masked_text = self .mask_urls_and_imgs (text )
149- start_tag = self ._table_tags ["table_open" ]
150- end_tag = self ._table_tags ["table_close" ]
151- splits = masked_text .split (start_tag )
149+ start_tags = [self ._table_tags ["table_open" ] "<tabular>" ]
150+ end_tags = [self ._table_tags ["table_close" ] "</tabular>" ]
151+ splits = masked_text
152+ for start_tag in start_tags :
153+ splits = splits .split (start_tag )
152154
153155 final_chunks = self .chunk_rest (splits [0 ]) # the first split is before the first table tag so it is regular text
154156
155157 table_caption_prefix = ""
156158 if len (final_chunks )> 0 :
157159 table_caption_prefix += self .extract_caption (final_chunks [- 1 ]) # extracted from the last chunk before the table
158160 for part in splits [1 :]:
159- table , rest = part .split (end_tag )
160- table = start_tag + table + end_tag
161- minitables = self .chunk_table (table , table_caption_prefix )
162- final_chunks .extend (minitables )
161+ for end_tag in part :
162+ if end_tag in part :
163+ table , rest = part .split (end_tag )
164+ table = start_tags [0 ] + table + end_tags [0 ]
165+ minitables = self .chunk_table (table , table_caption_prefix )
166+ final_chunks .extend (minitables )
163167
164168 if rest .strip ()!= "" :
165169 text_minichunks = self .chunk_rest (rest )
166170 final_chunks .extend (text_minichunks )
167171 table_caption_prefix = self .extract_caption (text_minichunks [- 1 ])
168172 else :
169173 table_caption_prefix = ""
170-
174+ break
171175
172176 final_final_chunks = [chunk for chunk , chunk_size in merge_chunks_serially (final_chunks , self ._chunk_size , content_dict )]
173177
0 commit comments