@@ -6224,6 +6224,13 @@ def index(
62246224 self , doc : "DoclingDocument" , page_nrs : Optional [set [int ]] = None
62256225 ) -> None :
62266226
6227+ if page_nrs is not None and (
6228+ unavailable_page_nrs := page_nrs - set (doc .pages .keys ())
6229+ ):
6230+ raise ValueError (
6231+ f"The following page numbers are not present in the document: { unavailable_page_nrs } "
6232+ )
6233+
62276234 orig_ref_to_new_ref : dict [str , str ] = {}
62286235 page_delta = self ._max_page - min (doc .pages .keys ()) + 1 if doc .pages else 0
62296236
@@ -6265,7 +6272,29 @@ def index(
62656272
62666273 if item .parent :
62676274 # set item's parent
6268- new_parent_cref = orig_ref_to_new_ref [item .parent .cref ]
6275+ new_parent_cref = orig_ref_to_new_ref .get (item .parent .cref )
6276+ if new_parent_cref is None :
6277+
6278+ parent_ref = item .parent
6279+ while new_parent_cref is None and parent_ref is not None :
6280+ parent_ref = RefItem (
6281+ cref = parent_ref .resolve (doc ).parent .cref
6282+ )
6283+ new_parent_cref = orig_ref_to_new_ref .get (
6284+ parent_ref .cref
6285+ )
6286+
6287+ if new_parent_cref is not None :
6288+ warnings .warn (
6289+ f"Parent { item .parent .cref } not found in indexed nodes, "
6290+ f"using ancestor { new_parent_cref } instead"
6291+ )
6292+ else :
6293+ warnings .warn (
6294+ "No ancestor found in indexed nodes, using body as parent"
6295+ )
6296+ new_parent_cref = "#/body"
6297+
62696298 new_item .parent = RefItem (cref = new_parent_cref )
62706299
62716300 # add item to parent's children
@@ -6355,38 +6384,54 @@ def concatenate(cls, docs: Sequence["DoclingDocument"]) -> "DoclingDocument":
63556384 res_doc ._update_from_index (doc_index )
63566385 return res_doc
63576386
6358- def _validate_rules (self ):
6387+ def _validate_rules (self , raise_on_error : bool = True ):
6388+
6389+ def _handle (error : Exception ):
6390+ if raise_on_error :
6391+ raise error
6392+ else :
6393+ warnings .warn (str (error ))
63596394
63606395 def validate_furniture (doc : DoclingDocument ):
63616396 with warnings .catch_warnings ():
63626397 warnings .simplefilter ("ignore" , category = DeprecationWarning )
63636398 has_furniture_children = len (doc .furniture .children ) > 0
63646399 if has_furniture_children :
6365- raise ValueError (
6366- f"Deprecated furniture node { doc .furniture .self_ref } has children"
6400+ _handle (
6401+ ValueError (
6402+ f"Deprecated furniture node { doc .furniture .self_ref } has children"
6403+ ),
63676404 )
63686405
63696406 def validate_list_group (doc : DoclingDocument , item : ListGroup ):
63706407 for ref in item .children :
63716408 child = ref .resolve (doc )
63726409 if not isinstance (child , ListItem ):
6373- raise ValueError (
6374- f"ListGroup { item .self_ref } contains non-ListItem { child .self_ref } ({ child .label = } )"
6410+ _handle (
6411+ ValueError (
6412+ f"ListGroup { item .self_ref } contains non-ListItem { child .self_ref } ({ child .label = } )"
6413+ ),
63756414 )
63766415
63776416 def validate_list_item (doc : DoclingDocument , item : ListItem ):
63786417 if item .parent is None :
6379- raise ValueError (f"ListItem { item .self_ref } has no parent" )
6380- if not isinstance (item .parent .resolve (doc ), ListGroup ):
6381- raise ValueError (
6382- f"ListItem { item .self_ref } has non-ListGroup parent: { item .parent .cref } "
6418+ _handle (
6419+ ValueError (f"ListItem { item .self_ref } has no parent" ),
6420+ )
6421+ elif not isinstance (item .parent .resolve (doc ), ListGroup ):
6422+ _handle (
6423+ ValueError (
6424+ f"ListItem { item .self_ref } has non-ListGroup parent: { item .parent .cref } "
6425+ ),
63836426 )
63846427
63856428 def validate_group (doc : DoclingDocument , item : GroupItem ):
63866429 if (
63876430 item .parent and not item .children
63886431 ): # tolerate empty body, but not other groups
6389- raise ValueError (f"Group { item .self_ref } has no children" )
6432+ _handle (
6433+ ValueError (f"Group { item .self_ref } has no children" ),
6434+ )
63906435
63916436 validate_furniture (self )
63926437
0 commit comments