From d5ed732815ec14406d648e97295571849737ac2a Mon Sep 17 00:00:00 2001 From: DaniilOr Date: Tue, 9 Dec 2025 14:40:58 +0400 Subject: [PATCH 1/2] update reclaim --- contest/extract.py | 3 +- src/reclaim/__init__.py | 66 ++++++++- src/reclaim/claim_level_prompts.py | 8 +- src/reclaim/decompose.py | 2 +- src/reclaim/extract_claims.py | 222 +++++++++++++++++++++++++++-- src/reclaim/prompts.py | 99 +++++++------ 6 files changed, 340 insertions(+), 60 deletions(-) diff --git a/contest/extract.py b/contest/extract.py index b4fafbd..306d66c 100644 --- a/contest/extract.py +++ b/contest/extract.py @@ -1,4 +1,5 @@ import json +from tqdm import tqdm from transformers import AutoTokenizer from dataclasses import asdict @@ -8,7 +9,7 @@ with open("Meta-Llama-3.1-70B-Instruct-Turbo.json", "r") as f: data = json.load(f) - for _id, row in data.items(): + for _id, row in tqdm(data.items()): claims = extract_and_align_claims( text=row["output"], tokens=row["greedy_tokens"], diff --git a/src/reclaim/__init__.py b/src/reclaim/__init__.py index ab037ff..fae5c0e 100644 --- a/src/reclaim/__init__.py +++ b/src/reclaim/__init__.py @@ -1,10 +1,15 @@ """ReClaim core package.""" from importlib import metadata -from typing import List +from typing import List, Optional from .decompose import doc2sentences -from .extract_claims import Claim, ClaimModel, ClaimsExtractor +from .extract_claims import ( + Claim, + ClaimModel, + ClaimPostprocessingConfig, + ClaimsExtractor, +) from .annotate_claims import ClaimsAnnotator from .openai_client import OpenAIChat @@ -20,15 +25,54 @@ "batch_extract_and_align_claims", "doc2sentences", "Claim", + "ClaimPostprocessingConfig", ] -def extract_claims(text: str, model: str = "gpt-4o") -> List[Claim]: +def _default_postprocess_config( + override: Optional[ClaimPostprocessingConfig], + enable_defaults: bool, +) -> Optional[ClaimPostprocessingConfig]: + """ + Resolve a postprocessing config: + - honor explicit override; + - if allowed, provide a sensible default bundle; + - otherwise, return None to disable all extras. + """ + if override is not None: + return override + if not enable_defaults: + return None + return ClaimPostprocessingConfig( + rewrite_pronouns=True, + sanitize_with_llm=True, + dedupe_with_encoder=True, + dedupe_with_cosine=True, + ) + + +def extract_claims( + text: str, + model: str = "gpt-4o", + postprocess_config: Optional[ClaimPostprocessingConfig] = None, + enable_default_postprocessing: bool = True, +) -> List[Claim]: """ Extract atomic claims from plain text. + + By default, enables post-processing (pronoun rewrite, LLM sanitization, + encoder + BoW dedupe). Pass a custom ClaimPostprocessingConfig or set + enable_default_postprocessing=False to skip these steps. """ result = doc2sentences(doc=text, mode="atomic_claims", model=model, schema=ClaimModel) claim_texts = result.claims if isinstance(result, ClaimModel) else result + chat = OpenAIChat(openai_model=model) + config = _default_postprocess_config(postprocess_config, enable_default_postprocessing) + extractor = ClaimsExtractor( + openai_chat=chat, + postprocess_config=config, + ) + claim_texts = extractor.postprocess_claims(claim_texts, text) return [ Claim( claim_text=claim_text, @@ -47,14 +91,22 @@ def extract_and_align_claims( openai_model: str = "gpt-4o", progress_bar: bool = True, n_threads: int = 1, + postprocess_config: Optional[ClaimPostprocessingConfig] = None, + enable_default_postprocessing: bool = True, ): """ Extract and align claims with token-level provenance from model output tokens. + + By default, enables post-processing (pronoun rewrite, LLM sanitization, + encoder + BoW dedupe). Pass a custom ClaimPostprocessingConfig or set + enable_default_postprocessing=False to skip these steps. """ + config = _default_postprocess_config(postprocess_config, enable_default_postprocessing) extractor = ClaimsExtractor( openai_chat=OpenAIChat(openai_model=openai_model), progress_bar=progress_bar, n_threads=n_threads, + postprocess_config=config, ) return extractor.claims_from_text(text, tokens, tokenizer) @@ -66,14 +118,22 @@ def batch_extract_and_align_claims( openai_model: str = "gpt-4o", progress_bar: bool = True, n_threads: int = 1, + postprocess_config: Optional[ClaimPostprocessingConfig] = None, + enable_default_postprocessing: bool = True, ) -> List[List[Claim]]: """ Batch extract and align claims with token-level provenance from model output tokens. + + By default, enables post-processing (pronoun rewrite, LLM sanitization, + encoder + BoW dedupe). Pass a custom ClaimPostprocessingConfig or set + enable_default_postprocessing=False to skip these steps. """ + config = _default_postprocess_config(postprocess_config, enable_default_postprocessing) extractor = ClaimsExtractor( openai_chat=OpenAIChat(openai_model=openai_model), progress_bar=progress_bar, n_threads=n_threads, + postprocess_config=config, ) return extractor.batch_claims_from_texts(texts, tokens, tokenizer) diff --git a/src/reclaim/claim_level_prompts.py b/src/reclaim/claim_level_prompts.py index ded0c74..dbe913e 100644 --- a/src/reclaim/claim_level_prompts.py +++ b/src/reclaim/claim_level_prompts.py @@ -2,7 +2,13 @@ # Lightweight prompt dictionaries keyed by language to mirror original structure. CLAIM_EXTRACTION_PROMPTS = { - "en": "List all atomic claims from the following sentence. Return each claim on a new line starting with '- '. Sentence: {sent}", + "en": ( + "List all atomic, decontextualized claims from the following sentence.\n" + "- One fact per claim (no conjunctions/enumerations).\n" + "- Replace pronouns or vague references with the specific entity so the claim stands alone.\n" + "Return JSON exactly as {\"claims\": [\"...\"]} and nothing else.\n" + "Sentence: {sent}" + ), } MATCHING_PROMPTS = { diff --git a/src/reclaim/decompose.py b/src/reclaim/decompose.py index d723b44..6e933cc 100644 --- a/src/reclaim/decompose.py +++ b/src/reclaim/decompose.py @@ -14,7 +14,7 @@ def doc2sentences( doc: str, mode: str = "independent_sentences", - model: str = "gpt-4o", + model: str = "gpt-4.1", system_role: str = "You are good at decomposing and decontextualizing text.", num_retries: int = 5, schema: Optional[BaseModel] = None, diff --git a/src/reclaim/extract_claims.py b/src/reclaim/extract_claims.py index 99831bc..71a24c9 100644 --- a/src/reclaim/extract_claims.py +++ b/src/reclaim/extract_claims.py @@ -1,9 +1,11 @@ import logging +import math import re import time +from collections import Counter from concurrent.futures import ThreadPoolExecutor from dataclasses import dataclass -from typing import Dict, List, Optional, Tuple +from typing import Dict, List, Optional from pydantic import BaseModel from tqdm import tqdm @@ -11,7 +13,7 @@ from .claim_level_prompts import CLAIM_EXTRACTION_PROMPTS, MATCHING_PROMPTS from .decompose import doc2sentences from .openai_client import OpenAIChat -from .prompts import MATCHING_PROMPT +from .prompts import MATCHING_PROMPT, PRONOUN_REWRITE_PROMPT, SANITIZE_CLAIM_PROMPT log = logging.getLogger("lm_polygraph") @@ -37,6 +39,31 @@ class Claim: aligned_token_ids: List[int] +@dataclass +class ClaimPostprocessingConfig: + """ + Configuration for optional claim cleanup steps. + + Each flag can be toggled independently to ease testing. + """ + + rewrite_pronouns: bool = False + sanitize_with_llm: bool = False + dedupe_with_cosine: bool = False + dedupe_with_encoder: bool = False + encoder_model_name: str = "Qwen/Qwen3-Embedding-0.6B" + encoder_device: Optional[str] = None + similarity_threshold: float = 0.92 + + def enabled(self) -> bool: + return ( + self.rewrite_pronouns + or self.sanitize_with_llm + or self.dedupe_with_cosine + or self.dedupe_with_encoder + ) + + class ClaimModel(BaseModel): """Pydantic model describing a list of extracted claims.""" claims: List[str] @@ -76,6 +103,7 @@ def __init__( extraction_prompts: Dict[str, str] = CLAIM_EXTRACTION_PROMPTS, matching_prompts: Dict[str, str] = MATCHING_PROMPTS, n_threads: int = 1, + postprocess_config: Optional[ClaimPostprocessingConfig] = None, ): """ Initialize the extractor with prompt templates and chat backend. @@ -88,6 +116,7 @@ def __init__( extraction_prompts: Templates used for claim extraction. matching_prompts: Templates used for aligning claims to text spans. n_threads: Maximum worker threads for batch operations. + postprocess_config: Optional toggles for post-processing. """ self.language = language self.openai_chat = openai_chat @@ -96,6 +125,8 @@ def __init__( self.extraction_prompts = extraction_prompts self.matching_prompts = matching_prompts self.n_threads = n_threads + self.postprocess_config = postprocess_config + self._encoder_cache = {} def batch_claims_from_texts( self, @@ -164,6 +195,7 @@ def claims_from_text(self, text: str, tokens: List[int], tokenizer) -> List[Clai uniq_sentences: List[str] = [] claim_list = doc2sentences(doc=text, mode="claims", schema=ClaimModel).claims + claim_list = self.postprocess_claims(claim_list, text) for s in stupid_claims: if s in claim_list: @@ -271,16 +303,15 @@ def _claims_from_sentence( Returns: Claims aligned to spans inside the provided sentence. """ - extracted_claims = self.openai_chat.ask( - self.extraction_prompts[self.language].format(sent=sent) + extracted = self.openai_chat.ask( + self.extraction_prompts[self.language].format(sent=sent), + schema=ClaimModel, ) + claim_texts = extracted.claims if isinstance(extracted, ClaimModel) else [] claims = [] - for claim_text in extracted_claims.split("\n"): - if not claim_text.startswith("- "): - continue - if "there aren't any claims" in claim_text.lower(): + for claim_text in claim_texts: + if not claim_text: continue - claim_text = claim_text[2:].strip() # Ask the model to surface specific words in the sentence that back the claim. chat_ask = self.matching_prompts[self.language].format( sent=sent, @@ -430,6 +461,179 @@ def _align( sent_pos += 1 return aligned_token_ids + def postprocess_claims(self, claims: List[str], text: str) -> List[str]: + """ + Apply optional heuristic + LLM cleanup to extracted claims. + + Steps (controlled by ClaimPostprocessingConfig): + - Rewrite pronoun-heavy claims using the source text for grounding. + - Run an LLM-based sanitization pass to enforce atomicity and completeness. + - Drop near-duplicates using cosine similarity on bag-of-words vectors. + """ + config = self.postprocess_config + if config is None or not config.enabled(): + return claims + + processed: List[str] = [] + pronoun_pattern = re.compile( + r"\b(it|he|she|they|them|his|her|their|this|that|these|those|there)\b", + re.IGNORECASE, + ) + + for claim in claims: + cur_claim = claim.strip() + if not cur_claim: + continue + + if config.rewrite_pronouns and pronoun_pattern.search(cur_claim): + rewritten = self._llm_rewrite(cur_claim, text, PRONOUN_REWRITE_PROMPT) + if rewritten is None: + continue + cur_claim = rewritten + + if config.sanitize_with_llm: + sanitized = self._llm_rewrite(cur_claim, text, SANITIZE_CLAIM_PROMPT) + if sanitized is None: + continue + cur_claim = sanitized + + processed.append(cur_claim) + + if config.dedupe_with_encoder: + processed = self._dedupe_claims_encoder( + processed, + config.similarity_threshold, + model_name=config.encoder_model_name, + device=config.encoder_device, + ) + + if config.dedupe_with_cosine: + processed = self._dedupe_claims(processed, config.similarity_threshold) + + return processed + + def _llm_rewrite(self, claim: str, text: str, prompt: str) -> Optional[str]: + """ + Ask the LLM to rewrite a claim. Returns None if the claim should be dropped. + """ + reply = self.openai_chat.ask(prompt.format(text=text, claim=claim)).strip() + if not reply: + return claim + upper = reply.upper() + if upper == "DROP": + return None + return reply + + def _dedupe_claims(self, claims: List[str], threshold: float) -> List[str]: + """ + Drop near-duplicate claims using cosine similarity over bag-of-words vectors. + """ + kept: List[str] = [] + for claim in claims: + tokens = self._tokenize(claim) + if len(tokens) == 0: + continue + dup = False + for existing in kept: + sim = self._cosine_similarity(tokens, self._tokenize(existing)) + if sim >= threshold: + dup = True + break + if not dup: + kept.append(claim) + return kept + + @staticmethod + def _tokenize(claim: str) -> Counter: + """Lowercase word tokenization for cosine similarity.""" + words = re.findall(r"[\w']+", claim.lower()) + return Counter(words) + + @staticmethod + def _cosine_similarity(a: Counter, b: Counter) -> float: + """Cosine similarity between two sparse bag-of-words Counters.""" + if not a or not b: + return 0.0 + dot = sum(a[k] * b[k] for k in a.keys() & b.keys()) + norm = math.sqrt(sum(v * v for v in a.values())) * math.sqrt( + sum(v * v for v in b.values()) + ) + if norm == 0: + return 0.0 + return dot / norm + + def _dedupe_claims_encoder( + self, + claims: List[str], + threshold: float, + model_name: str, + device: Optional[str], + ) -> List[str]: + """ + Drop near-duplicates using a transformer encoder (e.g., LaBSE or MiniLM). + """ + if len(claims) <= 1: + return claims + + try: + tokenizer, model = self._load_encoder(model_name, device) + except Exception as exc: + log.warning("Falling back to cosine dedupe; failed to load encoder: %s", exc) + return self._dedupe_claims(claims, threshold) + + import torch + import torch.nn.functional as F + + encoded = tokenizer( + claims, + padding=True, + truncation=True, + return_tensors="pt", + ) + if device: + encoded = {k: v.to(device) for k, v in encoded.items()} + model.to(device) + + with torch.no_grad(): + outputs = model(**encoded) + token_embeddings = outputs.last_hidden_state + mask = encoded["attention_mask"].unsqueeze(-1) + summed = (token_embeddings * mask).sum(dim=1) + counts = mask.sum(dim=1).clamp(min=1) + embeddings = summed / counts + embeddings = F.normalize(embeddings, p=2, dim=1) + + kept: List[str] = [] + kept_vecs = [] + for idx, claim in enumerate(claims): + vec = embeddings[idx] + if not kept_vecs: + kept.append(claim) + kept_vecs.append(vec) + continue + sims = [float(torch.dot(vec, kv)) for kv in kept_vecs] + if max(sims) >= threshold: + continue + kept.append(claim) + kept_vecs.append(vec) + return kept + + def _load_encoder(self, model_name: str, device: Optional[str]): + """ + Lazily load and cache a HF encoder model/tokenizer. + """ + if model_name in self._encoder_cache: + return self._encoder_cache[model_name] + + from transformers import AutoModel, AutoTokenizer + + tokenizer = AutoTokenizer.from_pretrained(model_name) + model = AutoModel.from_pretrained(model_name) + if device: + model.to(device) + self._encoder_cache[model_name] = (tokenizer, model) + return tokenizer, model + def match_claim(self, text: str, claim: str, max_parsed_words: int): """ Find the best matching sentence for a claim and construct a match mask. diff --git a/src/reclaim/prompts.py b/src/reclaim/prompts.py index 85731ab..af054ec 100644 --- a/src/reclaim/prompts.py +++ b/src/reclaim/prompts.py @@ -1,25 +1,19 @@ DOC_TO_ATOMIC_CLAIMS_PROMPT = """ -Your task is to extract atomic claims from a given text. Each claim must meet the following criteria: -1. Informative: Each claim must convey factual information about the subject matter, avoiding generic or irrelevant statements (e.g., "I will provide a balanced response" or "This type of network is simple"). -2. Context-independent: Each claim must be understandable and verifiable on its own, without requiring additional context (e.g., avoid claims like "This type of network is simple" without specifying the network type). -3. De-duplicated: Avoid repeating the same information in different wordings. -4. Precise: Focus on clear and specific information, avoiding vague or overly broad statements. -Define a function named decompose(input: str). The function should return only a Python list containing strings of atomic claims. Do not include any extra formatting, code blocks, or labels like "python" in your response. For example: -If the input text is: -"Mary is a five-year-old girl. She likes playing piano and doesn't like cookies." -The output should be: -["Mary is a five-year-old girl.", "Mary likes playing piano.", "Mary doesn't like cookies."] -Example Input: -"Linear Bus Topology involves connecting all network nodes to a single cable. This design makes it easy to install but difficult to troubleshoot, especially in large networks." -Example Output: -["Linear Bus Topology involves connecting all network nodes to a single cable.", "Linear Bus Topology is easy to install.", "Linear Bus Topology is difficult to troubleshoot.", "Linear Bus Topology is unsuitable for large networks."] - -Important Notes: -1. The output must be a valid Python list with no additional text, code blocks, or formatting. -2. The response must consist of only the list. - -Process the following text according to these rules: -decompose("{doc}") +You extract *atomic*, self-contained claims from the given text. + +Rules: +- A claim is one fact (single predicate); do not bundle multiple facts with "and", commas, or enumerations. +- Each claim must be decontextualized: replace pronouns (he/she/they/it/this/that/these/those/there) and vague references with the concrete entity from the text. +- Each claim must be informative and complete (subject + predicate + object/complement when needed). Drop fragments like "He began his career" or "is considered". +- Avoid duplicates or near-duplicates; keep the most specific phrasing. +- If no atomic claims exist, return an empty list. + +Return JSON exactly in the form: +{{"claims": ["claim 1", "claim 2", "..."]}} +No code fences, no extra keys, no prose. + +Text: +{doc} """ @@ -37,33 +31,19 @@ SENTENCES_TO_CLAIMS_PROMPT = """ -Your task is to decompose the text into atomic claims. -Claims should be a context-independent, fully atomic, representing one fact. Atomic claims are simple, indivisible facts that do not bundle multiple pieces of information together. - -### Guidelines for Decomposition: -1. **Atomicity**: Break down each statement into the smallest possible unit of factual information. Avoid grouping multiple facts in one claim. For example: - - Instead of: "Photosynthesis in plants converts sunlight, carbon dioxide, and water into glucose and oxygen." - - Output: ["Photosynthesis in plants converts sunlight into glucose.", "Photosynthesis in plants converts carbon dioxide into glucose.", "Photosynthesis in plants converts water into glucose.", "Photosynthesis in plants produces oxygen."] - - - Instead of: "The heart pumps blood through the body and regulates oxygen supply to tissues." - - Output: ["The heart pumps blood through the body.", "The heart regulates oxygen supply to tissues."] - - - Instead of: "Gravity causes objects to fall to the ground and keeps planets in orbit around the sun." - - Output: ["Gravity causes objects to fall to the ground.", "Gravity keeps planets in orbit around the sun."] +Decompose the text into *atomic, decontextualized* claims. -2. **Context-Independent**: Each claim must be understandable and verifiable on its own without requiring additional context or references to other claims. Avoid vague claims like "This process is important for life." +Requirements: +- One fact per claim; split conjunctions/enumerations into separate claims. +- Replace pronouns and vague references with the specific entity from the text so the claim stands alone. +- Keep claims informative and complete (subject + predicate + complement as needed). Drop non-informative fragments. +- Remove duplicates/near-duplicates; keep the most specific version. -3. **Precise and Unambiguous**: Ensure the claims are specific and avoid combining related ideas that can stand independently. +Return JSON exactly in the form: +{{"claims": ["claim 1", "claim 2", "..."]}} +No code fences or extra text. -4. **No Formatting**: The response must be a Python list of strings without any extra formatting, code blocks, or labels like "python". - -### Example: -If the input text is: -"Mary is a five-year-old girl. She likes playing piano and doesn't like cookies." -Extracted claims should be: -"Mary is a five-year-old girl.", "Mary likes playing piano.", "Mary doesn't like cookies." - -### Now, decompose the following text into atomic claims: +Text: {doc} """ @@ -80,6 +60,35 @@ process("{doc}") """ +PRONOUN_REWRITE_PROMPT = """ +Rewrite the claim so it is fully self-contained and does not rely on pronouns or vague references. Use the text to resolve the referents. If it cannot be rewritten, return DROP. + +Text: +{text} + +Claim: +{claim} + +Return only the rewritten claim, or DROP. +""" + +SANITIZE_CLAIM_PROMPT = """ +You are checking a claim for quality. The claim must be: +- one atomic fact (single predicate; no "and"/enumerations), +- decontextualized (explicit subject, no pronouns like he/she/they/it/this/that/these/those/there), +- informative and complete (no bare fragments like "was 13 years old" without who, where, when; no "is considered" without the thing). + +If the claim violates these rules, rewrite it into one valid atomic claim using the text for context. If it cannot be made valid, return DROP. + +Text: +{text} + +Claim: +{claim} + +Return only the rewritten claim, or DROP. +""" + MATCHING_PROMPT = """ Task: Analyze the given text and the claim (which was extracted from the text). For each sentence in the text: From 677f51ff6bf04f61423f5dbf8720a912bc5e9c44 Mon Sep 17 00:00:00 2001 From: DaniilOr Date: Tue, 9 Dec 2025 15:34:23 +0400 Subject: [PATCH 2/2] fix non-atomic --- src/reclaim/__init__.py | 9 +++++---- src/reclaim/extract_claims.py | 30 ++++++++++++++++++++++++++++-- src/reclaim/prompts.py | 17 +++++++++++++++++ 3 files changed, 50 insertions(+), 6 deletions(-) diff --git a/src/reclaim/__init__.py b/src/reclaim/__init__.py index fae5c0e..0cd4a32 100644 --- a/src/reclaim/__init__.py +++ b/src/reclaim/__init__.py @@ -46,6 +46,7 @@ def _default_postprocess_config( return ClaimPostprocessingConfig( rewrite_pronouns=True, sanitize_with_llm=True, + split_non_atomic=True, dedupe_with_encoder=True, dedupe_with_cosine=True, ) @@ -53,7 +54,7 @@ def _default_postprocess_config( def extract_claims( text: str, - model: str = "gpt-4o", + model: str = "gpt-4.1", postprocess_config: Optional[ClaimPostprocessingConfig] = None, enable_default_postprocessing: bool = True, ) -> List[Claim]: @@ -88,7 +89,7 @@ def extract_and_align_claims( text, tokens, tokenizer, - openai_model: str = "gpt-4o", + openai_model: str = "gpt-4.1", progress_bar: bool = True, n_threads: int = 1, postprocess_config: Optional[ClaimPostprocessingConfig] = None, @@ -115,7 +116,7 @@ def batch_extract_and_align_claims( texts: List[str], tokens: List[List[int]], tokenizer, - openai_model: str = "gpt-4o", + openai_model: str = "gpt-4.1", progress_bar: bool = True, n_threads: int = 1, postprocess_config: Optional[ClaimPostprocessingConfig] = None, @@ -142,7 +143,7 @@ def batch_extract_and_align_claims( def annotate_claims( claims: List[str], contexts: List[str], - openai_model: str = "gpt-4o", + openai_model: str = "gpt-4.1", progress_bar: bool = True, n_threads: int = 1, ): diff --git a/src/reclaim/extract_claims.py b/src/reclaim/extract_claims.py index 71a24c9..ce4f321 100644 --- a/src/reclaim/extract_claims.py +++ b/src/reclaim/extract_claims.py @@ -13,7 +13,12 @@ from .claim_level_prompts import CLAIM_EXTRACTION_PROMPTS, MATCHING_PROMPTS from .decompose import doc2sentences from .openai_client import OpenAIChat -from .prompts import MATCHING_PROMPT, PRONOUN_REWRITE_PROMPT, SANITIZE_CLAIM_PROMPT +from .prompts import ( + MATCHING_PROMPT, + PRONOUN_REWRITE_PROMPT, + SANITIZE_CLAIM_PROMPT, + SPLIT_NON_ATOMIC_PROMPT, +) log = logging.getLogger("lm_polygraph") @@ -49,6 +54,7 @@ class ClaimPostprocessingConfig: rewrite_pronouns: bool = False sanitize_with_llm: bool = False + split_non_atomic: bool = False dedupe_with_cosine: bool = False dedupe_with_encoder: bool = False encoder_model_name: str = "Qwen/Qwen3-Embedding-0.6B" @@ -59,6 +65,7 @@ def enabled(self) -> bool: return ( self.rewrite_pronouns or self.sanitize_with_llm + or self.split_non_atomic or self.dedupe_with_cosine or self.dedupe_with_encoder ) @@ -497,7 +504,11 @@ def postprocess_claims(self, claims: List[str], text: str) -> List[str]: continue cur_claim = sanitized - processed.append(cur_claim) + if config.split_non_atomic: + split_claims = self._split_non_atomic(cur_claim, text) + processed.extend(split_claims) + else: + processed.append(cur_claim) if config.dedupe_with_encoder: processed = self._dedupe_claims_encoder( @@ -524,6 +535,21 @@ def _llm_rewrite(self, claim: str, text: str, prompt: str) -> Optional[str]: return None return reply + def _split_non_atomic(self, claim: str, text: str) -> List[str]: + """ + Split a claim into atomic claims using an LLM. + """ + try: + res = self.openai_chat.ask( + SPLIT_NON_ATOMIC_PROMPT.format(text=text, claim=claim), + schema=ClaimModel, + ) + claims = res.claims if isinstance(res, ClaimModel) else [] + except Exception: + return [claim] + cleaned = [c.strip() for c in claims if c and c.strip()] + return cleaned if cleaned else [claim] + def _dedupe_claims(self, claims: List[str], threshold: float) -> List[str]: """ Drop near-duplicate claims using cosine similarity over bag-of-words vectors. diff --git a/src/reclaim/prompts.py b/src/reclaim/prompts.py index af054ec..3e57af7 100644 --- a/src/reclaim/prompts.py +++ b/src/reclaim/prompts.py @@ -89,6 +89,23 @@ Return only the rewritten claim, or DROP. """ +SPLIT_NON_ATOMIC_PROMPT = """ +Split the claim into atomic, decontextualized claims if it contains multiple facts. +- Each output claim must be one fact with a single predicate. +- Replace pronouns with explicit entities from the text so each claim stands alone. +- If the claim is already atomic, return it as a single-element list. +- If you cannot produce valid atomic claims, return an empty list. + +Return JSON exactly as: {{"claims": ["claim 1", "claim 2", "..."]}} +No code fences or extra text. + +Text: +{text} + +Claim: +{claim} +""" + MATCHING_PROMPT = """ Task: Analyze the given text and the claim (which was extracted from the text). For each sentence in the text: