diff --git a/run_yourbench.py b/run_yourbench.py index 600245fc..8b6cbe2e 100644 --- a/run_yourbench.py +++ b/run_yourbench.py @@ -14,6 +14,26 @@ ) logger = logging.getLogger(__name__) +# Default instructions injected into BOTH question-generation stages +# (single-shot + multi-hop) as `additional_instructions`. yourbench's default +# is "Generate questions to test an undergraduate student", which yields +# generic, self-contained comprehension questions that frontier models can +# answer WITHOUT the source — useless for measuring document-specific +# knowledge. This forces closed-book, fact-anchored questions instead. +# Override per-deployment with the QUESTION_GENERATION_INSTRUCTIONS env var. +DEFAULT_CLOSED_BOOK_INSTRUCTIONS = ( + "Generate CLOSED-BOOK, factual-recall questions that test whether a model has the " + "specific knowledge contained in this document. Every question MUST hinge on a concrete, " + "document-specific detail stated in the source — an exact name, number, value, configuration, " + "limit, default, sequence of steps, or described behavior. A knowledgeable expert who has NOT " + "read this document must be UNABLE to answer it from general domain knowledge or from the wording " + "of the question itself. Do NOT generate generic questions such as 'what are the benefits of X', " + "'how does X enhance/improve Y', or 'why is X important' — those are answerable by reasoning alone. " + "Keep each answer short, concrete, and verifiable against the source (a fact, value, name, or brief " + "enumeration), not an open-ended explanation. Prefer questions a generic model would plausibly get " + "wrong without this document." +) + def download_from_s3(bucket_name, object_key, local_path): """Download file from S3 bucket""" @@ -138,7 +158,22 @@ def main(): lighteval: citation_score_filtering: """ - create_config_file(config_content, config_path) + # Inject closed-book question-generation instructions into both generation + # stages. Without this, yourbench uses its generic default and produces + # questions answerable without the source document. + question_instructions = os.environ.get( + "QUESTION_GENERATION_INSTRUCTIONS", DEFAULT_CLOSED_BOOK_INSTRUCTIONS + ).strip() + config_dict = yaml.safe_load(config_content) + pipeline_cfg = config_dict.setdefault("pipeline", {}) + for stage in ("single_shot_question_generation", "multi_hop_question_generation"): + stage_cfg = pipeline_cfg.get(stage) or {} + stage_cfg["additional_instructions"] = question_instructions + pipeline_cfg[stage] = stage_cfg + logger.info(f"Creating config file at {config_path} (closed-book question instructions applied)") + os.makedirs(os.path.dirname(config_path), exist_ok=True) + with open(config_path, "w") as f: + yaml.dump(config_dict, f) # Step 4: Run yourbench run_yourbench(config_path) diff --git a/yourbench/utils/prompts.py b/yourbench/utils/prompts.py index bbfe87ea..ff2215ad 100644 --- a/yourbench/utils/prompts.py +++ b/yourbench/utils/prompts.py @@ -43,7 +43,7 @@ QUESTION_GENERATION_SYSTEM_PROMPT_HEADER = """## Your Role -You are an expert educational content creator specializing in crafting thoughtful, rich, and engaging questions based on provided textual information. Your goal is to produce meaningful, moderately challenging question-answer pairs that encourage reflection, insight, and nuanced understanding, tailored specifically according to provided instructions. +You are an expert benchmark author specializing in CLOSED-BOOK, factual-recall questions that test whether a model possesses the specific knowledge contained in a provided document. Your goal is to produce question-answer pairs that can only be answered correctly by someone who has actually read THIS specific text — never from general domain knowledge, common sense, or the wording of the question itself. ## Input Structure @@ -67,7 +67,7 @@ ## Primary Objective -Your goal is to generate a thoughtful set of question-answer pairs from a single provided ``. Aim for moderate complexity that encourages learners to deeply engage with the content, critically reflect on implications, and clearly demonstrate their understanding. +Your goal is to generate fact-anchored question-answer pairs from a single provided ``. Each question must hinge on a concrete, document-specific detail stated in the chunk, so that ONLY a reader of this exact text can answer it correctly. Do not aim for "moderate complexity" or open-ended reflection — aim for specificity and verifiability. ### Context Fields: @@ -86,8 +86,8 @@ 2. **Concept Exploration** - Consider implicit assumptions, subtle details, underlying theories, and potential applications of the provided information. -3. **Strategic Complexity Calibration** - - Thoughtfully rate difficulty (1-10), ensuring moderate complexity aligned with the additional instructions provided. +3. **Specificity Calibration** + - Rate difficulty (1-10) by how strongly the question depends on this specific source: higher when only a reader of this chunk could answer, lower when a generalist could guess. Discard questions a model could answer without the document. 4. **Intentional Question Planning** - Plan how questions can invite deeper understanding, meaningful reflection, or critical engagement, ensuring each question is purposeful. @@ -112,28 +112,27 @@ ## Question Generation Guidelines -### Encouraged Question Characteristics: +### Required Question Characteristics: -- **Thoughtful Engagement**: Prioritize creating questions that inspire deeper thought and nuanced consideration. -- **Moderate Complexity**: Develop questions that challenge learners appropriately without overwhelming them, following the provided additional instructions. -- **Self-contained Clarity**: Questions and answers should contain sufficient context, clearly understandable independently of external references. -- **Educational Impact**: Ensure clear pedagogical value, reflecting meaningful objectives and genuine content comprehension. -- **Conversational Tone**: Formulate engaging, natural, and realistic questions appropriate to the instructional guidelines. +- **Source-Dependent Specificity**: Each question MUST hinge on a specific fact, name, number, value, configuration, limit, default, or step that is stated in the ``. A knowledgeable expert who has NOT read this document must be unable to answer it from general knowledge or from the question's own wording. +- **Closed-Book, Not Reasoning**: Do NOT generate questions that can be answered by reasoning, inference, or domain familiarity alone. Avoid generic framings such as "what are the benefits of X", "how does X enhance/improve Y", or "why is X important" — a capable model answers those without the document. +- **Verifiable Answer**: The answer must be concrete and checkable against the chunk (a fact, value, name, or short enumeration), not an open-ended essay. +- **Discriminating**: Favor details that a generic model would plausibly get wrong or omit — exact terminology, specific steps/counts/sequences, named components, thresholds, or described behaviors unique to this text. +- **Natural Phrasing**: Phrase questions naturally; do not reference "the document" or "the text" explicitly (the question stands on its own grammatically), but its ANSWER must still require this specific source. -### Permitted Question Types: +### Question Types (in priority order): -- Analytical -- Application-based -- Clarification -- Counterfactual -- Conceptual -- True-False -- Factual -- Open-ended -- False-premise -- Edge-case +Strongly prefer: +- Factual (specific stated facts) +- Clarification (precise meaning of a specific term/value as defined here) +- Edge-case (specific limits, defaults, exceptions stated in the text) +- True-False (about a specific stated claim) +- False-premise (contradicting a specific stated detail) -(You do not need to use every question type, only those naturally fitting the content and instructions.)""" +Avoid (these are answerable without the document): +- Analytical, Conceptual, Application-based, Counterfactual, Open-ended + +(Use only the types that yield source-dependent, verifiable questions for this chunk.)""" QUESTION_GENERATION_SYSTEM_PROMPT_OUTPUT = """## Output Structure @@ -207,10 +206,10 @@ class MultipleChoiceQuestion(BaseModel): QUESTION_GENERATION_SYSTEM_PROMPT_FOOTER = """## Important Notes - Strive to generate questions that inspire genuine curiosity, reflection, and thoughtful engagement. - Maintain clear, direct, and accurate citations drawn verbatim from the provided text_chunk. -- Ensure complexity and depth reflect thoughtful moderation as guided by the additional instructions. -- Each "thought_process" should reflect careful consideration and reasoning behind your question selection. +- Prioritize source-dependent specificity over complexity: every question must require a concrete detail from the text_chunk to answer. +- Each "thought_process" should state WHICH specific fact from the chunk the question tests and why a model without the document could not answer it. - Ensure rigorous adherence to JSON formatting and the provided Pydantic validation model. -- When generating questions, NEVER include phrases like 'as per the text,' 'according to the document,' or any similar explicit references. Questions should inherently integrate content naturally and stand independently without explicit references to the source material +- Do not use meta-phrases like 'as per the text' or 'according to the document' in the question wording — phrase it naturally. This is about phrasing only: the question's ANSWER must still depend on this specific source, not on general knowledge. """ QUESTION_GENERATION_SYSTEM_PROMPT = ( @@ -243,7 +242,7 @@ class MultipleChoiceQuestion(BaseModel): MULTI_HOP_QUESTION_GENERATION_SYSTEM_HEADER = """## Your Role -You are an expert educational content creator specialized in generating insightful and thoughtfully designed multi-hop questions. Your task is to craft sophisticated, moderately challenging questions that inherently require careful, integrative reasoning over multiple chunks of textual information. Aim to provoke thoughtful reflection, nuanced understanding, and synthesis, particularly when the provided text allows for it. +You are an expert benchmark author specialized in CLOSED-BOOK, factual-recall multi-hop questions. Your task is to craft questions that require combining SPECIFIC facts stated across multiple chunks of THIS document, so that only a reader of these exact texts can answer them. Each question must depend on concrete, document-specific details — not on general domain knowledge, reasoning, or the wording of the question itself. ## Input Structure @@ -273,7 +272,7 @@ class MultipleChoiceQuestion(BaseModel): ## Primary Objective -Generate a thoughtful, educationally meaningful set of multi-hop question-answer pairs. Questions should ideally integrate concepts across multiple text chunks, challenging learners moderately and encouraging critical thinking and deeper understanding. +Generate multi-hop question-answer pairs that require combining specific, stated facts from across multiple text chunks. Each question must hinge on concrete details (names, numbers, values, steps, behaviors) found in the chunks, so that only a reader of these exact texts can answer correctly. Do not aim for "moderate challenge" or open-ended reflection — aim for source-dependent specificity and verifiable answers. ### Context Fields: - ``: Document context @@ -293,8 +292,8 @@ class MultipleChoiceQuestion(BaseModel): 2. **Reasoning Path Construction** - Construct potential pathways of multi-hop reasoning by connecting ideas, details, or implications found across text chunks. -3. **Complexity Calibration** - - Rate difficulty thoughtfully on a scale of 1-10, moderately challenging learners according to provided additional instructions. +3. **Specificity Calibration** + - Rate difficulty (1-10) by how strongly the question depends on combining specific facts from these chunks: higher when only a reader of these texts could answer, lower when a generalist could guess. Discard questions answerable without the document. 4. **Strategic Question Selection** - Choose questions that naturally emerge from the depth and complexity of the content provided, prioritizing integrative reasoning and genuine curiosity. @@ -302,24 +301,15 @@ class MultipleChoiceQuestion(BaseModel): ## Question Generation Guidelines ### Question Characteristics -- **Multi-Hop Integration**: Questions should naturally require integration across multiple chunks, demonstrating clear interconnected reasoning. -- **Thoughtfulness & Complexity**: Construct questions that stimulate critical thinking, reflection, or moderate challenge appropriate to the content. -- **Clarity & Precision**: Ensure each question and answer clearly and concisely communicates intent without ambiguity. -- **Educational Relevance**: Ensure each question has clear pedagogical purpose, enhancing understanding or critical reflection. -- **Authentic Language**: Use engaging, conversational language reflecting genuine human curiosity and inquiry. - -### Suggested Question Types -(Use naturally, as fitting to the content complexity) -- Analytical -- Application-based -- Clarification -- Counterfactual -- Conceptual -- True-False -- Factual -- Open-ended -- False-premise -- Edge-case +- **Multi-Hop Source Dependency**: Questions must require combining two or more SPECIFIC facts stated across the chunks. A model without these exact texts must be unable to answer from general knowledge. +- **Verifiable Answer**: The answer must be concrete and checkable against the chunks (facts, values, names, short enumerations), not an open-ended essay. +- **Closed-Book, Not Reasoning**: Avoid generic framings ("what are the benefits of…", "how does … improve …", "why is … important") that a capable model answers without the document. +- **Clarity & Precision**: Each question and answer communicates intent concisely and unambiguously. +- **Discriminating**: Favor combinations of specific terminology, steps, counts, components, or thresholds unique to these texts. + +### Question Types (in priority order) +Strongly prefer: Factual, Clarification, Edge-case, True-False, False-premise — each anchored to specific stated facts that must be combined across chunks. +Avoid (answerable without the document): Analytical, Conceptual, Application-based, Counterfactual, Open-ended. ## **Filtering Irrelevant Content**: @@ -335,13 +325,11 @@ class MultipleChoiceQuestion(BaseModel): MULTI_HOP_QUESTION_GENERATION_SYSTEM_FOOTER = """## Important Notes -- Prioritize depth and thoughtfulness in your reasoning paths. -- Allow natural complexity to guide question formulation, aiming for moderate challenge. -- Precisely cite verbatim excerpts from text chunks. -- Clearly communicate your thought process for integrative reasoning. +- Prioritize source-dependent specificity: each question must require combining concrete facts stated across the chunks. +- Precisely cite verbatim excerpts from the text chunks for every fact used. +- In "thought_process", state which specific facts from which chunks are combined and why a model without these texts could not answer. - Adhere strictly to JSON formatting and Pydantic validation requirements. -- Generate questions that genuinely inspire deeper reflection or meaningful exploration of the provided content. -- When generating questions, NEVER include phrases like 'as per the text,' 'according to the document,' or any similar explicit references. Questions should inherently integrate content naturally and stand independently without explicit references to the source material""" +- Do not use meta-phrases like 'as per the text' or 'according to the document' in the question wording — phrase it naturally. This is about phrasing only: the question's ANSWER must still depend on these specific sources, not on general knowledge.""" MULTI_HOP_QUESTION_GENERATION_SYSTEM_PROMPT = ( MULTI_HOP_QUESTION_GENERATION_SYSTEM_HEADER