From 8eb79cc4d1ef3ae1d2994ee76be83e59f08ac4b3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jon=20Lei=C3=B1ena?= Date: Thu, 11 Dec 2025 16:41:11 +0100 Subject: [PATCH] feat: Add code_x_glue code_to_code task --- README.md | 9 + bigcode_eval/tasks/__init__.py | 3 +- .../tasks/codexglue_code_to_code_trans.py | 257 ++++++++++++++++++ requirements-codebleu.txt | 1 + requirements.txt | 3 + 5 files changed, 272 insertions(+), 1 deletion(-) create mode 100644 bigcode_eval/tasks/codexglue_code_to_code_trans.py create mode 100644 requirements-codebleu.txt diff --git a/README.md b/README.md index aa3bb89e3..dab65d7c9 100644 --- a/README.md +++ b/README.md @@ -32,6 +32,7 @@ Below are the features and tasks of this framework: - [Recode](https://github.com/amazon-science/recode/tree/main) applied to the HumanEval benchmark. It evaluates the robustness of code-generation models. - [Pal](https://github.com/reasoning-machines/pal) Program-aided Language Models evaluation for grade school math problems : [GSM8K](https://huggingface.co/datasets/gsm8k) and [GSM-HARD](https://huggingface.co/datasets/reasoning-machines/gsm-hard). These problems are solved by generating reasoning chains of text and code. - Code to text task from [CodeXGLUE](https://huggingface.co/datasets/code_x_glue_ct_code_to_text) (zero-shot & fine-tuning) for 6 languages: **Python, Go, Ruby, Java, JavaScript and PHP.** Documentation translation task from [CodeXGLUE](https://huggingface.co/datasets/code_x_glue_tt_text_to_text). + - [Code-to-Code Translation](https://huggingface.co/datasets/google/code_x_glue_cc_code_to_code_trans) task from CodeXGLUE for translating between **Java** and **C#**, evaluated using the [CodeBLEU](https://github.com/k4black/codebleu) metric. - [CoNaLa](https://huggingface.co/datasets/neulab/conala) for **Python** code generation (2-shot setting and evaluation with BLEU score). - [Concode](https://huggingface.co/datasets/code_x_glue_tc_text_to_code) for **Java** code generation (2-shot setting and evaluation with BLEU score). - 3 multilingual downstream classification tasks: [Java Complexity prediction](https://huggingface.co/datasets/codeparrot/codecomplex), [Java code equivalence prediction](https://huggingface.co/datasets/code_x_glue_cc_clone_detection_big_clone_bench), [C code defect prediction](https://huggingface.co/datasets/code_x_glue_cc_defect_detection). @@ -51,6 +52,14 @@ Install [`torch`](https://pytorch.org/get-started/locally/) based on your device ``` pip install -e . ``` + +### CodeBLEU Installation (for Code-to-Code Translation) +To run the `codexglue_code_to_code_trans` task, you need to install the [CodeBLEU](https://github.com/k4black/codebleu) metric. Due to a [dependency conflict](https://github.com/k4black/codebleu/issues/62) between `codebleu` and newer `tree-sitter` versions, we first install `codebleu` without its dependencies, then install the remaining requirements which include compatible `tree-sitter` packages: +```bash +pip install -r requirements-codebleu.txt --no-deps +pip install -r requirements.txt +``` + To run the `DS-1000` benchmark, additional constraints must be resolved. ``` # python version must be 3.7.10 diff --git a/bigcode_eval/tasks/__init__.py b/bigcode_eval/tasks/__init__.py index 8162a5f1a..a3c576aa7 100644 --- a/bigcode_eval/tasks/__init__.py +++ b/bigcode_eval/tasks/__init__.py @@ -1,7 +1,7 @@ import inspect from pprint import pprint -from . import (apps, codexglue_code_to_text, codexglue_text_to_text, conala, +from . import (apps, codexglue_code_to_text, codexglue_text_to_text, codexglue_code_to_code_trans, conala, concode, ds1000, gsm, humaneval, humanevalplus, humanevalpack, instruct_humaneval, instruct_wizard_humaneval, mbpp, mbppplus, multiple, parity, python_bugs, quixbugs, recode, santacoder_fim, @@ -11,6 +11,7 @@ **apps.create_all_tasks(), **codexglue_code_to_text.create_all_tasks(), **codexglue_text_to_text.create_all_tasks(), + **codexglue_code_to_code_trans.create_all_tasks(), **multiple.create_all_tasks(), "codexglue_code_to_text-python-left": codexglue_code_to_text.LeftCodeToText, "conala": conala.Conala, diff --git a/bigcode_eval/tasks/codexglue_code_to_code_trans.py b/bigcode_eval/tasks/codexglue_code_to_code_trans.py new file mode 100644 index 000000000..d2f8369a9 --- /dev/null +++ b/bigcode_eval/tasks/codexglue_code_to_code_trans.py @@ -0,0 +1,257 @@ +"""CodeXGLUE: A Machine Learning Benchmark Dataset for Code Understanding and Generation +https://arxiv.org/abs/2102.04664 + +Code-to-Code Translation task from CodeXGLUE: +Translating code between Java and C# programming languages. +The dataset is collected from several public repos including Lucene, POI, JGit and Antlr. + +Dataset: https://huggingface.co/datasets/google/code_x_glue_cc_code_to_code_trans +- 10,300 training samples +- 500 validation samples +- 1,000 test samples + +This is a zero-shot or few-shot task evaluated with CodeBLEU score. +CodeBLEU is a metric specifically designed for code generation that considers: +- N-gram matching (like BLEU) +- Weighted n-gram matching based on syntax +- Syntax match using AST +- Dataflow match for semantic similarity + +Reference: https://arxiv.org/abs/2009.10297 +""" +import json + +from codebleu import calc_codebleu + +from bigcode_eval.base import Task + +_CITATION = """ +@article{DBLP:journals/corr/abs-2102-04664, + author = {Shuai Lu and + Daya Guo and + Shuo Ren and + Junjie Huang and + Alexey Svyatkovskiy and + Ambrosio Blanco and + Colin B. Clement and + Dawn Drain and + Daxin Jiang and + Duyu Tang and + Ge Li and + Lidong Zhou and + Linjun Shou and + Long Zhou and + Michele Tufano and + Ming Gong and + Ming Zhou and + Nan Duan and + Neel Sundaresan and + Shao Kun Deng and + Shengyu Fu and + Shujie Liu}, + title = {CodeXGLUE: {A} Machine Learning Benchmark Dataset for Code Understanding + and Generation}, + journal = {CoRR}, + volume = {abs/2102.04664}, + year = {2021} +} +""" + +# Translation directions supported +# Note: codebleu_lang uses the language identifier expected by the codebleu package +TRANSLATION_DIRECTIONS = { + "java_cs": { + "source": "java", + "target": "cs", + "source_name": "Java", + "target_name": "C#", + "codebleu_lang": "c_sharp", # Target language for CodeBLEU evaluation + }, + "cs_java": { + "source": "cs", + "target": "java", + "source_name": "C#", + "target_name": "Java", + "codebleu_lang": "java", # Target language for CodeBLEU evaluation + }, +} + + +def create_all_tasks(): + """Creates a dictionary of tasks for both translation directions. + :return: {task_name: task} + e.g. {codexglue_code_to_code_trans-java_cs: Task, codexglue_code_to_code_trans-cs_java: Task} + """ + return { + f"codexglue_code_to_code_trans-{direction}": create_task(direction) + for direction in TRANSLATION_DIRECTIONS + } + + +def create_task(direction): + class CodeToCodeTransTask(CodeToCodeTrans): + def __init__(self, **kwargs): + super().__init__(direction, **kwargs) + + return CodeToCodeTransTask + + +class CodeToCodeTrans(Task): + """Code-to-Code Translation task for Java ↔ C# translation. + + A task represents an entire benchmark including its dataset, problems, + answers, generation settings and evaluation methods. + """ + + DATASET_PATH = "code_x_glue_cc_code_to_code_trans" + DATASET_NAME = None + + def __init__(self, direction): + """Initialize the code translation task. + + :param direction: str + Translation direction, either 'java_cs' or 'cs_java' + """ + self.direction = direction + self.direction_config = TRANSLATION_DIRECTIONS[direction] + super().__init__( + stop_words=["\n\n", "\n//", "\n/*", "\n#"], # Stop at blank lines or comments + requires_execution=False, + ) + + def get_dataset(self): + """Returns dataset for the task or an iterable of any object, that get_prompt can handle.""" + return self.dataset["test"] + + def fewshot_examples(self): + """Loads and returns the few-shot examples for the task if they exist.""" + with open( + "bigcode_eval/tasks/few_shot_examples/codexglue_code_to_code_trans_few_shot_prompts.json", + "r", + ) as file: + examples = json.load(file) + return examples[self.direction] + + @staticmethod + def two_shot_prompt(entry, source_code, examples, source_name, target_name): + """Two shot prompt format with source and target code examples. + + :param entry: str + Instruction prefix for the task + :param source_code: str + The source code to translate + :param examples: dict + Few-shot examples containing source1, target1, source2, target2 + :param source_name: str + Name of the source language (e.g., 'Java') + :param target_name: str + Name of the target language (e.g., 'C#') + :return: str + The complete prompt + """ + prompt = f"""{entry} +{source_name}: +{examples['source1']} +{target_name}: +{examples['target1']} + +{source_name}: +{examples['source2']} +{target_name}: +{examples['target2']} + +{source_name}: +{source_code} +{target_name}: +""" + return prompt + + def get_prompt(self, doc): + """Builds the prompt for the LM to generate from. + + :param doc: dict[str: str] + sample from the test dataset + :return: str + """ + source_name = self.direction_config["source_name"] + target_name = self.direction_config["target_name"] + source_field = self.direction_config["source"] + + source_code = doc[source_field].strip() + entry = f"Translate the following code from {source_name} to {target_name}:\n" + examples = self.fewshot_examples() + prompt = self.two_shot_prompt(entry, source_code, examples, source_name, target_name) + return prompt + + def get_reference(self, doc): + """Builds the reference solution for the doc (sample from the test dataset). + + :param doc: dict[str: str] + sample from the test dataset + :return: str + """ + target_field = self.direction_config["target"] + return doc[target_field].strip() + + def postprocess_generation(self, generation, idx): + """Defines the postprocessing for a LM generation. + + :param generation: str + code generation from LM + :param idx: int + index of doc in the dataset to which the generation belongs + (not used for this task) + :return: str + """ + target_name = self.direction_config["target_name"] + # Extract the generated code after the last target language marker + marker = f"{target_name}:\n" + if marker in generation: + output = generation.split(marker)[-1] + else: + output = generation + + # Clean up the output - take first complete function/method + output = output.strip() + + # Stop at double newlines or comment markers that might indicate end of function + for stop in ["\n\n", "\n//", "\n/*"]: + if stop in output: + output = output.split(stop)[0] + + return output.strip() + + def process_results(self, generations, references): + """Takes the list of LM generations and evaluates them against ground truth references, + returning the CodeBLEU metric for the generations. + + CodeBLEU combines: + - ngram_match_score: Standard n-gram matching (like BLEU) + - weighted_ngram_match_score: N-gram matching weighted by syntax + - syntax_match_score: AST-based syntax matching + - dataflow_match_score: Semantic dataflow matching + - codebleu: Combined score (weighted average of above) + + :param generations: list(list(str)) + list of lists containing generations + :param references: list(str) + list of str containing references + :return: dict[str: float] + """ + # Extract the first generation from each list + predictions = [gen[0] for gen in generations] + + # Get the target language for CodeBLEU evaluation + lang = self.direction_config["codebleu_lang"] + + # Compute CodeBLEU score + # calc_codebleu expects references as list of strings (one per sample) + # and predictions as list of strings (one per sample) + results = calc_codebleu( + references=references, + predictions=predictions, + lang=lang, + ) + + return results + diff --git a/requirements-codebleu.txt b/requirements-codebleu.txt new file mode 100644 index 000000000..473605766 --- /dev/null +++ b/requirements-codebleu.txt @@ -0,0 +1 @@ +codebleu==0.7.0 \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index d286c5662..2c6770e1e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,3 +6,6 @@ pyext==0.5 mosestokenizer==1.0.0 huggingface_hub>=0.11.1 fsspec>=2023.12.2 +tree-sitter==0.25.2 +tree-sitter-c-sharp==0.23.1 +tree-sitter-java==0.23.5