You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
// RES-Q is a codebase editing benchmark consisting of 100 hand-crafted, compact natural language edit instructions. The task is to, given an edit instruction and a codebase, make an edit to the codebase that satisfies the instruction.
916
+
917
+
916
918
functioninitBenchmarkChart(){
917
-
// Dataset information
919
+
// Dataset information - full descriptions
918
920
constdatasetInfo={
919
921
// Code Generation & Editing
920
-
'HumanEval': 'Basic programming tasks for evaluating code generation capabilities across various algorithms and data structures.',
921
-
'SWE-Bench': 'GitHub pull request-style issue resolution in Python repositories, requiring understanding of complex codebases.',
922
-
'SWE-Bench-Java': 'Pull request resolution tasks for Java codebases, testing language-specific knowledge and framework understanding.',
923
-
'SWT-Bench': 'Testing-focused software engineering tasks that require writing unit tests and understanding test frameworks.',
924
-
'Canit-Edit': 'Targeted code editing to fix specific bugs without rewriting entire functions or classes.',
925
-
'Res-Q': 'Pull request response generation based on review comments, requiring communication skills.',
926
-
922
+
'HumanEval': 'A dataset of Python programming problems for evaluating code generation models.',
923
+
'SWE-Bench': 'A dataset of 2000 real-world GitHub issues and pull requests from Python repositories to evaluate language models on software engineering tasks.',
924
+
'SWE-Bench-Java': 'A dataset for evaluating language models on resolving real-world GitHub issues in Java codebases.',
925
+
'SWT-Bench': 'A benchmark for evaluating large language models on testing generation for real world software issues collected from GitHub.',
926
+
'Canit-Edit': 'A benchmark for evaluating LLMs on instructional code editing, the task of updating a program given a natural language instruction.',
927
+
'Res-Q': 'A benchmark for codebase editing tasks, consisting of 100 hand-crafted, compact natural language edit instructions.',
928
+
927
929
// Multimodal Code Generation
928
-
'Design2Code': 'Converting visual UI designs to HTML/CSS implementations, bridging visual and code modalities.',
929
-
'ChartMimic': 'Generating code to recreate data visualizations from images, requiring data science knowledge.',
930
-
'SWE-Bench-MM': 'Multimodal version of SWE-Bench with image inputs such as screenshots or diagrams.',
931
-
'DSBench': 'Data science and visualization code generation tasks based on dataset descriptions.',
930
+
'Design2Code': 'A dataset for converting visual UI designs to HTML/CSS implementations, bridging visual and code modalities.',
931
+
'ChartMimic': 'A dataset for generating code to recreate data visualizations from images, requiring data science knowledge.',
932
+
'SWE-Bench-MM': 'A multimodal version of SWE-Bench with image inputs such as screenshots or diagrams from JavaScript repositories.',
933
+
'DSBench': 'A benchmark for evaluating data science agents with realistic data analysis and data modeling tasks based on dataset descriptions.',
932
934
933
935
// Domain-Specific
934
-
'CTF': 'Cybersecurity challenges requiring exploit development and understanding of security concepts.',
935
-
'MiniCTX': 'Theorem proving tasks in the Lean programming language, requiring formal methods knowledge.',
936
-
'BIRD': 'Database query generation from natural language and schema, testing SQL knowledge.',
936
+
'CTF': 'A dataset of cybersecurity challenges requiring exploit development and understanding of security concepts.',
937
+
'MiniCTX': 'A dataset for theorem proving tasks in the Lean programming language, requiring formal proving capabilities.',
938
+
'BIRD': 'A dataset for database query generation from natural language and schema, testing SQL knowledge.',
937
939
938
940
// SWE Tasks
939
941
'VSCode': 'IDE configuration and extension tasks focused on developer workflows.',
940
-
'General-SWE': 'Project management, debugging, and profiling tasks common in software development.'
942
+
'General-SWE': 'Profiling, Refactoring, Debugging, and other tasks common in software development.'
0 commit comments