Skip to content

Commit 2e0a934

Browse files
committed
Update Website
1 parent 2df9b28 commit 2e0a934

File tree

6 files changed

+66
-1291
lines changed

6 files changed

+66
-1291
lines changed

docs/index.html

Lines changed: 66 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -341,7 +341,7 @@ <h2 class="subtitle is-3 has-text-centered">Towards Generalist Software Engineer
341341
<p class="is-size-5">Carnegie Mellon University</p>
342342
<br>
343343
<p class="buttons is-centered">
344-
<a href="#" class="button is-dark is-rounded">
344+
<a href="static/pdfs/PwP_Arxiv_Submission.pdf" class="button is-dark is-rounded">
345345
<span class="icon"><i class="ai ai-arxiv"></i></span>
346346
<span>Paper</span>
347347
</a>
@@ -912,32 +912,72 @@ <h2 class="title is-3">Citation</h2>
912912
if (document.querySelector('#contrib-tab2').classList.contains('is-active')) {
913913
initBenchmarkChart();
914914
}
915-
915+
// RES-Q is a codebase editing benchmark consisting of 100 hand-crafted, compact natural language edit instructions. The task is to, given an edit instruction and a codebase, make an edit to the codebase that satisfies the instruction.
916+
917+
916918
function initBenchmarkChart() {
917-
// Dataset information
919+
// Dataset information - full descriptions
918920
const datasetInfo = {
919921
// Code Generation & Editing
920-
'HumanEval': 'Basic programming tasks for evaluating code generation capabilities across various algorithms and data structures.',
921-
'SWE-Bench': 'GitHub pull request-style issue resolution in Python repositories, requiring understanding of complex codebases.',
922-
'SWE-Bench-Java': 'Pull request resolution tasks for Java codebases, testing language-specific knowledge and framework understanding.',
923-
'SWT-Bench': 'Testing-focused software engineering tasks that require writing unit tests and understanding test frameworks.',
924-
'Canit-Edit': 'Targeted code editing to fix specific bugs without rewriting entire functions or classes.',
925-
'Res-Q': 'Pull request response generation based on review comments, requiring communication skills.',
926-
922+
'HumanEval': 'A dataset of Python programming problems for evaluating code generation models.',
923+
'SWE-Bench': 'A dataset of 2000 real-world GitHub issues and pull requests from Python repositories to evaluate language models on software engineering tasks.',
924+
'SWE-Bench-Java': 'A dataset for evaluating language models on resolving real-world GitHub issues in Java codebases.',
925+
'SWT-Bench': 'A benchmark for evaluating large language models on testing generation for real world software issues collected from GitHub.',
926+
'Canit-Edit': 'A benchmark for evaluating LLMs on instructional code editing, the task of updating a program given a natural language instruction.',
927+
'Res-Q': 'A benchmark for codebase editing tasks, consisting of 100 hand-crafted, compact natural language edit instructions.',
928+
927929
// Multimodal Code Generation
928-
'Design2Code': 'Converting visual UI designs to HTML/CSS implementations, bridging visual and code modalities.',
929-
'ChartMimic': 'Generating code to recreate data visualizations from images, requiring data science knowledge.',
930-
'SWE-Bench-MM': 'Multimodal version of SWE-Bench with image inputs such as screenshots or diagrams.',
931-
'DSBench': 'Data science and visualization code generation tasks based on dataset descriptions.',
930+
'Design2Code': 'A dataset for converting visual UI designs to HTML/CSS implementations, bridging visual and code modalities.',
931+
'ChartMimic': 'A dataset for generating code to recreate data visualizations from images, requiring data science knowledge.',
932+
'SWE-Bench-MM': 'A multimodal version of SWE-Bench with image inputs such as screenshots or diagrams from JavaScript repositories.',
933+
'DSBench': 'A benchmark for evaluating data science agents with realistic data analysis and data modeling tasks based on dataset descriptions.',
932934

933935
// Domain-Specific
934-
'CTF': 'Cybersecurity challenges requiring exploit development and understanding of security concepts.',
935-
'MiniCTX': 'Theorem proving tasks in the Lean programming language, requiring formal methods knowledge.',
936-
'BIRD': 'Database query generation from natural language and schema, testing SQL knowledge.',
936+
'CTF': 'A dataset of cybersecurity challenges requiring exploit development and understanding of security concepts.',
937+
'MiniCTX': 'A dataset for theorem proving tasks in the Lean programming language, requiring formal proving capabilities.',
938+
'BIRD': 'A dataset for database query generation from natural language and schema, testing SQL knowledge.',
937939

938940
// SWE Tasks
939941
'VSCode': 'IDE configuration and extension tasks focused on developer workflows.',
940-
'General-SWE': 'Project management, debugging, and profiling tasks common in software development.'
942+
'General-SWE': 'Profiling, Refactoring, Debugging, and other tasks common in software development.'
943+
};
944+
945+
// Dataset URLs
946+
const datasetUrls = {
947+
'HumanEval': 'https://huggingface.co/datasets/openai/openai_humaneval',
948+
'SWE-Bench': 'https://huggingface.co/datasets/princeton-nlp/SWE-bench',
949+
'SWE-Bench-Java': 'https://huggingface.co/datasets/Daoguang/Multi-SWE-bench',
950+
'SWT-Bench': 'https://github.com/logic-star-ai/swt-bench',
951+
'Canit-Edit': 'https://github.com/nuprl/CanItEdit',
952+
'Res-Q': 'https://huggingface.co/datasets/Qurrent/RES-Q',
953+
'Design2Code': 'https://huggingface.co/datasets/SALT-NLP/Design2Code',
954+
'ChartMimic': 'https://huggingface.co/datasets/ChartMimic/ChartMimic',
955+
'SWE-Bench-MM': 'https://huggingface.co/datasets/princeton-nlp/SWE-bench_Multimodal',
956+
'DSBench': 'https://github.com/LiqiangJing/DSBench',
957+
'CTF': 'https://intercode-benchmark.github.io/',
958+
'MiniCTX': 'https://github.com/cmu-l3/minictx-eval',
959+
'BIRD': 'https://bird-bench.github.io/',
960+
'VSCode': '#',
961+
'General-SWE': '#'
962+
};
963+
964+
// Simple descriptions for tooltips
965+
const tooltipInfo = {
966+
'HumanEval': 'Python Code Generation',
967+
'SWE-Bench': 'Python Issue Resolution',
968+
'SWE-Bench-Java': 'Java Issue Resolution',
969+
'SWT-Bench': 'Test Generation',
970+
'Canit-Edit': 'Targeted Code Editing',
971+
'Res-Q': 'Codebase Editing',
972+
'Design2Code': 'UI Generation',
973+
'ChartMimic': 'Visualization Code',
974+
'SWE-Bench-MM': 'Visual Issue Resolution',
975+
'DSBench': 'Data Science Tasks',
976+
'CTF': 'Cybersecurity',
977+
'MiniCTX': 'Theorem Proving',
978+
'BIRD': 'Text to SQL',
979+
'VSCode': 'IDE Configuration',
980+
'General-SWE': 'Development Tasks'
941981
};
942982

943983
// Chart data structure with category information
@@ -998,7 +1038,7 @@ <h2 class="title is-3">Citation</h2>
9981038
return context[0].label;
9991039
},
10001040
label: function(context) {
1001-
return datasetInfo[context.label];
1041+
return tooltipInfo[context.label];
10021042
}
10031043
}
10041044
}
@@ -1011,7 +1051,13 @@ <h2 class="title is-3">Citation</h2>
10111051
const index = elements[0].index;
10121052
const datasetLabel = data.labels[index];
10131053
document.getElementById('datasetTitle').textContent = datasetLabel;
1014-
document.getElementById('datasetDescription').textContent = datasetInfo[datasetLabel] || 'No description available';
1054+
document.getElementById('datasetDescription').innerHTML = `
1055+
<p>${datasetInfo[datasetLabel]}</p>
1056+
<p class="mt-3"><a href="${datasetUrls[datasetLabel]}" target="_blank" class="button is-small is-link">
1057+
<span class="icon"><i class="fas fa-external-link-alt"></i></span>
1058+
<span>Visit Dataset</span>
1059+
</a></p>
1060+
`;
10151061
} else {
10161062
document.body.style.cursor = 'default';
10171063
}

docs/index2.html

Lines changed: 0 additions & 233 deletions
This file was deleted.

0 commit comments

Comments
 (0)