
{
      "id": "http://arxiv.org/abs/2409.18952",
      "updated": "2024-09-27T00:00:00Z",
      "published": "2024-09-27T00:00:00Z",
      "title": "RepairBench: Leaderboard of Frontier Models for Program Repair",
      "summary": "  AI-driven program repair uses AI models to repair buggy software by producing patches.\nRapid advancements in AI surely impact state-of-the-art performance of program repair.\nYet, grasping this progress requires frequent and standardized evaluations.\nWe propose RepairBench, a novel leaderboard for AI-driven program repair.\nThe key characteristics of RepairBench are: 1) it is execution-based: all patches are compiled and executed against a test suite, 2) it assesses frontier models in a frequent and standardized way.\nRepairBench leverages two high-quality benchmarks, Defects4J and GitBug-Java, to evaluate frontier models against real-world program repair tasks.\nWe publicly release the evaluation framework of RepairBench.\nWe will update the leaderboard as new frontier models are released.",
      "author": [
            {
                  "name": "Silva, Andr\u00e9"
            },
            {
                  "name": "Monperrus, Martin"
            }
      ],
      "link": [
            "http://arxiv.org/abs/2409.18952",
            "https://arxiv.org/pdf/2409.18952"
      ],
      "primary_category": "unknown",
      "category": [],
      "journal_ref": "Proceedings of the International Workshop on Large Language Models for Code (LLM4Code), 2025"
}
