{"library":"human-eval","type":"library","category":null,"description":"HumanEval is a benchmark developed by OpenAI for assessing the code generation capabilities of Large Language Models (LLMs). It comprises 164 hand-written Python programming problems, each with a function signature, docstring, and comprehensive unit tests, designed to evaluate functional correctness. The library uses the `pass@k` metric for evaluation. The current version is 1.0.3, released on July 24, 2023. As a benchmark dataset and evaluation harness, it has an infrequent release cadence, with updates typically driven by new research or significant improvements to the benchmark itself.","language":"python","status":"active","version":"1.0.3","tags":["LLM evaluation","code generation","benchmark","Python","functional correctness","pass@k","AI","machine learning"],"last_verified":"Fri May 22","install":[{"cmd":"pip install human-eval","imports":["from human_eval.data import read_problems","from human_eval.data import write_jsonl","evaluate_functional_correctness samples.jsonl"]},{"cmd":"git clone https://github.com/openai/human-eval.git\ncd human-eval\npip install -e .","imports":[]}],"homepage":null,"github":null,"docs":null,"changelog":null,"pypi":"https://pypi.org/project/human-eval/","npm":null,"openapi_spec":null,"status_page":null,"smithery":null,"compatibility":{"summary":{"python_range":"3.10–3.9","success_rate":100,"avg_install_s":3.9,"avg_import_s":0.02,"wheel_type":"wheel"},"url":"https://checklist.day/v1/registry/human-eval/compatibility"}}