{"library":"genai-perf","type":"library","category":null,"description":"GenAI-Perf is a command-line interface (CLI) tool designed for measuring the throughput and latency of generative AI models (Large Language Models, Vision Language Models, Embedding Models, Ranking Models, and LoRA Adapters) served through an inference server. It generates load, measures key performance metrics such as output token throughput, time to first token, inter-token latency, and request throughput, and reports the results to the console, CSV, and JSON files. While currently at version 0.0.16 and under rapid development, it is being actively phased out in favor of NVIDIA's new `AIPerf` tool for generative AI benchmarking.","language":"python","status":"deprecated","version":"0.0.16","tags":["AI","LLM","performance","profiling","benchmark","Triton","NVIDIA","CLI"],"last_verified":"Sun May 24","install":[{"cmd":"pip install genai-perf","imports":["from genai_perf.checkpoint.checkpoint import Checkpoint","from genai_perf.config.run.results import Results"]},{"cmd":"export RELEASE=\"YY.MM\" # e.g. export RELEASE=\"24.06\"\ndocker run -it --net=host --gpus=all nvcr.io/nvidia/tritonserver:${RELEASE}-py3-sdk\ngenai-perf --help","imports":[]}],"homepage":null,"github":"https://github.com/triton-inference-server/perf_analyzer","docs":null,"changelog":null,"pypi":"https://pypi.org/project/genai-perf/","npm":null,"openapi_spec":null,"status_page":null,"smithery":null,"compatibility":{"summary":{"python_range":"3.10–3.9","success_rate":80,"avg_install_s":27.2,"avg_import_s":0.48,"wheel_type":"wheel"},"url":"https://checklist.day/v1/registry/genai-perf/compatibility"}}