{"library":"flashinfer-python","type":"library","category":null,"description":"FlashInfer is a high-performance kernel library for optimizing Large Language Model (LLM) inference on NVIDIA GPUs. It provides efficient CUDA kernels for operations like paged attention, prefill, and decode. Currently at version 0.6.7.post3, the library is under active development with frequent patch releases and nightly builds, indicating rapid evolution and potential API changes.","language":"python","status":"active","version":"0.6.7.post3","tags":["LLM","inference","CUDA","GPU","attention","AI","performance"],"install":[{"cmd":"pip install flashinfer-python","imports":["import flashinfer as fi","from flashinfer import BatchDecodeWithPagedKVCache","from flashinfer import BatchPrefillWithRaggedKVCache","from flashinfer.core import PagedKVCache"]},{"cmd":"pip install flashinfer-python --pre --extra-index-url https://flashinfer.ai/whl/cu121","imports":[]}],"homepage":"https://flashinfer.ai","github":"https://github.com/flashinfer-ai/flashinfer","docs":null,"changelog":null,"pypi":"https://pypi.org/project/flashinfer-python/","npm":null,"openapi_spec":null,"status_page":null,"smithery":null,"compatibility":{"summary":{"python_range":"3.10–3.9","success_rate":20,"avg_install_s":80.4,"avg_import_s":10.55,"wheel_type":"wheel"},"url":"https://checklist.day/v1/registry/flashinfer-python/compatibility"},"provenance":{"verified_status":"passing","verified_at":"Sun Jun 28","last_verified":"Sun Jun 28","next_check":"Tue Jul 28","install_tag":null}}