{"library":"vllm-flash-attn","type":"library","category":null,"description":"Forward-only flash-attention kernel optimized for vLLM inference. Version 2.6.2 is the latest, released as a lightweight wrapper around the Flash Attention CUDA kernel with a simplified forward-only API. Development is active alongside vLLM releases.","language":"python","status":"active","version":"2.6.2","tags":["flash-attention","vllm","inference","cuda"],"last_verified":"Fri May 01","install":[{"cmd":"pip install vllm-flash-attn","imports":["from vllm_flash_attn import flash_attn_func","from vllm_flash_attn import flash_attn_with_kvcache"]}],"homepage":null,"github":"https://github.com/vllm-project/flash-attention.git","docs":null,"changelog":null,"pypi":"https://pypi.org/project/vllm-flash-attn/","npm":null,"openapi_spec":null,"status_page":null,"smithery":null,"compatibility":null}