{"id":24813,"library":"vllm-flash-attn","title":"vLLM Flash Attention Wrapper","description":"Forward-only flash-attention kernel optimized for vLLM inference. Version 2.6.2 is the latest, released as a lightweight wrapper around the Flash Attention CUDA kernel with a simplified forward-only API. Development is active alongside vLLM releases.","status":"active","version":"2.6.2","language":"python","source_language":"en","source_url":"https://github.com/vllm-project/flash-attention.git","tags":["flash-attention","vllm","inference","cuda"],"install":[{"cmd":"pip install vllm-flash-attn","lang":"bash","label":"PyPI install"}],"dependencies":[{"reason":"Flash attention kernels depend on PyTorch tensors and CUDA.","package":"torch","optional":false},{"reason":"The underlying flash-attention library; vllm-flash-attn wraps it for forward-only use.","package":"flash-attn","optional":false}],"imports":[{"note":"flash_attn provides a full trian/test API; vllm_flash_attn exports only forward-only functions.","wrong":"from flash_attn import flash_attn_func","symbol":"flash_attn_func","correct":"from vllm_flash_attn import flash_attn_func"},{"note":"This function is specifically for inference with precomputed KV cache.","wrong":"from flash_attn import flash_attn_with_kvcache","symbol":"flash_attn_with_kvcache","correct":"from vllm_flash_attn import flash_attn_with_kvcache"}],"quickstart":{"code":"import torch\nfrom vllm_flash_attn import flash_attn_func\n\nq = torch.randn(1, 1, 8, 64, device='cuda', dtype=torch.float16)\nk = torch.randn(1, 1, 8, 64, device='cuda', dtype=torch.float16)\nv = torch.randn(1, 1, 8, 64, device='cuda', dtype=torch.float16)\nout = flash_attn_func(q, k, v, softmax_scale=1.0, causal=False)\nprint(out.shape)","lang":"python","description":"Basic usage of forward-only flash attention. Requires CUDA GPU."},"warnings":[{"fix":"Use the full flash-attn package (flash_attn) for training.","message":"This package is forward-only. It does not support backward pass gradients. Using it in training will silently produce wrong gradients or crash.","severity":"gotcha","affected_versions":"all"},{"fix":"Upgrade GPU to Volta or newer, or pin to vllm-flash-attn<2.6.0 if on older hardware.","message":"Support for compute capability < 8.0 (e.g., V100) was dropped in v2.6.0. Older versions may still work but are unmaintained.","severity":"deprecated","affected_versions":">=2.6.0"},{"fix":"Always pass softmax_scale=1.0 or your desired scaling factor.","message":"The function signature for flash_attn_func changed in v2.6.0: the `softmax_scale` parameter is no longer optional and must be passed explicitly.","severity":"gotcha","affected_versions":">=2.6.0"}],"env_vars":null,"last_verified":"2026-05-01T00:00:00.000Z","next_check":"2026-07-30T00:00:00.000Z","problems":[{"fix":"pip install vllm-flash-attn","cause":"Package not installed or installed with an older name (flash_attn instead of vllm_flash_attn).","error":"ImportError: No module named 'vllm_flash_attn'"},{"fix":"Update to a GPU with compute capability >= 8.0, or use a CPU implementation.","cause":"GPU older than Ampere (e.g., V100, GTX 1080).","error":"RuntimeError: FlashAttention only supports CUDA with compute capability >= 8.0"},{"fix":"Call .contiguous() on tensors before passing: q.contiguous(), k.contiguous(), v.contiguous()","cause":"Input tensors are not contiguous; flash attention requires contiguous memory layout.","error":"AssertionError: Input tensor must be contiguous in memory"}],"ecosystem":"pypi","meta_description":null,"install_score":null,"install_tag":null,"quickstart_score":null,"quickstart_tag":null}