{"library":"flash-attn","type":"library","category":null,"description":"Flash Attention is a fast and memory-efficient exact attention mechanism for deep learning models, particularly Transformers. It reorders the attention computation to reduce the number of memory accesses, making it significantly faster and less memory-intensive than standard attention. The library is currently stable at version 2.8.3, with an active beta development for version 4.0.0 which introduces new features and architectural changes. Its release cadence is driven by research advancements and performance optimizations.","language":"python","status":"active","version":"2.8.3","tags":["attention","transformer","cuda","gpu","deep-learning","pytorch","optimization","ai"],"last_verified":"Fri May 22","install":[{"cmd":"pip install flash-attn --no-build-isolation","imports":["from flash_attn import flash_attn_func","from flash_attn import flash_attn_qkvpacked_func","from flash_attn import flash_attn_varlen_func","from flash_attn.modules.mha import FlashAttention2"]},{"cmd":"pip install flash-attn --no-cuda-extensions","imports":[]}],"homepage":"https://flash-attention.github.io/flash-attention/","github":"https://github.com/Dao-AILab/flash-attention","docs":null,"changelog":null,"pypi":"https://pypi.org/project/flash-attn/","npm":null,"openapi_spec":null,"status_page":null,"smithery":null,"compatibility":{"summary":{"python_range":"3.10–3.9","success_rate":0,"avg_install_s":null,"avg_import_s":null,"wheel_type":null},"url":"https://checklist.day/v1/registry/flash-attn/compatibility"}}