{"library":"flash-attn-4","type":"library","category":null,"description":"Flash Attention 4 is the next-generation implementation of the Flash Attention algorithm using NVIDIA CUTE (CUDA Template Engine). It provides highly optimized fused attention kernels for modern GPUs, supporting head dimensions up to 256 and various data types including FP8. Version 4.0.0b12 is in beta, with frequent releases.","language":"python","status":"active","version":"4.0.0b12","tags":["flash attention","CUTE","CUDA","attention","transformer"],"last_verified":"Sat May 09","install":[{"cmd":"pip install flash-attn-4","imports":["from flash_attn_4 import flash_attn_func","from flash_attn_4 import flash_attn_varlen_func"]}],"homepage":null,"github":"https://github.com/Dao-AILab/flash-attention","docs":null,"changelog":null,"pypi":"https://pypi.org/project/flash-attn-4/","npm":null,"openapi_spec":null,"status_page":null,"smithery":null,"compatibility":null}