{"id":27661,"library":"colt5-attention","title":"CoLT5 Attention","description":"Conditionally Routed Attention, an implementation of the CoLT5 architecture for efficient long-context transformers. Current version 0.11.1, rapid release cadence.","status":"active","version":"0.11.1","language":"python","source_language":"en","source_url":"https://github.com/lucidrains/CoLT5-attention","tags":["attention","transformer","efficient","long-context","routing"],"install":[{"cmd":"pip install colt5-attention","lang":"bash","label":"PyPI"}],"dependencies":[{"reason":"Core dependency for tensor operations and neural network modules","package":"torch","optional":false},{"reason":"Used for tensor reshaping and attention operations","package":"einops","optional":false}],"imports":[{"note":"Standard import path","wrong":null,"symbol":"CoLT5Attention","correct":"from colt5_attention import CoLT5Attention"},{"note":"Exported from colt5_attention","wrong":null,"symbol":"ConditionalRouting","correct":"from colt5_attention import ConditionalRouting"}],"quickstart":{"code":"import torch\nfrom colt5_attention import CoLT5Attention\n\nattn = CoLT5Attention(\n    dim=512,\n    num_routed_queries=64,\n    num_routed_key_values=64,\n    num_heads=8,\n    dropout=0.1\n)\nx = torch.randn(2, 1024, 512)\nout = attn(x)\nprint(out.shape)  # (2, 1024, 512)","lang":"python","description":"Initialize CoLT5 attention with routed queries/key-values and run forward pass."},"warnings":[{"fix":"Explicitly set `num_routed_queries=128` if you need the old behavior.","message":"Version 0.11.0 changed default value of `num_routed_queries` from 128 to 64. Existing code relying on default may see different memory/performance.","severity":"breaking","affected_versions":">=0.11.0"},{"fix":"Convert boolean mask to float: `attn_mask = 0.0 * keep_mask + (-1e9) * (~keep_mask)`","message":"Attention masking must use a float mask (e.g., 0.0 for keep, -inf for mask). Boolean masks are not supported and may silently produce wrong outputs.","severity":"gotcha","affected_versions":"all"},{"fix":"Call `x = x.contiguous()` before passing to attention.","message":"Input tensor must be contiguous. Non-contiguous tensors can cause runtime error or incorrect gradient propagation.","severity":"gotcha","affected_versions":"all"}],"env_vars":null,"last_verified":"2026-05-09T00:00:00.000Z","next_check":"2026-08-07T00:00:00.000Z","problems":[{"fix":"Use `from colt5_attention import CoLT5Attention`","cause":"Wrong import path; user tried `import colt5_attention` then `colt5_attention.CoLT5Attention` but the class may not be top-level in older versions.","error":"AttributeError: module 'colt5_attention' has no attribute 'CoLT5Attention'"},{"fix":"Ensure all inputs are on the same device: `x = x.to('cuda')`, `mask = mask.to('cuda')`","cause":"Passing tensors on different devices (e.g., CPU and CUDA) to the attention module.","error":"RuntimeError: Expected all tensors to be on the same device, but found at least two devices"}],"ecosystem":"pypi","meta_description":null,"install_score":null,"install_tag":null,"quickstart_score":null,"quickstart_tag":null}