{"id":28402,"library":"torchscale","title":"torchscale","description":"torchscale is a PyTorch library for building large-scale Transformer models, providing components like Multi-head Attention (MHA), Long Short-Term Memory (LSTM), and other scalable architectures. As of version 0.3.0, it supports Python >=3.8 and is maintained by Microsoft. Releases are infrequent.","status":"active","version":"0.3.0","language":"python","source_language":"en","source_url":"https://github.com/microsoft/torchscale","tags":["transformer","scale","pytorch","microsoft","deep-learning"],"install":[{"cmd":"pip install torchscale","lang":"bash","label":"PyPI install"}],"dependencies":[{"reason":"Core dependency for all tensor operations and models.","package":"torch","optional":false},{"reason":"Used for tensor rearrangement in attention modules.","package":"einops","optional":true},{"reason":"Optional for loading Hugging Face model weights.","package":"transformers","optional":true}],"imports":[{"note":"MHA is in the component submodule, not top-level.","wrong":"from torchscale import MHA","symbol":"MHA","correct":"from torchscale.component import MHA"},{"note":"","wrong":"","symbol":"AttentionLayer","correct":"from torchscale.component import AttentionLayer"},{"note":"","wrong":"","symbol":"IncrementalDecoder","correct":"from torchscale.model import IncrementalDecoder"}],"quickstart":{"code":"import torch\nfrom torchscale.component import MHA\nfrom torchscale.model import IncrementalDecoder\n\n# Example: Multi-head attention\nmha = MHA(embed_dim=512, num_heads=8)\nx = torch.randn(4, 10, 512)\noutput = mha(x, x, x)\nprint(output.shape)\n\n# Example: Decoder\ndecoder = IncrementalDecoder(\n    vocab_size=1000,\n    embed_dim=512,\n    num_heads=8,\n    num_layers=6,\n)\ntokens = torch.randint(0, 1000, (4, 20))\nlogits = decoder(tokens)\nprint(logits.shape)","lang":"python","description":"Initialize MHA and IncrementalDecoder with random input."},"warnings":[{"fix":"Replace `LongShortTerm` with `IncrementalDecoder` or `TemporalDecoder` depending on use case.","message":"The `torchscale.model.LongShortTerm` class is deprecated in 0.3.0; use `IncrementalDecoder` or `TemporalDecoder` instead.","severity":"deprecated","affected_versions":">=0.3.0"},{"fix":"Remove `kdim` and `vdim` from MHA constructor and ensure all dimensions match `embed_dim`.","message":"In version 0.3.0, the `MHA` class no longer accepts `kdim` and `vdim` arguments; use `embed_dim` for all.","severity":"breaking","affected_versions":">=0.3.0"},{"fix":"Ensure input tensors have shape (batch, sequence, features) or use .transpose() if needed.","message":"torchscale components expect batch-first tensors (batch, seq, dim), not sequence-first. Incorrect ordering may cause shape mismatches.","severity":"gotcha","affected_versions":"all"}],"env_vars":null,"last_verified":"2026-05-09T00:00:00.000Z","next_check":"2026-08-07T00:00:00.000Z","problems":[{"fix":"Run `pip install torchscale` from the correct Python environment.","cause":"torchscale is not installed or installed in a different environment.","error":"ModuleNotFoundError: No module named 'torchscale'"},{"fix":"Use `from torchscale.component import MHA`.","cause":"Importing from wrong path: top-level module does not contain MHA.","error":"AttributeError: module 'torchscale' has no attribute 'MHA'"},{"fix":"Ensure input shape is (batch, seq_len, embed_dim). Use x = x.transpose(0,1) if using (seq_len, batch, embed_dim).","cause":"Input tensor dimensions do not match MHA expectations, often due to sequence-first vs batch-first confusion.","error":"RuntimeError: The expanded size of the tensor must match the existing size at non-singleton dimension"}],"ecosystem":"pypi","meta_description":null,"install_score":null,"install_tag":null,"quickstart_score":null,"quickstart_tag":null}