{"id":28442,"library":"vllm-tpu","title":"vLLM TPU","description":"vLLM TPU is a variant of vLLM that runs on Google Cloud TPUs (v5e/v5p). It provides a high-throughput and memory-efficient inference and serving engine for large language models, leveraging TPU-specific optimizations like Pallas kernels for attention and quantization. The current version is 0.19.0, following the main vLLM release cadence (monthly).","status":"active","version":"0.19.0","language":"python","source_language":"en","source_url":"https://github.com/vllm-project/vllm","tags":["vllm","tpu","inference","llm"],"install":[{"cmd":"pip install vllm-tpu","lang":"bash","label":"PyPI"}],"dependencies":[{"reason":"Required for tensor operations on TPU (via torch_xla).","package":"torch","optional":false},{"reason":"Required for TPU runtime integration.","package":"torch_xla","optional":false},{"reason":"Required for Pallas kernels and TPU-specific ops.","package":"jax","optional":false}],"imports":[{"note":"The entry point is the same as the main vLLM package; no separate import module.","wrong":"","symbol":"LLM","correct":"from vllm import LLM"},{"note":"","wrong":"","symbol":"SamplingParams","correct":"from vllm import SamplingParams"},{"note":"","wrong":"","symbol":"AsyncLLMEngine","correct":"from vllm import AsyncLLMEngine"}],"quickstart":{"code":"import os\nos.environ['VLLM_TPU'] = '1'  # Optional: explicitly enable TPU backend\nfrom vllm import LLM, SamplingParams\n\nprompts = [\"Hello, my name is\", \"The capital of France is\"]\nsampling_params = SamplingParams(temperature=0.8, top_p=0.95)\n\nllm = LLM(model=\"Qwen/Qwen2.5-1.5B\", max_num_seqs=8)\noutputs = llm.generate(prompts, sampling_params)\nfor output in outputs:\n    print(output.outputs[0].text)","lang":"python","description":"Basic inference with a small model on TPU. Assumes a TPU VM (v5e/v5p) with torch_xla installed."},"warnings":[{"fix":"Verify model compatibility before use; refer to the vLLM TPU docs.","message":"vLLM TPU is experimental and does not support all features of the main vLLM (e.g., tensor parallelism, quantization). Check the official docs for supported model architectures and features.","severity":"gotcha","affected_versions":"<0.20"},{"fix":"Provision a TPU VM and install the TPU runtime: https://cloud.google.com/tpu/docs/users-guide-tpu-vm","message":"You must run on a TPU VM (v5e/v5p) with torch_xla installed. Installing vllm-tpu on CPU/GPU will fail.","severity":"gotcha","affected_versions":"all"},{"fix":"Uninstall vllm first: pip uninstall vllm; then pip install vllm-tpu.","message":"As of v0.19.0, the vllm-tpu package is a separate PyPI package from vllm. Mixing installations may cause conflicts.","severity":"breaking","affected_versions":">=0.19.0"}],"env_vars":null,"last_verified":"2026-05-09T00:00:00.000Z","next_check":"2026-08-07T00:00:00.000Z","problems":[{"fix":"Ensure vllm-tpu is installed in the current environment: pip list | grep vllm","cause":"vllm-tpu package does not export a top-level 'vllm' module? Actually it does. If you see this, the installation might be broken or you are trying to import from the wrong environment.","error":"ModuleNotFoundError: No module named 'vllm'"},{"fix":"Run on a TPU VM. Alternatively, set VLLM_DEVICE='cpu' for CPU fallback (but that's not TPU).","cause":"Code running on a non-TPU machine (e.g., GPU or CPU).","error":"RuntimeError: TPU not found"},{"fix":"Use a model from the list of supported architectures: LLama, Mistral, Qwen2, etc.","cause":"Model not compatible with vLLM TPU (e.g., models requiring custom CUDA kernels).","error":"ValueError: Unsupported model architecture: ..."}],"ecosystem":"pypi","meta_description":null,"install_score":null,"install_tag":null,"quickstart_score":null,"quickstart_tag":null}