{"library":"vllm-tpu","type":"library","category":null,"description":"vLLM TPU is a variant of vLLM that runs on Google Cloud TPUs (v5e/v5p). It provides a high-throughput and memory-efficient inference and serving engine for large language models, leveraging TPU-specific optimizations like Pallas kernels for attention and quantization. The current version is 0.19.0, following the main vLLM release cadence (monthly).","language":"python","status":"active","version":"0.19.0","tags":["vllm","tpu","inference","llm"],"last_verified":"Sat May 09","install":[{"cmd":"pip install vllm-tpu","imports":["from vllm import LLM","from vllm import SamplingParams","from vllm import AsyncLLMEngine"]}],"homepage":null,"github":"https://github.com/vllm-project/vllm","docs":"https://docs.vllm.ai/en/latest/","changelog":null,"pypi":"https://pypi.org/project/vllm-tpu/","npm":null,"openapi_spec":null,"status_page":null,"smithery":null,"compatibility":null}