{"id":23293,"library":"auto-gptq","title":"AutoGPTQ","description":"AutoGPTQ is an easy-to-use LLMs quantization package based on the GPTQ algorithm. It provides user-friendly APIs for quantizing and running large language models with reduced memory usage. Current version 0.7.1 supports loading sharded quantized checkpoints and Gemma models. Release cadence is irregular, with major features in point releases.","status":"active","version":"0.7.1","language":"python","source_language":"en","source_url":"https://github.com/PanQiWei/AutoGPTQ","tags":["llm","quantization","gptq","cuda","exllama","inference"],"install":[{"cmd":"pip install auto-gptq","lang":"bash","label":"Install from PyPI"},{"cmd":"pip install auto-gptq[exllama]","lang":"bash","label":"Install with exllama kernels for inference"}],"dependencies":[{"reason":"Core dependency for tensor operations and GPU support. AutoGPTQ requires PyTorch >=2.0.","package":"torch","optional":false},{"reason":"Hugging Face Transformers for model architectures and tokenizers.","package":"transformers","optional":false},{"reason":"Used for device mapping and large model support.","package":"accelerate","optional":true},{"reason":"Used for loading calibration datasets during quantization.","package":"datasets","optional":true}],"imports":[{"note":"AutoGPTQForCausalLM is exposed at package level, not in a submodule.","wrong":"from auto_gptq.modeling import AutoGPTQForCausalLM","symbol":"AutoGPTQForCausalLM","correct":"from auto_gptq import AutoGPTQForCausalLM"},{"note":"","wrong":"","symbol":"BaseQuantizeConfig","correct":"from auto_gptq import BaseQuantizeConfig"},{"note":"","wrong":"","symbol":"exllama_set_max_input_length","correct":"from auto_gptq import exllama_set_max_input_length"}],"quickstart":{"code":"import torch\nfrom transformers import AutoTokenizer\nfrom auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig\n\nmodel_id = \"facebook/opt-125m\"\ntokenizer = AutoTokenizer.from_pretrained(model_id)\n\nquantize_config = BaseQuantizeConfig(\n    bits=4,\n    group_size=128,\n    desc_act=False,\n)\n\nmodel = AutoGPTQForCausalLM.from_pretrained(\n    model_id,\n    quantize_config=quantize_config,\n)\n\ntext = \"AutoGPTQ is\"\ninputs = tokenizer(text, return_tensors=\"pt\").to(model.device)\noutputs = model.generate(**inputs, max_new_tokens=20)\nprint(tokenizer.decode(outputs[0], skip_special_tokens=True))","lang":"python","description":"Quickstart for quantizing and running a model (quantization not shown for simplicity, but the config is set)."},"warnings":[{"fix":"Install auto-gptq with exllama extra: `pip install auto-gptq[exllama]`","message":"The exllama kernel is the default backend for inference. If you do not install the optional exllama dependency, the package may fall back to a slower backend or raise an error. Install with `pip install auto-gptq[exllama]`.","severity":"gotcha","affected_versions":">=0.5.0"},{"fix":"Use `model_name_or_path` instead of `save_dir` when loading quantized models.","message":"In v0.3.2, the `save_dir` argument was removed from `from_quantized`. Now only `model_name_or_path` is supported.","severity":"breaking","affected_versions":"0.3.2+"},{"fix":"Remove `use_triton` argument. The backend is automatically selected (exllama preferred).","message":"The `use_triton` parameter in `from_quantized` is deprecated since v0.5.0. Triton backend is no longer supported; use exllama or cuda backends.","severity":"deprecated","affected_versions":">=0.5.0"},{"fix":"Call `from auto_gptq import exllama_set_max_input_length; model = exllama_set_max_input_length(model, 4096)` after loading.","message":"When using the exllama backend, the maximum input length defaults to 2048. For longer sequences, call `exllama_set_max_input_length` before generation. Otherwise, you may get an error like 'CUDA out of memory' or silent failures.","severity":"gotcha","affected_versions":"0.4.2 - 0.7.1"},{"fix":"Upgrade to v0.3.2 or later.","message":"Quantization with `desc_act=True` and `group_size` may not be supported with all kernels. The cuda kernel before v0.3.2 had a bug when using both. Ensure you use v0.3.2+ if you need that combination.","severity":"gotcha","affected_versions":"<0.3.2"}],"env_vars":null,"last_verified":"2026-05-01T00:00:00.000Z","next_check":"2026-07-30T00:00:00.000Z","problems":[{"fix":"Use `from auto_gptq import AutoGPTQForCausalLM`","cause":"Incorrect import path; trying to import from submodule instead of top-level.","error":"AttributeError: module 'auto_gptq' has no attribute 'AutoGPTQForCausalLM'"},{"fix":"Upgrade to auto-gptq>=0.4.2: `pip install --upgrade auto-gptq`","cause":"Using an older version that doesn't have this function (introduced in v0.4.2).","error":"ImportError: cannot import name 'exllama_set_max_input_length' from 'auto_gptq'"},{"fix":"Call `exllama_set_max_input_length(model, new_length)` after loading model and before generation.","cause":"Exllama kernel's default max input length is 2048; exceeding this causes memory issues.","error":"RuntimeError: CUDA error: out of memory"},{"fix":"Use `model_name_or_path` instead: e.g., `AutoGPTQForCausalLM.from_quantized('model_path_or_name')`","cause":"The `save_dir` argument was removed in v0.3.2.","error":"TypeError: from_quantized() got an unexpected keyword argument 'save_dir'"}],"ecosystem":"pypi","meta_description":null,"install_score":null,"install_tag":null,"quickstart_score":null,"quickstart_tag":null}