How long did it take to quantize?
#6
by
stepnoy
- opened
I’m new to quantizing large models and recently tried converting GLM 4.7 to NVFP4. The process has been running for over 48 hours, and I’m wondering if this is typical. How long did it take in your case?
It is stuck(?) on phase
awq_lite: Searching parameters...
For at least 24 hours. Only one CPU core is active now at 100% (out of 164 vCPU)
GPU not active at the moment, but were used some how before.
Is it ok ? Should I wait or not? is my approach wrong? Runpod eating my money..
Here is some of my specs:
Hardware / environment
- RunPod instance
- GPUs: 7× NVIDIA H200
- PyTorch: 2.9.1+cu128
- ModelOpt: 0.40.0
Exact command I run
python -u /workspace/spark-config/scripts/quantize_glm47_prism_nvfp4.py \
--model-dir /workspace/models/Ex0bit/GLM-4.7-PRISM \
--output-dir /workspace/models/Ex0bit/GLM-4.7-PRISM-NVFP4 \
--num-calib 512 \
--max-seq-len 512 \
--streaming
Exact script I run
This is the full script (as-is):
#!/usr/bin/env python3
from __future__ import annotations
import argparse
import json
import sys
from pathlib import Path
from typing import Any, Dict, Iterator, List
import torch
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer
import modelopt
import modelopt.torch.quantization as mtq
from modelopt.torch.export import export_hf_checkpoint
def _pick_text(ex: Dict[str, Any]) -> str:
for k in ("text", "content", "prompt", "instruction", "input"):
v = ex.get(k)
if isinstance(v, str) and v.strip():
return v
# fallback: concat a few string fields
parts: List[str] = []
for v in ex.values():
if isinstance(v, str) and v.strip():
parts.append(v.strip())
if len(parts) >= 3:
break
if parts:
return "\n".join(parts)
return json.dumps(ex, ensure_ascii=False)[:2000]
def _iter_texts(dataset_id: str, split: str, n: int, streaming: bool, name: str | None = None) -> Iterator[str]:
ds = load_dataset(dataset_id, name=name, split=split, streaming=streaming)
if streaming:
i = 0
for ex in ds:
yield _pick_text(ex)
i += 1
if i >= n:
break
else:
for i in range(min(n, len(ds))):
yield _pick_text(ds[i])
def _write_hf_quant_config(out_dir: Path) -> None:
cfg = {
"producer": {"name": "modelopt", "version": getattr(modelopt, "__version__", "unknown")},
"quantization": {
"quant_algo": "NVFP4",
"kv_cache_quant_algo": "FP8",
"group_size": 16,
"exclude_modules": ["lm_head"],
},
}
(out_dir / "hf_quant_config.json").write_text(json.dumps(cfg, indent=4), encoding="utf-8")
def _verify_real(out_dir: Path) -> int:
try:
from safetensors import safe_open
except Exception as e:
print(f"WARN: cannot verify (no safetensors): {e}", file=sys.stderr)
return 2
shards = sorted(out_dir.glob("model-*.safetensors"))
if not shards:
print("WARN: no model-*.safetensors found", file=sys.stderr)
return 2
u8_w = 0
fp8_s = 0
with safe_open(str(shards[0]), framework="pt", device="cpu") as f:
for k in f.keys():
if k.endswith(".weight") and f.get_slice(k).get_dtype() == "U8":
u8_w += 1
if k.endswith("weight_scale") and f.get_slice(k).get_dtype().startswith("F8_"):
fp8_s += 1
print(f"verify: {shards[0].name} u8_weights={u8_w} fp8_scales={fp8_s}")
return 0 if (u8_w > 50 and fp8_s > 50) else 2
def main() -> int:
ap = argparse.ArgumentParser()
ap.add_argument("--model-dir", required=True)
ap.add_argument("--output-dir", required=True)
ap.add_argument("--num-calib", type=int, default=512)
ap.add_argument("--max-seq-len", type=int, default=512)
ap.add_argument("--streaming", action="store_true")
args = ap.parse_args()
model_dir = Path(args.model_dir).resolve()
out_dir = Path(args.output_dir).resolve()
out_dir.mkdir(parents=True, exist_ok=True)
print("modelopt", getattr(modelopt, "__version__", "unknown"))
print("cuda", torch.cuda.is_available(), "gpus", torch.cuda.device_count())
print("Loading tokenizer...", flush=True)
tok = AutoTokenizer.from_pretrained(str(model_dir), trust_remote_code=True)
print("Loading model (can take a while)...", flush=True)
model = AutoModelForCausalLM.from_pretrained(
str(model_dir),
torch_dtype=torch.bfloat16,
device_map="auto",
trust_remote_code=True,
low_cpu_mem_usage=True,
)
model.eval()
try:
model.config.use_cache = False
except Exception:
pass
print("Loading calibration datasets...", flush=True)
# Salyut1 mix: cnn_dailymail + nemotron-post-training-dataset-v2
n_a = args.num_calib // 2
n_b = args.num_calib - n_a
texts_a = list(_iter_texts("abisee/cnn_dailymail", "train", n_a, args.streaming, name="3.0.0"))
# Nemotron dataset is gated; if not authenticated, fall back to an open calibration set.
try:
texts_b = list(_iter_texts("nvidia/Nemotron-Post-Training-Dataset-v2", "train", n_b, args.streaming))
except Exception as e:
print(
f"WARN: cannot load nvidia/Nemotron-Post-Training-Dataset-v2 (gated?). Falling back to neuralmagic/calibration LLM. Error: {e}",
flush=True,
)
texts_b = list(_iter_texts("neuralmagic/calibration", "train", n_b, args.streaming, name="LLM"))
texts = texts_a + texts_b
print("Tokenizing calibration set...", flush=True)
calib = [tok(t, truncation=True, max_length=args.max_seq_len, return_tensors="pt") for t in texts]
# Feed inputs to the first device in the HF device map
input_dev = next(model.parameters()).device
def forward_loop(m):
with torch.inference_mode():
for enc in calib:
enc = {k: v.to(input_dev) for k, v in enc.items() if v is not None}
_ = m(**enc)
print("Quantizing: NVFP4_AWQ_FULL_CFG", flush=True)
model_q = mtq.quantize(model, mtq.NVFP4_AWQ_FULL_CFG, forward_loop=forward_loop)
print("Exporting REAL HF checkpoint (export_hf_checkpoint)", flush=True)
export_hf_checkpoint(model_q, dtype=torch.bfloat16, export_dir=str(out_dir), save_modelopt_state=True)
tok.save_pretrained(str(out_dir))
_write_hf_quant_config(out_dir)
rc = _verify_real(out_dir)
print("DONE rc=", rc)
return rc
if __name__ == "__main__":
raise SystemExit(main())
I took 50+ hours, and as far as I can understand - it wasted the weights... only output is "!!!!!....." (((