How long did it take to quantize?

by stepnoy - opened 1 day ago

1 day ago

I’m new to quantizing large models and recently tried converting GLM 4.7 to NVFP4. The process has been running for over 48 hours, and I’m wondering if this is typical. How long did it take in your case?

It is stuck(?) on phase

awq_lite: Searching parameters...

For at least 24 hours. Only one CPU core is active now at 100% (out of 164 vCPU)
GPU not active at the moment, but were used some how before.

Is it ok ? Should I wait or not? is my approach wrong? Runpod eating my money..

Here is some of my specs:

Hardware / environment

RunPod instance
GPUs: 7× NVIDIA H200
PyTorch: 2.9.1+cu128
ModelOpt: 0.40.0

Exact command I run

python -u /workspace/spark-config/scripts/quantize_glm47_prism_nvfp4.py \
  --model-dir /workspace/models/Ex0bit/GLM-4.7-PRISM \
  --output-dir /workspace/models/Ex0bit/GLM-4.7-PRISM-NVFP4 \
  --num-calib 512 \
  --max-seq-len 512 \
  --streaming

Exact script I run

This is the full script (as-is):

#!/usr/bin/env python3
from __future__ import annotations

import argparse
import json
import sys
from pathlib import Path
from typing import Any, Dict, Iterator, List

import torch
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer

import modelopt
import modelopt.torch.quantization as mtq
from modelopt.torch.export import export_hf_checkpoint


def _pick_text(ex: Dict[str, Any]) -> str:
    for k in ("text", "content", "prompt", "instruction", "input"):
        v = ex.get(k)
        if isinstance(v, str) and v.strip():
            return v
    # fallback: concat a few string fields
    parts: List[str] = []
    for v in ex.values():
        if isinstance(v, str) and v.strip():
            parts.append(v.strip())
        if len(parts) >= 3:
            break
    if parts:
        return "\n".join(parts)
    return json.dumps(ex, ensure_ascii=False)[:2000]


def _iter_texts(dataset_id: str, split: str, n: int, streaming: bool, name: str | None = None) -> Iterator[str]:
    ds = load_dataset(dataset_id, name=name, split=split, streaming=streaming)
    if streaming:
        i = 0
        for ex in ds:
            yield _pick_text(ex)
            i += 1
            if i >= n:
                break
    else:
        for i in range(min(n, len(ds))):
            yield _pick_text(ds[i])


def _write_hf_quant_config(out_dir: Path) -> None:
    cfg = {
        "producer": {"name": "modelopt", "version": getattr(modelopt, "__version__", "unknown")},
        "quantization": {
            "quant_algo": "NVFP4",
            "kv_cache_quant_algo": "FP8",
            "group_size": 16,
            "exclude_modules": ["lm_head"],
        },
    }
    (out_dir / "hf_quant_config.json").write_text(json.dumps(cfg, indent=4), encoding="utf-8")


def _verify_real(out_dir: Path) -> int:
    try:
        from safetensors import safe_open
    except Exception as e:
        print(f"WARN: cannot verify (no safetensors): {e}", file=sys.stderr)
        return 2

    shards = sorted(out_dir.glob("model-*.safetensors"))
    if not shards:
        print("WARN: no model-*.safetensors found", file=sys.stderr)
        return 2

    u8_w = 0
    fp8_s = 0
    with safe_open(str(shards[0]), framework="pt", device="cpu") as f:
        for k in f.keys():
            if k.endswith(".weight") and f.get_slice(k).get_dtype() == "U8":
                u8_w += 1
            if k.endswith("weight_scale") and f.get_slice(k).get_dtype().startswith("F8_"):
                fp8_s += 1

    print(f"verify: {shards[0].name} u8_weights={u8_w} fp8_scales={fp8_s}")
    return 0 if (u8_w > 50 and fp8_s > 50) else 2


def main() -> int:
    ap = argparse.ArgumentParser()
    ap.add_argument("--model-dir", required=True)
    ap.add_argument("--output-dir", required=True)
    ap.add_argument("--num-calib", type=int, default=512)
    ap.add_argument("--max-seq-len", type=int, default=512)
    ap.add_argument("--streaming", action="store_true")
    args = ap.parse_args()

    model_dir = Path(args.model_dir).resolve()
    out_dir = Path(args.output_dir).resolve()
    out_dir.mkdir(parents=True, exist_ok=True)

    print("modelopt", getattr(modelopt, "__version__", "unknown"))
    print("cuda", torch.cuda.is_available(), "gpus", torch.cuda.device_count())

    print("Loading tokenizer...", flush=True)
    tok = AutoTokenizer.from_pretrained(str(model_dir), trust_remote_code=True)
    print("Loading model (can take a while)...", flush=True)
    model = AutoModelForCausalLM.from_pretrained(
        str(model_dir),
        torch_dtype=torch.bfloat16,
        device_map="auto",
        trust_remote_code=True,
        low_cpu_mem_usage=True,
    )
    model.eval()
    try:
        model.config.use_cache = False
    except Exception:
        pass

    print("Loading calibration datasets...", flush=True)
    # Salyut1 mix: cnn_dailymail + nemotron-post-training-dataset-v2
    n_a = args.num_calib // 2
    n_b = args.num_calib - n_a
    texts_a = list(_iter_texts("abisee/cnn_dailymail", "train", n_a, args.streaming, name="3.0.0"))

    # Nemotron dataset is gated; if not authenticated, fall back to an open calibration set.
    try:
        texts_b = list(_iter_texts("nvidia/Nemotron-Post-Training-Dataset-v2", "train", n_b, args.streaming))
    except Exception as e:
        print(
            f"WARN: cannot load nvidia/Nemotron-Post-Training-Dataset-v2 (gated?). Falling back to neuralmagic/calibration LLM. Error: {e}",
            flush=True,
        )
        texts_b = list(_iter_texts("neuralmagic/calibration", "train", n_b, args.streaming, name="LLM"))

    texts = texts_a + texts_b

    print("Tokenizing calibration set...", flush=True)
    calib = [tok(t, truncation=True, max_length=args.max_seq_len, return_tensors="pt") for t in texts]

    # Feed inputs to the first device in the HF device map
    input_dev = next(model.parameters()).device

    def forward_loop(m):
        with torch.inference_mode():
            for enc in calib:
                enc = {k: v.to(input_dev) for k, v in enc.items() if v is not None}
                _ = m(**enc)

    print("Quantizing: NVFP4_AWQ_FULL_CFG", flush=True)
    model_q = mtq.quantize(model, mtq.NVFP4_AWQ_FULL_CFG, forward_loop=forward_loop)

    print("Exporting REAL HF checkpoint (export_hf_checkpoint)", flush=True)
    export_hf_checkpoint(model_q, dtype=torch.bfloat16, export_dir=str(out_dir), save_modelopt_state=True)
    tok.save_pretrained(str(out_dir))
    _write_hf_quant_config(out_dir)

    rc = _verify_real(out_dir)
    print("DONE rc=", rc)
    return rc


if __name__ == "__main__":
    raise SystemExit(main())

stepnoy

about 1 hour ago

I took 50+ hours, and as far as I can understand - it wasted the weights... only output is "!!!!!....." (((

Upload images, audio, and videos by dragging in the text input, pasting, or clicking here.

Tap or paste here to upload images

· Sign up or log in to comment