Seems you’re comparing word-level labels to the processor’s token-level labels? Maybe.
from transformers import LayoutLMv3Processor
from PIL import Image
# --- toy invoice words, one value likely splits into multiple subwords ---
words = ["Invoice", "No.", "12345", "Total", "USD", "1,234.56", "."]
boxes = [
[ 50, 50, 200, 100],
[210, 50, 260, 100],
[270, 50, 380, 100],
[ 50, 150, 140, 200],
[150, 150, 220, 200],
[230, 150, 380, 200],
[390, 150, 405, 200],
]
# 0 = O, 1 = INVOICE_NO, 3 = AMOUNT (example)
word_labels = [0, 0, 1, 0, 0, 3, 0]
image = Image.new("RGB", (1000, 1000), "white")
processor = LayoutLMv3Processor.from_pretrained("microsoft/layoutlmv3-base", apply_ocr=False)
# ------------------
# WRONG COMPARISON
# ------------------
# Make the tokenizer label *every* subword, so any split word duplicates its label.
processor.tokenizer.only_label_first_subword = False
enc_wrong = processor(
images=image,
text=words,
boxes=boxes,
word_labels=word_labels,
truncation=True,
padding="max_length",
max_length=128,
return_tensors="pt",
)
labels_tok_wrong = enc_wrong["labels"][0].tolist()
# Naively drop -100 (special tokens, padding, or ignored subtokens)
labels_wrong_naive = [l for l in labels_tok_wrong if l != -100]
print("WRONG: compare original vs processor labels after removing -100")
print("original:", word_labels)
print("encoded :", labels_wrong_naive[:len(word_labels)+10]) # show a slice
print("equal? ", word_labels == labels_wrong_naive)
# ------------------
# CORRECT COMPARISON (two valid options)
# ------------------
# Option A: Keep only first subword labels during encoding
processor.tokenizer.only_label_first_subword = True
enc_ok = processor(
images=image,
text=words,
boxes=boxes,
word_labels=word_labels,
truncation=True,
padding="max_length",
max_length=128,
return_tensors="pt",
)
labels_tok_ok = enc_ok["labels"][0].tolist()
labels_ok_naive = [l for l in labels_tok_ok if l != -100] # now this is 1:1 with words
print("\nCORRECT A: only_label_first_subword=True then drop -100")
print("original:", word_labels)
print("encoded :", labels_ok_naive)
print("equal? ", word_labels == labels_ok_naive)
# Option B: Collapse token-level labels back to word-level using word_ids()
word_ids = enc_wrong.word_ids(0) # from the earlier 'enc_wrong' with duplicated subword labels
recovered = []
seen = set()
for wid, lab in zip(word_ids, labels_tok_wrong):
if wid is None or lab == -100:
continue
if wid not in seen: # first subword of each word only
recovered.append(lab)
seen.add(wid)
print("\nCORRECT B: collapse tokens -> words via word_ids() on any encoding")
print("original:", word_labels)
print("recovered:", recovered)
print("equal? ", word_labels == recovered)
"""
WRONG: compare original vs processor labels after removing -100
original: [0, 0, 1, 0, 0, 3, 0]
encoded : [0, 0, 0, 0, 1, 1, 0, 0, 3, 3, 3, 3, 3, 0]
equal? False
CORRECT A: only_label_first_subword=True then drop -100
original: [0, 0, 1, 0, 0, 3, 0]
encoded : [0, 0, 1, 0, 0, 3, 0]
equal? True
CORRECT B: collapse tokens -> words via word_ids() on any encoding
original: [0, 0, 1, 0, 0, 3, 0]
recovered: [0, 0, 1, 0, 0, 3, 0]
equal? True
"""