smaller tokenizer and model

Browse files

Files changed (6) hide show

config.json +1 -1
make-tiny-deberta.py +28 -45
merges.txt +0 -0
pytorch_model.bin +2 -2
tokenizer.json +0 -0
vocab.json +0 -0

config.json CHANGED Viewed

@@ -29,5 +29,5 @@
   "torch_dtype": "float16",
   "transformers_version": "4.9.0.dev0",
   "type_vocab_size": 0,
-  "vocab_size": 50265
 }

   "torch_dtype": "float16",
   "transformers_version": "4.9.0.dev0",
   "type_vocab_size": 0,
+  "vocab_size": 5001
 }

make-tiny-deberta.py CHANGED Viewed

@@ -74,50 +74,33 @@ mname_tiny = "tiny-deberta"
 ### Tokenizer
-# XXX: can't figure out how to shrink this tokenizer's vocab! Help?
-# # Shrink the orig vocab to keep things small (just enough to tokenize any word, so letters+symbols)
-# # DebertaTokenizerFast is fully defined by a tokenizer.json, which contains the vocab and the ids, so we just need to truncate it wisely
-# import subprocess
-# tokenizer_fast = DebertaTokenizerFast.from_pretrained(mname_orig)
-# vocab_keep_items = 50265
-# tmp_dir = f"/tmp/{mname_tiny}"
-# tokenizer_fast.save_pretrained(tmp_dir)
-# # resize tokenizer.json (vocab.txt will be automatically resized on save_pretrained)
-# # perl -pi -e 's|(2999).*|$1}}}|' tokenizer.json # 0-indexed, so vocab_keep_items-1!
-# closing_pat = "}}}"
-# cmd = (f"perl -pi -e s|({vocab_keep_items-1}).*|$1{closing_pat}| {tmp_dir}/tokenizer.json").split()
-# result = subprocess.run(cmd, capture_output=True, text=True)
-# # reload with modified tokenizer
-# tokenizer_fast_tiny = DebertaTokenizerFast.from_pretrained(tmp_dir)
-# # it seems that DebertaTokenizer is not needed and DebertaTokenizerFast does the job
-# # Shrink the orig vocab to keep things small (just enough to tokenize any word, so letters+symbols)
-# # ElectraTokenizerFast is fully defined by a tokenizer.json, which contains the vocab and the ids, so we just need to truncate it wisely
-# import subprocess
-# tokenizer_fast = DebertaTokenizerFast.from_pretrained(mname_orig)
-# vocab_keep_items = 5120
-# tmp_dir = f"/tmp/{mname_tiny}"
-# vocab_short_path = f"{tmp_dir}/vocab.json"
-# tokenizer_fast.save_pretrained(tmp_dir)
-# # resize tokenizer.json (vocab.txt will be automatically resized on save_pretrained)
-# # perl -pi -e 's|(2999).*|$1}}}|' tokenizer.json # 0-indexed, so vocab_keep_items-1!
-# closing_pat = "}"
-# cmd = (f"perl -pi -e s|({vocab_keep_items-1}).*|$1{closing_pat}| {tmp_dir}/vocab.json").split()
-# result = subprocess.run(cmd, capture_output=True, text=True)
-# # reload with modified tokenizer
-# #tokenizer_fast_tiny = DebertaTokenizerFast.from_pretrained(tmp_dir, vocab_file=vocab_short_path)
-# # it seems that ElectraTokenizer is not needed and ElectraTokenizerFast does the job
-# using full tokenizer for now
-tokenizer_fast_tiny = DebertaTokenizerFast.from_pretrained(mname_orig)
 ### Config
@@ -126,7 +109,7 @@ config_tiny = DebertaConfig.from_pretrained(mname_orig)
 print(config_tiny)
 # remember to update this to the actual config as each model is different and then shrink the numbers
 config_tiny.update(dict(
-    #vocab_size=vocab_keep_items,
     embedding_size=32,
     pooler_size=32,
     hidden_size=32,

 ### Tokenizer
+import json
+from transformers import AutoTokenizer
+from tokenizers import Tokenizer
+vocab_keep_items = 5000
+tokenizer = AutoTokenizer.from_pretrained(mname_orig, use_fast=True)
+assert tokenizer.is_fast, "This only works for fast tokenizers."
+tokenizer_json = json.loads(tokenizer._tokenizer.to_str())
+vocab = tokenizer_json["model"]["vocab"]
+if tokenizer_json["model"]["type"] == "BPE":
+    new_vocab = { token: i for token, i in vocab.items() if i < vocab_keep_items }
+    merges = tokenizer_json["model"]["merges"]
+    new_merges = []
+    for i in range(len(merges)):
+        a, b = merges[i].split()
+        new_token = "".join((a, b))
+        if a in new_vocab and b in new_vocab and new_token in new_vocab:
+            new_merges.append(merges[i])
+    tokenizer_json["model"]["merges"] = new_merges
+elif tokenizer_json["model"]["type"] == "Unigram":
+    new_vocab = vocab[:vocab_keep_items]
+elif tokenizer_json["model"]["type"] == "WordPiece" or tokenizer_json["model"]["type"] == "WordLevel":
+    new_vocab = { token: i for token, i in vocab.items() if i < vocab_keep_items }
+else:
+    raise ValueError(f"don't know how to handle {tokenizer_json['model']['type']}")
+tokenizer_json["model"]["vocab"] = new_vocab
+tokenizer._tokenizer = Tokenizer.from_str(json.dumps(tokenizer_json))
+tokenizer_fast_tiny = tokenizer
 ### Config
 print(config_tiny)
 # remember to update this to the actual config as each model is different and then shrink the numbers
 config_tiny.update(dict(
+    vocab_size=vocab_keep_items,
     embedding_size=32,
     pooler_size=32,
     hidden_size=32,

merges.txt CHANGED Viewed

The diff for this file is too large to render. See raw diff

pytorch_model.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:d3a7c1679f1892c270c43d7ee3e37245c7f4e619179379bdaa313bcd05f46210
-size 3395431

 version https://git-lfs.github.com/spec/v1
+oid sha256:5f071626d5c3781b98f722d52a8e7f1ae7e0df341123f3e764fdf4798d8ca59f
+size 408039

tokenizer.json CHANGED Viewed

The diff for this file is too large to render. See raw diff

vocab.json CHANGED Viewed

The diff for this file is too large to render. See raw diff