stas commited on
Commit
a09f129
·
1 Parent(s): 2ab4c76

smaller tokenizer and model

Browse files
Files changed (6) hide show
  1. config.json +1 -1
  2. make-tiny-deberta.py +28 -45
  3. merges.txt +0 -0
  4. pytorch_model.bin +2 -2
  5. tokenizer.json +0 -0
  6. vocab.json +0 -0
config.json CHANGED
@@ -29,5 +29,5 @@
29
  "torch_dtype": "float16",
30
  "transformers_version": "4.9.0.dev0",
31
  "type_vocab_size": 0,
32
- "vocab_size": 50265
33
  }
 
29
  "torch_dtype": "float16",
30
  "transformers_version": "4.9.0.dev0",
31
  "type_vocab_size": 0,
32
+ "vocab_size": 5001
33
  }
make-tiny-deberta.py CHANGED
@@ -74,50 +74,33 @@ mname_tiny = "tiny-deberta"
74
 
75
  ### Tokenizer
76
 
77
-
78
- # XXX: can't figure out how to shrink this tokenizer's vocab! Help?
79
-
80
- # # Shrink the orig vocab to keep things small (just enough to tokenize any word, so letters+symbols)
81
- # # DebertaTokenizerFast is fully defined by a tokenizer.json, which contains the vocab and the ids, so we just need to truncate it wisely
82
- # import subprocess
83
- # tokenizer_fast = DebertaTokenizerFast.from_pretrained(mname_orig)
84
- # vocab_keep_items = 50265
85
- # tmp_dir = f"/tmp/{mname_tiny}"
86
- # tokenizer_fast.save_pretrained(tmp_dir)
87
-
88
- # # resize tokenizer.json (vocab.txt will be automatically resized on save_pretrained)
89
- # # perl -pi -e 's|(2999).*|$1}}}|' tokenizer.json # 0-indexed, so vocab_keep_items-1!
90
- # closing_pat = "}}}"
91
- # cmd = (f"perl -pi -e s|({vocab_keep_items-1}).*|$1{closing_pat}| {tmp_dir}/tokenizer.json").split()
92
- # result = subprocess.run(cmd, capture_output=True, text=True)
93
- # # reload with modified tokenizer
94
- # tokenizer_fast_tiny = DebertaTokenizerFast.from_pretrained(tmp_dir)
95
- # # it seems that DebertaTokenizer is not needed and DebertaTokenizerFast does the job
96
-
97
-
98
- # # Shrink the orig vocab to keep things small (just enough to tokenize any word, so letters+symbols)
99
- # # ElectraTokenizerFast is fully defined by a tokenizer.json, which contains the vocab and the ids, so we just need to truncate it wisely
100
- # import subprocess
101
- # tokenizer_fast = DebertaTokenizerFast.from_pretrained(mname_orig)
102
- # vocab_keep_items = 5120
103
- # tmp_dir = f"/tmp/{mname_tiny}"
104
- # vocab_short_path = f"{tmp_dir}/vocab.json"
105
- # tokenizer_fast.save_pretrained(tmp_dir)
106
- # # resize tokenizer.json (vocab.txt will be automatically resized on save_pretrained)
107
- # # perl -pi -e 's|(2999).*|$1}}}|' tokenizer.json # 0-indexed, so vocab_keep_items-1!
108
- # closing_pat = "}"
109
- # cmd = (f"perl -pi -e s|({vocab_keep_items-1}).*|$1{closing_pat}| {tmp_dir}/vocab.json").split()
110
- # result = subprocess.run(cmd, capture_output=True, text=True)
111
- # # reload with modified tokenizer
112
- # #tokenizer_fast_tiny = DebertaTokenizerFast.from_pretrained(tmp_dir, vocab_file=vocab_short_path)
113
- # # it seems that ElectraTokenizer is not needed and ElectraTokenizerFast does the job
114
-
115
-
116
- # using full tokenizer for now
117
- tokenizer_fast_tiny = DebertaTokenizerFast.from_pretrained(mname_orig)
118
-
119
-
120
-
121
 
122
 
123
  ### Config
@@ -126,7 +109,7 @@ config_tiny = DebertaConfig.from_pretrained(mname_orig)
126
  print(config_tiny)
127
  # remember to update this to the actual config as each model is different and then shrink the numbers
128
  config_tiny.update(dict(
129
- #vocab_size=vocab_keep_items,
130
  embedding_size=32,
131
  pooler_size=32,
132
  hidden_size=32,
 
74
 
75
  ### Tokenizer
76
 
77
+ import json
78
+ from transformers import AutoTokenizer
79
+ from tokenizers import Tokenizer
80
+ vocab_keep_items = 5000
81
+ tokenizer = AutoTokenizer.from_pretrained(mname_orig, use_fast=True)
82
+ assert tokenizer.is_fast, "This only works for fast tokenizers."
83
+ tokenizer_json = json.loads(tokenizer._tokenizer.to_str())
84
+ vocab = tokenizer_json["model"]["vocab"]
85
+ if tokenizer_json["model"]["type"] == "BPE":
86
+ new_vocab = { token: i for token, i in vocab.items() if i < vocab_keep_items }
87
+ merges = tokenizer_json["model"]["merges"]
88
+ new_merges = []
89
+ for i in range(len(merges)):
90
+ a, b = merges[i].split()
91
+ new_token = "".join((a, b))
92
+ if a in new_vocab and b in new_vocab and new_token in new_vocab:
93
+ new_merges.append(merges[i])
94
+ tokenizer_json["model"]["merges"] = new_merges
95
+ elif tokenizer_json["model"]["type"] == "Unigram":
96
+ new_vocab = vocab[:vocab_keep_items]
97
+ elif tokenizer_json["model"]["type"] == "WordPiece" or tokenizer_json["model"]["type"] == "WordLevel":
98
+ new_vocab = { token: i for token, i in vocab.items() if i < vocab_keep_items }
99
+ else:
100
+ raise ValueError(f"don't know how to handle {tokenizer_json['model']['type']}")
101
+ tokenizer_json["model"]["vocab"] = new_vocab
102
+ tokenizer._tokenizer = Tokenizer.from_str(json.dumps(tokenizer_json))
103
+ tokenizer_fast_tiny = tokenizer
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
104
 
105
 
106
  ### Config
 
109
  print(config_tiny)
110
  # remember to update this to the actual config as each model is different and then shrink the numbers
111
  config_tiny.update(dict(
112
+ vocab_size=vocab_keep_items,
113
  embedding_size=32,
114
  pooler_size=32,
115
  hidden_size=32,
merges.txt CHANGED
The diff for this file is too large to render. See raw diff
 
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d3a7c1679f1892c270c43d7ee3e37245c7f4e619179379bdaa313bcd05f46210
3
- size 3395431
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5f071626d5c3781b98f722d52a8e7f1ae7e0df341123f3e764fdf4798d8ca59f
3
+ size 408039
tokenizer.json CHANGED
The diff for this file is too large to render. See raw diff
 
vocab.json CHANGED
The diff for this file is too large to render. See raw diff