Kaguya-19
commited on
Commit
·
1023e8b
1
Parent(s):
5d2fd73
fit for Sentence Transformer
Browse files- README.md +92 -23
- config.json +1 -1
README.md
CHANGED
|
@@ -85,40 +85,53 @@ flash-attn>2.3.5
|
|
| 85 |
|
| 86 |
### 示例脚本 Demo
|
| 87 |
|
|
|
|
|
|
|
| 88 |
```python
|
| 89 |
-
from transformers import AutoModel,
|
| 90 |
import torch
|
| 91 |
import numpy as np
|
| 92 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 93 |
model_name = "openbmb/MiniCPM-Reranker"
|
| 94 |
-
tokenizer =
|
| 95 |
tokenizer.padding_side = "right"
|
|
|
|
| 96 |
model = AutoModelForSequenceClassification.from_pretrained(model_name, trust_remote_code=True,attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
|
| 97 |
model.eval()
|
| 98 |
-
max_len_q, max_len_d = 512, 512
|
| 99 |
-
|
| 100 |
-
def tokenize_our(query,doc):
|
| 101 |
-
input_id_query = tokenizer.encode(query, add_special_tokens=False, max_length=max_len_q, truncation=True)
|
| 102 |
-
input_id_doc = tokenizer.encode(doc, add_special_tokens=False, max_length=max_len_d, truncation=True)
|
| 103 |
-
pad_input = {"input_ids": [tokenizer.bos_token_id] + input_id_query + [tokenizer.eos_token_id] + input_id_doc}
|
| 104 |
-
return tokenizer.pad(
|
| 105 |
-
pad_input,
|
| 106 |
-
padding="max_length",
|
| 107 |
-
max_length=max_len_q + max_len_d + 2,
|
| 108 |
-
return_tensors="pt",
|
| 109 |
-
)
|
| 110 |
|
| 111 |
@torch.no_grad()
|
| 112 |
def rerank(input_query, input_docs):
|
| 113 |
-
tokenized_inputs = [
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
for k in input_ids:
|
| 120 |
-
input_ids[k] = torch.stack(input_ids[k]).to("cuda")
|
| 121 |
-
outputs = model(**input_ids)
|
| 122 |
score = outputs.logits
|
| 123 |
return score.float().detach().cpu().numpy()
|
| 124 |
|
|
@@ -136,6 +149,62 @@ for i in range(len(queries)):
|
|
| 136 |
print(np.array(scores)) # [[[-4.7460938][-8.8515625]]]
|
| 137 |
```
|
| 138 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 139 |
## 实验结果 Evaluation Results
|
| 140 |
|
| 141 |
### 中文与英文重排序结果 CN/EN Re-ranking Results
|
|
|
|
| 85 |
|
| 86 |
### 示例脚本 Demo
|
| 87 |
|
| 88 |
+
#### Huggingface Transformers
|
| 89 |
+
|
| 90 |
```python
|
| 91 |
+
from transformers import AutoModel, LlamaTokenizer, AutoModelForSequenceClassification
|
| 92 |
import torch
|
| 93 |
import numpy as np
|
| 94 |
|
| 95 |
+
# from https://github.com/huggingface/transformers/blob/v4.44.2/src/transformers/models/xlm_roberta/tokenization_xlm_roberta.py
|
| 96 |
+
class MiniCPMRerankerLLamaTokenizer(LlamaTokenizer):
|
| 97 |
+
def build_inputs_with_special_tokens(
|
| 98 |
+
self, token_ids_0, token_ids_1 = None
|
| 99 |
+
):
|
| 100 |
+
"""
|
| 101 |
+
- single sequence: `<s> X </s>`
|
| 102 |
+
- pair of sequences: `<s> A </s> B`
|
| 103 |
+
|
| 104 |
+
Args:
|
| 105 |
+
token_ids_0 (`List[int]`):
|
| 106 |
+
List of IDs to which the special tokens will be added.
|
| 107 |
+
token_ids_1 (`List[int]`, *optional*):
|
| 108 |
+
Optional second list of IDs for sequence pairs.
|
| 109 |
+
|
| 110 |
+
Returns:
|
| 111 |
+
`List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
|
| 112 |
+
"""
|
| 113 |
+
|
| 114 |
+
if token_ids_1 is None:
|
| 115 |
+
return super().build_inputs_with_special_tokens(token_ids_0)
|
| 116 |
+
bos = [self.bos_token_id]
|
| 117 |
+
sep = [self.eos_token_id]
|
| 118 |
+
return bos + token_ids_0 + sep + token_ids_1
|
| 119 |
+
|
| 120 |
model_name = "openbmb/MiniCPM-Reranker"
|
| 121 |
+
tokenizer = MiniCPMRerankerLLamaTokenizer.from_pretrained(model_name, trust_remote_code=True)
|
| 122 |
tokenizer.padding_side = "right"
|
| 123 |
+
|
| 124 |
model = AutoModelForSequenceClassification.from_pretrained(model_name, trust_remote_code=True,attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
|
| 125 |
model.eval()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 126 |
|
| 127 |
@torch.no_grad()
|
| 128 |
def rerank(input_query, input_docs):
|
| 129 |
+
tokenized_inputs = tokenizer([[input_query, input_doc] for input_doc in input_docs], return_tensors="pt", padding=True, truncation=True, max_length=1024)
|
| 130 |
+
|
| 131 |
+
for k in tokenized_inputs:
|
| 132 |
+
tokenized_inputs [k] = tokenized_inputs[k].to("cuda")
|
| 133 |
+
|
| 134 |
+
outputs = model(**tokenized_inputs)
|
|
|
|
|
|
|
|
|
|
| 135 |
score = outputs.logits
|
| 136 |
return score.float().detach().cpu().numpy()
|
| 137 |
|
|
|
|
| 149 |
print(np.array(scores)) # [[[-4.7460938][-8.8515625]]]
|
| 150 |
```
|
| 151 |
|
| 152 |
+
#### Sentence Transformer
|
| 153 |
+
|
| 154 |
+
```python
|
| 155 |
+
from sentence_transformers import CrossEncoder
|
| 156 |
+
from transformers import LlamaTokenizer
|
| 157 |
+
import torch
|
| 158 |
+
|
| 159 |
+
# from https://github.com/huggingface/transformers/blob/v4.44.2/src/transformers/models/xlm_roberta/tokenization_xlm_roberta.py
|
| 160 |
+
class MiniCPMRerankerLLamaTokenizer(LlamaTokenizer):
|
| 161 |
+
def build_inputs_with_special_tokens(
|
| 162 |
+
self, token_ids_0, token_ids_1 = None
|
| 163 |
+
):
|
| 164 |
+
"""
|
| 165 |
+
- single sequence: `<s> X </s>`
|
| 166 |
+
- pair of sequences: `<s> A </s> B`
|
| 167 |
+
|
| 168 |
+
Args:
|
| 169 |
+
token_ids_0 (`List[int]`):
|
| 170 |
+
List of IDs to which the special tokens will be added.
|
| 171 |
+
token_ids_1 (`List[int]`, *optional*):
|
| 172 |
+
Optional second list of IDs for sequence pairs.
|
| 173 |
+
|
| 174 |
+
Returns:
|
| 175 |
+
`List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
|
| 176 |
+
"""
|
| 177 |
+
|
| 178 |
+
if token_ids_1 is None:
|
| 179 |
+
return super().build_inputs_with_special_tokens(token_ids_0)
|
| 180 |
+
bos = [self.bos_token_id]
|
| 181 |
+
sep = [self.eos_token_id]
|
| 182 |
+
return bos + token_ids_0 + sep + token_ids_1
|
| 183 |
+
|
| 184 |
+
model_name = "openbmb/MiniCPM-Reranker"
|
| 185 |
+
model = CrossEncoder(model_name,max_length=1024,trust_remote_code=True, automodel_args={"attn_implementation":"flash_attention_2","torch_dtype": torch.float16})
|
| 186 |
+
model.tokenizer = MiniCPMRerankerLLamaTokenizer.from_pretrained(model_name, trust_remote_code=True)
|
| 187 |
+
model.tokenizer.padding_side = "right"
|
| 188 |
+
|
| 189 |
+
query = "中国的首都是哪里?"
|
| 190 |
+
passages = [["beijing", "shanghai"]]
|
| 191 |
+
|
| 192 |
+
INSTRUCTION = "Query: "
|
| 193 |
+
query = INSTRUCTION + query
|
| 194 |
+
|
| 195 |
+
sentence_pairs = [[query, doc] for doc in passages]
|
| 196 |
+
|
| 197 |
+
scores = model.predict(sentence_pairs, convert_to_tensor=True).tolist()
|
| 198 |
+
rankings = model.rank(query, passages, return_documents=True, convert_to_tensor=True)
|
| 199 |
+
|
| 200 |
+
print(scores) # [0.0087432861328125, 0.00020503997802734375]
|
| 201 |
+
for ranking in rankings:
|
| 202 |
+
print(f"Score: {ranking['score']:.4f}, Corpus: {ranking['text']}")
|
| 203 |
+
|
| 204 |
+
# ID: 0, Score: 0.0087, Text: beijing
|
| 205 |
+
# ID: 1, Score: 0.0002, Text: shanghai
|
| 206 |
+
```
|
| 207 |
+
|
| 208 |
## 实验结果 Evaluation Results
|
| 209 |
|
| 210 |
### 中文与英文重排序结果 CN/EN Re-ranking Results
|
config.json
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
{
|
| 2 |
-
"_name_or_path": "openbmb/
|
| 3 |
"architectures": [
|
| 4 |
"MiniCPM"
|
| 5 |
],
|
|
|
|
| 1 |
{
|
| 2 |
+
"_name_or_path": "openbmb/MiniCPM-Reranker",
|
| 3 |
"architectures": [
|
| 4 |
"MiniCPM"
|
| 5 |
],
|