Update README.md
Browse files
README.md
CHANGED
|
@@ -2,8 +2,6 @@
|
|
| 2 |
language: en
|
| 3 |
license: mit
|
| 4 |
---
|
| 5 |
-
# Under testing
|
| 6 |
-
|
| 7 |
# Kosmos-2.5
|
| 8 |
|
| 9 |
[Microsoft Document AI](https://www.microsoft.com/en-us/research/project/document-ai/) | [GitHub](https://github.com/microsoft/unilm/tree/master/kosmos-2.5)
|
|
@@ -18,41 +16,32 @@ Kosmos-2.5 is a multimodal literate model for machine reading of text-intensive
|
|
| 18 |
Since this is a generative model, there is a risk of **hallucination** during the generation process, and it **CAN NOT** guarantee the accuracy of all OCR/Markdown results in the images.
|
| 19 |
|
| 20 |
## Use with transformers:
|
| 21 |
-
```bash
|
| 22 |
-
pip install git+https://github.com/tic-top/transformers.git
|
| 23 |
-
```
|
| 24 |
```python
|
| 25 |
from PIL import Image
|
| 26 |
import requests
|
| 27 |
import torch
|
| 28 |
-
from transformers import AutoProcessor,
|
| 29 |
import re
|
| 30 |
-
|
| 31 |
-
repo = "kirp/kosmos2_5"
|
| 32 |
device = "cuda:0"
|
| 33 |
dtype = torch.bfloat16
|
| 34 |
-
model =
|
| 35 |
processor = AutoProcessor.from_pretrained(repo)
|
| 36 |
-
|
| 37 |
url = "https://huggingface.co/kirp/kosmos2_5/resolve/main/receipt_00008.png"
|
| 38 |
image = Image.open(requests.get(url, stream=True).raw)
|
| 39 |
prompt = "<ocr>" # <md>
|
| 40 |
-
|
| 41 |
inputs = processor(text=prompt, images=image, return_tensors="pt")
|
| 42 |
height, width = inputs.pop("height"), inputs.pop("width")
|
| 43 |
raw_width, raw_height = image.size
|
| 44 |
scale_height = raw_height / height
|
| 45 |
scale_width = raw_width / width
|
| 46 |
-
|
| 47 |
inputs = {k: v.to(device) if v is not None else None for k, v in inputs.items()}
|
| 48 |
inputs["flattened_patches"] = inputs["flattened_patches"].to(dtype)
|
| 49 |
-
|
| 50 |
generated_ids = model.generate(
|
| 51 |
**inputs,
|
| 52 |
max_new_tokens=1024,
|
| 53 |
)
|
| 54 |
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)
|
| 55 |
-
|
| 56 |
def postprocess(y, scale_height, scale_width):
|
| 57 |
y = y.replace(prompt, "")
|
| 58 |
if "<md>" in prompt:
|
|
@@ -73,7 +62,6 @@ def postprocess(y, scale_height, scale_width):
|
|
| 73 |
y1 = int(y1 * scale_height)
|
| 74 |
info += f"{x0},{y0},{x1},{y0},{x1},{y1},{x0},{y1},{lines[i]}"
|
| 75 |
return info
|
| 76 |
-
|
| 77 |
output_text = postprocess(generated_text[0], scale_height, scale_width)
|
| 78 |
print(output_text)
|
| 79 |
```
|
|
@@ -115,4 +103,3 @@ The content of this project itself is licensed under the [MIT](https://github.co
|
|
| 115 |
[Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct)
|
| 116 |
|
| 117 |
|
| 118 |
-
|
|
|
|
| 2 |
language: en
|
| 3 |
license: mit
|
| 4 |
---
|
|
|
|
|
|
|
| 5 |
# Kosmos-2.5
|
| 6 |
|
| 7 |
[Microsoft Document AI](https://www.microsoft.com/en-us/research/project/document-ai/) | [GitHub](https://github.com/microsoft/unilm/tree/master/kosmos-2.5)
|
|
|
|
| 16 |
Since this is a generative model, there is a risk of **hallucination** during the generation process, and it **CAN NOT** guarantee the accuracy of all OCR/Markdown results in the images.
|
| 17 |
|
| 18 |
## Use with transformers:
|
|
|
|
|
|
|
|
|
|
| 19 |
```python
|
| 20 |
from PIL import Image
|
| 21 |
import requests
|
| 22 |
import torch
|
| 23 |
+
from transformers import AutoProcessor, Kosmos2_5ForConditionalGeneration
|
| 24 |
import re
|
| 25 |
+
repo = "microsoft/kosmos-2.5"
|
|
|
|
| 26 |
device = "cuda:0"
|
| 27 |
dtype = torch.bfloat16
|
| 28 |
+
model = Kosmos2_5ForConditionalGeneration.from_pretrained(repo, device_map=device, torch_dtype=dtype)
|
| 29 |
processor = AutoProcessor.from_pretrained(repo)
|
|
|
|
| 30 |
url = "https://huggingface.co/kirp/kosmos2_5/resolve/main/receipt_00008.png"
|
| 31 |
image = Image.open(requests.get(url, stream=True).raw)
|
| 32 |
prompt = "<ocr>" # <md>
|
|
|
|
| 33 |
inputs = processor(text=prompt, images=image, return_tensors="pt")
|
| 34 |
height, width = inputs.pop("height"), inputs.pop("width")
|
| 35 |
raw_width, raw_height = image.size
|
| 36 |
scale_height = raw_height / height
|
| 37 |
scale_width = raw_width / width
|
|
|
|
| 38 |
inputs = {k: v.to(device) if v is not None else None for k, v in inputs.items()}
|
| 39 |
inputs["flattened_patches"] = inputs["flattened_patches"].to(dtype)
|
|
|
|
| 40 |
generated_ids = model.generate(
|
| 41 |
**inputs,
|
| 42 |
max_new_tokens=1024,
|
| 43 |
)
|
| 44 |
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)
|
|
|
|
| 45 |
def postprocess(y, scale_height, scale_width):
|
| 46 |
y = y.replace(prompt, "")
|
| 47 |
if "<md>" in prompt:
|
|
|
|
| 62 |
y1 = int(y1 * scale_height)
|
| 63 |
info += f"{x0},{y0},{x1},{y0},{x1},{y1},{x0},{y1},{lines[i]}"
|
| 64 |
return info
|
|
|
|
| 65 |
output_text = postprocess(generated_text[0], scale_height, scale_width)
|
| 66 |
print(output_text)
|
| 67 |
```
|
|
|
|
| 103 |
[Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct)
|
| 104 |
|
| 105 |
|
|
|