microsoft
/

kosmos-2.5

@@ -2,8 +2,6 @@
 language: en
 license: mit
 ---
-# Under testing
 # Kosmos-2.5
 [Microsoft Document AI](https://www.microsoft.com/en-us/research/project/document-ai/) | [GitHub](https://github.com/microsoft/unilm/tree/master/kosmos-2.5)
@@ -18,41 +16,32 @@ Kosmos-2.5 is a multimodal literate model for machine reading of text-intensive
 Since this is a generative model, there is a risk of **hallucination** during the generation process, and it **CAN NOT** guarantee the accuracy of all OCR/Markdown results in the images.
 ## Use with transformers：
-```bash
-pip install git+https://github.com/tic-top/transformers.git
-```
 ```python
 from PIL import Image
 import requests
 import torch
-from transformers import AutoProcessor, AutoModelForVision2Seq
 import re
-repo = "kirp/kosmos2_5"
 device = "cuda:0"
 dtype = torch.bfloat16
-model = AutoModelForVision2Seq.from_pretrained(repo, device_map=device, torch_dtype=dtype)
 processor = AutoProcessor.from_pretrained(repo)
 url = "https://huggingface.co/kirp/kosmos2_5/resolve/main/receipt_00008.png"
 image = Image.open(requests.get(url, stream=True).raw)
 prompt = "<ocr>" # <md>
 inputs = processor(text=prompt, images=image, return_tensors="pt")
 height, width = inputs.pop("height"), inputs.pop("width")
 raw_width, raw_height = image.size
 scale_height = raw_height / height
 scale_width = raw_width / width
 inputs = {k: v.to(device) if v is not None else None for k, v in inputs.items()}
 inputs["flattened_patches"] = inputs["flattened_patches"].to(dtype)
 generated_ids = model.generate(
     **inputs,
     max_new_tokens=1024,
 )
 generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)
 def postprocess(y, scale_height, scale_width):
     y = y.replace(prompt, "")
     if "<md>" in prompt:
@@ -73,7 +62,6 @@ def postprocess(y, scale_height, scale_width):
             y1 = int(y1 * scale_height)
             info += f"{x0},{y0},{x1},{y0},{x1},{y1},{x0},{y1},{lines[i]}"
     return info
 output_text = postprocess(generated_text[0], scale_height, scale_width)
 print(output_text)
 ```
@@ -115,4 +103,3 @@ The content of this project itself is licensed under the [MIT](https://github.co
 [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct)

 language: en
 license: mit
 ---
 # Kosmos-2.5
 [Microsoft Document AI](https://www.microsoft.com/en-us/research/project/document-ai/) | [GitHub](https://github.com/microsoft/unilm/tree/master/kosmos-2.5)
 Since this is a generative model, there is a risk of **hallucination** during the generation process, and it **CAN NOT** guarantee the accuracy of all OCR/Markdown results in the images.
 ## Use with transformers：
 ```python
 from PIL import Image
 import requests
 import torch
+from transformers import AutoProcessor, Kosmos2_5ForConditionalGeneration
 import re
+repo = "microsoft/kosmos-2.5"
 device = "cuda:0"
 dtype = torch.bfloat16
+model = Kosmos2_5ForConditionalGeneration.from_pretrained(repo, device_map=device, torch_dtype=dtype)
 processor = AutoProcessor.from_pretrained(repo)
 url = "https://huggingface.co/kirp/kosmos2_5/resolve/main/receipt_00008.png"
 image = Image.open(requests.get(url, stream=True).raw)
 prompt = "<ocr>" # <md>
 inputs = processor(text=prompt, images=image, return_tensors="pt")
 height, width = inputs.pop("height"), inputs.pop("width")
 raw_width, raw_height = image.size
 scale_height = raw_height / height
 scale_width = raw_width / width
 inputs = {k: v.to(device) if v is not None else None for k, v in inputs.items()}
 inputs["flattened_patches"] = inputs["flattened_patches"].to(dtype)
 generated_ids = model.generate(
     **inputs,
     max_new_tokens=1024,
 )
 generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)
 def postprocess(y, scale_height, scale_width):
     y = y.replace(prompt, "")
     if "<md>" in prompt:
             y1 = int(y1 * scale_height)
             info += f"{x0},{y0},{x1},{y0},{x1},{y1},{x0},{y1},{lines[i]}"
     return info
 output_text = postprocess(generated_text[0], scale_height, scale_width)
 print(output_text)
 ```
 [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct)