Spaces:

Dorjzodovsuren
/

MongolianLlama

Sleeping

App Files Files Community

MongolianLlama / app.py

Dorjzodovsuren

Update app.py

eca10fb verified 8 months ago

raw

history blame contribute delete

2.99 kB

	import os
	import torch
	import spaces
	import gradio as gr
	from threading import Thread
	from transformers import AutoModelForCausalLM, StoppingCriteria, StoppingCriteriaList, TextIteratorStreamer, AutoTokenizer

	# Model configuration
	model_name = "Dorjzodovsuren/Mongolian_Llama3-v1.1"
	max_seq_length = 1024
	dtype = torch.float16 # or torch.bfloat16 if preferred
	load_in_4bit = False # if using bitsandbytes for 4-bit loading

	# Load tokenizer
	tokenizer = AutoTokenizer.from_pretrained(model_name)

	# # Load model
	# model = AutoModelForCausalLM.from_pretrained(
	# model_name,
	# device_map="auto",
	# torch_dtype=dtype,
	# load_in_4bit=load_in_4bit # This requires `bitsandbytes` to be installed
	# )

	model_id = "unsloth/llama-3.1-8b-bnb-4bit"
	peft_model_id = "Dorjzodovsuren/Mongolian_Llama3-v1.1"

	model = AutoModelForCausalLM.from_pretrained(model_id)
	model.load_adapter(peft_model_id)

	EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN

	alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

	### Instruction:
	{}

	### Input:
	{}

	### Response:
	{}"""

	# Get the device based on GPU availability
	device = 'cuda'

	# Move model into device
	model = model.to(device)

	class StopOnTokens(StoppingCriteria):
	def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
	stop_ids = [29, 0]
	for stop_id in stop_ids:
	if input_ids[0][-1] == stop_id:
	return True
	return False

	# Current implementation does not support conversation based on history.
	# Highly recommend to experiment on various hyper parameters to compare qualities.
	gpu_timeout = int(os.getenv("GPU_TIMEOUT", 60))

	@spaces.GPU(duration=gpu_timeout)
	def predict(message, history):
	stop = StopOnTokens()
	messages = alpaca_prompt.format(
	message,
	"",
	"",
	)

	model_inputs = tokenizer([messages], return_tensors="pt").to(device)

	#streamer = TextIteratorStreamer(tokenizer, timeout=10., skip_prompt=True, skip_special_tokens=True)
	streamer = TextIteratorStreamer(tokenizer, timeout=10, skip_prompt=True, skip_special_tokens=True)
	generate_kwargs = dict(
	model_inputs,
	streamer=streamer,
	max_new_tokens=1024,
	top_p=0.95,
	temperature=0.001,
	repetition_penalty=1.1,
	stopping_criteria=StoppingCriteriaList([stop])
	)
	t = Thread(target=model.generate, kwargs=generate_kwargs)
	t.start()

	partial_message = ""
	for new_token in streamer:
	if new_token != '<':
	partial_message += new_token
	yield partial_message

	# Add a simple chat example
	examples = [
	["What's the capital of France?"],
	["What is meaning of life?"],
	["Хайр гэж юу вэ?"]
	]

	gr.ChatInterface(predict, examples=examples).launch(debug=True, share=True, show_api=True)