Spaces:

BridgeEight
/

internlm-20B-chat-w4-turbomind

Runtime error

App Files Files Community

BridgeEight commited on Jan 22, 2024

Commit

5dedc73

verified ·

1 Parent(s): 8dac1b2

Update app.py

Browse files

Files changed (1) hide show

app.py +48 -105

app.py CHANGED Viewed

@@ -1,128 +1,71 @@
-import os,random
-os.system('sh install_lmdeploy.sh')
-import gradio as gr
-from lmdeploy.serve.gradio.app import *
-os.system('sh download.sh')
-InterFace.async_engine = AsyncEngine(model_path='internlm2-chat-20b-4bits',
-                                        instance_num=2,
-                                        tp=1)
-async def reset_local_demo(instruction_txtbox: gr.Textbox,
-                           state_chatbot: gr.State, request: gr.Request):
-    """reset the session.
-    Args:
-        instruction_txtbox (str): user's prompt
-        state_chatbot (Sequence): the chatting history
-        request (gr.Request): the request from a user
-    """
-    state_chatbot = []
-    return (
-        state_chatbot,
-        state_chatbot,
-        gr.Textbox.update(value=''),
-    )
-async def cancel_local_demo(state_chatbot: gr.State, cancel_btn: gr.Button,
-                            reset_btn: gr.Button, request: gr.Request):
-    """stop the session.
-    Args:
-        instruction_txtbox (str): user's prompt
-        state_chatbot (Sequence): the chatting history
-        request (gr.Request): the request from a user
-    """
-    return (state_chatbot, disable_btn, disable_btn)
-async def chat_stream_demo(
-    instruction: str,
-    state_chatbot: Sequence,
-    cancel_btn: gr.Button,
-    reset_btn: gr.Button,
-    request: gr.Request,
-):
-    """Chat with AI assistant.
-    Args:
-        instruction (str): user's prompt
-        state_chatbot (Sequence): the chatting history
-        request (gr.Request): the request from a user
-    """
-    session_id = random.randint(0,100000)
-    bot_summarized_response = ''
-    state_chatbot = state_chatbot + [(instruction, None)]
-    messages = []
-    for item in state_chatbot:
-        messages.append(dict(role='user', content=item[0]))
-        if item[1] is not None:
-            messages.append(dict(role='assistant', content=item[1]))
-    yield (state_chatbot, state_chatbot, disable_btn, disable_btn,
-           f'{bot_summarized_response}'.strip())
-    async for outputs in InterFace.async_engine.generate(
-            messages,
-            session_id,
-            stream_response=True,
-            sequence_start=True,
-            sequence_end=True):
-        response = outputs.response
-        if outputs.finish_reason == 'length':
-            gr.Warning('WARNING: exceed session max length.'
-                       ' Please restart the session by reset button.')
-        if outputs.generate_token_len < 0:
-            gr.Warning('WARNING: running on the old session.'
-                       ' Please restart the session by reset button.')
-        if state_chatbot[-1][-1] is None:
-            state_chatbot[-1] = (state_chatbot[-1][0], response)
-        else:
-            state_chatbot[-1] = (state_chatbot[-1][0],
-                                 state_chatbot[-1][1] + response
-                                 )  # piece by piece
-        yield (state_chatbot, state_chatbot, disable_btn, disable_btn,
-               f'{bot_summarized_response}'.strip())
-    yield (state_chatbot, state_chatbot, disable_btn, disable_btn,
-           f'{bot_summarized_response}'.strip())
 with gr.Blocks(css=CSS, theme=THEME) as demo:
     state_chatbot = gr.State([])
     with gr.Column(elem_id='container'):
         gr.Markdown('## LMDeploy Playground')
         chatbot = gr.Chatbot(
             elem_id='chatbot',
-            label=InterFace.async_engine.tm_model.model_name)
         instruction_txtbox = gr.Textbox(
             placeholder='Please input the instruction',
             label='Instruction')
         with gr.Row():
-            cancel_btn = gr.Button(value='Cancel', interactive=False, visible=False)
-            reset_btn = gr.Button(value='Reset', interactive=False, visible=False)
-    send_event = instruction_txtbox.submit(
-        chat_stream_demo,
-        [instruction_txtbox, state_chatbot, cancel_btn, reset_btn],
-        [state_chatbot, chatbot, cancel_btn, reset_btn])
     instruction_txtbox.submit(
         lambda: gr.Textbox.update(value=''),
         [],
         [instruction_txtbox],
     )
-    cancel_btn.click(cancel_local_demo,
-                        [state_chatbot, cancel_btn, reset_btn],
-                        [state_chatbot, cancel_btn, reset_btn],
-                        cancels=[send_event])
-    reset_btn.click(reset_local_demo, [instruction_txtbox, state_chatbot],
                     [state_chatbot, chatbot, instruction_txtbox],
                     cancels=[send_event])
-# print(f'server is gonna mount on: http://{server_name}:{server_port}')
-    demo.queue(concurrency_count=4, max_size=100).launch()

+from lmdeploy.serve.gradio.turbomind_coupled import *
+from lmdeploy.messages import TurbomindEngineConfig
+backend_config = TurbomindEngineConfig(max_batch_size=1, cache_max_entry_count=0.05)
+model_path = 'internlm/internlm2-chat-20b-4bits'
+InterFace.async_engine = AsyncEngine(
+    model_path=model_path,
+    backend='turbomind',
+    backend_config=backend_config,
+    tp=1)
 with gr.Blocks(css=CSS, theme=THEME) as demo:
     state_chatbot = gr.State([])
+    state_session_id = gr.State(0)
     with gr.Column(elem_id='container'):
         gr.Markdown('## LMDeploy Playground')
         chatbot = gr.Chatbot(
             elem_id='chatbot',
+            label=InterFace.async_engine.engine.model_name)
         instruction_txtbox = gr.Textbox(
             placeholder='Please input the instruction',
             label='Instruction')
         with gr.Row():
+            cancel_btn = gr.Button(value='Cancel', interactive=False)
+            reset_btn = gr.Button(value='Reset')
+        with gr.Row():
+            request_output_len = gr.Slider(1,
+                                            2048,
+                                            value=512,
+                                            step=1,
+                                            label='Maximum new tokens')
+            top_p = gr.Slider(0.01, 1, value=0.8, step=0.01, label='Top_p')
+            temperature = gr.Slider(0.01,
+                                    1.5,
+                                    value=0.7,
+                                    step=0.01,
+                                    label='Temperature')
+    send_event = instruction_txtbox.submit(chat_stream_local, [
+        instruction_txtbox, state_chatbot, cancel_btn, reset_btn,
+        state_session_id, top_p, temperature, request_output_len
+    ], [state_chatbot, chatbot, cancel_btn, reset_btn])
     instruction_txtbox.submit(
         lambda: gr.Textbox.update(value=''),
         [],
         [instruction_txtbox],
     )
+    cancel_btn.click(
+        cancel_local_func,
+        [state_chatbot, cancel_btn, reset_btn, state_session_id],
+        [state_chatbot, cancel_btn, reset_btn],
+        cancels=[send_event])
+    reset_btn.click(reset_local_func,
+                    [instruction_txtbox, state_chatbot, state_session_id],
                     [state_chatbot, chatbot, instruction_txtbox],
                     cancels=[send_event])
+    def init():
+        with InterFace.lock:
+            InterFace.global_session_id += 1
+        new_session_id = InterFace.global_session_id
+        return new_session_id
+    demo.load(init, inputs=None, outputs=[state_session_id])
+demo.queue(concurrency_count=InterFace.async_engine.instance_num,
+            max_size=100).launch()