| | def _test_client(port: int, print_logprobs: bool = False, test_vlm: bool = False): |
| | import requests |
| | import time |
| | import aiohttp |
| | from pprint import pprint |
| | from swift.llm import InferClient, InferRequest, RequestConfig |
| |
|
| | infer_client = InferClient(port=port) |
| |
|
| | while True: |
| | try: |
| | models = infer_client.models |
| | print(f'models: {models}') |
| | except aiohttp.ClientConnectorError: |
| | time.sleep(5) |
| | continue |
| | break |
| |
|
| | if test_vlm: |
| | query = '这是什么' |
| | |
| | messages = [{ |
| | 'role': |
| | 'user', |
| | 'content': [ |
| | { |
| | 'type': 'text', |
| | 'text': '这是什么' |
| | }, |
| | { |
| | 'type': 'image_url', |
| | 'image_url': { |
| | 'url': 'cat.png' |
| | } |
| | }, |
| | ] |
| | }] |
| | else: |
| | query = '123*234=?' |
| | messages = [{'role': 'user', 'content': query}] |
| |
|
| | infer_request = InferRequest(messages=messages) |
| | request_config = RequestConfig(seed=42, max_tokens=256, temperature=0.8, logprobs=True, top_logprobs=5) |
| |
|
| | resp = infer_client.infer([infer_request], request_config=request_config)[0] |
| | response = resp.choices[0].message.content |
| | print(f'query: {query}') |
| | print(f'response: {response}') |
| | if print_logprobs: |
| | pprint(resp.choices[0].logprobs) |
| |
|
| | request_config = RequestConfig( |
| | stream=True, seed=42, max_tokens=256, temperature=0.8, top_k=20, top_p=0.8, logprobs=True, top_logprobs=5) |
| | gen_list = infer_client.infer([infer_request], request_config=request_config) |
| | print(f'query: {query}') |
| | print('response: ', end='') |
| | for chunk in gen_list[0]: |
| | print(chunk.choices[0].delta.content, end='', flush=True) |
| | if print_logprobs and chunk.choices[0].logprobs is not None: |
| | pprint(chunk.choices[0].logprobs) |
| | print() |
| |
|
| |
|
| | def _test(infer_backend, test_vlm: bool = False): |
| | import os |
| | os.environ['CUDA_VISIBLE_DEVICES'] = '0' |
| |
|
| | from swift.llm import DeployArguments |
| | from swift.llm import deploy_main |
| | import multiprocessing |
| | mp = multiprocessing.get_context('spawn') |
| | model = 'Qwen/Qwen2-VL-7B-Instruct' if test_vlm else 'Qwen/Qwen2-7B-Instruct' |
| | args = DeployArguments(model=model, infer_backend=infer_backend, verbose=False) |
| | process = mp.Process(target=deploy_main, args=(args, )) |
| | process.start() |
| | _test_client(args.port, True, test_vlm) |
| | process.terminate() |
| |
|
| |
|
| | def test_vllm_vlm(): |
| | _test('vllm', test_vlm=True) |
| |
|
| |
|
| | def test_vllm(): |
| | _test('vllm') |
| |
|
| |
|
| | def test_lmdeploy(): |
| | _test('lmdeploy') |
| |
|
| |
|
| | def test_pt(): |
| | _test('pt') |
| |
|
| |
|
| | def test_vllm_origin(): |
| | import os |
| | import subprocess |
| | import sys |
| | from modelscope import snapshot_download |
| | model_dir = snapshot_download('Qwen/Qwen2-7B-Instruct') |
| | args = [sys.executable, '-m', 'vllm.entrypoints.openai.api_server', '--model', model_dir] |
| | process = subprocess.Popen(args) |
| | _test_client(8000) |
| | process.terminate() |
| |
|
| |
|
| | if __name__ == '__main__': |
| | |
| | |
| | test_vllm_vlm() |
| | |
| | |
| |
|