diff --git a/models/benchmark/benchmark_longbench.py b/models/benchmark/benchmark_longbench.py new file mode 100644 index 0000000000000000000000000000000000000000..a007ba31904b5da346c1a5102e34b28e5de6f89c --- /dev/null +++ b/models/benchmark/benchmark_longbench.py @@ -0,0 +1,448 @@ +import argparse +import dataclasses +import inspect +import json +import os +import random +import sys +import time +from pathlib import Path + +import numpy as np +import torch +from tqdm import tqdm +from vllm import LLM, SamplingParams + +sys.path.append(str(Path(__file__).resolve().parent.parent.parent) + "/inference") +sys.path.append(str(Path(__file__).resolve().parent) + "/longbench") + +FILE_DIR = str(Path(__file__).resolve().parent) + +from metrics import ( + classification_score, + code_sim_score, + count_score, + qa_f1_score, + qa_f1_zh_score, + retrieval_score, + retrieval_zh_score, + rouge_score, + rouge_zh_score, +) +from utils import load_chat_template, sampling_add_cli_args +from vllm import LLM, EngineArgs, SamplingParams + +dataset2metric = { + "narrativeqa": qa_f1_score, + "qasper": qa_f1_score, + "multifieldqa_en": qa_f1_score, + "multifieldqa_zh": qa_f1_zh_score, + "hotpotqa": qa_f1_score, + "2wikimqa": qa_f1_score, + "musique": qa_f1_score, + "dureader": rouge_zh_score, + "gov_report": rouge_score, + "qmsum": rouge_score, + "multi_news": rouge_score, + "vcsum": rouge_zh_score, + "trec": classification_score, + "triviaqa": qa_f1_score, + "samsum": rouge_score, + "lsht": classification_score, + "passage_retrieval_en": retrieval_score, + "passage_count": count_score, + "passage_retrieval_zh": retrieval_zh_score, + "lcc": code_sim_score, + "repobench-p": code_sim_score, +} + + +def scorer_e(dataset, predictions, answers, lengths, all_classes): + scores = {"0-4k": [], "4-8k": [], "8k+": []} + for (prediction, ground_truths, length) in zip(predictions, answers, lengths): + score = 0.0 + if dataset in ["trec", "triviaqa", "samsum", "lsht"]: + prediction = prediction.lstrip("\n").split("\n")[0] + for ground_truth in ground_truths: + score = max( + score, + dataset2metric[dataset]( + prediction, ground_truth, all_classes=all_classes + ), + ) + if length < 4000: + scores["0-4k"].append(score) + elif length < 8000: + scores["4-8k"].append(score) + else: + scores["8k+"].append(score) + for key in scores.keys(): + scores[key] = round(100 * np.mean(scores[key]), 2) + return scores + + +def scorer(dataset, predictions, answers, all_classes): + total_score = 0.0 + for (prediction, ground_truths) in zip(predictions, answers): + score = 0.0 + if dataset in ["trec", "triviaqa", "samsum", "lsht"]: + prediction = prediction.lstrip("\n").split("\n")[0] + for ground_truth in ground_truths: + score = max( + score, + dataset2metric[dataset]( + prediction, ground_truth, all_classes=all_classes + ), + ) + total_score += score + return round(100 * total_score / len(predictions), 2) + + +def build_chat(tokenizer, prompt, model_name): + if "chatglm3" in model_name: + prompt = tokenizer.build_chat_input(prompt) + elif "chatglm" in model_name: + prompt = tokenizer.build_prompt(prompt) + elif "longchat" in model_name or "vicuna" in model_name: + from fastchat.model import get_conversation_template + + conv = get_conversation_template("vicuna") + conv.append_message(conv.roles[0], prompt) + conv.append_message(conv.roles[1], None) + prompt = conv.get_prompt() + elif "xgen" in model_name: + header = ( + "A chat between a curious human and an artificial intelligence assistant. " + "The assistant gives helpful, detailed, and polite answers to the human's questions.\n\n" + ) + prompt = header + f" ### Human: {prompt}\n###" + elif "internlm" in model_name: + prompt = f"<|User|>:{prompt}\n<|Bot|>:" + elif "falcon" in model_name: + prompt = f"User: {prompt}\nFalcon:" + else: + # we do not use default template... + if tokenizer.chat_template is not None: + return tokenizer.apply_chat_template( + [{"role": "user", "content": prompt}], + tokenize=False, + add_generation_prompt=True, + ) + return prompt + + +def parse_args(args=None): + parser = argparse.ArgumentParser() + parser.add_argument("--e", action="store_true", help="Evaluate on LongBench-E") + parser.add_argument("--datapath", type=str, default=None) + parser.add_argument("--model-name", type=str, default=None) + parser.add_argument("--chat-template", type=str, default=None) + parser.add_argument( + "--skip-chat-template-check", default=False, action="store_true" + ) + parser.add_argument("--val-data-nums", type=int, default=-1) + parser.add_argument( + "--save-pred", action="store_true", help="Save the pred output to local files." + ) + parser.add_argument("--new-model-run", default=False, action="store_true") + parser.add_argument("--target", default=None, type=str, help="for CI TEST ONLY") + parser = EngineArgs.add_cli_args(parser) + parser = sampling_add_cli_args(parser) + args = parser.parse_args() + + return args + + +def get_pred( + model, + tokenizer, + data, + max_length, + max_gen, + prompt_format, + dataset, + out_path, + save_pred, +): + preds = [] + prompts = [] + prompts_ids = [] + sampling_args = [ + param.name + for param in list( + inspect.signature(SamplingParams).parameters.values() + )[1:] + ] + sampling_params = { + attr: getattr(args, attr) for attr in sampling_args if args.__contains__(attr) + } + sampling_params["max_tokens"] = max_gen + sampling_params = SamplingParams(**sampling_params) + + prompts_ids = [] + for json_obj in tqdm(data): + prompt = prompt_format.format(**json_obj) + # truncate to fit max_length (we suggest truncate in the middle, since the left and right side may contain crucial instructions) + tokenized_prompt = tokenizer( + prompt, truncation=False, return_tensors="pt" + ).input_ids[0] + if "chatglm3" in model_name: + tokenized_prompt = tokenizer( + prompt, truncation=False, return_tensors="pt", add_special_tokens=False + ).input_ids[0] + if len(tokenized_prompt) > max_length: + half = int(max_length / 2) + prompt = tokenizer.decode( + tokenized_prompt[:half], skip_special_tokens=True + ) + tokenizer.decode(tokenized_prompt[-half:], skip_special_tokens=True) + if dataset not in [ + "trec", + "triviaqa", + "samsum", + "lsht", + "lcc", + "repobench-p", + ]: # chat models are better off without build prompts on these tasks + prompt = build_chat(tokenizer, prompt, model_name) + if "chatglm3" in model_name: + if dataset in ["trec", "triviaqa", "samsum", "lsht", "lcc", "repobench-p"]: + input = tokenizer(prompt, truncation=False, return_tensors="pt") + else: + input = prompt + else: + input = tokenizer(prompt, truncation=False, return_tensors="pt") + + prompt_token_id = input.input_ids.view(-1).tolist() + prompts_ids.append(prompt_token_id) + assert len(data) == len(prompts_ids) + outputs = model.generate( + sampling_params=sampling_params, prompt_token_ids=prompts_ids + ) + for i, output in enumerate(outputs): + pred = output.outputs[0].text + json_obj = data[i] + preds.append( + { + "pred": pred, + "answers": json_obj["answers"], + "all_classes": json_obj["all_classes"], + "length": json_obj["length"], + } + ) + + if save_pred: + for pred in preds: + with open(out_path, "a", encoding="utf-8") as f: + json.dump(pred, f, ensure_ascii=False) + f.write("\n") + return preds + + +def seed_everything(seed): + torch.manual_seed(seed) + torch.cuda.manual_seed(seed) + torch.backends.cudnn.benchmark = False + torch.backends.cudnn.deterministic = True + torch.cuda.manual_seed_all(seed) + + +def load_model_and_tokenizer(args): + engine_args = [attr.name for attr in dataclasses.fields(EngineArgs)] + engine_params = {attr: getattr(args, attr) for attr in engine_args} + model = LLM(**engine_params) + tokenizer = model.get_tokenizer() + load_chat_template(tokenizer, args.chat_template) + return model, tokenizer + + +if __name__ == "__main__": + seed_everything(42) + args = parse_args() + model_name = args.model_name + + model2maxlen = json.load( + open(os.path.join(FILE_DIR, "longbench/config/model2maxlen.json"), "r") + ) + try: + max_length = model2maxlen[model_name] + except: + raise ValueError( + "the model_name is not in model2maxlen.json, please check your inputs or add a new model!!!" + ) + # define model + model, tokenizer = load_model_and_tokenizer(args) + if ( + tokenizer.chat_template is None + and args.chat_template is None + and not args.skip_chat_template_check + ): + raise ValueError( + "tokenizer.chat_template is None, please pass --skip-chat-template-check if you do not pass --chat-template" + ) + + if args.e: + datasets = [ + "qasper", + "multifieldqa_en", + "hotpotqa", + "2wikimqa", + "gov_report", + "multi_news", + "trec", + "triviaqa", + "samsum", + "passage_count", + "passage_retrieval_en", + "lcc", + "repobench-p", + ] + else: + datasets = [ + "narrativeqa", + "qasper", + "multifieldqa_en", + "multifieldqa_zh", + "hotpotqa", + "2wikimqa", + "musique", + "dureader", + "gov_report", + "qmsum", + "multi_news", + "vcsum", + "trec", + "triviaqa", + "samsum", + "lsht", + "passage_count", + "passage_retrieval_en", + "passage_retrieval_zh", + "lcc", + "repobench-p", + ] + + # we design specific prompt format and max generation length for each task, feel free to modify them to optimize model output + dataset2prompt = json.load( + open(os.path.join(FILE_DIR, "longbench/config/dataset2prompt.json"), "r") + ) + dataset2maxlen = json.load( + open(os.path.join(FILE_DIR, "longbench/config/dataset2maxlen.json"), "r") + ) + + val_data_num = args.val_data_nums + val_data_num = ( + min(val_data_num, len(datasets)) if val_data_num != -1 else len(datasets) + ) + index = list(range(len(datasets))) + random.seed(time.time()) + random.shuffle(index) + val_datasets = ( + datasets + if val_data_num == len(datasets) + else [datasets[i] for i in index[:val_data_num]] + ) + + # predict on each dataset + scores = dict() + for dataset in val_datasets: + data = [] + predictions, answers, lengths = [], [], [] + if args.e: + with open( + os.path.join(args.datapath, "{}.jsonl".format(dataset)), + "r", + encoding="utf-8", + ) as lines: + for line in lines: + data.append(json.loads(line)) + if not os.path.exists(f"pred_e/{model_name}"): + os.makedirs(f"pred_e/{model_name}") + out_path = f"pred_e/{model_name}/{dataset}.jsonl" + else: + with open( + os.path.join(args.datapath, "{}.jsonl".format(dataset)), + "r", + encoding="utf-8", + ) as lines: + for line in lines: + data.append(json.loads(line)) + if not os.path.exists(f"pred/{model_name}"): + os.makedirs(f"pred/{model_name}") + out_path = f"pred/{model_name}/{dataset}.jsonl" + prompt_format = dataset2prompt[dataset] + max_gen = dataset2maxlen[dataset] + + preds = get_pred( + model, + tokenizer, + data, + max_length, + max_gen, + prompt_format, + dataset, + out_path, + args.save_pred, + ) + + for line in preds: + predictions.append(line["pred"]) + answers.append(line["answers"]) + all_classes = line["all_classes"] + if "length" in line: + lengths.append(line["length"]) + if args.e: + score = scorer_e(dataset, predictions, answers, lengths, all_classes) + else: + score = scorer(dataset, predictions, answers, all_classes) + + scores[dataset] = score + if not args.new_model_run: + reference_file = ( + os.path.join(FILE_DIR, f"longbench/result_record/{model_name}.jsonl") + if args.target is None + else args.target + ) + mdoel_result_reference = json.load(open(reference_file, "r")) + reference_score = mdoel_result_reference[dataset] + if ( + score < reference_score + and (reference_score - score) / reference_score > 0.06 + ): + print( + f"{model_name} on dataset: {dataset}, target score: {reference_score}, val score: {score}, fail!" + ) + # exit(1) + else: + print( + f"{model_name} on dataset: {dataset}, target score: {reference_score}, val score: {score}, pass" + ) + if args.e: + out_path = f"pred_e/{model_name}/result.json" + else: + out_path = f"pred/{model_name}/result.json" + with open(out_path, "w") as f: + json.dump(scores, f, ensure_ascii=False, indent=4) + + # for ci test exit + try: + import os + import subprocess + + current_pid = os.getpid() + cmd = ( + """ps -ef | grep multiprocessing.spawn | grep {} | grep -v grep | """.format( + current_pid + ) + + "awk '{print $2}'" + ) + result = subprocess.run(cmd, shell=True, capture_output=True, text=True) + pid = result.returncode + if result.returncode == 0: + result = result.stdout.strip() + pids = result.split("\n") + for pid in pids: + if str.isdigit(pid): + cmd = "kill -9 {}".format(pid) + subprocess.run(cmd, shell=True, capture_output=True, text=True) + except: + assert False \ No newline at end of file diff --git a/models/benchmark/benchmark_serving.py b/models/benchmark/benchmark_serving.py new file mode 100644 index 0000000000000000000000000000000000000000..363cfa931c4e85f7d71776faa1f267af3faa8000 --- /dev/null +++ b/models/benchmark/benchmark_serving.py @@ -0,0 +1,549 @@ +import argparse +import asyncio +import json +import random +import sys +import time +import traceback +import warnings +from dataclasses import dataclass, field +from pathlib import Path +from typing import AsyncGenerator, List, Optional, Tuple + +import aiohttp +import numpy as np +from tqdm.asyncio import tqdm +from vllm import AsyncEngineArgs, SamplingParams +from vllm.transformers_utils.tokenizer import get_tokenizer + + +AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60) + + +def remove_prefix(text: str, prefix: str) -> str: + if text.startswith(prefix): + return text[len(prefix):] + return text + +@dataclass +class BenchmarkMetrics: + completed: int + total_input: int + total_output: int + request_throughput: float + output_throughput: float + total_token_throughput: float + mean_ttft_ms: float + median_ttft_ms: float + std_ttft_ms: float + percentiles_ttft_ms: List[Tuple[float, float]] + mean_tpot_ms: float + median_tpot_ms: float + std_tpot_ms: float + percentiles_tpot_ms: List[Tuple[float, float]] + mean_itl_ms: float + median_itl_ms: float + std_itl_ms: float + percentiles_itl_ms: List[Tuple[float, float]] + # E2EL stands for end-to-end latency per request. + # It is the time taken on the client side from sending + # a request to receiving a complete response. + mean_e2el_ms: float + median_e2el_ms: float + std_e2el_ms: float + percentiles_e2el_ms: List[Tuple[float, float]] + + +@dataclass +class RequestFuncInput: + prompt: List[int] + api_url: str + prompt_len: int + output_len: int + model: str + best_of: int = 1 + logprobs: Optional[int] = None + multi_modal_content: Optional[dict] = None + ignore_eos: bool = False + + +@dataclass +class RequestFuncOutput: + generated_text: str = "" + success: bool = False + latency: float = 0.0 + ttft: float = 0.0 # Time to first token + itl: List[float] = field( + default_factory=list) # List of inter-token latencies + prompt_len: int = 0 + error: str = "" + + +def get_random_lens( + input_tokens, + output_tokens, + max_input_tokens, + max_output_tokens, + num_requests, + input_output_type, +): + assert 1 <= input_tokens < max_input_tokens + assert 1 <= output_tokens < max_output_tokens + min_input_tokens = input_tokens + min_output_tokens = output_tokens + input_mean = int((max_input_tokens + min_input_tokens) / 2) + input_std = int((max_input_tokens - input_mean) / 2) + output_mean = int((max_output_tokens + min_output_tokens) / 2) + output_std = int((max_output_tokens - output_mean) / 2) + + input_len_list = [] + output_len_list = [] + for _ in range(num_requests): + if input_output_type == "normal": + while True: + input_length = int(np.random.normal(input_mean, input_std)) + if min_input_tokens <= input_length <= max_input_tokens: + break + while True: + output_length = int(np.random.normal(output_mean, output_std)) + if min_output_tokens <= output_length <= max_output_tokens: + break + else: + input_length = int(np.random.uniform(min_input_tokens, max_input_tokens)) + output_length = int(np.random.uniform(min_output_tokens, max_output_tokens)) + + input_len_list.append([None, input_length]) + output_len_list.append(output_length) + + return input_len_list, output_len_list + + +def sample_requests( + num_requests: int, + input_tokens: int, + output_tokens: int, + max_input_tokens: int, + max_output_tokens: int, + input_output_type: str = "fix", + seed: int = 42, +) -> Tuple[List[Tuple[str, int]], List[int]]: + np.random.seed(seed) + random.seed(seed) + + if input_output_type == "fix": + input_len_list = [[None, input_tokens] for _ in range(num_requests)] + output_len_list = [output_tokens for _ in range(num_requests)] + elif input_output_type in ["normal", "uniform"]: + input_len_list, output_len_list = get_random_lens( + input_tokens, + output_tokens, + max_input_tokens, + max_output_tokens, + num_requests, + input_output_type, + ) + else: + raise NotImplementedError("You can modify this code according to your needs") + + for inputs in input_len_list: + assert len(inputs) == 2 + # [str, int] or [None ,int] + assert isinstance(inputs[0], str) or inputs[0] is None + assert isinstance(inputs[1], int) + for outptus in output_len_list: + assert isinstance(outptus, int) + + return input_len_list, output_len_list + + +async def get_request( + input_requests: List[RequestFuncInput], + time_interval: float, +): + input_requests = iter(input_requests) + for request in input_requests: + yield request + + if time_interval == 0: + continue + await asyncio.sleep(time_interval) + + +async def async_request_openai_completions( + request_func_input: RequestFuncInput, + pbar: Optional[tqdm] = None, +) -> RequestFuncOutput: + api_url = request_func_input.api_url + assert api_url.endswith( + ("completions", "profile") + ), "OpenAI Completions API URL must end with 'completions' or 'profile'." + + async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session: + payload = { + "model": request_func_input.model, + "prompt": request_func_input.prompt, + "temperature": 0.0, + "best_of": request_func_input.best_of, + "max_tokens": request_func_input.output_len, + "logprobs": request_func_input.logprobs, + "stream": True, + "ignore_eos": request_func_input.ignore_eos, + } + headers = { + # "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}" + "Authorization": "EMPTY" + } + + output = RequestFuncOutput() + output.prompt_len = request_func_input.prompt_len + output.output_len = request_func_input.output_len + + generated_text = "" + ttft = 0.0 + st = time.perf_counter() + most_recent_timestamp = st + try: + async with session.post(url=api_url, json=payload, + headers=headers) as response: + if response.status == 200: + async for chunk_bytes in response.content: + chunk_bytes = chunk_bytes.strip() + if not chunk_bytes: + continue + + chunk = remove_prefix(chunk_bytes.decode("utf-8"), + "data: ") + if chunk == "[DONE]": + latency = time.perf_counter() - st + else: + data = json.loads(chunk) + + # NOTE: Some completion API might have a last + # usage summary response without a token so we + # want to check a token was generated + if data["choices"][0]["text"]: + timestamp = time.perf_counter() + # First token + if ttft == 0.0: + ttft = time.perf_counter() - st + output.ttft = ttft + + # Decoding phase + else: + output.itl.append(timestamp - + most_recent_timestamp) + + most_recent_timestamp = timestamp + generated_text += data["choices"][0]["text"] + + output.generated_text = generated_text + output.success = True + output.latency = latency + else: + output.error = response.reason or "" + output.success = False + except Exception: + output.success = False + exc_info = sys.exc_info() + output.error = "".join(traceback.format_exception(*exc_info)) + + if pbar: + pbar.update(1) + return output + + +async def benchmark( + input_requests: List[RequestFuncInput], + time_interval: float, +) -> None: + tasks: List[asyncio.Task] = [] + async for request in get_request(input_requests, time_interval): + task = asyncio.create_task(async_request_openai_completions(request)) + tasks.append(task) + outputs = await asyncio.gather(*tasks) + return outputs + + +def calculate_metrics( + outputs: List[RequestFuncOutput], + dur_s: float, + tokenizer, + selected_percentile_metrics: List[str], + selected_percentiles: List[float], +) -> Tuple[BenchmarkMetrics, List[int]]: + actual_output_lens: List[int] = [] + total_input = 0 + completed = 0 + itls: List[float] = [] + tpots: List[float] = [] + ttfts: List[float] = [] + e2els: List[float] = [] + for i in range(len(outputs)): + if outputs[i].success: + # We use the tokenizer to count the number of output tokens for all + # serving backends instead of looking at len(outputs[i].itl) since + # multiple output tokens may be bundled together + # Note : this may inflate the output token count slightly + output_len = len( + tokenizer(outputs[i].generated_text, + add_special_tokens=False).input_ids) + try: + assert output_len == outputs[i].output_len + except: + pass + # We use ignore_eos = True, so we can assert actual output len is what we want. + # if assert false, mostly because the tokenizer and detokenizer process. + + actual_output_lens.append(outputs[i].output_len) + total_input += outputs[i].prompt_len + if output_len > 1: + tpots.append( + (outputs[i].latency - outputs[i].ttft) / (output_len - 1)) + itls += outputs[i].itl + ttfts.append(outputs[i].ttft) + e2els.append(outputs[i].latency) + completed += 1 + else: + actual_output_lens.append(0) + + if completed == 0: + warnings.warn( + "All requests failed. This is likely due to a misconfiguration " + "on the benchmark arguments.", + stacklevel=2) + + metrics = BenchmarkMetrics( + completed=completed, + total_input=total_input, + total_output=sum(actual_output_lens), + request_throughput=completed / dur_s, + output_throughput=sum(actual_output_lens) / dur_s, + total_token_throughput=(total_input + sum(actual_output_lens)) / dur_s, + mean_ttft_ms=np.mean(ttfts or 0) * + 1000, # ttfts is empty if streaming is not supported by backend + std_ttft_ms=np.std(ttfts or 0) * 1000, + median_ttft_ms=np.median(ttfts or 0) * 1000, + percentiles_ttft_ms=[(p, np.percentile(ttfts or 0, p) * 1000) + for p in selected_percentiles], + mean_tpot_ms=np.mean(tpots or 0) * 1000, + std_tpot_ms=np.std(tpots or 0) * 1000, + median_tpot_ms=np.median(tpots or 0) * 1000, + percentiles_tpot_ms=[(p, np.percentile(tpots or 0, p) * 1000) + for p in selected_percentiles], + mean_itl_ms=np.mean(itls or 0) * 1000, + std_itl_ms=np.std(itls or 0) * 1000, + median_itl_ms=np.median(itls or 0) * 1000, + percentiles_itl_ms=[(p, np.percentile(itls or 0, p) * 1000) + for p in selected_percentiles], + mean_e2el_ms=np.median(e2els or 0) * 1000, + std_e2el_ms=np.std(e2els or 0) * 1000, + median_e2el_ms=np.mean(e2els or 0) * 1000, + percentiles_e2el_ms=[(p, np.percentile(e2els or 0, p) * 1000) + for p in selected_percentiles], + ) + + return metrics, actual_output_lens + + +async def main(args): + api_url = f"http://{args.host}:{args.port}/v1/completions" + + input_tokens = args.input_tokens + output_tokens = args.output_tokens + + input_list, output_list = sample_requests( + args.num_prompts, + input_tokens, + output_tokens, + args.max_input_tokens, + args.max_output_tokens, + args.input_output_type, + args.seed, + ) + + prompts = [] + for i in range(args.num_prompts): + if input_list[i][0] is None: + prompts.append( + RequestFuncInput( + np.random.randint(6023, 6024, input_list[i][1]).tolist(), + api_url, + input_list[i][1], + output_list[i], + args.model, + 1, + None, + None, + True, + ) + ) + else: + # this pass is used for test str input. + prompts.append( + RequestFuncInput( + input_list[i][0], + api_url, + input_list[i][1], + output_list[i], + args.model, + 1, + None, + None, + True, + ) + ) + # warm up + await benchmark(prompts[:1], args.time_interval) + + # benchmark + benchmark_start_time = time.perf_counter() + outputs = await benchmark(prompts, args.time_interval) + benchmark_end_time = time.perf_counter() + benchmark_duration = benchmark_end_time - benchmark_start_time + + tokenizer = get_tokenizer(args.model, trust_remote_code=True) + selected_percentile_metrics = args.percentile_metrics.split(",") + selected_percentiles = [float(p) for p in args.metric_percentiles.split(",")] + metrics, actual_output_lens = calculate_metrics( + outputs=outputs, + dur_s=benchmark_duration, + tokenizer=tokenizer, + selected_percentile_metrics=selected_percentile_metrics, + selected_percentiles=selected_percentiles + ) + + print("{s:{c}^{n}}".format(s=' Serving Benchmark Result ', n=50, c='=')) + print("{:<40} {:<10}".format("Successful requests:", metrics.completed)) + print("{:<40} {:<10.2f}".format("Benchmark duration (s):", + benchmark_duration)) + print("{:<40} {:<10}".format("Total input tokens:", metrics.total_input)) + print("{:<40} {:<10}".format("Total generated tokens:", + metrics.total_output)) + print("{:<40} {:<10.2f}".format("Request throughput (req/s):", + metrics.request_throughput)) + print("{:<40} {:<10.2f}".format("Output token throughput (tok/s):", + metrics.output_throughput)) + print("{:<40} {:<10.2f}".format("Total Token throughput (tok/s):", + metrics.total_token_throughput)) + + result = { + "duration": benchmark_duration, + "completed": metrics.completed, + "total_input_tokens": metrics.total_input, + "total_output_tokens": metrics.total_output, + "request_throughput": metrics.request_throughput, + "output_throughput": metrics.output_throughput, + "total_token_throughput": metrics.total_token_throughput, + "input_lens": [output.prompt_len for output in outputs], + "output_lens": actual_output_lens, + "ttfts": [output.ttft for output in outputs], + "itls": [output.itl for output in outputs], + "generated_texts": [output.generated_text for output in outputs], + "errors": [output.error for output in outputs], + } + + def process_one_metric( + # E.g., "ttft" + metric_attribute_name: str, + # E.g., "TTFT" + metric_name: str, + # E.g., "Time to First Token" + metric_header: str, + ): + # This function prints and adds statistics of the specified + # metric. + if metric_attribute_name not in selected_percentile_metrics: + return + print("{s:{c}^{n}}".format(s=metric_header, n=50, c='-')) + print("{:<40} {:<10.2f}".format( + f"Mean {metric_name} (ms):", + getattr(metrics, f"mean_{metric_attribute_name}_ms"))) + print("{:<40} {:<10.2f}".format( + f"Median {metric_name} (ms):", + getattr(metrics, f"median_{metric_attribute_name}_ms"))) + result[f"mean_{metric_attribute_name}_ms"] = getattr( + metrics, f"mean_{metric_attribute_name}_ms") + result[f"median_{metric_attribute_name}_ms"] = getattr( + metrics, f"median_{metric_attribute_name}_ms") + result[f"std_{metric_attribute_name}_ms"] = getattr( + metrics, f"std_{metric_attribute_name}_ms") + for p, value in getattr(metrics, + f"percentiles_{metric_attribute_name}_ms"): + p_word = str(int(p)) if int(p) == p else str(p) + print("{:<40} {:<10.2f}".format(f"P{p_word} {metric_name} (ms):", + value)) + result[f"p{p_word}_{metric_attribute_name}_ms"] = value + + process_one_metric("ttft", "TTFT", "Time to First Token") + process_one_metric("tpot", "TPOT", + "Time per Output Token (excl. 1st token)") + process_one_metric("itl", "ITL", "Inter-token Latency") + process_one_metric("e2el", "E2EL", "End-to-end Latency") + + print("=" * 50) + + if args.target is not None: + target_qps = args.target + if metrics.output_throughput < target_qps: + print( + "actual qps: {} < target qps: {} , fail!!".format( + metrics.output_throughput, target_qps + ) + ) + exit(1) + else: + print( + "actual qps: {} > target qps: {} , pass!!".format( + metrics.output_throughput, target_qps + ) + ) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--model", type=str, default=None) + parser.add_argument("--num-prompts", type=int, default=128) + parser.add_argument("--input-tokens", type=int, default=128) + parser.add_argument( + "--max-input-tokens", + type=int, + default=-1, + help="Use for generate random length of input, limit min input length", + ) + parser.add_argument("--output-tokens", type=int, default=128) + parser.add_argument( + "--max-output-tokens", + type=int, + default=-1, + help="Use for generate random length of output, limit max output length", + ) + parser.add_argument("--input-output-type", type=str, default="fix",choices=['fix','normal','uniform']) + parser.add_argument("--host", type=str, default="127.0.0.1") + parser.add_argument("--port", type=int, default=12345) + parser.add_argument("--time-interval", type=float, default=0.0) + parser.add_argument("--target", type=int, default=None) + parser.add_argument("--seed", type=int, default=42) + parser.add_argument( + "--percentile-metrics", + type=str, + default="ttft,tpot,itl", + help="Comma-seperated list of selected metrics to report percentils. " + "This argument specifies the metrics to report percentiles. " + "Allowed metric names are \"ttft\", \"tpot\", \"itl\", \"e2el\". " + "Default value is \"ttft,tpot,itl\".") + parser.add_argument( + "--metric-percentiles", + type=str, + default="99", + help="Comma-seperated list of percentiles for selected metrics. " + "To report 25-th, 50-th, and 75-th percentiles, use \"25,50,75\". " + "Default value is \"99\". " + "Use \"--percentile-metrics\" to select metrics.", + ) + + args = parser.parse_args() + asyncio.run(main(args)) + +# 测试num-prompts 1 input-tokens 256 --output-tokens 256 作为最基础的基准 +# 测试num-prompts 16 input-tokens 2048 --output-tokens 1024 作为一般性能测试 +# 测试num-prompts 100 input-tokens 512 max-input-tokens 16384 output-tokens 512 max-output-tokens 16384 input-output-type uniform 监测算子对于不同长度的性能 \ No newline at end of file diff --git a/models/benchmark/benchmark_serving_multimodal.py b/models/benchmark/benchmark_serving_multimodal.py new file mode 100644 index 0000000000000000000000000000000000000000..407dd548aab81734b1b265926947571bd02594e0 --- /dev/null +++ b/models/benchmark/benchmark_serving_multimodal.py @@ -0,0 +1,495 @@ +import argparse +import asyncio +import json +import random +import sys +import time +import traceback +import warnings +from dataclasses import dataclass, field +from pathlib import Path +from typing import AsyncGenerator, List, Optional, Tuple + +import aiohttp +import numpy as np +from tqdm.asyncio import tqdm +from vllm import AsyncEngineArgs, SamplingParams +from vllm.transformers_utils.tokenizer import get_tokenizer +import base64 +from PIL import Image +import io + +AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60) + + +def remove_prefix(text: str, prefix: str) -> str: + if text.startswith(prefix): + return text[len(prefix):] + return text + +@dataclass +class BenchmarkMetrics: + completed: int + total_input: int + total_output: int + request_throughput: float + output_throughput: float + total_token_throughput: float + mean_ttft_ms: float + median_ttft_ms: float + std_ttft_ms: float + percentiles_ttft_ms: List[Tuple[float, float]] + mean_tpot_ms: float + median_tpot_ms: float + std_tpot_ms: float + percentiles_tpot_ms: List[Tuple[float, float]] + mean_itl_ms: float + median_itl_ms: float + std_itl_ms: float + percentiles_itl_ms: List[Tuple[float, float]] + # E2EL stands for end-to-end latency per request. + # It is the time taken on the client side from sending + # a request to receiving a complete response. + mean_e2el_ms: float + median_e2el_ms: float + std_e2el_ms: float + percentiles_e2el_ms: List[Tuple[float, float]] + + +@dataclass +class RequestFuncInput: + content: dict + api_url: str + prompt_len: int + output_len: int + model: str + best_of: int = 1 + logprobs: Optional[int] = None + multi_modal_content: Optional[dict] = None + ignore_eos: bool = False + + +@dataclass +class RequestFuncOutput: + generated_text: str = "" + success: bool = False + latency: float = 0.0 + ttft: float = 0.0 # Time to first token + itl: List[float] = field( + default_factory=list) # List of inter-token latencies + prompt_len: int = 0 + error: str = "" + + +def sample_requests( + num_requests: int, + output_tokens: int, + seed: int = 42, +) -> Tuple[List[Tuple[str, int]], List[int]]: + np.random.seed(seed) + random.seed(seed) + + input_len_list = [["What's in this image?", None] for _ in range(num_requests)] + output_len_list = [output_tokens for _ in range(num_requests)] + + for inputs in input_len_list: + assert len(inputs) == 2 + # [str, int] or [None ,int] + assert isinstance(inputs[0], str) or inputs[0] is None + assert isinstance(inputs[1], int) or inputs[1] is None + for outptus in output_len_list: + assert isinstance(outptus, int) + + return input_len_list, output_len_list + + +async def get_request( + input_requests: List[RequestFuncInput], + time_interval: float, +): + input_requests = iter(input_requests) + for request in input_requests: + yield request + + if time_interval == 0: + continue + await asyncio.sleep(time_interval) + + +async def async_request_openai_completions( + request_func_input: RequestFuncInput, + pbar: Optional[tqdm] = None, +) -> RequestFuncOutput: + api_url = request_func_input.api_url + assert api_url.endswith( + ("completions", "profile") + ), "OpenAI Completions API URL must end with 'completions' or 'profile'." + + async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session: + payload = { + "model": request_func_input.model, + "messages": [ + { + "role": "user", + "content": request_func_input.content + }, + ], + "temperature": 0.0, + "best_of": request_func_input.best_of, + "max_tokens": request_func_input.output_len, + "logprobs": request_func_input.logprobs, + "stream": True, + "ignore_eos": request_func_input.ignore_eos, + } + headers = { + "Content-Type": "application/json", + # "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}" + "Authorization": "EMPTY" + } + + output = RequestFuncOutput() + output.prompt_len = request_func_input.prompt_len + output.output_len = request_func_input.output_len + + generated_text = "" + ttft = 0.0 + st = time.perf_counter() + most_recent_timestamp = st + try: + async with session.post(url=api_url, json=payload, + headers=headers) as response: + if response.status == 200: + async for chunk_bytes in response.content: + chunk_bytes = chunk_bytes.strip() + if not chunk_bytes: + continue + + chunk = remove_prefix(chunk_bytes.decode("utf-8"), + "data: ") + if chunk == "[DONE]": + latency = time.perf_counter() - st + else: + timestamp = time.perf_counter() + data = json.loads(chunk) + + delta = data["choices"][0]["delta"] + if delta.get("content", None): + # First token + if ttft == 0.0: + ttft = time.perf_counter() - st + output.ttft = ttft + + # Decoding phase + else: + output.itl.append(timestamp - + most_recent_timestamp) + + generated_text += delta["content"] + most_recent_timestamp = timestamp + + output.generated_text = generated_text + output.success = True + output.latency = latency + else: + output.error = response.reason or "" + output.success = False + except Exception: + output.success = False + exc_info = sys.exc_info() + output.error = "".join(traceback.format_exception(*exc_info)) + + if pbar: + pbar.update(1) + return output + + +async def benchmark( + input_requests: List[RequestFuncInput], + time_interval: float, +) -> None: + tasks: List[asyncio.Task] = [] + async for request in get_request(input_requests, time_interval): + task = asyncio.create_task(async_request_openai_completions(request)) + tasks.append(task) + outputs = await asyncio.gather(*tasks) + return outputs + + +def calculate_metrics( + outputs: List[RequestFuncOutput], + dur_s: float, + tokenizer, + selected_percentile_metrics: List[str], + selected_percentiles: List[float], +) -> Tuple[BenchmarkMetrics, List[int]]: + actual_output_lens: List[int] = [] + total_input = 0 + completed = 0 + itls: List[float] = [] + tpots: List[float] = [] + ttfts: List[float] = [] + e2els: List[float] = [] + for i in range(len(outputs)): + if outputs[i].success: + # We use the tokenizer to count the number of output tokens for all + # serving backends instead of looking at len(outputs[i].itl) since + # multiple output tokens may be bundled together + # Note : this may inflate the output token count slightly + output_len = len( + tokenizer(outputs[i].generated_text, + add_special_tokens=False).input_ids) + try: + assert output_len == outputs[i].output_len + except: + pass + # We use ignore_eos = True, so we can assert actual output len is what we want. + # if assert false, mostly because the tokenizer and detokenizer process. + + actual_output_lens.append(outputs[i].output_len) + total_input += outputs[i].prompt_len + if output_len > 1: + tpots.append( + (outputs[i].latency - outputs[i].ttft) / (output_len - 1)) + itls += outputs[i].itl + ttfts.append(outputs[i].ttft) + e2els.append(outputs[i].latency) + completed += 1 + else: + actual_output_lens.append(0) + + if completed == 0: + warnings.warn( + "All requests failed. This is likely due to a misconfiguration " + "on the benchmark arguments.", + stacklevel=2) + + metrics = BenchmarkMetrics( + completed=completed, + total_input=total_input, + total_output=sum(actual_output_lens), + request_throughput=completed / dur_s, + output_throughput=sum(actual_output_lens) / dur_s, + total_token_throughput=(total_input + sum(actual_output_lens)) / dur_s, + mean_ttft_ms=np.mean(ttfts or 0) * + 1000, # ttfts is empty if streaming is not supported by backend + std_ttft_ms=np.std(ttfts or 0) * 1000, + median_ttft_ms=np.median(ttfts or 0) * 1000, + percentiles_ttft_ms=[(p, np.percentile(ttfts or 0, p) * 1000) + for p in selected_percentiles], + mean_tpot_ms=np.mean(tpots or 0) * 1000, + std_tpot_ms=np.std(tpots or 0) * 1000, + median_tpot_ms=np.median(tpots or 0) * 1000, + percentiles_tpot_ms=[(p, np.percentile(tpots or 0, p) * 1000) + for p in selected_percentiles], + mean_itl_ms=np.mean(itls or 0) * 1000, + std_itl_ms=np.std(itls or 0) * 1000, + median_itl_ms=np.median(itls or 0) * 1000, + percentiles_itl_ms=[(p, np.percentile(itls or 0, p) * 1000) + for p in selected_percentiles], + mean_e2el_ms=np.median(e2els or 0) * 1000, + std_e2el_ms=np.std(e2els or 0) * 1000, + median_e2el_ms=np.mean(e2els or 0) * 1000, + percentiles_e2el_ms=[(p, np.percentile(e2els or 0, p) * 1000) + for p in selected_percentiles], + ) + + return metrics, actual_output_lens + + +async def main(args): + api_url = f"http://{args.host}:{args.port}/v1/chat/completions" + + output_tokens = args.output_tokens + if args.chat_template is not None: + tokenizer = get_tokenizer(args.model, trust_remote_code=args.trust_remote_code, chat_template=args.chat_template) + else: + tokenizer = get_tokenizer(args.model, trust_remote_code=args.trust_remote_code) + input_list, output_list = sample_requests( + args.num_prompts, + output_tokens, + args.seed, + ) + + prompts = [] + image_size = args.image_size.split(",") + image_size = [int(x) for x in image_size] + image = Image.open(args.image_path) + image = image.resize(image_size) + image = image.convert("RGB") + image_data = io.BytesIO() + image.save(image_data, format='JPEG') + image_base64 = base64.b64encode(image_data.getvalue()).decode("utf-8") + for i in range(args.num_prompts): + messages = [{'role': 'user', 'content': f"{input_list[i][0]}"}] + content = [ + { + "type": "text", + "text": input_list[i][0] + } + ] + prompt_ids = tokenizer.apply_chat_template(messages,tokenize=True) + assert isinstance(prompt_ids, list) and isinstance(prompt_ids[0], int) + + content.append( + { + "type": "image_url", + "image_url": { + "url": f"data:image/jpeg;base64,{image_base64}" + }, + } + ) + + prompts.append( + RequestFuncInput( + content, + api_url, + len(prompt_ids), + output_list[i], + args.model, + 1, + None, + None, + True, + ) + ) + # warm up + outputs = await benchmark(prompts[:16], args.time_interval) + + # benchmark + benchmark_start_time = time.perf_counter() + outputs = await benchmark(prompts, args.time_interval) + benchmark_end_time = time.perf_counter() + benchmark_duration = benchmark_end_time - benchmark_start_time + + selected_percentile_metrics = args.percentile_metrics.split(",") + selected_percentiles = [float(p) for p in args.metric_percentiles.split(",")] + metrics, actual_output_lens = calculate_metrics( + outputs=outputs, + dur_s=benchmark_duration, + tokenizer=tokenizer, + selected_percentile_metrics=selected_percentile_metrics, + selected_percentiles=selected_percentiles + ) + + print("{s:{c}^{n}}".format(s=' Serving Benchmark Result ', n=50, c='=')) + print("{:<40} {:<10}".format("Successful requests:", metrics.completed)) + print("{:<40} {:<10.2f}".format("Benchmark duration (s):", + benchmark_duration)) + print("{:<40} {:<10}".format("Total input tokens:", metrics.total_input)) + print("{:<40} {:<10}".format("Total generated tokens:", + metrics.total_output)) + print("{:<40} {:<10.2f}".format("Request throughput (req/s):", + metrics.request_throughput)) + print("{:<40} {:<10.2f}".format("Output token throughput (tok/s):", + metrics.output_throughput)) + print("{:<40} {:<10.2f}".format("Total Token throughput (tok/s):", + metrics.total_token_throughput)) + + result = { + "duration": benchmark_duration, + "completed": metrics.completed, + "total_input_tokens": metrics.total_input, + "total_output_tokens": metrics.total_output, + "request_throughput": metrics.request_throughput, + "output_throughput": metrics.output_throughput, + "total_token_throughput": metrics.total_token_throughput, + "input_lens": [output.prompt_len for output in outputs], + "output_lens": actual_output_lens, + "ttfts": [output.ttft for output in outputs], + "itls": [output.itl for output in outputs], + "generated_texts": [output.generated_text for output in outputs], + "errors": [output.error for output in outputs], + } + + def process_one_metric( + # E.g., "ttft" + metric_attribute_name: str, + # E.g., "TTFT" + metric_name: str, + # E.g., "Time to First Token" + metric_header: str, + ): + # This function prints and adds statistics of the specified + # metric. + if metric_attribute_name not in selected_percentile_metrics: + return + print("{s:{c}^{n}}".format(s=metric_header, n=50, c='-')) + print("{:<40} {:<10.2f}".format( + f"Mean {metric_name} (ms):", + getattr(metrics, f"mean_{metric_attribute_name}_ms"))) + print("{:<40} {:<10.2f}".format( + f"Median {metric_name} (ms):", + getattr(metrics, f"median_{metric_attribute_name}_ms"))) + result[f"mean_{metric_attribute_name}_ms"] = getattr( + metrics, f"mean_{metric_attribute_name}_ms") + result[f"median_{metric_attribute_name}_ms"] = getattr( + metrics, f"median_{metric_attribute_name}_ms") + result[f"std_{metric_attribute_name}_ms"] = getattr( + metrics, f"std_{metric_attribute_name}_ms") + for p, value in getattr(metrics, + f"percentiles_{metric_attribute_name}_ms"): + p_word = str(int(p)) if int(p) == p else str(p) + print("{:<40} {:<10.2f}".format(f"P{p_word} {metric_name} (ms):", + value)) + result[f"p{p_word}_{metric_attribute_name}_ms"] = value + + process_one_metric("ttft", "TTFT", "Time to First Token") + process_one_metric("tpot", "TPOT", + "Time per Output Token (excl. 1st token)") + process_one_metric("itl", "ITL", "Inter-token Latency") + process_one_metric("e2el", "E2EL", "End-to-end Latency") + + print("=" * 50) + + if args.target is not None: + target_qps = args.target + if metrics.output_throughput < target_qps: + print( + "actual qps: {} < target qps: {} , fail!!".format( + metrics.output_throughput, target_qps + ) + ) + exit(1) + else: + print( + "actual qps: {} > target qps: {} , pass!!".format( + metrics.output_throughput, target_qps + ) + ) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--model", type=str, default=None) + parser.add_argument("--output-tokens", type=int, default=128) + # this paramater could be unnecessary, because most multimodal's preprocessor will resize image before using it. + parser.add_argument("--image-path", type=str, default="test.jpg") + parser.add_argument("--image-size", type=str, default="512,512") + parser.add_argument("--num-prompts", type=int, default=128) + parser.add_argument("--host", type=str, default="127.0.0.1") + parser.add_argument("--port", type=int, default=12345) + parser.add_argument("--time-interval", type=float, default=0.0) + parser.add_argument("--target", type=int, default=None) + parser.add_argument("--seed", type=int, default=42) + parser.add_argument('--trust-remote-code', type=bool, default=False) + parser.add_argument('--chat-template',type=str,default=None) + parser.add_argument( + "--percentile-metrics", + type=str, + default="ttft,tpot,itl", + help="Comma-seperated list of selected metrics to report percentils. " + "This argument specifies the metrics to report percentiles. " + "Allowed metric names are \"ttft\", \"tpot\", \"itl\", \"e2el\". " + "Default value is \"ttft,tpot,itl\".") + parser.add_argument( + "--metric-percentiles", + type=str, + default="99", + help="Comma-seperated list of percentiles for selected metrics. " + "To report 25-th, 50-th, and 75-th percentiles, use \"25,50,75\". " + "Default value is \"99\". " + "Use \"--percentile-metrics\" to select metrics.", + ) + + args = parser.parse_args() + asyncio.run(main(args)) diff --git a/models/benchmark/internvl2/README.md b/models/benchmark/internvl2/README.md new file mode 100644 index 0000000000000000000000000000000000000000..84f30b1007b78d9fc6b2389be19ee43e1eb9afdf --- /dev/null +++ b/models/benchmark/internvl2/README.md @@ -0,0 +1,12 @@ +# InternVL2-26B + +* 下载数据 + + 数据链接: [InternVL2-26B](https://huggingface.co/OpenGVLab/InternVL2-26B) + +* 测试 InternVL2-26B + +```bash +cd .. +bash test_performance_server_multimodal.sh --model /path/to/model -tp 4 --max-num-seqs 32 --max-num-batched-tokens 8192 --max-model-len 8192 --host 127.0.0.1 --port 12345 --trust-remote-code,--model /path/to/model --host 127.0.0.1 --port 12345 --num-prompts 1 --output-tokens 128 --image-path test.jpg --trust-remote-code true --image-size "512,512" +``` diff --git a/models/benchmark/llama3.1/README.md b/models/benchmark/llama3.1/README.md new file mode 100644 index 0000000000000000000000000000000000000000..2f2c2213087d5140bba3cc2db50d0359505b8be8 --- /dev/null +++ b/models/benchmark/llama3.1/README.md @@ -0,0 +1,45 @@ +# Llama3.1 测试 + +* 下载权重和数据集 + + FP16权重链接:[meta-llama/Llama-3.1-70B-Instruct · Hugging Face](https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct) + 数据集:[HuggingFaceH4/ultrachat_200k](https://huggingface.co/datasets/HuggingFaceH4/ultrachat_200k) + +* 模型权重量化 + +```bash +cd .. +python3 quantize_w8a8.py \ +--model /path/to/model \ +--dataset-path /home/data/nlp/ultrachat_200k \ +--num-samples 32 \ +--model-type llama + +# w8a8权重保存名称以 -W8A8-Dynamic-Per-Token 结尾 +``` + +* 测试 Llama3.1-70B-instruct W8A8 + +```bash +cd .. +bash test_performance_server.sh --model /path/to/model -tp 4 --host 127.0.0.1 --port 12345 --max-num-seqs 10 --max-num-batched-tokens 20480 --max-model-len 20480,--model /path/to/model --host 127.0.0.1 --port 12345 --num-prompts 10 --input-tokens 2048 --output-tokens 1024 +``` + +======================================================================================== +* 下载权重 Llama3.1-8B-instruct: + + FP16权重链接: meta-llama/Llama-3.1-8B-Instruct · Hugging Face + + +* 数据集下载并解压: https://huggingface.co/datasets/THUDM/LongBench/resolve/main/data.zip + + +* 测试 Llama3.1-8B-instruct bf16/fp16 + +```bash +cd .. +bash test_vllm_longbench.sh --model-name llama3.1-8b-chat --model /path/to/model --datapath /path/to/longbench/data -tp 1 --max-model-len 32768 --dtype float16 --max-num-seqs 16 --val-data-nums 1 --temperature 0.0 --max-num-batched-tokens 32768 --trust-remote-code + +bash test_vllm_longbench.sh --model-name llama3.1-8b-chat --model /path/to/model --datapath /path/to/longbench/data -tp 1 --max-model-len 32768 --dtype bfloat16 --max-num-seqs 16 --val-data-nums 1 --temperature 0.0 --max-num-batched-tokens 32768 --trust-remote-code + +``` \ No newline at end of file diff --git a/models/benchmark/llava.jinja b/models/benchmark/llava.jinja new file mode 100644 index 0000000000000000000000000000000000000000..af8d0b8e5413ead17f600519514d6cd4e0269274 --- /dev/null +++ b/models/benchmark/llava.jinja @@ -0,0 +1,23 @@ +{%- if messages[0]['role'] == 'system' -%} + {%- set system_message = messages[0]['content'] -%} + {%- set messages = messages[1:] -%} +{%- else -%} + {% set system_message = '' -%} +{%- endif -%} + +{{ bos_token + system_message }} +{%- for message in messages -%} + {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) -%} + {{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }} + {%- endif -%} + + {%- if message['role'] == 'user' -%} + {{ 'USER: ' + message['content'] + '\n' }} + {%- elif message['role'] == 'assistant' -%} + {{ 'ASSISTANT: ' + message['content'] + eos_token + '\n' }} + {%- endif -%} +{%- endfor -%} + +{%- if add_generation_prompt -%} + {{ 'ASSISTANT:' }} +{% endif %} \ No newline at end of file diff --git a/models/benchmark/longbench/config/dataset2maxlen.json b/models/benchmark/longbench/config/dataset2maxlen.json new file mode 100644 index 0000000000000000000000000000000000000000..4f8272a0ca789a946dcfcbeaded3d51fa8d71e45 --- /dev/null +++ b/models/benchmark/longbench/config/dataset2maxlen.json @@ -0,0 +1,23 @@ +{ + "narrativeqa": 128, + "qasper": 128, + "multifieldqa_en": 64, + "multifieldqa_zh": 64, + "hotpotqa": 32, + "2wikimqa": 32, + "musique": 32, + "dureader": 128, + "gov_report": 512, + "qmsum": 512, + "multi_news": 512, + "vcsum": 512, + "trec": 64, + "triviaqa": 32, + "samsum": 128, + "lsht": 64, + "passage_count": 32, + "passage_retrieval_en": 32, + "passage_retrieval_zh": 32, + "lcc": 64, + "repobench-p": 64 +} diff --git a/models/benchmark/longbench/config/dataset2prompt.json b/models/benchmark/longbench/config/dataset2prompt.json new file mode 100644 index 0000000000000000000000000000000000000000..588c35f0067ce4d822c6890f4b58b0a767bab4a6 --- /dev/null +++ b/models/benchmark/longbench/config/dataset2prompt.json @@ -0,0 +1,23 @@ +{ + "narrativeqa": "You are given a story, which can be either a novel or a movie script, and a question. Answer the question asconcisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nStory: {context}\n\nNow, answer the question based on the story asconcisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nQuestion: {input}\n\nAnswer:", + "qasper": "You are given a scientific article and a question. Answer the question as concisely as you can, using a single phrase or sentence if possible. If the question cannot be answered based on the information in the article, write \"unanswerable\". If the question is a yes/no question, answer \"yes\", \"no\", or \"unanswerable\". Do not provide any explanation.\n\nArticle: {context}\n\n Answer the question based on the above article as concisely as you can, using a single phrase or sentence if possible. If the question cannot be answered based on the information in the article, write \"unanswerable\". If the question is a yes/no question, answer \"yes\", \"no\", or \"unanswerable\". Do not provide any explanation.\n\nQuestion: {input}\n\nAnswer:", + "multifieldqa_en": "Read the following text and answer briefly.\n\n{context}\n\nNow, answer the following question based on the above text, only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:", + "multifieldqa_zh": "阅读以下文字并用中文简短回答:\n\n{context}\n\n现在请基于上面的文章回答下面的问题,只告诉我答案,不要输出任何其他字词。\n\n问题:{input}\n回答:", + "hotpotqa": "Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:", + "2wikimqa": "Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:", + "musique": "Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:", + "dureader": "请基于给定的文章回答下述问题。\n\n文章:{context}\n\n请基于上述文章回答下面的问题。\n\n问题:{input}\n回答:", + "gov_report": "You are given a report by a government agency. Write a one-page summary of the report.\n\nReport:\n{context}\n\nNow, write a one-page summary of the report.\n\nSummary:", + "qmsum": "You are given a meeting transcript and a query containing a question or instruction. Answer the query in one or more sentences.\n\nTranscript:\n{context}\n\nNow, answer the query based on the above meeting transcript in one or more sentences.\n\nQuery: {input}\nAnswer:", + "multi_news": "You are given several news passages. Write a one-page summary of all news. \n\nNews:\n{context}\n\nNow, write a one-page summary of all the news.\n\nSummary:", + "vcsum": "下面有一段会议记录,请你阅读后,写一段总结,总结会议的内容。\n会议记录:\n{context}\n\n会议总结:", + "trec": "Please determine the type of the question below. Here are some examples of questions.\n\n{context}\n{input}", + "triviaqa": "Answer the question based on the given passage. Only give me the answer and do not output any other words. The following are some examples.\n\n{context}\n\n{input}", + "samsum": "Summarize the dialogue into a few short sentences. The following are some examples.\n\n{context}\n\n{input}", + "lsht": "请判断给定新闻的类别,下面是一些例子。\n\n{context}\n{input}", + "passage_count": "There are some paragraphs below sourced from Wikipedia. Some of them may be duplicates. Please carefully read these paragraphs and determine how many unique paragraphs there are after removing duplicates. In other words, how many non-repeating paragraphs are there in total?\n\n{context}\n\nPlease enter the final count of unique paragraphs after removing duplicates. The output format should only contain the number, such as 1, 2, 3, and so on.\n\nThe final answer is: ", + "passage_retrieval_en": "Here are 30 paragraphs from Wikipedia, along with an abstract. Please determine which paragraph the abstract is from.\n\n{context}\n\nThe following is an abstract.\n\n{input}\n\nPlease enter the number of the paragraph that the abstract is from. The answer format must be like \"Paragraph 1\", \"Paragraph 2\", etc.\n\nThe answer is: ", + "passage_retrieval_zh": "以下是若干段落文字,以及其中一个段落的摘要。请确定给定的摘要出自哪一段。\n\n{context}\n\n下面是一个摘要\n\n{input}\n\n请输入摘要所属段落的编号。答案格式必须是\"段落1\",\"段落2\"等格式\n\n答案是:", + "lcc": "Please complete the code given below. \n{context}Next line of code:\n", + "repobench-p": "Please complete the code given below. \n{context}{input}Next line of code:\n" +} diff --git a/models/benchmark/longbench/config/model2maxlen.json b/models/benchmark/longbench/config/model2maxlen.json new file mode 100644 index 0000000000000000000000000000000000000000..ba319e9c115e7e2774525ff8ed134c66112d2e28 --- /dev/null +++ b/models/benchmark/longbench/config/model2maxlen.json @@ -0,0 +1,11 @@ +{ + "llama2-7b-chat": 3584, + "llama2-13b-chat": 3584, + "llama3-8b-chat": 7680, + "llama3-70b-chat": 7680, + "baichuan2-13b-chat": 3584, + "qwen2-72b-chat-awq": 32256, + "llama3.1-8b-chat": 32256, + "qwen2.5-14b-chat": 32256, + "qwen2-vl-chat": 32256 +} diff --git a/models/benchmark/longbench/config/model2path.json b/models/benchmark/longbench/config/model2path.json new file mode 100644 index 0000000000000000000000000000000000000000..0c3f73f91c661f1cbe081513c984b4e8a83f82e5 --- /dev/null +++ b/models/benchmark/longbench/config/model2path.json @@ -0,0 +1,7 @@ +{ + "llama2-7b-chat-4k": "meta-llama/Llama-2-7b-chat-hf", + "chatglm2-6b": "THUDM/chatglm2-6b", + "chatglm2-6b-32k": "THUDM/chatglm2-6b-32k", + "vicuna-v1.5-7b-16k": "lmsys/vicuna-7b-v1.5-16k", + "llama-2-7B-32K-Instruct": "togethercomputer/Llama-2-7B-32K-Instruct" +} diff --git a/models/benchmark/longbench/metrics.py b/models/benchmark/longbench/metrics.py new file mode 100644 index 0000000000000000000000000000000000000000..5ce1137d65dbc887ff4ced94aa4e254e2086392f --- /dev/null +++ b/models/benchmark/longbench/metrics.py @@ -0,0 +1,163 @@ +import difflib +import re +import string +from collections import Counter +from typing import List + +import jieba +from fuzzywuzzy import fuzz +from rouge import Rouge + + +def normalize_answer(s): + """Lower text and remove punctuation, articles and extra whitespace.""" + + def remove_articles(text): + return re.sub(r"\b(a|an|the)\b", " ", text) + + def white_space_fix(text): + return " ".join(text.split()) + + def remove_punc(text): + exclude = set(string.punctuation) + return "".join(ch for ch in text if ch not in exclude) + + def lower(text): + return text.lower() + + return white_space_fix(remove_articles(remove_punc(lower(s)))) + + +def normalize_zh_answer(s): + """Lower text and remove punctuation, extra whitespace.""" + + def white_space_fix(text): + return "".join(text.split()) + + def remove_punc(text): + cn_punctuation = "!?。。"#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏." + all_punctuation = set(string.punctuation + cn_punctuation) + return "".join(ch for ch in text if ch not in all_punctuation) + + def lower(text): + return text.lower() + + return white_space_fix(remove_punc(lower(s))) + + +def count_score(prediction, ground_truth, **kwargs): + numbers = re.findall(r"\d+", prediction) + right_num = 0 + for number in numbers: + if str(number) == str(ground_truth): + right_num += 1 + final_score = 0.0 if len(numbers) == 0 else right_num / len(numbers) + return float(final_score) + + +def retrieval_score(prediction, ground_truth, **kwargs): + pattern = r"Paragraph (\d+)" + matches = re.findall(pattern, ground_truth) + ground_truth_id = matches[0] + numbers = re.findall(r"\d+", prediction) + right_num = 0 + for number in numbers: + if str(number) == str(ground_truth_id): + right_num += 1 + final_score = 0.0 if len(numbers) == 0 else right_num / len(numbers) + return float(final_score) + + +def retrieval_zh_score(prediction, ground_truth, **kwargs): + pattern = r"段落(\d+)" + matches = re.findall(pattern, ground_truth) + ground_truth_id = matches[0] + numbers = re.findall(r"\d+", prediction) + right_num = 0 + for number in numbers: + if str(number) == str(ground_truth_id): + right_num += 1 + final_score = 0.0 if len(numbers) == 0 else right_num / len(numbers) + return float(final_score) + + +def code_sim_score(prediction, ground_truth, **kwargs): + all_lines = prediction.lstrip("\n").split("\n") + prediction = "" + for line in all_lines: + if ("`" not in line) and ("#" not in line) and ("//" not in line): + prediction = line + break + return fuzz.ratio(prediction, ground_truth) / 100 + + +def classification_score(prediction, ground_truth, **kwargs): + em_match_list = [] + all_classes = kwargs["all_classes"] + for class_name in all_classes: + if class_name in prediction: + em_match_list.append(class_name) + for match_term in em_match_list: + if match_term in ground_truth and match_term != ground_truth: + em_match_list.remove(match_term) + if em_match_list != 0: + if ground_truth in em_match_list: + score = 1.0 / len(em_match_list) + else: + score = 0.0 + else: + best_match = None + highest_similarity = 0 + for string in all_classes: + similarity = difflib.SequenceMatcher(None, string, prediction).ratio() + if similarity > highest_similarity: + highest_similarity = similarity + best_match = string + score = float(best_match == ground_truth) + return score + + +def rouge_score(prediction, ground_truth, **kwargs): + rouge = Rouge() + try: + scores = rouge.get_scores([prediction], [ground_truth], avg=True) + except: + return 0.0 + return scores["rouge-l"]["f"] + + +def rouge_zh_score(prediction, ground_truth, **kwargs): + prediction = " ".join(list(jieba.cut(prediction, cut_all=False))) + ground_truth = " ".join(list(jieba.cut(ground_truth, cut_all=False))) + score = rouge_score(prediction, ground_truth) + return score + + +def f1_score(prediction, ground_truth, **kwargs): + common = Counter(prediction) & Counter(ground_truth) + num_same = sum(common.values()) + if num_same == 0: + return 0 + precision = 1.0 * num_same / len(prediction) + recall = 1.0 * num_same / len(ground_truth) + f1 = (2 * precision * recall) / (precision + recall) + return f1 + + +def qa_f1_score(prediction, ground_truth, **kwargs): + normalized_prediction = normalize_answer(prediction) + normalized_ground_truth = normalize_answer(ground_truth) + + prediction_tokens = normalized_prediction.split() + ground_truth_tokens = normalized_ground_truth.split() + return f1_score(prediction_tokens, ground_truth_tokens) + + +def qa_f1_zh_score(prediction, ground_truth, **kwargs): + prediction_tokens = list(jieba.cut(prediction, cut_all=False)) + ground_truth_tokens = list(jieba.cut(ground_truth, cut_all=False)) + prediction_tokens = [normalize_zh_answer(token) for token in prediction_tokens] + ground_truth_tokens = [normalize_zh_answer(token) for token in ground_truth_tokens] + prediction_tokens = [token for token in prediction_tokens if len(token) > 0] + ground_truth_tokens = [token for token in ground_truth_tokens if len(token) > 0] + return f1_score(prediction_tokens, ground_truth_tokens) diff --git a/models/benchmark/longbench/result_record/llama3.1-8b-chat.jsonl b/models/benchmark/longbench/result_record/llama3.1-8b-chat.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..6f076a2e2916a2228ce6031650a627f3fa1f902c --- /dev/null +++ b/models/benchmark/longbench/result_record/llama3.1-8b-chat.jsonl @@ -0,0 +1,23 @@ +{ + "narrativeqa": 27.7, + "qasper": 45.17, + "multifieldqa_en": 55.0, + "multifieldqa_zh": 62.36, + "hotpotqa": 55.3, + "2wikimqa": 43.68, + "musique": 30.76, + "dureader": 33.84, + "gov_report": 35.17, + "qmsum": 25.3, + "multi_news": 27.41, + "vcsum": 17.06, + "trec": 72.5, + "triviaqa": 91.65, + "samsum": 43.7, + "lsht": 46.5, + "passage_count": 7.12, + "passage_retrieval_en": 99.5, + "passage_retrieval_zh": 96.96, + "lcc": 63.08, + "repobench-p": 57.02 +} \ No newline at end of file diff --git a/models/benchmark/longbench/result_record/qwen2-vl-chat.jsonl b/models/benchmark/longbench/result_record/qwen2-vl-chat.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..07e02fd46460093adb22c3cab2fcc07415db4769 --- /dev/null +++ b/models/benchmark/longbench/result_record/qwen2-vl-chat.jsonl @@ -0,0 +1,23 @@ +{ + "narrativeqa": 25.02, + "qasper": 31.85, + "multifieldqa_en": 51.41, + "multifieldqa_zh": 59.48, + "hotpotqa": 55.05, + "2wikimqa": 48.26, + "musique": 31.67, + "dureader": 33.51, + "gov_report": 31.03, + "qmsum": 25.22, + "multi_news": 26.47, + "vcsum": 18.46, + "trec": 80.0, + "triviaqa": 81.89, + "samsum": 43.91, + "lsht": 39.0, + "passage_count": 4.0, + "passage_retrieval_en": 74.5, + "passage_retrieval_zh": 74.0, + "lcc": 53.89, + "repobench-p": 39.81 +} \ No newline at end of file diff --git a/models/benchmark/longbench/result_record/qwen2.5-14b-chat.jsonl b/models/benchmark/longbench/result_record/qwen2.5-14b-chat.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..65cdf9faa33d45e207c3f79502c4a2ad62a72bb1 --- /dev/null +++ b/models/benchmark/longbench/result_record/qwen2.5-14b-chat.jsonl @@ -0,0 +1,23 @@ +{ + "narrativeqa": 28.33, + "qasper": 45.5, + "multifieldqa_en": 54.52, + "multifieldqa_zh": 62.73, + "hotpotqa": 62.55, + "2wikimqa": 58.52, + "musique": 37.71, + "dureader": 32.12, + "gov_report": 32.54, + "qmsum": 24.95, + "multi_news": 24.62, + "vcsum": 16.11, + "trec": 77.5, + "triviaqa": 90.06, + "samsum": 47.39, + "lsht": 49.17, + "passage_count": 12.88, + "passage_retrieval_en": 98.75, + "passage_retrieval_zh": 98.58, + "lcc": 65.23, + "repobench-p": 52.61 +} \ No newline at end of file diff --git a/models/benchmark/longbench/utils.py b/models/benchmark/longbench/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..1708c9f284d71cd320055d78e81b4c8b0de746ce --- /dev/null +++ b/models/benchmark/longbench/utils.py @@ -0,0 +1,210 @@ +import argparse +import codecs +import logging + +""" +The following arguments can not be add in args... +early_stopping: Union[bool, str] = False, +early_stopping: Controls the stopping condition for beam search. It + accepts the following values: `True`, where the generation stops as + soon as there are `best_of` complete candidates; `False`, where an + heuristic is applied and the generation stops when is it very + unlikely to find better candidates; `"never"`, where the beam search + procedure only stops when there cannot be better candidates + (canonical beam search algorithm). +stop: Optional[Union[str, List[str]]] = None, +stop_token_ids: Optional[List[int]] = None, +logits_processors: Optional[List[LogitsProcessor]] = None, +logits_processors: List of functions that modify logits based on + previously generated tokens, and optionally prompt tokens as + a first argument. +truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None, +truncate_prompt_tokens: If set to an integer k, will use only the last k + tokens from the prompt (i.e., left truncation). Defaults to None + (i.e., no truncation). + """ + + +def sampling_add_cli_args(args: argparse.ArgumentParser) -> argparse.ArgumentParser: + args.add_argument( + "--n", + type=int, + default=1, + help="Number of output sequences to return for the given prompt.", + ) + args.add_argument( + "--best-of", + type=int, + default=None, + help="Number of output sequences that are generated from the prompt. " + "From these `best_of` sequences, the top `n` sequences are returned. " + "`best_of` must be greater than or equal to `n`. This is treated as " + "the beam width when `use_beam_search` is True. By default, `best_of`" + "is set to `n`.", + ) + args.add_argument( + "--presence-penalty", + type=float, + default=0.0, + help="Float that penalizes new tokens based on whether they " + "appear in the generated text so far. Values > 0 encourage the model " + "to use new tokens, while values < 0 encourage the model to repeat " + "tokens.", + ) + args.add_argument( + "--frequency-penalty", + type=float, + default=0.0, + help="Float that penalizes new tokens based on their " + " frequency in the generated text so far. Values > 0 encourage the " + " model to use new tokens, while values < 0 encourage the model to " + "repeat tokens.", + ) + args.add_argument( + "--repetition-penalty", + type=float, + default=1.0, + help="Float that penalizes new tokens based on whether " + "they appear in the prompt and the generated text so far. Values > 1 " + "encourage the model to use new tokens, while values < 1 encourage " + "the model to repeat tokens.", + ) + args.add_argument( + "--temperature", + type=float, + default=1.0, + help="Float that controls the randomness of the sampling. Lower " + "values make the model more deterministic, while higher values make " + "the model more random. Zero means greedy sampling.", + ) + args.add_argument( + "--top-p", + type=float, + default=1.0, + help="Float that controls the cumulative probability of the top tokens " + "to consider. Must be in (0, 1]. Set to 1 to consider all tokens.", + ) + args.add_argument( + "--top-k", + type=int, + default=-1, + help="Integer that controls the number of top tokens to consider. Set " + "to -1 to consider all tokens.", + ) + args.add_argument( + "--min-p", + type=float, + default=0.0, + help="Float that represents the minimum probability for a token to be " + "considered, relative to the probability of the most likely token. " + "Must be in [0, 1]. Set to 0 to disable this.", + ) + args.add_argument( + "--use-beam-search", + default=False, + action="store_true", + help="Whether to use beam search instead of sampling.", + ) + args.add_argument( + "--length-penalty", + type=float, + default=1.0, + help="Float that penalizes sequences based on their length. Used in beam search.", + ) + args.add_argument( + "--stop", + type=str, + default=None, + help="List of strings that stop the generation when they are generated. " + "The returned output will not contain the stop strings.", + ) + args.add_argument( + "--stop-token-ids", + type=int, + default=None, + help="List of tokens that stop the generation when they are " + "generated. The returned output will contain the stop tokens unless " + "the stop tokens are special tokens.", + ) + args.add_argument( + "--include-stop-str-in-output", + default=False, + action="store_true", + help="Whether to include the stop strings in output text. Defaults to False.", + ) + args.add_argument( + "--ignore-eos", + default=False, + action="store_true", + help="Whether to ignore the EOS token and continue generating tokens after the EOS token is generated.", + ) + args.add_argument( + "--max-tokens", + type=int, + default=16, + help="Maximum number of tokens to generate per output sequence.", + ) + args.add_argument( + "--min-tokens", + type=int, + default=0, + help="Minimum number of tokens to generate per output sequence " + "before EOS or stop_token_ids can be generated", + ) + args.add_argument( + "--logprobs", + type=int, + default=None, + help="NNumber of log probabilities to return per output token. " + "Note that the implementation follows the OpenAI API: The return " + "result includes the log probabilities on the `logprobs` most likely " + "tokens, as well the chosen tokens. The API will always return the " + "log probability of the sampled token, so there may be up to " + "`logprobs+1` elements in the response.", + ) + args.add_argument( + "--prompt-logprobs", + type=int, + default=None, + help="Number of log probabilities to return per prompt token.", + ) + args.add_argument( + "--detokenize", + type=bool, + default=True, + help="Whether to detokenize the output. Defaults to True.", + ) + args.add_argument( + "--skip-special-tokens", + default=True, + action="store_false", + help="Whether to skip special tokens in the output.", + ) + args.add_argument( + "--spaces-between-special-tokens", + default=True, + action="store_false", + help="Whether to add spaces between special tokens in the output. Defaults to True.", + ) + return args + + +def load_chat_template(tokenizer, chat_template): + if chat_template is not None: + try: + with open(chat_template, "r") as f: + tokenizer.chat_template = f.read() + except OSError: + # If opening a file fails, set chat template to be args to + # ensure we decode so our escape are interpreted correctly + tokenizer.chat_template = codecs.decode(chat_template, "unicode_escape") + + logging.info(f"Using supplied chat template:\n{tokenizer.chat_template}") + elif tokenizer.chat_template is not None: + logging.info( + f"Using default chat template:\n{tokenizer.chat_template}. This May lead to unsatisfactory results. You can provide a template.jinja file for vllm." + ) + else: + logging.warning( + "No chat template provided. Chat API will not work. This May lead to unsatisfactory results. You can provide a template.jinja file for vllm." + ) diff --git a/models/benchmark/mixtral/README.md b/models/benchmark/mixtral/README.md new file mode 100644 index 0000000000000000000000000000000000000000..20e096097ea11ba155697a1faa2232b79636908c --- /dev/null +++ b/models/benchmark/mixtral/README.md @@ -0,0 +1,12 @@ +# Mixtral 测试 + +* 下载数据 + + 联系您的应用工程师获取 + +* 测试 Mixtral-8x22B-W8A8 + +```bash +cd .. +bash test_performance_server.sh --model /path/to/model -tp 8 --host 127.0.0.1 --port 12345 --enable-chunked-prefill=False --max-num-seqs 10 --max-num-batched-tokens 20480 --max-model-len 20480,--model /path/to/model --host 127.0.0.1 --port 12345 --num-prompts 10 --input-tokens 2048 --output-tokens 1024 +``` diff --git a/models/benchmark/quantize_w8a8.py b/models/benchmark/quantize_w8a8.py new file mode 100644 index 0000000000000000000000000000000000000000..bf1ffcd77500612f2825a524ec5ea7acfe69db66 --- /dev/null +++ b/models/benchmark/quantize_w8a8.py @@ -0,0 +1,171 @@ +import os +import shutil +import json +import argparse +from transformers import AutoTokenizer +from datasets import load_from_disk +from functools import partial + +from llmcompressor.modifiers.quantization import QuantizationModifier +from llmcompressor.modifiers.smoothquant import SmoothQuantModifier +from llmcompressor.transformers import SparseAutoModelForCausalLM, oneshot + +parser = argparse.ArgumentParser() +parser.add_argument("--dataset-path", required=True, type=str, help="dataset-path") +parser.add_argument("--model", required=True, type=str, help="model-path") +parser.add_argument("--num-samples", type=int, default=512, help="The number of samples used for calibration") +parser.add_argument("--max-length", type=int, default=2048, help="max sequence length") +parser.add_argument("--model-type", type=str, required=True, choices=['llama', 'chatglm', "baichuan2", "qwen", "gpt-neox"], help="model type") +parser.add_argument("--smoothquant", type=bool, default=True, help="enable smoothquant") + + +args = parser.parse_args() + + +CHATGLM_SMOOTHQUANT_MAPPINGS = [ + [["re:.*query_key_value"], "re:.*input_layernorm"], + [["re:.*dense_h_to_4h"], "re:.*post_attention_layernorm"], +] + +BAICHUAN2_SMOOTHQUANT_MAPPINGS = [ + [["re:.*W_pack"], "re:.*input_layernorm"], + [["re:.*gate_proj", "re:.*up_proj"], "re:.*post_attention_layernorm"], +] + +LLAMA_SMOOTHQUANT_MAPPINGS = [ + [["re:.*q_proj", "re:.*k_proj", "re:.*v_proj"], "re:.*input_layernorm"], + [["re:.*gate_proj", "re:.*up_proj"], "re:.*post_attention_layernorm"], +] + +GPT_NEOX_SMOOTHQUANT_MAPPINGS = [ + [["re:.*query_key_value"], "re:.*input_layernorm"], + [["re:.*dense_h_to_4h"], "re:.*post_attention_layernorm"], +] + +smooth_quant_mappings = { + 'chatglm': CHATGLM_SMOOTHQUANT_MAPPINGS, + "baichuan2": BAICHUAN2_SMOOTHQUANT_MAPPINGS, + 'llama': LLAMA_SMOOTHQUANT_MAPPINGS, + 'qwen': LLAMA_SMOOTHQUANT_MAPPINGS, + 'gpt-neox': GPT_NEOX_SMOOTHQUANT_MAPPINGS, +} + +ignore_layer = { + "chatglm": ['transformer.output_layer'], + "baichuan2": ['lm_head'], + "llama": ['lm_head'], + "qwen": ['lm_head'], + "gpt-neox": ['embed_out'] +} + + +def preprocess(example, tokenizer, chat_template=None): + return { + "text": tokenizer.apply_chat_template( + example["messages"], + chat_template=chat_template, + tokenize=False, + ) + } + +# Tokenize inputs. +def tokenize(sample, tokenizer, max_length): + return tokenizer( + sample["text"], + padding=False, + max_length=max_length, + truncation=True, + add_special_tokens=False, + ) + +def save_tokenizer(model_path, save_path): + tokenizer_files = [ + "special_tokens_map.json", + "tokenization_chatglm.py", + "tokenizer_config.json", + "tokenizer.model", + "tokenizer_config_default.json", + "tokenizer.json", + "vocab.json", + "merges.txt", + ] + for file_name in tokenizer_files: + src_file = os.path.join(model_path, file_name) + des_file = os.path.join(save_path, file_name) + if os.path.isfile(src_file): + shutil.copy(src_file, des_file) + +def preprocess_for_llama(model_path): + generation_config_file = os.path.join(model_path, "generation_config.json") + if os.path.isfile(generation_config_file): + # add do_sample + shutil.copy(generation_config_file, generation_config_file+"_bk") + with open(generation_config_file,'r') as f: + config = json.load(f) + if "do_sample" not in config: + config['do_sample'] = True + with open(generation_config_file,'w') as f: + json.dump(config, f) + +def main(args): + if args.model_type == "llama": + preprocess_for_llama(args.model) + + tokenizer = AutoTokenizer.from_pretrained(args.model,trust_remote_code=True) + + dataset = load_from_disk(args.dataset_path)['train_sft'] + dataset = dataset.shuffle(seed=42).select(range(args.num_samples)) + if tokenizer.chat_template is None: + chat_template = ( + "{% for message in messages %}" + "{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}" + "{% endfor %}" + "{% if add_generation_prompt %}" + "{{ '<|im_start|>assistant\n' }}" + "{% endif %}" + ) + else: + chat_template = None + + dataset = dataset.map(partial(preprocess, tokenizer=tokenizer, chat_template=chat_template)) + dataset = dataset.map(partial(tokenize, tokenizer=tokenizer, max_length=args.max_length), remove_columns=dataset.column_names) + + if args.smoothquant: + smooth_modifier = SmoothQuantModifier(smoothing_strength=0.8) + smooth_modifier.mappings = smooth_quant_mappings[args.model_type] + recipe = [ + smooth_modifier, + QuantizationModifier(targets="Linear", scheme="W8A8", ignore=ignore_layer[args.model_type]), + ] + else: + recipe = [ + QuantizationModifier(targets="Linear", scheme="W8A8", ignore=ignore_layer[args.model_type]), + ] + + model = SparseAutoModelForCausalLM.from_pretrained( + args.model, + device_map="auto", + torch_dtype="auto", + trust_remote_code=True, + ) + + # Apply algorithms. + oneshot( + model=model, + tokenizer=tokenizer, + dataset=dataset, + recipe=recipe, + max_seq_length=args.max_length, + num_calibration_samples=args.num_samples, + ) + + # Save to disk compressed. + SAVE_DIR = args.model + "-W8A8-Dynamic-Per-Token" + model.save_pretrained(SAVE_DIR, save_compressed=True) + if args.model_type == "chatglm" or args.model_type == "gpt-neox": + save_tokenizer(args.model, SAVE_DIR) + else: + tokenizer.save_pretrained(SAVE_DIR) + +if __name__ == "__main__": + main(args) diff --git a/models/benchmark/qwen2.5/README.md b/models/benchmark/qwen2.5/README.md new file mode 100644 index 0000000000000000000000000000000000000000..1e9f94c3b2029bc7e5d559c1e1298d51ab5e8ad2 --- /dev/null +++ b/models/benchmark/qwen2.5/README.md @@ -0,0 +1,95 @@ +# Qwen2.5 测试 + +* 下载数据 + + 数据链接: [Qwen](https://huggingface.co/Qwen) + 数据集:[HuggingFaceH4/ultrachat_200k](https://huggingface.co/datasets/HuggingFaceH4/ultrachat_200k) + +* 测试 Qwen2.5-72B-Instruct + +```bash +cd .. +bash test_performance_server.sh --model /path/to/model -tp 8 --host 127.0.0.1 --port 12345 --max-num-batched-tokens 20480 --max-num-seqs 10 --max-model-len 20480,--model /path/to/model --host 127.0.0.1 --port 12345 --num-prompts 10 --input-tokens 2048 --output-tokens 1024 +``` + +* 测试 Qwen2.5-72B-instruct W8A8 + +```bash +## 量化 +python3 quantize_w8a8.py \ +--model /path/to/model \ +--dataset-path /home/data/nlp/ultrachat_200k \ +--num-samples 32 \ +--model-type qwen + +# w8a8权重保存名称以 -W8A8-Dynamic-Per-Token 结尾 + +## 运行 Qwen2.5-72B-instruct W8A8 +cd .. +bash test_performance_server.sh --model /path/to/model -tp 4 --host 127.0.0.1 --port 12345 --max-num-batched-tokens 20480 --max-num-seqs 10 --max-model-len 20480,--model /path/to/model --host 127.0.0.1 --port 12345 --num-prompts 10 --input-tokens 2048 --output-tokens 1024 +``` + +* 测试 Qwen2.5-72B-Instruct-GPTQ-Int4 + +```bash +cd .. +# tp 2 +VLLM_RPC_TIMEOUT=100000 bash test_performance_server.sh --model /path/to/model --quantization gptq -tp 2 --host 127.0.0.1 --port 12345 --max-num-batched-tokens 20480 --max-num-seqs 10 --max-model-len 20480,--model /path/to/model --host 127.0.0.1 --port 12345 --num-prompts 10 --input-tokens 2048 --output-tokens 1024 +# tp 4 +VLLM_RPC_TIMEOUT=100000 bash test_performance_server.sh --model /path/to/model --quantization gptq -tp 4 --host 127.0.0.1 --port 12345 --max-num-batched-tokens 20480 --max-num-seqs 10 --max-model-len 20480,--model /path/to/model --host 127.0.0.1 --port 12345 --num-prompts 10 --input-tokens 2048 --output-tokens 1024 +``` + +* 测试 Qwen2.5-72B-Instruct-AWQ-Int4 + +```bash +cd .. +# tp 2 +VLLM_RPC_TIMEOUT=100000 bash test_performance_server.sh --model /path/to/model --quantization awq -tp 2 --host 127.0.0.1 --port 12345 --max-num-batched-tokens 20480 --max-num-seqs 10 --max-model-len 20480,--model /path/to/model --host 127.0.0.1 --port 12345 --num-prompts 10 --input-tokens 2048 --output-tokens 1024 +# tp 4 +VLLM_RPC_TIMEOUT=100000 bash test_performance_server.sh --model /path/to/model --quantization awq -tp 4 --host 127.0.0.1 --port 12345 --max-num-batched-tokens 20480 --max-num-seqs 10 --max-model-len 20480,--model /path/to/model --host 127.0.0.1 --port 12345 --num-prompts 10 --input-tokens 2048 --output-tokens 1024 +``` + +* 测试 Qwen2.5-14B-Instruct性能 + +```bash +cd .. +bash test_performance_server.sh --model /path/to/model -tp 2 --host 127.0.0.1 --port 12345 --max-num-batched-tokens 20480 --max-num-seqs 10 --max-model-len 20480,--model /path/to/model --host 127.0.0.1 --port 12345 --num-prompts 10 --input-tokens 2048 --output-tokens 1024 +``` + +* 测试 Qwen2.5-14B-Instruct精度 + +* 数据集下载并解压:https://huggingface.co/datasets/THUDM/LongBench/resolve/main/data.zip + +* 测试 Qwen2.5-14B-Instruct bf16/fp16 + +```bash +cd .. +bash test_vllm_longbench.sh --model-name qwen2.5-14b-chat --model /path/to/model --datapath /path/to/longbench/data -tp 1 --max-model-len 32768 --dtype float16 --max-num-seqs 8 --val-data-nums 1 --temperature 0.0 --max-num-batched-tokens 32768 --trust-remote-code + +bash test_vllm_longbench.sh --model-name qwen2.5-14b-chat --model /path/to/model --datapath /path/to/longbench/data -tp 1 --max-model-len 32768 --dtype bfloat16 --max-num-seqs 8 --val-data-nums 1 --temperature 0.0 --max-num-batched-tokens 32768 --trust-remote-code + +``` + + +* 测试Qwen2-VL-7B 性能 + +```bash +# if "Flash Attention implemented by IXDNN requires last dimension of inputs to be divisible by 32, but get head_dim=80. Optional ENABLE_FLASH_ATTENTION_WITH_HEAD_DIM_PADDING env can be setted to pad along the head dimension." happened. +# set ENABLE_FLASH_ATTENTION_WITH_HEAD_DIM_PADDING=1 +cd .. +bash test_performance_server_multimodal.sh --model /path/to/model -tp 1 --max-num-seqs 32 --max-num-batched-tokens 8192 --max-model-len 8192 --host 127.0.0.1 --port 12345,--model /path/to/model --host 127.0.0.1 --port 12345 --num-prompts 1 --output-tokens 128 --image-path test.jpg --image-size "512,512" +``` + +* 测试Qwen2-VL-7B 精度 + + +* 数据集下载并解压:https://huggingface.co/datasets/THUDM/LongBench/resolve/main/data.zip + + +* 测试 Qwen2-VL-7B bf16 + +```bash +cd .. +bash test_vllm_longbench.sh --model-name qwen2-vl-chat --model /path/to/model --datapath /path/to/longbench/data -tp 1 --max-model-len 32768 --dtype bfloat16 --max-num-seqs 8 --val-data-nums 1 --temperature 0.0 --max-num-batched-tokens 32768 --trust-remote-code + +``` diff --git a/models/benchmark/requirements.txt b/models/benchmark/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..9aebe9f80e98c7f34ce4c954b52391a4abe0d7e7 --- /dev/null +++ b/models/benchmark/requirements.txt @@ -0,0 +1,34 @@ +psutil +sentencepiece # Required for LLaMA tokenizer. +numpy < 2.0.0 +requests >= 2.26.0 +tqdm +py-cpuinfo +transformers >= 4.45.0 # Required for Llama 3.2. +tokenizers >= 0.19.1 # Required for Llama 3. +fastapi >= 0.107.0, < 0.113.0; python_version < '3.9' +fastapi >= 0.107.0, != 0.113.*, != 0.114.0; python_version >= '3.9' +aiohttp +openai >= 1.40.0 # Ensure modern openai package (ensure types module present) +uvicorn[standard] +pydantic >= 2.9 # Required for fastapi >= 0.113.0 +pillow # Required for image processing +prometheus_client >= 0.18.0 +prometheus-fastapi-instrumentator >= 7.0.0 +tiktoken >= 0.6.0 # Required for DBRX tokenizer +lm-format-enforcer == 0.10.6 +outlines >= 0.0.43, < 0.1 +typing_extensions >= 4.10 +filelock >= 3.10.4 # filelock starts to support `mode` argument from 3.10.4 +partial-json-parser # used for parsing partial JSON outputs +pyzmq +msgspec +gguf == 0.10.0 +importlib_metadata +mistral_common[opencv] >= 1.4.4 +pyyaml +six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12 +setuptools>=74.1.1; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12 +einops # Required for Qwen2-VL. +compressed-tensors==0.5.0 # required for w8a8 +llmcompressor==0.1.0 # required for w8a8 diff --git a/models/benchmark/set_environment.sh b/models/benchmark/set_environment.sh new file mode 100644 index 0000000000000000000000000000000000000000..29e1f359c43950e74f44746781d6707acd9bc8d4 --- /dev/null +++ b/models/benchmark/set_environment.sh @@ -0,0 +1,21 @@ +#!/bin/bash +# Copyright (c) 2025, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +set -e + +pip3 install -r "requirements.txt" -i https://pypi.tuna.tsinghua.edu.cn/simple +pip3 install -U mpi4py -i https://pypi.tuna.tsinghua.edu.cn/simple + +exit $? \ No newline at end of file diff --git a/models/benchmark/test.jpg b/models/benchmark/test.jpg new file mode 100644 index 0000000000000000000000000000000000000000..e131e8ecdf32c3f751ab0f7b2e5f002683babda2 Binary files /dev/null and b/models/benchmark/test.jpg differ diff --git a/models/benchmark/test_performance_server.sh b/models/benchmark/test_performance_server.sh new file mode 100644 index 0000000000000000000000000000000000000000..0f799a8cae356eaff612f8fdc8105ae97b860ed9 --- /dev/null +++ b/models/benchmark/test_performance_server.sh @@ -0,0 +1,75 @@ +#!/bin/bash +# Copyright (c) 2025, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +set -e + +IFS="," +read -r server_args client_args <<< "$@" +IFS=" " + +python3 -m vllm.entrypoints.openai.api_server $server_args & + +index=0 +port="12345" +IFS=' ' read -r args <<< "$server_args" +arguments=($args) +for argument in $args +do + index=`expr $index + 1` + case $argument in + --port) port=${arguments[index]};; + esac +done + +cleanup() { + serve_pid=$(ps -ef | grep "vllm.entrypoints.openai.api_server" | grep "$port" | awk '{print $2}' | grep -v "grep") + # first try kill -15 to serve pid + # kill -15 "$serve_pid" + while read pid; do + kill -15 $pid + sleep 5 + done <<< $serve_pid + sleep 20 + main_work_pid=$(ps -ef | grep "$serve_pid" | awk '{print $2}' | grep -v "grep" || echo "") + while read pid; do + if [ -n "$( ixsmi | grep "$pid" )" ];then + kill -9 $pid + sleep 5 + fi + done <<< $main_work_pid +} + +trap cleanup EXIT + +sleep_time=20 +status=1 +duration_times=0 +while [ $status -ne 0 ]; do + sleep $sleep_time + duration_times=$(expr $duration_times + 1) + if ! timeout 3 bash -c "/dev/null 2>&1; then + status=1 + if [ $duration_times -gt 500 ]; then + echo "connection time out, the port may can not be used, closing ..." + exit 1 + fi + else + status=0 + fi +done + +python3 "benchmark_serving.py" $client_args + +exit $? \ No newline at end of file diff --git a/models/benchmark/test_performance_server_multimodal.sh b/models/benchmark/test_performance_server_multimodal.sh new file mode 100644 index 0000000000000000000000000000000000000000..5fa8944e64ce461ac818426468f1754ab2809fea --- /dev/null +++ b/models/benchmark/test_performance_server_multimodal.sh @@ -0,0 +1,71 @@ +#!/bin/bash +# Copyright (c) 2025, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +set -e + +IFS="," +read -r server_args client_args <<< "$@" +IFS=" " + +python3 -m vllm.entrypoints.openai.api_server $server_args & + +index=0 +port="12345" +IFS=' ' read -r args <<< "$server_args" +arguments=($args) +for argument in $args +do + index=`expr $index + 1` + case $argument in + --port) port=${arguments[index]};; + esac +done + +cleanup() { + serve_pid=$(ps -ef | grep "vllm.entrypoints.openai.api_server" | grep "$port" | awk '{print $2}' | grep -v "grep") + # first try kill -15 to serve pid + kill -15 "$serve_pid" + sleep 20 + main_work_pid=$(ps -ef | grep "$serve_pid" | awk '{print $2}' | grep -v "grep" || echo "") + while read pid; do + if [ -n "$( ixsmi | grep "$pid" )" ];then + kill -9 $pid + sleep 5 + fi + done <<< $main_work_pid +} + +trap cleanup EXIT + +sleep_time=20 +status=1 +duration_times=0 +while [ $status -ne 0 ]; do + sleep $sleep_time + duration_times=$(expr $duration_times + 1) + if ! timeout 3 bash -c "/dev/null 2>&1; then + status=1 + if [ $duration_times -gt 500 ]; then + echo "connection time out, the port may can not be used, closing ..." + exit 1 + fi + else + status=0 + fi +done + +python3 "benchmark_serving_multimodal.py" $client_args + +exit $? \ No newline at end of file diff --git a/models/benchmark/test_vllm_longbench.sh b/models/benchmark/test_vllm_longbench.sh new file mode 100644 index 0000000000000000000000000000000000000000..33a117478e6452bc045c5c3e0249bd4bac440c78 --- /dev/null +++ b/models/benchmark/test_vllm_longbench.sh @@ -0,0 +1,22 @@ +#!/bin/bash +# Copyright (c) 2025, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +set -e + +bash set_environment.sh + +python3 benchmark_longbench.py $@ + +exit $? \ No newline at end of file diff --git a/models/benchmark/yi1.5/README.md b/models/benchmark/yi1.5/README.md new file mode 100644 index 0000000000000000000000000000000000000000..e8e98ca58b6b40ad4f2bbfca8765cda64dfe1fd4 --- /dev/null +++ b/models/benchmark/yi1.5/README.md @@ -0,0 +1,12 @@ +# Yi1.5 测试 + +* 下载数据 + + 数据链接: [Yi](https://huggingface.co/01-ai/Yi-1.5-34B-Chat) + +* 测试 Yi-1.5-34B-chat + +```bash +cd .. +bash test_performance_server.sh --model /path/to/model -tp 4 --host 127.0.0.1 --port 12345 --max-num-batched-tokens 20480 --max-num-seqs 10,--model /path/to/model --host 127.0.0.1 --port 12345 --num-prompts 10 --input-tokens 2048 --output-tokens 1024 +```