# HELP vllm:cache_config_info Information of the LLMEngine CacheConfig # TYPE vllm:cache_config_info gauge vllm:cache_config_info{block_size="16",cache_dtype="auto",cpu_offload_gb="0",enable_prefix_caching="False",gpu_memory_utilization="0.7",is_attention_free="False",num_cpu_blocks="512",num_gpu_blocks="1094",num_gpu_blocks_override="None",sliding_window="None",swap_space_bytes="4294967296"} 1.0 # HELP vllm:num_preemptions_total Cumulative number of preemption from the engine. # TYPE vllm:num_preemptions_total counter vllm:num_preemptions_total{model_name="taide/TAIDE-LX-7B-Chat"} 0.0 # HELP vllm:prompt_tokens_total Number of prefill tokens processed. # TYPE vllm:prompt_tokens_total counter vllm:prompt_tokens_total{model_name="taide/TAIDE-LX-7B-Chat"} 0.0 # HELP vllm:generation_tokens_total Number of generation tokens processed. # TYPE vllm:generation_tokens_total counter vllm:generation_tokens_total{model_name="taide/TAIDE-LX-7B-Chat"} 0.0 # HELP vllm:num_requests_running Number of requests currently running on GPU. # TYPE vllm:num_requests_running gauge vllm:num_requests_running{model_name="taide/TAIDE-LX-7B-Chat"} 0.0 # HELP vllm:num_requests_swapped Number of requests swapped to CPU. # TYPE vllm:num_requests_swapped gauge vllm:num_requests_swapped{model_name="taide/TAIDE-LX-7B-Chat"} 0.0 # HELP vllm:num_requests_waiting Number of requests waiting to be processed. # TYPE vllm:num_requests_waiting gauge vllm:num_requests_waiting{model_name="taide/TAIDE-LX-7B-Chat"} 0.0 # HELP vllm:gpu_cache_usage_perc GPU KV-cache usage. 1 means 100 percent usage. # TYPE vllm:gpu_cache_usage_perc gauge vllm:gpu_cache_usage_perc{model_name="taide/TAIDE-LX-7B-Chat"} 0.0 # HELP vllm:cpu_cache_usage_perc CPU KV-cache usage. 1 means 100 percent usage. # TYPE vllm:cpu_cache_usage_perc gauge vllm:cpu_cache_usage_perc{model_name="taide/TAIDE-LX-7B-Chat"} 0.0 # HELP vllm:cpu_prefix_cache_hit_rate CPU prefix cache block hit rate. # TYPE vllm:cpu_prefix_cache_hit_rate gauge vllm:cpu_prefix_cache_hit_rate{model_name="taide/TAIDE-LX-7B-Chat"} -1.0 # HELP vllm:gpu_prefix_cache_hit_rate GPU prefix cache block hit rate. # TYPE vllm:gpu_prefix_cache_hit_rate gauge vllm:gpu_prefix_cache_hit_rate{model_name="taide/TAIDE-LX-7B-Chat"} -1.0 # HELP vllm:avg_prompt_throughput_toks_per_s Average prefill throughput in tokens/s. # TYPE vllm:avg_prompt_throughput_toks_per_s gauge vllm:avg_prompt_throughput_toks_per_s{model_name="taide/TAIDE-LX-7B-Chat"} 0.0 # HELP vllm:avg_generation_throughput_toks_per_s Average generation throughput in tokens/s. # TYPE vllm:avg_generation_throughput_toks_per_s gauge vllm:avg_generation_throughput_toks_per_s{model_name="taide/TAIDE-LX-7B-Chat"} 0.0