vllm.envs
VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE module-attribute ¶
VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE: bool = (
False
)
VLLM_ALLOW_INSECURE_SERIALIZATION module-attribute ¶
VLLM_ALLOW_INSECURE_SERIALIZATION: bool = False
VLLM_ENABLE_FUSED_MOE_ACTIVATION_CHUNKING module-attribute ¶
VLLM_ENABLE_FUSED_MOE_ACTIVATION_CHUNKING: bool = True
VLLM_FLASHINFER_FORCE_TENSOR_CORES module-attribute ¶
VLLM_FLASHINFER_FORCE_TENSOR_CORES: bool = False
VLLM_LOGITS_PROCESSOR_THREADS module-attribute ¶
VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE module-attribute ¶
VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: int = 163840
VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16 module-attribute ¶
VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16: bool = True
VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB module-attribute ¶
VLLM_ROCM_QUICK_REDUCE_QUANTIZATION module-attribute ¶
VLLM_ROCM_QUICK_REDUCE_QUANTIZATION: str = 'NONE'
VLLM_TEST_USE_PRECOMPILED_NIGHTLY_WHEEL module-attribute ¶
VLLM_TEST_USE_PRECOMPILED_NIGHTLY_WHEEL: bool = False
VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS module-attribute ¶
VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS: int = 1
VLLM_TORCH_PROFILER_RECORD_SHAPES module-attribute ¶
VLLM_TORCH_PROFILER_RECORD_SHAPES: bool = False
VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY module-attribute ¶
VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY: bool = False
VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE module-attribute ¶
VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE: str = 'auto'
VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM module-attribute ¶
VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM: bool = False
VLLM_V1_USE_PREFILL_DECODE_ATTENTION module-attribute ¶
VLLM_V1_USE_PREFILL_DECODE_ATTENTION: bool = False
VLLM_XLA_CACHE_PATH module-attribute ¶
VLLM_XLA_CACHE_PATH: str = join(
VLLM_CACHE_ROOT, "xla_cache"
)
环境变量 module-attribute ¶
environment_variables: dict[str, Callable[[], Any]] = {
"VLLM_TARGET_DEVICE": lambda: getenv(
"VLLM_TARGET_DEVICE", "cuda"
),
"MAX_JOBS": lambda: getenv("MAX_JOBS", None),
"NVCC_THREADS": lambda: getenv("NVCC_THREADS", None),
"VLLM_USE_PRECOMPILED": lambda: bool(
get("VLLM_USE_PRECOMPILED")
)
or bool(get("VLLM_PRECOMPILED_WHEEL_LOCATION")),
"VLLM_TEST_USE_PRECOMPILED_NIGHTLY_WHEEL": lambda: bool(
int(
getenv(
"VLLM_TEST_USE_PRECOMPILED_NIGHTLY_WHEEL",
"0",
)
)
),
"CMAKE_BUILD_TYPE": lambda: getenv("CMAKE_BUILD_TYPE"),
"VERBOSE": lambda: bool(int(getenv("VERBOSE", "0"))),
"VLLM_CONFIG_ROOT": lambda: expanduser(
getenv(
"VLLM_CONFIG_ROOT",
join(get_default_config_root(), "vllm"),
)
),
"VLLM_CACHE_ROOT": lambda: expanduser(
getenv(
"VLLM_CACHE_ROOT",
join(get_default_cache_root(), "vllm"),
)
),
"VLLM_HOST_IP": lambda: getenv("VLLM_HOST_IP", ""),
"VLLM_PORT": get_vllm_port,
"VLLM_RPC_BASE_PATH": lambda: getenv(
"VLLM_RPC_BASE_PATH", gettempdir()
),
"VLLM_USE_MODELSCOPE": lambda: lower() == "true",
"VLLM_RINGBUFFER_WARNING_INTERVAL": lambda: int(
get("VLLM_RINGBUFFER_WARNING_INTERVAL", "60")
),
"CUDA_HOME": lambda: get("CUDA_HOME", None),
"VLLM_NCCL_SO_PATH": lambda: get(
"VLLM_NCCL_SO_PATH", None
),
"LD_LIBRARY_PATH": lambda: get("LD_LIBRARY_PATH", None),
"VLLM_USE_TRITON_FLASH_ATTN": lambda: lower()
in ("true", "1"),
"VLLM_V1_USE_PREFILL_DECODE_ATTENTION": lambda: lower()
in ("true", "1"),
"VLLM_FLASH_ATTN_VERSION": lambda: maybe_convert_int(
get("VLLM_FLASH_ATTN_VERSION", None)
),
"VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE": lambda: bool(
get("VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE", "1")
!= "0"
),
"VLLM_USE_STANDALONE_COMPILE": lambda: get(
"VLLM_USE_STANDALONE_COMPILE", "1"
)
== "1",
"LOCAL_RANK": lambda: int(get("LOCAL_RANK", "0")),
"CUDA_VISIBLE_DEVICES": lambda: get(
"CUDA_VISIBLE_DEVICES", None
),
"VLLM_ENGINE_ITERATION_TIMEOUT_S": lambda: int(
get("VLLM_ENGINE_ITERATION_TIMEOUT_S", "60")
),
"VLLM_API_KEY": lambda: get("VLLM_API_KEY", None),
"VLLM_DEBUG_LOG_API_SERVER_RESPONSE": lambda: lower()
== "true",
"S3_ACCESS_KEY_ID": lambda: get(
"S3_ACCESS_KEY_ID", None
),
"S3_SECRET_ACCESS_KEY": lambda: get(
"S3_SECRET_ACCESS_KEY", None
),
"S3_ENDPOINT_URL": lambda: get("S3_ENDPOINT_URL", None),
"VLLM_USAGE_STATS_SERVER": lambda: get(
"VLLM_USAGE_STATS_SERVER", "https://stats.vllm.ai"
),
"VLLM_NO_USAGE_STATS": lambda: get(
"VLLM_NO_USAGE_STATS", "0"
)
== "1",
"VLLM_DO_NOT_TRACK": lambda: (
get("VLLM_DO_NOT_TRACK", None)
or get("DO_NOT_TRACK", None)
or "0"
)
== "1",
"VLLM_USAGE_SOURCE": lambda: get(
"VLLM_USAGE_SOURCE", "production"
),
"VLLM_CONFIGURE_LOGGING": lambda: int(
getenv("VLLM_CONFIGURE_LOGGING", "1")
),
"VLLM_LOGGING_CONFIG_PATH": lambda: getenv(
"VLLM_LOGGING_CONFIG_PATH"
),
"VLLM_LOGGING_LEVEL": lambda: upper(),
"VLLM_LOGGING_PREFIX": lambda: getenv(
"VLLM_LOGGING_PREFIX", ""
),
"VLLM_LOGITS_PROCESSOR_THREADS": lambda: int(
getenv("VLLM_LOGITS_PROCESSOR_THREADS", "0")
)
if "VLLM_LOGITS_PROCESSOR_THREADS" in environ
else None,
"VLLM_TRACE_FUNCTION": lambda: int(
getenv("VLLM_TRACE_FUNCTION", "0")
),
"VLLM_ATTENTION_BACKEND": lambda: getenv(
"VLLM_ATTENTION_BACKEND", None
),
"VLLM_USE_FLASHINFER_SAMPLER": lambda: bool(
int(environ["VLLM_USE_FLASHINFER_SAMPLER"])
)
if "VLLM_USE_FLASHINFER_SAMPLER" in environ
else None,
"VLLM_FLASHINFER_FORCE_TENSOR_CORES": lambda: bool(
int(
getenv(
"VLLM_FLASHINFER_FORCE_TENSOR_CORES", "0"
)
)
),
"VLLM_PP_LAYER_PARTITION": lambda: getenv(
"VLLM_PP_LAYER_PARTITION", None
),
"VLLM_CPU_KVCACHE_SPACE": lambda: int(
getenv("VLLM_CPU_KVCACHE_SPACE", "0")
)
if "VLLM_CPU_KVCACHE_SPACE" in environ
else None,
"VLLM_CPU_OMP_THREADS_BIND": lambda: getenv(
"VLLM_CPU_OMP_THREADS_BIND", "auto"
),
"VLLM_CPU_NUM_OF_RESERVED_CPU": lambda: int(
getenv("VLLM_CPU_NUM_OF_RESERVED_CPU", "0")
)
if "VLLM_CPU_NUM_OF_RESERVED_CPU" in environ
else None,
"VLLM_CPU_MOE_PREPACK": lambda: bool(
int(getenv("VLLM_CPU_MOE_PREPACK", "1"))
),
"VLLM_CPU_SGL_KERNEL": lambda: bool(
int(getenv("VLLM_CPU_SGL_KERNEL", "0"))
),
"VLLM_USE_RAY_SPMD_WORKER": lambda: bool(
int(getenv("VLLM_USE_RAY_SPMD_WORKER", "0"))
),
"VLLM_USE_RAY_COMPILED_DAG": lambda: bool(
int(getenv("VLLM_USE_RAY_COMPILED_DAG", "0"))
),
"VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE": lambda: getenv(
"VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE", "auto"
),
"VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM": lambda: bool(
int(
getenv(
"VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM",
"0",
)
)
),
"VLLM_WORKER_MULTIPROC_METHOD": lambda: getenv(
"VLLM_WORKER_MULTIPROC_METHOD", "fork"
),
"VLLM_ASSETS_CACHE": lambda: expanduser(
getenv(
"VLLM_ASSETS_CACHE",
join(
get_default_cache_root(), "vllm", "assets"
),
)
),
"VLLM_IMAGE_FETCH_TIMEOUT": lambda: int(
getenv("VLLM_IMAGE_FETCH_TIMEOUT", "5")
),
"VLLM_VIDEO_FETCH_TIMEOUT": lambda: int(
getenv("VLLM_VIDEO_FETCH_TIMEOUT", "30")
),
"VLLM_AUDIO_FETCH_TIMEOUT": lambda: int(
getenv("VLLM_AUDIO_FETCH_TIMEOUT", "10")
),
"VLLM_MAX_AUDIO_CLIP_FILESIZE_MB": lambda: int(
getenv("VLLM_MAX_AUDIO_CLIP_FILESIZE_MB", "25")
),
"VLLM_VIDEO_LOADER_BACKEND": lambda: getenv(
"VLLM_VIDEO_LOADER_BACKEND", "opencv"
),
"VLLM_MM_INPUT_CACHE_GIB": lambda: int(
getenv("VLLM_MM_INPUT_CACHE_GIB", "4")
),
"VLLM_XLA_CACHE_PATH": lambda: expanduser(
getenv(
"VLLM_XLA_CACHE_PATH",
join(
get_default_cache_root(),
"vllm",
"xla_cache",
),
)
),
"VLLM_XLA_CHECK_RECOMPILATION": lambda: bool(
int(getenv("VLLM_XLA_CHECK_RECOMPILATION", "0"))
),
"VLLM_XLA_USE_SPMD": lambda: bool(
int(getenv("VLLM_XLA_USE_SPMD", "0"))
),
"VLLM_FUSED_MOE_CHUNK_SIZE": lambda: int(
getenv("VLLM_FUSED_MOE_CHUNK_SIZE", "32768")
),
"VLLM_ENABLE_FUSED_MOE_ACTIVATION_CHUNKING": lambda: bool(
int(
getenv(
"VLLM_ENABLE_FUSED_MOE_ACTIVATION_CHUNKING",
"1",
)
)
),
"VLLM_NO_DEPRECATION_WARNING": lambda: bool(
int(getenv("VLLM_NO_DEPRECATION_WARNING", "0"))
),
"VLLM_KEEP_ALIVE_ON_ENGINE_DEATH": lambda: bool(
getenv("VLLM_KEEP_ALIVE_ON_ENGINE_DEATH", 0)
),
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": lambda: lower()
in ("1", "true"),
"VLLM_TEST_FORCE_FP8_MARLIN": lambda: lower()
in ("1", "true"),
"VLLM_TEST_FORCE_LOAD_FORMAT": lambda: getenv(
"VLLM_TEST_FORCE_LOAD_FORMAT", "dummy"
),
"VLLM_RPC_TIMEOUT": lambda: int(
getenv("VLLM_RPC_TIMEOUT", "10000")
),
"VLLM_HTTP_TIMEOUT_KEEP_ALIVE": lambda: int(
get("VLLM_HTTP_TIMEOUT_KEEP_ALIVE", "5")
),
"VLLM_PLUGINS": lambda: None
if "VLLM_PLUGINS" not in environ
else split(","),
"VLLM_LORA_RESOLVER_CACHE_DIR": lambda: getenv(
"VLLM_LORA_RESOLVER_CACHE_DIR", None
),
"VLLM_TORCH_PROFILER_DIR": lambda: None
if getenv("VLLM_TORCH_PROFILER_DIR", None) is None
else expanduser(getenv("VLLM_TORCH_PROFILER_DIR", ".")),
"VLLM_TORCH_PROFILER_RECORD_SHAPES": lambda: bool(
getenv("VLLM_TORCH_PROFILER_RECORD_SHAPES", "0")
!= "0"
),
"VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY": lambda: bool(
getenv(
"VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY", "0"
)
!= "0"
),
"VLLM_TORCH_PROFILER_WITH_STACK": lambda: bool(
getenv("VLLM_TORCH_PROFILER_WITH_STACK", "1") != "0"
),
"VLLM_TORCH_PROFILER_WITH_FLOPS": lambda: bool(
getenv("VLLM_TORCH_PROFILER_WITH_FLOPS", "0") != "0"
),
"VLLM_USE_TRITON_AWQ": lambda: bool(
int(getenv("VLLM_USE_TRITON_AWQ", "0"))
),
"VLLM_ALLOW_RUNTIME_LORA_UPDATING": lambda: lower()
in ("1", "true"),
"VLLM_SKIP_P2P_CHECK": lambda: getenv(
"VLLM_SKIP_P2P_CHECK", "1"
)
== "1",
"VLLM_DISABLED_KERNELS": lambda: []
if "VLLM_DISABLED_KERNELS" not in environ
else split(","),
"VLLM_USE_V1": lambda: bool(
int(getenv("VLLM_USE_V1", "1"))
),
"VLLM_ROCM_USE_AITER": lambda: lower() in ("true", "1"),
"VLLM_ROCM_USE_AITER_PAGED_ATTN": lambda: lower()
in ("true", "1"),
"VLLM_ROCM_USE_AITER_LINEAR": lambda: lower()
in ("true", "1"),
"VLLM_ROCM_USE_AITER_MOE": lambda: lower()
in ("true", "1"),
"VLLM_ROCM_USE_AITER_RMSNORM": lambda: lower()
in ("true", "1"),
"VLLM_ROCM_USE_AITER_MLA": lambda: lower()
in ("true", "1"),
"VLLM_ROCM_USE_AITER_MHA": lambda: lower()
in ("true", "1"),
"VLLM_ROCM_USE_SKINNY_GEMM": lambda: lower()
in ("true", "1"),
"VLLM_ROCM_FP8_PADDING": lambda: bool(
int(getenv("VLLM_ROCM_FP8_PADDING", "1"))
),
"VLLM_ROCM_MOE_PADDING": lambda: bool(
int(getenv("VLLM_ROCM_MOE_PADDING", "1"))
),
"VLLM_ROCM_CUSTOM_PAGED_ATTN": lambda: lower()
in ("true", "1"),
"VLLM_ROCM_QUICK_REDUCE_QUANTIZATION": lambda: upper(),
"VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16": lambda: lower()
in ("true", "1"),
"VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB": lambda: maybe_convert_int(
get(
"VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB", None
)
),
"Q_SCALE_CONSTANT": lambda: int(
getenv("Q_SCALE_CONSTANT", "200")
),
"K_SCALE_CONSTANT": lambda: int(
getenv("K_SCALE_CONSTANT", "200")
),
"V_SCALE_CONSTANT": lambda: int(
getenv("V_SCALE_CONSTANT", "100")
),
"VLLM_ENABLE_V1_MULTIPROCESSING": lambda: bool(
int(getenv("VLLM_ENABLE_V1_MULTIPROCESSING", "1"))
),
"VLLM_LOG_BATCHSIZE_INTERVAL": lambda: float(
getenv("VLLM_LOG_BATCHSIZE_INTERVAL", "-1")
),
"VLLM_DISABLE_COMPILE_CACHE": lambda: bool(
int(getenv("VLLM_DISABLE_COMPILE_CACHE", "0"))
),
"VLLM_SERVER_DEV_MODE": lambda: bool(
int(getenv("VLLM_SERVER_DEV_MODE", "0"))
),
"VLLM_V1_OUTPUT_PROC_CHUNK_SIZE": lambda: int(
getenv("VLLM_V1_OUTPUT_PROC_CHUNK_SIZE", "128")
),
"VLLM_MLA_DISABLE": lambda: bool(
int(getenv("VLLM_MLA_DISABLE", "0"))
),
"VLLM_RAY_PER_WORKER_GPUS": lambda: float(
getenv("VLLM_RAY_PER_WORKER_GPUS", "1.0")
),
"VLLM_RAY_BUNDLE_INDICES": lambda: getenv(
"VLLM_RAY_BUNDLE_INDICES", ""
),
"VLLM_CUDART_SO_PATH": lambda: getenv(
"VLLM_CUDART_SO_PATH", None
),
"VLLM_DP_RANK": lambda: int(
getenv("VLLM_DP_RANK", "0")
),
"VLLM_DP_RANK_LOCAL": lambda: int(
getenv("VLLM_DP_RANK_LOCAL", VLLM_DP_RANK)
),
"VLLM_DP_SIZE": lambda: int(
getenv("VLLM_DP_SIZE", "1")
),
"VLLM_DP_MASTER_IP": lambda: getenv(
"VLLM_DP_MASTER_IP", "127.0.0.1"
),
"VLLM_DP_MASTER_PORT": lambda: int(
getenv("VLLM_DP_MASTER_PORT", "0")
),
"VLLM_MOE_DP_CHUNK_SIZE": lambda: int(
getenv("VLLM_MOE_DP_CHUNK_SIZE", "256")
),
"VLLM_RANDOMIZE_DP_DUMMY_INPUTS": lambda: get(
"VLLM_RANDOMIZE_DP_DUMMY_INPUTS", "0"
)
== "1",
"VLLM_CI_USE_S3": lambda: get("VLLM_CI_USE_S3", "0")
== "1",
"VLLM_MODEL_REDIRECT_PATH": lambda: get(
"VLLM_MODEL_REDIRECT_PATH", None
),
"VLLM_MARLIN_USE_ATOMIC_ADD": lambda: get(
"VLLM_MARLIN_USE_ATOMIC_ADD", "0"
)
== "1",
"VLLM_V0_USE_OUTLINES_CACHE": lambda: get(
"VLLM_V0_USE_OUTLINES_CACHE", "0"
)
== "1",
"VLLM_V1_USE_OUTLINES_CACHE": lambda: get(
"VLLM_V1_USE_OUTLINES_CACHE", "0"
)
== "1",
"VLLM_TPU_BUCKET_PADDING_GAP": lambda: int(
environ["VLLM_TPU_BUCKET_PADDING_GAP"]
)
if "VLLM_TPU_BUCKET_PADDING_GAP" in environ
else 0,
"VLLM_TPU_MOST_MODEL_LEN": lambda: maybe_convert_int(
get("VLLM_TPU_MOST_MODEL_LEN", None)
),
"VLLM_TPU_USING_PATHWAYS": lambda: bool(
"proxy" in lower()
),
"VLLM_USE_DEEP_GEMM": lambda: bool(
int(getenv("VLLM_USE_DEEP_GEMM", "0"))
),
"VLLM_USE_FLASHINFER_MOE_FP8": lambda: bool(
int(getenv("VLLM_USE_FLASHINFER_MOE_FP8", "0"))
),
"VLLM_USE_FLASHINFER_MOE_FP4": lambda: bool(
int(getenv("VLLM_USE_FLASHINFER_MOE_FP4", "0"))
),
"VLLM_XGRAMMAR_CACHE_MB": lambda: int(
getenv("VLLM_XGRAMMAR_CACHE_MB", "512")
),
"VLLM_MSGPACK_ZERO_COPY_THRESHOLD": lambda: int(
getenv("VLLM_MSGPACK_ZERO_COPY_THRESHOLD", "256")
),
"VLLM_ALLOW_INSECURE_SERIALIZATION": lambda: bool(
int(
getenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "0")
)
),
"VLLM_NIXL_SIDE_CHANNEL_HOST": lambda: getenv(
"VLLM_NIXL_SIDE_CHANNEL_HOST", "localhost"
),
"VLLM_NIXL_SIDE_CHANNEL_PORT": lambda: int(
getenv("VLLM_NIXL_SIDE_CHANNEL_PORT", "5557")
),
"VLLM_ALL2ALL_BACKEND": lambda: getenv(
"VLLM_ALL2ALL_BACKEND", "naive"
),
"VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE": lambda: int(
getenv(
"VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE", "163840"
)
),
"VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS": lambda: int(
getenv("VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS", "1")
),
"VLLM_SLEEP_WHEN_IDLE": lambda: bool(
int(getenv("VLLM_SLEEP_WHEN_IDLE", "0"))
),
"VLLM_MQ_MAX_CHUNK_BYTES_MB": lambda: int(
getenv("VLLM_MQ_MAX_CHUNK_BYTES_MB", "16")
),
"VLLM_EXECUTE_MODEL_TIMEOUT_SECONDS": lambda: int(
getenv("VLLM_EXECUTE_MODEL_TIMEOUT_SECONDS", "300")
),
"VLLM_KV_CACHE_LAYOUT": lambda: getenv(
"VLLM_KV_CACHE_LAYOUT", None
),
"VLLM_COMPUTE_NANS_IN_LOGITS": lambda: bool(
int(getenv("VLLM_COMPUTE_NANS_IN_LOGITS", "0"))
),
"VLLM_USE_NVFP4_CT_EMULATIONS": lambda: bool(
int(getenv("VLLM_USE_NVFP4_CT_EMULATIONS", "0"))
),
"VLLM_NIXL_ABORT_REQUEST_TIMEOUT": lambda: int(
getenv("VLLM_NIXL_ABORT_REQUEST_TIMEOUT", "120")
),
"VLLM_USE_CUDNN_PREFILL": lambda: bool(
int(getenv("VLLM_USE_CUDNN_PREFILL", "0"))
),
"VLLM_USE_TRTLLM_DECODE_ATTENTION": lambda: getenv(
"VLLM_USE_TRTLLM_DECODE_ATTENTION", None
),
"VLLM_ENABLE_CUDAGRAPH_GC": lambda: bool(
int(getenv("VLLM_ENABLE_CUDAGRAPH_GC", "0"))
),
"VLLM_LOOPBACK_IP": lambda: getenv(
"VLLM_LOOPBACK_IP", ""
),
"VLLM_PROCESS_NAME_PREFIX": lambda: getenv(
"VLLM_PROCESS_NAME_PREFIX", "VLLM"
),
"VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE": lambda: bool(
int(
getenv(
"VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE",
"0",
)
)
),
}
__dir__ ¶
compute_hash ¶
compute_hash() -> str
警告:每当向环境变量添加新键时,如果它影响计算图,请确保将其包含在因素列表中。例如,VLLM_PP_LAYER_PARTITION的不同值会生成不同的计算图,因此它被包含在因素列表中。影响不同内核或注意力后端选择的环境变量也应包含在因素列表中。
Source code in vllm/envs.py
get_default_cache_root ¶
get_default_config_root ¶
get_vllm_port ¶
从VLLM_PORT环境变量获取端口号。
返回:
| 类型 | 描述 |
|---|---|
Optional[int] | 如果设置了VLLM_PORT,则端口号为整数;否则为None。 |
抛出异常:
| 类型 | 描述 |
|---|---|
ValueError | 如果VLLM_PORT是一个URI,建议检查k8s服务发现问题。 |