choose_scaled_mm_linear_kernel(
config: _KernelConfigT,
possible_kernels: dict[
PlatformEnum, list[type[_KernelT]]
],
compute_capability: int | None = None,
force_kernel: type[_KernelT] | None = None,
) -> type[_KernelT]
Choose a _KernelT that can implement the given config for the given compute capability. Attempts to choose the best kernel in terms of performance.
Parameters:
| Name | Type | Description | Default |
config | _KernelConfigT | Description of the linear layer to be implemented. | required |
possible_kernels | dict[PlatformEnum, list[_KernelT]] | A dictionary of platforms and their list of possible kernels. | required |
compute_capability | Optional[int] | The compute capability of the target device, if None uses current_platform to get the compute capability. Defaults to None. | None |
force_kernel | Optional[type[_KernelT]] | An Optional forced kernel to override the possible_kernels if it can be implemented. If None, it will only try the possible kernels. | None |
Raises:
| Type | Description |
ValueError | If no kernel can implement the given config. |
Returns:
| Name | Type | Description |
_KernelT | type[_KernelT] | |
Source code in vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py
| def choose_scaled_mm_linear_kernel(
config: _KernelConfigT,
possible_kernels: dict[PlatformEnum, list[type[_KernelT]]],
compute_capability: int | None = None,
force_kernel: type[_KernelT] | None = None,
) -> type[_KernelT]:
"""
Choose a _KernelT that can implement the given config for the
given compute capability. Attempts to choose the best kernel in terms of
performance.
Args:
config (_KernelConfigT): Description of the linear layer
to be implemented.
possible_kernels (dict[PlatformEnum, list[_KernelT]]): A
dictionary of platforms and their list of possible kernels.
compute_capability (Optional[int], optional): The compute capability of
the target device, if None uses `current_platform` to get the
compute capability. Defaults to None.
force_kernel (Optional[type[_KernelT]]): An Optional forced kernel to override
the possible_kernels if it can be implemented. If None, it will only try the
possible kernels.
Raises:
ValueError: If no kernel can implement the given config.
Returns:
_KernelT: Chosen kernel.
"""
failure_reason_list = []
if force_kernel is not None:
can_implement, failure_reason = is_supported_and_can_implement_kernel(
force_kernel, config, compute_capability
)
if can_implement:
return force_kernel
logger.info_once(
"Tried to force %s, but the kernel couldn't be implemented",
force_kernel.__name__,
scope="global",
)
for kernel in possible_kernels[current_platform._enum]:
is_supported_and_can_implement, failure_reason = (
is_supported_and_can_implement_kernel(kernel, config, compute_capability)
)
if is_supported_and_can_implement:
return kernel
failure_reason_list.append(failure_reason)
raise ValueError(
"Failed to find a kernel that can implement the "
"ScaledMM linear layer. Reasons: \n" + "\n".join(failure_reason_list)
)
|