vllm.model_executor.layers.fused_moe.router.fused_moe_router ¶

FusedMoERouter ¶

Bases: ABC

FusedMoERouter is an abstract class that provides a 'select_experts' method that is used for routing hidden states based on router logits.

Source code in vllm/model_executor/layers/fused_moe/router/fused_moe_router.py

class FusedMoERouter(ABC):
    """
    FusedMoERouter is an abstract class that provides a 'select_experts'
    method that is used for routing hidden states based on router logits.
    """

    def __init__(self):
        self._routing_replay_out: torch.Tensor | None = None

    @property
    @abstractmethod
    def routing_method_type(self) -> RoutingMethodType:
        raise NotImplementedError

    @abstractmethod
    def _select_experts(
        self,
        hidden_states: torch.Tensor,
        router_logits: torch.Tensor,
        *,
        input_ids: torch.Tensor | None = None,
    ) -> tuple[torch.Tensor, torch.Tensor]:
        raise NotImplementedError

    def select_experts(
        self,
        hidden_states: torch.Tensor,
        router_logits: torch.Tensor,
        *,
        input_ids: torch.Tensor | None = None,
    ) -> tuple[torch.Tensor, torch.Tensor]:
        """
        Route the input hidden states to the top-k experts based on the
        router logits.

        Returns:
            (topk_weights, topk_ids)
            (tuple[torch.Tensor, torch.Tensor]):
            The weights and expert ids computation result.

            **Compatibility**: When EPLB is not enabled, the returned ids are
            equivalent to global logical ids, so should be compatible with
            plain MoE implementations without redundant experts.
        """

        topk_weights, topk_ids = self._select_experts(
            hidden_states,
            router_logits,
            input_ids=input_ids,
        )

        # Write routing data for non-monolithic path (Triton, etc.)
        # (set by bind_routing_capture_to_model during capturer init)
        if self._routing_replay_out is not None:
            self._routing_replay_out[: topk_ids.shape[0]].copy_(
                topk_ids.to(torch.int16)
            )

        return topk_weights, topk_ids

select_experts ¶

select_experts(
    hidden_states: Tensor,
    router_logits: Tensor,
    *,
    input_ids: Tensor | None = None,
) -> tuple[Tensor, Tensor]

Route the input hidden states to the top-k experts based on the router logits.

Returns:

Type	Description
`Tensor`	(topk_weights, topk_ids)
`tuple[Tensor, Tensor]`
`tuple[Tensor, Tensor]`	The weights and expert ids computation result.
`tuple[Tensor, Tensor]`	Compatibility: When EPLB is not enabled, the returned ids are
`tuple[Tensor, Tensor]`	equivalent to global logical ids, so should be compatible with
`tuple[Tensor, Tensor]`	plain MoE implementations without redundant experts.

Source code in vllm/model_executor/layers/fused_moe/router/fused_moe_router.py

def select_experts(
    self,
    hidden_states: torch.Tensor,
    router_logits: torch.Tensor,
    *,
    input_ids: torch.Tensor | None = None,
) -> tuple[torch.Tensor, torch.Tensor]:
    """
    Route the input hidden states to the top-k experts based on the
    router logits.

    Returns:
        (topk_weights, topk_ids)
        (tuple[torch.Tensor, torch.Tensor]):
        The weights and expert ids computation result.

        **Compatibility**: When EPLB is not enabled, the returned ids are
        equivalent to global logical ids, so should be compatible with
        plain MoE implementations without redundant experts.
    """

    topk_weights, topk_ids = self._select_experts(
        hidden_states,
        router_logits,
        input_ids=input_ids,
    )

    # Write routing data for non-monolithic path (Triton, etc.)
    # (set by bind_routing_capture_to_model during capturer init)
    if self._routing_replay_out is not None:
        self._routing_replay_out[: topk_ids.shape[0]].copy_(
            topk_ids.to(torch.int16)
        )

    return topk_weights, topk_ids