vllm.model_executor.models.ovis ¶

PyTorch Ovis model.

OvisImagePatchInputs ¶

Bases: TensorSchema

Dimensions

bnp: Batch size * number of images * number of patches
h: Height of each patch
w: Width of each patch
patch_indicators: Batch size * (number of patches + 1)
bn: Batch size * number of images

Source code in vllm/model_executor/models/ovis.py

class OvisImagePatchInputs(TensorSchema):
    """
    Dimensions:
        - bnp: Batch size * number of images * number of patches
        - h: Height of each patch
        - w: Width of each patch
        - patch_indicators: Batch size * (number of patches + 1)
        - bn: Batch size * number of images
    """

    type: Literal["image_patches"]
    flat_data: Annotated[torch.Tensor, TensorShape("bnp", 3, "h", "w")]
    indicator_tokens: Annotated[torch.Tensor, TensorShape("patch_indicators")]
    patches_per_image: Annotated[list[int], TensorShape("bn")]

OvisMultiModalProcessor ¶

Bases: BaseMultiModalProcessor[OvisProcessingInfo]

Source code in vllm/model_executor/models/ovis.py

class OvisMultiModalProcessor(BaseMultiModalProcessor[OvisProcessingInfo]):
    def image_indicators_to_visual_tokens(
        self,
        image_indicators: list[int],
    ) -> list[int]:
        """
        Filter image indicators placeholders and convert them to corresponding
        tokens in visual tokenizer.
        For example, [-301, -300, -302, -300, -303, -300, -304, -300, -305]
        should return [vocab_size-1, vocab_size-2, ..., vocab_size-5]
        """
        hf_config = self.info.get_hf_config()
        vte_vocab_size = hf_config.visual_tokenizer_config.vocab_size
        # -300 is image_atom token, filter them out
        return [vte_vocab_size + x + 300 for x in image_indicators if x < -300]

    def _call_hf_processor(
        self,
        prompt: str,
        mm_data: Mapping[str, object],
        mm_kwargs: Mapping[str, object],
        tok_kwargs: Mapping[str, object],
    ) -> BatchFeature:
        if not mm_data:
            # Avoid warning from HF logger for text-only input
            tokenizer = self.info.get_tokenizer()
            prompt_ids = tokenizer.encode(prompt, add_special_tokens=False)
            return BatchFeature(dict(input_ids=[prompt_ids]), tensor_type="pt")

        processed_outputs = super()._call_hf_processor(
            prompt=prompt,
            mm_data=mm_data,
            mm_kwargs=mm_kwargs,
            tok_kwargs=tok_kwargs,
        )

        hf_processor = self.info.get_hf_processor()
        image_indicators = [
            hf_processor.construct_image_indicators(grid)
            for grid in processed_outputs["grids"]
        ]
        indicator_tokens = [
            self.image_indicators_to_visual_tokens(indicator)
            for indicator in image_indicators
        ]
        processed_outputs["indicator_tokens"] = torch.tensor(indicator_tokens)
        return processed_outputs

    def _apply_hf_processor_tokens_only(
        self,
        prompt_tokens: list[int],
    ) -> list[int]:
        return prompt_tokens

    def _get_mm_fields_config(
        self,
        hf_inputs: BatchFeature,
        hf_processor_mm_kwargs: Mapping[str, object],
    ) -> Mapping[str, MultiModalFieldConfig]:
        return dict(
            pixel_values=MultiModalFieldConfig.batched("image"),
            grids=MultiModalFieldConfig.batched("image"),
            indicator_tokens=MultiModalFieldConfig.batched("image"),
        )

    def _get_prompt_updates(
        self,
        mm_items: MultiModalDataItems,
        hf_processor_mm_kwargs: Mapping[str, object],
        out_mm_kwargs: MultiModalKwargsItems,
    ) -> list[PromptReplacement]:
        def get_replacement_ovis(item_idx: int):
            out_item = out_mm_kwargs["image"][item_idx]
            grid = out_item["grids"].data

            hf_processor = self.info.get_hf_processor()
            return hf_processor.construct_image_placeholders(grid)

        return [
            PromptReplacement(
                modality="image",
                target=IMAGE_TOKEN,
                replacement=get_replacement_ovis,
            ),
        ]

image_indicators_to_visual_tokens ¶

image_indicators_to_visual_tokens(
    image_indicators: list[int],
) -> list[int]

Filter image indicators placeholders and convert them to corresponding tokens in visual tokenizer. For example, [-301, -300, -302, -300, -303, -300, -304, -300, -305] should return [vocab_size-1, vocab_size-2, ..., vocab_size-5]

Source code in vllm/model_executor/models/ovis.py

def image_indicators_to_visual_tokens(
    self,
    image_indicators: list[int],
) -> list[int]:
    """
    Filter image indicators placeholders and convert them to corresponding
    tokens in visual tokenizer.
    For example, [-301, -300, -302, -300, -303, -300, -304, -300, -305]
    should return [vocab_size-1, vocab_size-2, ..., vocab_size-5]
    """
    hf_config = self.info.get_hf_config()
    vte_vocab_size = hf_config.visual_tokenizer_config.vocab_size
    # -300 is image_atom token, filter them out
    return [vte_vocab_size + x + 300 for x in image_indicators if x < -300]

VisualTokenizer ¶

Bases: Module

Source code in vllm/model_executor/models/ovis.py

class VisualTokenizer(torch.nn.Module):
    def __init__(
        self,
        config: PretrainedConfig,
        quant_config: QuantizationConfig | None = None,
        prefix: str = "",
    ):
        super().__init__()
        self.config = config
        self.backbone = self._init_backbone(
            config=config,
            quant_config=quant_config,
            prefix=f"{prefix}.backbone",
        )
        # reserved tokens for IMAGE_INDICATORS
        head_dim = config.vocab_size - len(IMAGE_INDICATOR_IDS)
        self.head = torch.nn.Sequential(
            ReplicatedLinear(
                config.backbone_config.hidden_size
                * config.hidden_stride
                * config.hidden_stride,
                head_dim,
                bias=False,
                return_bias=False,
            ),
            torch.nn.LayerNorm(head_dim),
        )

    def _init_backbone(
        self,
        config: PretrainedConfig,
        quant_config: QuantizationConfig | None = None,
        prefix: str = "",
    ) -> nn.Module:
        model_type = config.backbone_config.model_type
        if model_type == "aimv2":
            # No post rms_norm in Ovis2's AIMv2 ViT.
            return AIMv2Model(
                config=config.backbone_config,
                quant_config=quant_config,
                require_post_norm=False,
                prefix=prefix,
            )
        elif model_type == "siglip_vision_model":
            return SiglipVisionModel(
                config=config.backbone_config,
                quant_config=quant_config,
                prefix=prefix,
            )
        raise ValueError(f"Unsupported visual tokenizer model_type: {model_type}")

    @property
    def dtype(self) -> torch.dtype:
        return next(self.head.parameters()).dtype

    @property
    def device(self) -> torch.device:
        return next(self.head.parameters()).device

    def tokenize(self, logits: torch.Tensor) -> torch.Tensor:
        if self.config.tokenize_function == "softmax":
            tokens = softmax(logits, dim=-1)
        elif self.config.tokenize_function == "gumbel_argmax":
            tokens = gumbel_softmax(logits, tau=self.config.tau, hard=True)
        elif self.config.tokenize_function == "st_argmax":
            tokens = st_argmax(logits, dim=-1)
        else:
            raise ValueError(
                "Invalid `max_type`, expected softmax or gumbel_argmax "
                f"or st_argmax, but got {self.config.tokenize_function}"
            )
        return tokens

    def encode(self, pixel_values: torch.Tensor) -> torch.Tensor:
        features = self.backbone(pixel_values)
        if self.config.drop_cls_token:
            features = features[:, 1:, :]

        # merge number of `hidden_stride * hidden_stride` hidden states together
        # to reduce token sequence length
        # e.g., for hidden_stride=2, this leads to a token length reduction:
        # 1024 -> 256 for aimv2
        if self.config.hidden_stride > 1:
            # this `d` maybe different from the above `d`
            n, L, d = features.shape
            sqrt_l = int(L**0.5)
            assert sqrt_l**2 == L, (
                "The token sequence length should be a perfect square."
            )
            features = features.reshape(n, sqrt_l, sqrt_l, d)
            pl = (
                self.config.hidden_stride - (sqrt_l % self.config.hidden_stride)
            ) % self.config.hidden_stride
            features = pad(features, (0, 0, 0, pl, 0, pl), "constant", 0)
            sqrt_l += pl
            features = features.reshape(
                n,
                sqrt_l // self.config.hidden_stride,
                self.config.hidden_stride,
                sqrt_l // self.config.hidden_stride,
                self.config.hidden_stride,
                d,
            )
            # [n, sqrt_l/hs, sqrt_l/hs, hs, hs, d]
            features = features.permute(0, 1, 3, 2, 4, 5)
            # [n, sqrt_l/hs, sqrt_l/hs, hs*hs*d]
            features = features.flatten(3)
            # [n, sqrt_l/hs*sqrt_l/hs, hs*hs*d]
            features = features.reshape(
                n, -1, self.config.hidden_stride * self.config.hidden_stride * d
            )

        return features

    def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
        """[BatchSize, ImageShape] -> [BatchSize, Token, VocabSize]"""
        features = self.encode(pixel_values)
        logits = self.head(features)
        tokens = self.tokenize(logits)
        # tokens' shape is [BatchSize, #Token, VocabSize-5], so padding with
        # [BatchSize, #Token, 5], after which, tokens' shape should become
        # [BatchSize, #Token, VocabSize]
        tokens = torch.nn.functional.pad(
            tokens,
            (0, len(IMAGE_INDICATOR_IDS)),
            mode="constant",
            value=0,
        )
        return tokens

forward ¶

forward(pixel_values: Tensor) -> Tensor

[BatchSize, ImageShape] -> [BatchSize, Token, VocabSize]

Source code in vllm/model_executor/models/ovis.py

def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
    """[BatchSize, ImageShape] -> [BatchSize, Token, VocabSize]"""
    features = self.encode(pixel_values)
    logits = self.head(features)
    tokens = self.tokenize(logits)
    # tokens' shape is [BatchSize, #Token, VocabSize-5], so padding with
    # [BatchSize, #Token, 5], after which, tokens' shape should become
    # [BatchSize, #Token, VocabSize]
    tokens = torch.nn.functional.pad(
        tokens,
        (0, len(IMAGE_INDICATOR_IDS)),
        mode="constant",
        value=0,
    )
    return tokens