Source code for maxframe.learn.contrib.llm.multi_modal

# Copyright 1999-2026 Alibaba Group Holding Ltd.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from typing import Any, Dict, Optional

import numpy as np

from maxframe import dataframe as md
from maxframe.learn.contrib.llm.core import (  # noqa: F401
    IMAGE_BASE64_CONTENT,
    IMAGE_BINARY_CONTENT,
    IMAGE_CONTENT_KEYS,
    IMAGE_CONTENT_PART,
    IMAGE_MIME_TYPE,
    IMAGE_URL_CONTENT,
    LLM,
    TEXT_CONTENT_PART,
    ContentPart,
    ImageContentType,
    LLMTaskOperator,
    LLMTextGenOperator,
    validate_llm_input_data,
)
from maxframe.serialization.serializables import BoolField
from maxframe.serialization.serializables.field import AnyField


class LLMMultiModalGenerationOp(LLMTextGenOperator):
    pass


class MultiModalEmbeddingOp(LLMTaskOperator):
    # Multimodal embedding request template.
    input = AnyField("input", default=None)
    simple_output = BoolField("simple_output", default=False)

    def get_output_dtypes(self) -> Dict[str, np.dtype]:
        return {"response": md.dtype("string"), "success": np.dtype("bool")}


class MultiModalModel(LLM):
    pass


class MultiModalGenLLM(MultiModalModel):
    def generate(
        self,
        data,
        messages=None,
        prompt_template=None,
        simple_output: bool = False,
        params: Optional[Dict[str, Any]] = None,
        **kw,
    ):
        raise NotImplementedError


class MultiModalEmbeddingModel(MultiModalModel):
    def embed(
        self,
        data,
        input,
        simple_output: bool = False,
        params: Optional[Dict[str, Any]] = None,
        **kw,
    ):
        raise NotImplementedError



[docs]
def generate(
    data,
    model: MultiModalGenLLM,
    messages=None,
    prompt_template=None,
    simple_output: bool = False,
    params: Optional[Dict[str, Any]] = None,
    **kw,
):
    """
    Generate text with a multimodal LLM from MaxFrame data.

    Parameters
    ----------
    data : DataFrame or Series
        Input data used to render one request per row. Template placeholders
        reference columns by name, for example ``"{image_url}"``.
    model : MultiModalGenLLM
        Multimodal generation model instance.
    messages : list of dict, optional
        Chat messages template. This is an alias of ``prompt_template`` and
        takes precedence when both arguments are provided.
    prompt_template : str or list of dict, optional
        Prompt template used to build each request. A list value should follow
        chat message shape, where message ``content`` may contain plain text,
        provider-compatible dicts, or ``ContentPart`` values.
    simple_output : bool, default False
        Whether to return the generated text directly when supported by the
        model executor, instead of the raw provider response.
    params : dict, optional
        Additional generation parameters such as temperature or max tokens.

    Returns
    -------
    DataFrame
        A DataFrame with ``response`` and ``success`` columns. Failed requests
        store the error message in ``response``.

    Examples
    --------
    Build chat messages with text and image URL placeholders:

    >>> from maxframe.learn.contrib.llm import ContentPart, ImageContentType
    >>> from maxframe.learn.contrib.llm.models.dashscope import DashScopeMultiModalLLM
    >>> import maxframe.dataframe as md
    >>>
    >>> df = md.DataFrame({"image_url": ["https://example.com/cat.png"]})
    >>> model = DashScopeMultiModalLLM(
    ...     name="qwen-vl-max",
    ...     api_key_resource="<api-key-resource-name>",
    ... )
    >>> messages = [
    ...     {
    ...         "role": "user",
    ...         "content": [
    ...             ContentPart.text("Analyze this image."),
    ...             ContentPart.image(
    ...                 data=df.image_url,
    ...                 type=ImageContentType.IMAGE_URL,
    ...             ),
    ...         ],
    ...     }
    ... ]
    >>> result = model.generate(df, messages=messages)

    Use an OSS object URL with explicit storage options when the image is not
    publicly reachable:

    >>> storage_options = {
    ...     "access_key_id": "<access-key-id>",
    ...     "access_key_secret": "<access-key-secret>",
    ... }
    >>> df = md.DataFrame({"image_url": ["oss://endpoint/bucket/path/cat.png"]})
    >>> messages = [
    ...     {
    ...         "role": "user",
    ...         "content": [
    ...             ContentPart.text("Analyze this OSS image."),
    ...             ContentPart.image(
    ...                 data=df.image_url,
    ...                 type=ImageContentType.IMAGE_URL,
    ...                 storage_options=storage_options,
    ...             ),
    ...         ],
    ...     }
    ... ]
    >>> result = model.generate(df, messages=messages)

    Notes
    -----
    ``ContentPart.image`` supports ``IMAGE_URL``, ``BINARY`` and ``BASE64``
    image content. ``BINARY`` and ``BASE64`` inputs must include ``mime_type``.
    """
    validate_llm_input_data(data)
    if not isinstance(model, MultiModalGenLLM):
        raise ValueError("model must be a MultiModalGenLLM object")
    params = params if params is not None else {}
    model.validate_params(params)
    return model.generate(
        data,
        messages=messages,
        prompt_template=prompt_template,
        simple_output=simple_output,
        params=params,
        **kw,
    )




[docs]
def embed(
    data,
    model: MultiModalEmbeddingModel,
    input,
    simple_output: bool = False,
    params: Optional[Dict[str, Any]] = None,
    **kw,
):
    """
    Embed multimodal input with a multimodal embedding model.

    Parameters
    ----------
    data : DataFrame or Series
        Input data used to render one embedding request per row.
    model : MultiModalEmbeddingModel
        Multimodal embedding model instance.
    input : list or ContentPart
        Multimodal input template. Values may contain placeholders that
        reference columns in ``data``. The template is rendered row by row and
        sent as a single multimodal embedding input for that row.
    simple_output : bool, default False
        Whether to return embedding vectors directly when supported by the
        model executor, instead of the raw provider response.
    params : dict, optional
        Additional embedding parameters.

    Returns
    -------
    DataFrame
        A DataFrame with ``response`` and ``success`` columns. Failed requests
        store the error message in ``response``.

    Examples
    --------
    >>> from maxframe.learn.contrib.llm import ContentPart, ImageContentType
    >>> input = [
    ...     ContentPart.text("Represent this product image."),
    ...     ContentPart.image(
    ...         data=df.image_url,
    ...         type=ImageContentType.IMAGE_URL,
    ...     ),
    ... ]
    >>> result = model.embed(df, input=input, simple_output=True)
    """
    validate_llm_input_data(data)
    if not isinstance(model, MultiModalEmbeddingModel):
        raise ValueError("model must be a MultiModalEmbeddingModel object")
    params = params if params is not None else {}
    model.validate_params(params)
    return model.embed(
        data, input=input, simple_output=simple_output, params=params, **kw
    )