# Copyright 1999-2026 Alibaba Group Holding Ltd.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import Any, Dict, List, Optional, Tuple
import numpy as np
from maxframe import dataframe as md
from maxframe import opcodes
from maxframe.dataframe.core import DataFrame, Series
from maxframe.learn.contrib.llm.core import LLM, LLMTaskOperator
from maxframe.serialization.serializables import (
DictField,
FieldTypes,
ListField,
StringField,
)
class TextLLMSummarizeOp(LLMTaskOperator):
_op_type_ = opcodes.LLM_TEXT_SUMMARIZE_TASK
_legacy_name = "TextLLMSummarizeOperator" # since v2.3.0
def get_output_dtypes(self) -> Dict[str, np.dtype]:
return {
"summary": md.dtype("string"),
"success": np.dtype("bool"),
}
class TextLLMTranslateOp(LLMTaskOperator):
_op_type_ = opcodes.LLM_TEXT_TRANSLATE_TASK
_legacy_name = "TextLLMTranslateOperator" # since v2.3.0
source_language = StringField("source_language")
target_language = StringField("target_language")
description = StringField("description", default=None)
examples = ListField("examples", FieldTypes.dict, default=None)
def get_output_dtypes(self) -> Dict[str, np.dtype]:
return {
"output": md.dtype("string"),
"success": np.dtype("bool"),
}
class TextLLMClassifyOp(LLMTaskOperator):
_op_type_ = opcodes.LLM_TEXT_CLASSIFY_TASK
_legacy_name = "TextLLMClassifyOperator" # since v2.3.0
labels = ListField("labels")
description = StringField("description", default=None)
examples = ListField("examples", FieldTypes.dict, default=None)
def get_output_dtypes(self) -> Dict[str, np.dtype]:
return {
"label": md.dtype("string"),
"reason": md.dtype("string"),
"success": np.dtype("bool"),
}
class TextLLMExtractOp(LLMTaskOperator):
_op_type_ = opcodes.LLM_TEXT_EXTRACT_TASK
_legacy_name = "TextLLMExtractOperator" # since v2.3.0
schema = DictField("schema", FieldTypes.string, FieldTypes.any, default=None)
description = StringField("description", default=None)
examples = ListField("examples", FieldTypes.dict, default_factory=None)
def get_output_dtypes(self) -> Dict[str, np.dtype]:
return {
"output": md.dtype("string"),
"success": np.dtype("bool"),
}
class TextGenLLM(LLM):
_legacy_name = "TextLLM" # since v2.3.0
def generate(
self,
data,
prompt_template: List[Dict[str, str]],
params: Optional[Dict[str, Any]] = None,
):
raise NotImplementedError
def summarize(self, series, index=None, **kw):
return TextLLMSummarizeOp(model=self, task="summarize", **kw)(series, index)
def translate(
self,
series,
target_language: str,
source_language: Optional[str] = None,
description: Optional[str] = None,
examples: Optional[List[Dict[str, str]]] = None,
index=None,
**kw
):
return TextLLMTranslateOp(
model=self,
task="translate",
source_language=source_language,
target_language=target_language,
description=description,
examples=examples,
**kw
)(series, index)
def classify(
self,
series,
labels: List[str],
description: Optional[str] = None,
examples: Optional[List[Dict[str, str]]] = None,
index=None,
**kw
):
return TextLLMClassifyOp(
model=self,
labels=labels,
task="classify",
description=description,
examples=examples,
**kw
)(series, index)
def extract(
self,
series,
schema: Any,
description: Optional[str] = None,
examples: Optional[List[Tuple[str, str]]] = None,
index=None,
**kw
):
import inspect
from pydantic import BaseModel
if inspect.isclass(schema) and issubclass(schema, BaseModel):
schema = schema.model_json_schema()
return TextLLMExtractOp(
model=self,
schema=schema,
task="extract",
description=description,
examples=examples,
**kw
)(series, index)
TextLLM = TextGenLLM # for old client compatibility
class TextEmbeddingModel(LLM):
def embed(
self,
data: Series,
dimensions: Optional[int] = None,
encoding_format: Optional[str] = None,
simple_output: bool = False,
params: Optional[Dict[str, Any]] = None,
**kw
):
raise NotImplementedError
[docs]
def generate(
data,
model: TextGenLLM,
prompt_template: List[Dict[str, Any]],
params: Optional[Dict[str, Any]] = None,
):
"""
Generate text using a text language model based on given data and prompt template.
Parameters
----------
data : DataFrame or Series
Input data used for generation. Can be maxframe DataFrame, Series that contain text to be processed.
model : TextLLM
Language model instance used for text generation.
prompt_template : List[Dict[str, str]]
Dictionary containing the conversation messages template. Use ``{col_name}`` as a placeholder to reference
column data from input data.
Usually in format of [{"role": "user", "content": "{query}"}], same with openai api schema.
params : Dict[str, Any], optional
Additional parameters for generation configuration, by default None.
Can include settings like temperature, max_tokens, etc.
Returns
-------
DataFrame
Generated text raw response and success status. If the success is False, the generated text will return the
error message.
Examples
--------
>>> from maxframe.learn.contrib.llm.models.managed import ManagedTextGenLLM
>>> import maxframe.dataframe as md
>>>
>>> # Initialize the model
>>> llm = ManagedTextGenLLM(name="Qwen3-0.6B")
>>>
>>> # Prepare prompt template
>>> messages = [
... {
... "role": "user",
... "content": "Help answer following question: {query}",
... },
... ]
>>> # Create sample data
>>> df = md.DataFrame({"query": ["What is machine learning?"]})
>>>
>>> # Generate response
>>> result = generate(df, llm, prompt_template=messages)
>>> result.execute()
"""
if not isinstance(data, DataFrame) and not isinstance(data, Series):
raise ValueError("data must be a maxframe dataframe or series object")
if not isinstance(model, TextGenLLM):
raise TypeError("model must be a TextLLM object")
params = params if params is not None else dict()
model.validate_params(params)
return model.generate(data, prompt_template=prompt_template, params=params)
def summary(series, model: TextGenLLM, index=None):
"""
Generate summaries for text content in a series using a language model.
Parameters
----------
series : Series
A maxframe Series containing text data to be summarized.
Each element should be a text string.
model : TextGenLLM
Language model instance used for text summarization.
index : array-like, optional
Index for the output series, by default None, will generate new index.
Returns
-------
DataFrame
A DataFrame containing the generated summaries and success status.
Columns include 'summary' (generated summary text) and 'success' (boolean status).
If 'success' is False, the 'summary' column will contain error information instead of the expected output.
Examples
--------
>>> from maxframe.learn.contrib.llm.models.managed import ManagedTextGenLLM
>>> import maxframe.dataframe as md
>>>
>>> # Initialize the model
>>> llm = ManagedTextGenLLM(name="Qwen3-0.6B")
>>>
>>> # Create sample data
>>> texts = md.Series([
... "Machine learning is a subset of artificial intelligence that enables computers to learn and improve from experience without being explicitly programmed.",
... "Deep learning uses neural networks with multiple layers to model and understand complex patterns in data."
... ])
>>>
>>> # Generate summaries
>>> result = summary(texts, llm)
>>> result.execute()
Notes
-----
**Preview:** This API is in preview state and may be unstable.
The interface may change in future releases.
"""
if not isinstance(series, Series):
raise ValueError("series must be a maxframe series object")
if series.dtype != np.str_:
raise ValueError("summary input must be a string series")
return model.summarize(series, index=index)
[docs]
def translate(
series, model: TextGenLLM, source_language: str, target_language: str, index=None
):
"""
Translate text content in a series using a language model from source language to target language.
Parameters
----------
series : Series
A maxframe Series containing text data to translate.
Each element should be a text string.
model : TextGenLLM
Language model instance used for text translation.
source_language : str
Source language of the text (e.g., 'en', 'zh', 'ja').
target_language : str
Target language for translation (e.g., 'en', 'zh', 'ja').
index : array-like, optional
Index for the output series, by default None, will generate new index.
Returns
-------
DataFrame
A DataFrame containing the generated translations and success status.
Columns include 'output' (translated text) and 'success' (boolean status).
If 'success' is False, the 'output' column will contain error information instead of the expected output.
Examples
--------
>>> from maxframe.learn.contrib.llm.models.managed import ManagedTextGenLLM
>>> import maxframe.dataframe as md
>>>
>>> # Initialize the model
>>> llm = ManagedTextGenLLM(name="Qwen3-0.6B")
>>>
>>> # Create sample data
>>> texts = md.Series([
... "Hello, how are you?",
... "Machine learning is fascinating."
... ])
>>>
>>> # Translate from English to Chinese
>>> result = translate(texts, llm, source_language="en", target_language="zh")
>>> result.execute()
Notes
-----
**Preview:** This API is in preview state and may be unstable.
The interface may change in future releases.
"""
if not isinstance(series, Series):
raise ValueError("series must be a maxframe series object")
if series.dtype != np.str_:
raise ValueError("translate input must be a string series")
return model.translate(
series,
source_language=source_language,
target_language=target_language,
index=index,
)
def classify(
series,
model: TextGenLLM,
labels: List[str],
description: Optional[str] = None,
examples: Optional[List[Dict[str, str]]] = None,
index=None,
):
"""
Classify text content in a series with given labels using a language model.
Parameters
----------
series : Series
A maxframe Series containing text data to be classified.
Each element should be a text string.
model : TextGenLLM
Language model instance used for text classification.
labels : List[str]
List of labels to classify the text into.
description : str, optional
Description of the classification task to help the model understand the context.
examples : List[Dict[str, str]], optional
Examples of the classification task, like [{"text": "text...", "label": "A", "reason": "reason..."}],
to help LLM better understand your classification rules.
index : array-like, optional
Index for the output series, by default None, will generate new index.
Returns
-------
DataFrame
A DataFrame containing the generated classification results and success status.
Columns include 'label' (predicted label), 'reason' (reasoning), and 'success' (boolean status).
If 'success' is False, the 'label' and 'reason' columns will contain error information instead of the expected output.
Examples
--------
>>> from maxframe.learn.contrib.llm.models.managed import ManagedTextGenLLM
>>> import maxframe.dataframe as md
>>>
>>> # Initialize the model
>>> llm = ManagedTextGenLLM(name="Qwen3-0.6B")
>>>
>>> # Create sample data
>>> texts = md.Series([
... "I love this product! It's amazing!",
... "This is terrible, worst purchase ever.",
... "It's okay, nothing special."
... ])
>>>
>>> # Classify sentiment
>>> labels = ["positive", "negative", "neutral"]
>>> description = "Classify the sentiment of customer reviews"
>>> examples = [
... {"text": "Great product!", "label": "positive", "reason": "Expresses satisfaction"},
... {"text": "Poor quality", "label": "negative", "reason": "Expresses dissatisfaction"}
... ]
>>> result = classify(texts, llm, labels=labels, description=description, examples=examples)
>>> result.execute()
Notes
-----
**Preview:** This API is in preview state and may be unstable.
The interface may change in future releases.
"""
if not isinstance(series, Series):
raise ValueError("series must be a maxframe series object")
if series.dtype != np.str_:
raise ValueError("classify input must be a string series")
if not isinstance(labels, list):
raise TypeError("labels must be a list")
if not labels:
raise ValueError("labels must not be empty")
return model.classify(
series, labels=labels, description=description, examples=examples, index=index
)
def embed(
series,
model: TextEmbeddingModel,
dimensions: Optional[int] = None,
encoding_format: Optional[str] = None,
simple_output: bool = False,
params: Optional[Dict[str, Any]] = None,
index=None,
):
"""
Embed text content in a series using a text embedding model.
Parameters
----------
series : Series
A maxframe Series containing text data to be embedded.
Each element should be a text string.
model : TextEmbeddingModel
Text embedding model instance used for generating embeddings.
dimensions : int, optional
Dimensions of the embedding vectors. If not specified, uses model default.
encoding_format : str, optional
Encoding format of the embedding (e.g., 'float', 'base64'). If not specified, uses model default.
simple_output : bool, optional
Whether to return the embedding data directly without additional metadata, by default False.
params : Dict[str, Any], optional
Additional parameters for embedding configuration, by default None.
Can include model-specific settings.
index : array-like, optional
Index for the output series, by default None, will generate new index.
Returns
-------
DataFrame
A DataFrame containing the generated embeddings and success status.
Columns include 'response' (embedding vectors) and 'success' (boolean status).
If 'success' is False, the 'response' column will contain error information instead of the expected output.
Examples
--------
>>> from maxframe.learn.contrib.llm.models.managed import ManagedTextEmbeddingModel
>>> import maxframe.dataframe as md
>>>
>>> # Initialize the embedding model
>>> embedding_model = ManagedTextEmbeddingModel(name="text-embedding-ada-002")
>>>
>>> # Create sample data
>>> texts = md.Series([
... "Machine learning is a powerful technology.",
... "Natural language processing enables computers to understand text.",
... "Deep learning uses neural networks for pattern recognition."
... ])
>>>
>>> # Generate embeddings
>>> result = embed(texts, embedding_model, simple_output=True)
>>> result.execute()
Notes
-----
**Preview:** This API is in preview state and may be unstable.
The interface may change in future releases.
"""
if not isinstance(series, Series):
raise ValueError("series must be a maxframe series object")
if series.dtype != np.str_:
raise ValueError("embed input must be a string series")
return model.embed(
series,
dimensions=dimensions,
encoding_format=encoding_format,
simple_output=simple_output,
params=params,
index=index,
)
TextLLMExtractOperator = TextLLMExtractOp
TextLLMSummarizeOperator = TextLLMSummarizeOp
TextLLMTranslateOperator = TextLLMTranslateOp
TextLLMClassifyOperator = TextLLMClassifyOp