feat: add `bigframes.bigquery.ai.generate_embedding` #2343

sycai · 2026-02-03T18:09:49Z

Does it make sense to expose this function as bbq.ai.generate_embedding too?

Good catch! Done.

-Original file line number
+Diff line change
@@ Expand Up / @@ -19,7 +19,7 @@ @@
     from __future__ import annotations
     import json
-    from typing import Any, Iterable, List, Literal, Mapping, Tuple, Union
+    from typing import Any, Dict, Iterable, List, Literal, Mapping, Optional, Tuple, Union
     import pandas as pd
@@ Expand All / @@ -28,6 +28,7 @@ @@
     from bigframes import series, session
     from bigframes.core import convert
     from bigframes.core.logging import log_adapter
+    import bigframes.core.sql.literals
     from bigframes.ml import core as ml_core
     from bigframes.operations import ai_ops, output_schemas
@@ Expand Down Expand Up / @@ -388,6 +389,113 @@ def generate_double( @@
         return series_list[0]._apply_nary_op(operator, series_list[1:])
+    @log_adapter.method_logger(custom_base_name="bigquery_ai")
+    def generate_embedding(
+        model_name: str,
+        data: Union[dataframe.DataFrame, series.Series, pd.DataFrame, pd.Series],
+        *,
+        output_dimensionality: Optional[int] = None,
+        task_type: Optional[str] = None,
+        start_second: Optional[float] = None,
+        end_second: Optional[float] = None,
+        interval_seconds: Optional[float] = None,
+        trial_id: Optional[int] = None,
+    ) -> dataframe.DataFrame:
+        """
+        Creates embeddings that describe an entity—for example, a piece of text or an image.
+        **Examples:**
+            >>> import bigframes.pandas as bpd
+            >>> import bigframes.bigquery as bbq
+            >>> df = bpd.DataFrame({"content": ["apple", "bear", "pear"]})
+            >>> bbq.ai.generate_embedding(
+            ...     "project.dataset.model_name",
+            ...     df
+            ... ) # doctest: +SKIP
+        Args:
+            model_name (str):
+                The name of a remote model from Vertex AI, such as the
+                multimodalembedding@001 model.
+            data (bigframes.pandas.DataFrame or bigframes.pandas.Series):
+                The data to generate embeddings for. If a Series is provided, it is
+                treated as the 'content' column.  If a DataFrame is provided, it
+                must contain a 'content' column, or you must rename the column you
+                wish to embed to 'content'.
+            output_dimensionality (int, optional):
+                An INT64 value that specifies the number of dimensions to use when
+                generating embeddings. For example, if you specify 256 AS
+                output_dimensionality, then the embedding output column contains a
+-dimensional embedding for each input value. To find the
+                supported range of output dimensions, read about the available
+                `Google text embedding models <https://docs.cloud.google.com/vertex-ai/generative-ai/docs/embeddings/get-text-embeddings#google-models>`_.
+            task_type (str, optional):
+                A STRING literal that specifies the intended downstream application to
+                help the model produce better quality embeddings. For a list of
+                supported task types and how to choose which one to use, see `Choose an
+                embeddings task type <http://docs.cloud.google.com/vertex-ai/generative-ai/docs/embeddings/task-types>`_.
+            start_second (float, optional):
+                The second in the video at which to start the embedding. The default value is 0.
+            end_second (float, optional):
+                The second in the video at which to end the embedding. The default value is 120.
+            interval_seconds (float, optional):
+                The interval to use when creating embeddings. The default value is 16.
+            trial_id (int, optional):
+                An INT64 value that identifies the hyperparameter tuning trial that
+                you want the function to evaluate. The function uses the optimal
+                trial by default. Only specify this argument if you ran
+                hyperparameter tuning when creating the model.
+        Returns:
+            bigframes.pandas.DataFrame:
+                A new DataFrame with the generated embeddings. See the `SQL
+                reference for AI.GENERATE_EMBEDDING
+                <https://docs.cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-ai-generate-embedding#output>`_
+                for details.
+        """
+        if isinstance(data, (pd.DataFrame, pd.Series)):
+            data = bpd.read_pandas(data)
+        if isinstance(data, series.Series):
+            data = data.copy()
+            data.name = "content"
+            data_df = data.to_frame()
+        elif isinstance(data, dataframe.DataFrame):
+            data_df = data
+        else:
+            raise ValueError(f"Unsupported data type: {type(data)}")
+        # We need to get the SQL for the input data to pass as a subquery to the TVF
+        source_sql = data_df.sql
+        struct_fields: Dict[str, bigframes.core.sql.literals.STRUCT_VALUES] = {}
+        if output_dimensionality is not None:
+            struct_fields["OUTPUT_DIMENSIONALITY"] = output_dimensionality
+        if task_type is not None:
+            struct_fields["TASK_TYPE"] = task_type
+        if start_second is not None:
+            struct_fields["START_SECOND"] = start_second
+        if end_second is not None:
+            struct_fields["END_SECOND"] = end_second
+        if interval_seconds is not None:
+            struct_fields["INTERVAL_SECONDS"] = interval_seconds
+        if trial_id is not None:
+            struct_fields["TRIAL_ID"] = trial_id
+        # Construct the TVF query
+        query = f"""
+            SELECT *
+            FROM AI.GENERATE_EMBEDDING(
+                MODEL `{model_name}`,
+                ({source_sql}),
+                {bigframes.core.sql.literals.struct_literal(struct_fields)})
+            )
+        """
+        return data_df._session.read_gbq(query)
     @log_adapter.method_logger(custom_base_name="bigquery_ai")
     def if_(
         prompt: PROMPT_TYPE,
@@ Expand Down @@

-Original file line number
+Diff line change
@@ Expand Up / @@ -22,6 +22,7 @@ @@
         generate,
         generate_bool,
         generate_double,
+        generate_embedding,
         generate_int,
         if_,
         score,
@@ Expand All / @@ -33,6 +34,7 @@ @@
         "generate",
         "generate_bool",
         "generate_double",
+        "generate_embedding",
         "generate_int",
         "if_",
         "score",
@@ Expand Down @@

-Original file line number
+Diff line change
@@ -0,0 +1,58 @@
+    # Copyright 2026 Google LLC
+    #
+    # Licensed under the Apache License, Version 2.0 (the "License");
+    # you may not use this file except in compliance with the License.
+    # You may obtain a copy of the License at
+    #
+    #     http://www.apache.org/licenses/LICENSE-2.0
+    #
+    # Unless required by applicable law or agreed to in writing, software
+    # distributed under the License is distributed on an "AS IS" BASIS,
+    # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    # See the License for the specific language governing permissions and
+    # limitations under the License.
+    from __future__ import annotations
+    import collections.abc
+    import json
+    from typing import Any, List, Mapping, Union
+    import bigframes.core.sql
+    STRUCT_VALUES = Union[
+        str, int, float, bool, Mapping[str, str], List[str], Mapping[str, Any]
+    ]
+    STRUCT_TYPE = Mapping[str, STRUCT_VALUES]
+    def struct_literal(struct_options: STRUCT_TYPE) -> str:
+        rendered_options = []
+        for option_name, option_value in struct_options.items():
+            if option_name == "model_params":
+                json_str = json.dumps(option_value)
+                # Escape single quotes for SQL string literal
+                sql_json_str = json_str.replace("'", "''")
+                rendered_val = f"JSON'{sql_json_str}'"
+            elif isinstance(option_value, collections.abc.Mapping):
+                struct_body = ", ".join(
+                    [
+                        f"{bigframes.core.sql.simple_literal(v)} AS {k}"
+                        for k, v in option_value.items()
+                    ]
+                )
+                rendered_val = f"STRUCT({struct_body})"
+            elif isinstance(option_value, list):
+                rendered_val = (
+                    "["
+                    + ", ".join(
+                        [bigframes.core.sql.simple_literal(v) for v in option_value]
+                    )
+                    + "]"
+                )
+            elif isinstance(option_value, bool):
+                rendered_val = str(option_value).lower()
+            else:
+                rendered_val = bigframes.core.sql.simple_literal(option_value)
+            rendered_options.append(f"{rendered_val} AS {option_name}")
+        return f"STRUCT({', '.join(rendered_options)})"

-Original file line number
+Diff line change
@@ Expand Up / @@ -14,12 +14,11 @@ @@
     from __future__ import annotations
-    import collections.abc
-    import json
     from typing import Any, Dict, List, Mapping, Optional, Union
     import bigframes.core.compile.googlesql as googlesql
     import bigframes.core.sql
+    import bigframes.core.sql.literals
     def create_model_ddl(
@@ Expand Down Expand Up / @@ -109,36 +108,7 @@ def _build_struct_sql( @@
     ) -> str:
         if not struct_options:
             return ""
-        rendered_options = []
-        for option_name, option_value in struct_options.items():
-            if option_name == "model_params":
-                json_str = json.dumps(option_value)
-                # Escape single quotes for SQL string literal
-                sql_json_str = json_str.replace("'", "''")
-                rendered_val = f"JSON'{sql_json_str}'"
-            elif isinstance(option_value, collections.abc.Mapping):
-                struct_body = ", ".join(
-                    [
-                        f"{bigframes.core.sql.simple_literal(v)} AS {k}"
-                        for k, v in option_value.items()
-                    ]
-                )
-                rendered_val = f"STRUCT({struct_body})"
-            elif isinstance(option_value, list):
-                rendered_val = (
-                    "["
-                    + ", ".join(
-                        [bigframes.core.sql.simple_literal(v) for v in option_value]
-                    )
-                    + "]"
-                )
-            elif isinstance(option_value, bool):
-                rendered_val = str(option_value).lower()
-            else:
-                rendered_val = bigframes.core.sql.simple_literal(option_value)
-            rendered_options.append(f"{rendered_val} AS {option_name}")
-        return f", STRUCT({', '.join(rendered_options)})"
+        return f", {bigframes.core.sql.literals.struct_literal(struct_options)}"
     def evaluate(
@@ Expand Down @@

-Original file line number
+Diff line change
@@ -0,0 +1,134 @@
+    # Copyright 2025 Google LLC
+    #
+    # Licensed under the Apache License, Version 2.0 (the "License");
+    # you may not use this file except in compliance with the License.
+    # You may obtain a copy of the License at
+    #
+    #     http://www.apache.org/licenses/LICENSE-2.0
+    #
+    # Unless required by applicable law or agreed to in writing, software
+    # distributed under the License is distributed on an "AS IS" BASIS,
+    # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    # See the License for the specific language governing permissions and
+    # limitations under the License.
+    from unittest import mock
+    import pandas as pd
+    import pytest
+    import bigframes.bigquery as bbq
+    import bigframes.dataframe
+    import bigframes.series
+    import bigframes.session
+    @pytest.fixture
+    def mock_session():
+        return mock.create_autospec(spec=bigframes.session.Session)
+    @pytest.fixture
+    def mock_dataframe(mock_session):
+        df = mock.create_autospec(spec=bigframes.dataframe.DataFrame)
+        df._session = mock_session
+        df.sql = "SELECT * FROM my_table"
+        return df
+    @pytest.fixture
+    def mock_series(mock_session):
+        series = mock.create_autospec(spec=bigframes.series.Series)
+        series._session = mock_session
+        # Mock to_frame to return a mock dataframe
+        df = mock.create_autospec(spec=bigframes.dataframe.DataFrame)
+        df._session = mock_session
+        df.sql = "SELECT my_col AS content FROM my_table"
+        series.copy.return_value = series
+        series.to_frame.return_value = df
+        return series
+    def test_generate_embedding_with_dataframe(mock_dataframe, mock_session):
+        model_name = "project.dataset.model"
+        bbq.ai.generate_embedding(
+            model_name,
+            mock_dataframe,
+            output_dimensionality=256,
+        )
+        mock_session.read_gbq.assert_called_once()
+        query = mock_session.read_gbq.call_args[0][0]
+        # Normalize whitespace for comparison
+        query = " ".join(query.split())
+        expected_part_1 = "SELECT * FROM AI.GENERATE_EMBEDDING("
+        expected_part_2 = f"MODEL `{model_name}`,"
+        expected_part_3 = "(SELECT * FROM my_table),"
+        expected_part_4 = "STRUCT(256 AS OUTPUT_DIMENSIONALITY)"
+        assert expected_part_1 in query
+        assert expected_part_2 in query
+        assert expected_part_3 in query
+        assert expected_part_4 in query
+    def test_generate_embedding_with_series(mock_series, mock_session):
+        model_name = "project.dataset.model"
+        bbq.ai.generate_embedding(
+            model_name, mock_series, start_second=0.0, end_second=10.0, interval_seconds=5.0
+        )
+        mock_session.read_gbq.assert_called_once()
+        query = mock_session.read_gbq.call_args[0][0]
+        query = " ".join(query.split())
+        assert f"MODEL `{model_name}`" in query
+        assert "(SELECT my_col AS content FROM my_table)" in query
+        assert (
+            "STRUCT(0.0 AS START_SECOND, 10.0 AS END_SECOND, 5.0 AS INTERVAL_SECONDS)"
+            in query
+        )
+    def test_generate_embedding_defaults(mock_dataframe, mock_session):
+        model_name = "project.dataset.model"
+        bbq.ai.generate_embedding(
+            model_name,
+            mock_dataframe,
+        )
+        mock_session.read_gbq.assert_called_once()
+        query = mock_session.read_gbq.call_args[0][0]
+        query = " ".join(query.split())
+        assert f"MODEL `{model_name}`" in query
+        assert "STRUCT()" in query
+    @mock.patch("bigframes.pandas.read_pandas")
+    def test_generate_embedding_with_pandas_dataframe(
+        read_pandas_mock, mock_dataframe, mock_session
+    ):
+        # This tests that pandas input path works and calls read_pandas
+        model_name = "project.dataset.model"
+        # Mock return value of read_pandas to be a BigFrames DataFrame
+        read_pandas_mock.return_value = mock_dataframe
+        pandas_df = pd.DataFrame({"content": ["test"]})
+        bbq.ai.generate_embedding(
+            model_name,
+            pandas_df,
+        )
+        read_pandas_mock.assert_called_once()
+        # Check that read_pandas was called with something (the pandas df)
+        assert read_pandas_mock.call_args[0][0] is pandas_df
+        mock_session.read_gbq.assert_called_once()

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

feat: add `bigframes.bigquery.ai.generate_embedding` #2343

Diff view

Diff view

There are no files selected for viewing

sycai Feb 3, 2026

Uh oh!

tswast Feb 3, 2026

Uh oh!

Uh oh!

feat: add bigframes.bigquery.ai.generate_embedding #2343

Are you sure you want to change the base?

feat: add bigframes.bigquery.ai.generate_embedding #2343

Uh oh!

Uh oh!

Diff view

Diff view

There are no files selected for viewing

sycai Feb 3, 2026

Choose a reason for hiding this comment

Uh oh!

tswast Feb 3, 2026

Choose a reason for hiding this comment

Uh oh!

Uh oh!

feat: add `bigframes.bigquery.ai.generate_embedding` #2343

feat: add `bigframes.bigquery.ai.generate_embedding` #2343