Skip to content
Open
110 changes: 109 additions & 1 deletion bigframes/bigquery/_operations/ai.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
from __future__ import annotations

import json
from typing import Any, Iterable, List, Literal, Mapping, Tuple, Union
from typing import Any, Dict, Iterable, List, Literal, Mapping, Optional, Tuple, Union

import pandas as pd

Expand All @@ -28,6 +28,7 @@
from bigframes import series, session
from bigframes.core import convert
from bigframes.core.logging import log_adapter
import bigframes.core.sql.literals
from bigframes.ml import core as ml_core
from bigframes.operations import ai_ops, output_schemas

Expand Down Expand Up @@ -388,6 +389,113 @@ def generate_double(
return series_list[0]._apply_nary_op(operator, series_list[1:])


@log_adapter.method_logger(custom_base_name="bigquery_ai")
def generate_embedding(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does it make sense to expose this function as bbq.ai.generate_embedding too?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good catch! Done.

model_name: str,
data: Union[dataframe.DataFrame, series.Series, pd.DataFrame, pd.Series],
*,
output_dimensionality: Optional[int] = None,
task_type: Optional[str] = None,
start_second: Optional[float] = None,
end_second: Optional[float] = None,
interval_seconds: Optional[float] = None,
trial_id: Optional[int] = None,
) -> dataframe.DataFrame:
"""
Creates embeddings that describe an entity—for example, a piece of text or an image.

**Examples:**

>>> import bigframes.pandas as bpd
>>> import bigframes.bigquery as bbq
>>> df = bpd.DataFrame({"content": ["apple", "bear", "pear"]})
>>> bbq.ai.generate_embedding(
... "project.dataset.model_name",
... df
... ) # doctest: +SKIP

Args:
model_name (str):
The name of a remote model from Vertex AI, such as the
multimodalembedding@001 model.
data (bigframes.pandas.DataFrame or bigframes.pandas.Series):
The data to generate embeddings for. If a Series is provided, it is
treated as the 'content' column. If a DataFrame is provided, it
must contain a 'content' column, or you must rename the column you
wish to embed to 'content'.
output_dimensionality (int, optional):
An INT64 value that specifies the number of dimensions to use when
generating embeddings. For example, if you specify 256 AS
output_dimensionality, then the embedding output column contains a
256-dimensional embedding for each input value. To find the
supported range of output dimensions, read about the available
`Google text embedding models <https://docs.cloud.google.com/vertex-ai/generative-ai/docs/embeddings/get-text-embeddings#google-models>`_.
task_type (str, optional):
A STRING literal that specifies the intended downstream application to
help the model produce better quality embeddings. For a list of
supported task types and how to choose which one to use, see `Choose an
embeddings task type <http://docs.cloud.google.com/vertex-ai/generative-ai/docs/embeddings/task-types>`_.
start_second (float, optional):
The second in the video at which to start the embedding. The default value is 0.
end_second (float, optional):
The second in the video at which to end the embedding. The default value is 120.
interval_seconds (float, optional):
The interval to use when creating embeddings. The default value is 16.
trial_id (int, optional):
An INT64 value that identifies the hyperparameter tuning trial that
you want the function to evaluate. The function uses the optimal
trial by default. Only specify this argument if you ran
hyperparameter tuning when creating the model.

Returns:
bigframes.pandas.DataFrame:
A new DataFrame with the generated embeddings. See the `SQL
reference for AI.GENERATE_EMBEDDING
<https://docs.cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-ai-generate-embedding#output>`_
for details.
"""
if isinstance(data, (pd.DataFrame, pd.Series)):
data = bpd.read_pandas(data)

if isinstance(data, series.Series):
data = data.copy()
data.name = "content"
data_df = data.to_frame()
elif isinstance(data, dataframe.DataFrame):
data_df = data
else:
raise ValueError(f"Unsupported data type: {type(data)}")

# We need to get the SQL for the input data to pass as a subquery to the TVF
source_sql = data_df.sql

struct_fields: Dict[str, bigframes.core.sql.literals.STRUCT_VALUES] = {}
if output_dimensionality is not None:
struct_fields["OUTPUT_DIMENSIONALITY"] = output_dimensionality
if task_type is not None:
struct_fields["TASK_TYPE"] = task_type
if start_second is not None:
struct_fields["START_SECOND"] = start_second
if end_second is not None:
struct_fields["END_SECOND"] = end_second
if interval_seconds is not None:
struct_fields["INTERVAL_SECONDS"] = interval_seconds
if trial_id is not None:
struct_fields["TRIAL_ID"] = trial_id

# Construct the TVF query
query = f"""
SELECT *
FROM AI.GENERATE_EMBEDDING(
MODEL `{model_name}`,
({source_sql}),
{bigframes.core.sql.literals.struct_literal(struct_fields)})
)
"""

return data_df._session.read_gbq(query)


@log_adapter.method_logger(custom_base_name="bigquery_ai")
def if_(
prompt: PROMPT_TYPE,
Expand Down
2 changes: 2 additions & 0 deletions bigframes/bigquery/ai.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
generate,
generate_bool,
generate_double,
generate_embedding,
generate_int,
if_,
score,
Expand All @@ -33,6 +34,7 @@
"generate",
"generate_bool",
"generate_double",
"generate_embedding",
"generate_int",
"if_",
"score",
Expand Down
58 changes: 58 additions & 0 deletions bigframes/core/sql/literals.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
# Copyright 2026 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import annotations

import collections.abc
import json
from typing import Any, List, Mapping, Union

import bigframes.core.sql

STRUCT_VALUES = Union[
str, int, float, bool, Mapping[str, str], List[str], Mapping[str, Any]
]
STRUCT_TYPE = Mapping[str, STRUCT_VALUES]


def struct_literal(struct_options: STRUCT_TYPE) -> str:
rendered_options = []
for option_name, option_value in struct_options.items():
if option_name == "model_params":
json_str = json.dumps(option_value)
# Escape single quotes for SQL string literal
sql_json_str = json_str.replace("'", "''")
rendered_val = f"JSON'{sql_json_str}'"
elif isinstance(option_value, collections.abc.Mapping):
struct_body = ", ".join(
[
f"{bigframes.core.sql.simple_literal(v)} AS {k}"
for k, v in option_value.items()
]
)
rendered_val = f"STRUCT({struct_body})"
elif isinstance(option_value, list):
rendered_val = (
"["
+ ", ".join(
[bigframes.core.sql.simple_literal(v) for v in option_value]
)
+ "]"
)
elif isinstance(option_value, bool):
rendered_val = str(option_value).lower()
else:
rendered_val = bigframes.core.sql.simple_literal(option_value)
rendered_options.append(f"{rendered_val} AS {option_name}")
return f"STRUCT({', '.join(rendered_options)})"
34 changes: 2 additions & 32 deletions bigframes/core/sql/ml.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,11 @@

from __future__ import annotations

import collections.abc
import json
from typing import Any, Dict, List, Mapping, Optional, Union

import bigframes.core.compile.googlesql as googlesql
import bigframes.core.sql
import bigframes.core.sql.literals


def create_model_ddl(
Expand Down Expand Up @@ -109,36 +108,7 @@ def _build_struct_sql(
) -> str:
if not struct_options:
return ""

rendered_options = []
for option_name, option_value in struct_options.items():
if option_name == "model_params":
json_str = json.dumps(option_value)
# Escape single quotes for SQL string literal
sql_json_str = json_str.replace("'", "''")
rendered_val = f"JSON'{sql_json_str}'"
elif isinstance(option_value, collections.abc.Mapping):
struct_body = ", ".join(
[
f"{bigframes.core.sql.simple_literal(v)} AS {k}"
for k, v in option_value.items()
]
)
rendered_val = f"STRUCT({struct_body})"
elif isinstance(option_value, list):
rendered_val = (
"["
+ ", ".join(
[bigframes.core.sql.simple_literal(v) for v in option_value]
)
+ "]"
)
elif isinstance(option_value, bool):
rendered_val = str(option_value).lower()
else:
rendered_val = bigframes.core.sql.simple_literal(option_value)
rendered_options.append(f"{rendered_val} AS {option_name}")
return f", STRUCT({', '.join(rendered_options)})"
return f", {bigframes.core.sql.literals.struct_literal(struct_options)}"


def evaluate(
Expand Down
134 changes: 134 additions & 0 deletions tests/unit/bigquery/test_ai.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from unittest import mock

import pandas as pd
import pytest

import bigframes.bigquery as bbq
import bigframes.dataframe
import bigframes.series
import bigframes.session


@pytest.fixture
def mock_session():
return mock.create_autospec(spec=bigframes.session.Session)


@pytest.fixture
def mock_dataframe(mock_session):
df = mock.create_autospec(spec=bigframes.dataframe.DataFrame)
df._session = mock_session
df.sql = "SELECT * FROM my_table"
return df


@pytest.fixture
def mock_series(mock_session):
series = mock.create_autospec(spec=bigframes.series.Series)
series._session = mock_session
# Mock to_frame to return a mock dataframe
df = mock.create_autospec(spec=bigframes.dataframe.DataFrame)
df._session = mock_session
df.sql = "SELECT my_col AS content FROM my_table"
series.copy.return_value = series
series.to_frame.return_value = df
return series


def test_generate_embedding_with_dataframe(mock_dataframe, mock_session):
model_name = "project.dataset.model"

bbq.ai.generate_embedding(
model_name,
mock_dataframe,
output_dimensionality=256,
)

mock_session.read_gbq.assert_called_once()
query = mock_session.read_gbq.call_args[0][0]

# Normalize whitespace for comparison
query = " ".join(query.split())

expected_part_1 = "SELECT * FROM AI.GENERATE_EMBEDDING("
expected_part_2 = f"MODEL `{model_name}`,"
expected_part_3 = "(SELECT * FROM my_table),"
expected_part_4 = "STRUCT(256 AS OUTPUT_DIMENSIONALITY)"

assert expected_part_1 in query
assert expected_part_2 in query
assert expected_part_3 in query
assert expected_part_4 in query


def test_generate_embedding_with_series(mock_series, mock_session):
model_name = "project.dataset.model"

bbq.ai.generate_embedding(
model_name, mock_series, start_second=0.0, end_second=10.0, interval_seconds=5.0
)

mock_session.read_gbq.assert_called_once()
query = mock_session.read_gbq.call_args[0][0]
query = " ".join(query.split())

assert f"MODEL `{model_name}`" in query
assert "(SELECT my_col AS content FROM my_table)" in query
assert (
"STRUCT(0.0 AS START_SECOND, 10.0 AS END_SECOND, 5.0 AS INTERVAL_SECONDS)"
in query
)


def test_generate_embedding_defaults(mock_dataframe, mock_session):
model_name = "project.dataset.model"

bbq.ai.generate_embedding(
model_name,
mock_dataframe,
)

mock_session.read_gbq.assert_called_once()
query = mock_session.read_gbq.call_args[0][0]
query = " ".join(query.split())

assert f"MODEL `{model_name}`" in query
assert "STRUCT()" in query


@mock.patch("bigframes.pandas.read_pandas")
def test_generate_embedding_with_pandas_dataframe(
read_pandas_mock, mock_dataframe, mock_session
):
# This tests that pandas input path works and calls read_pandas
model_name = "project.dataset.model"

# Mock return value of read_pandas to be a BigFrames DataFrame
read_pandas_mock.return_value = mock_dataframe

pandas_df = pd.DataFrame({"content": ["test"]})

bbq.ai.generate_embedding(
model_name,
pandas_df,
)

read_pandas_mock.assert_called_once()
# Check that read_pandas was called with something (the pandas df)
assert read_pandas_mock.call_args[0][0] is pandas_df

mock_session.read_gbq.assert_called_once()
Loading