diff --git a/packages/bigframes/bigframes/bigquery/__init__.py b/packages/bigframes/bigframes/bigquery/__init__.py index d643b986ac8e..d3fb8701df20 100644 --- a/packages/bigframes/bigframes/bigquery/__init__.py +++ b/packages/bigframes/bigframes/bigquery/__init__.py @@ -49,11 +49,7 @@ from bigframes.bigquery import aead, ai, ml, obj from bigframes.bigquery._operations.approx_agg import approx_top_count -from bigframes.bigquery._operations.array import ( - array_agg, - array_length, - array_to_string, -) +from bigframes.bigquery._operations.array import array_agg from bigframes.bigquery._operations.datetime import ( unix_micros, unix_millis, @@ -72,11 +68,6 @@ st_regionstats, st_simplify, ) -from bigframes.bigquery._operations.global_namespace.aead_encryption import ( - deterministic_decrypt_bytes, - deterministic_decrypt_string, - deterministic_encrypt, -) from bigframes.bigquery._operations.io import load_data from bigframes.bigquery._operations.json import ( json_extract, @@ -102,14 +93,47 @@ from bigframes.bigquery._operations.struct import struct from bigframes.bigquery._operations.table import create_external_table from bigframes.core.logging import log_adapter +from bigframes.operations.googlesql.global_namespace.aead_encryption import ( + deterministic_decrypt_bytes, + deterministic_decrypt_string, + deterministic_encrypt, +) +from bigframes.operations.googlesql.global_namespace.array import ( + array_concat, + array_first, + array_first_n, + array_includes, + array_includes_all, + array_includes_any, + array_is_distinct, + array_last, + array_length, + array_reverse, + array_slice, + array_to_string, + flatten, + generate_array, +) _functions = [ # approximate aggregate ops approx_top_count, # array ops array_agg, + array_concat, + array_first, + array_first_n, + array_includes, + array_includes_all, + array_includes_any, + array_is_distinct, + array_last, array_length, + array_reverse, + array_slice, array_to_string, + flatten, + generate_array, # datetime ops unix_micros, unix_millis, @@ -170,8 +194,20 @@ "approx_top_count", # array ops "array_agg", + "array_concat", + "array_first", + "array_first_n", + "array_includes", + "array_includes_all", + "array_includes_any", + "array_is_distinct", + "array_last", "array_length", + "array_reverse", + "array_slice", "array_to_string", + "flatten", + "generate_array", # datetime ops "unix_micros", "unix_millis", diff --git a/packages/bigframes/bigframes/bigquery/_operations/array.py b/packages/bigframes/bigframes/bigquery/_operations/array.py index 9eb83665daa9..7cc414879a87 100644 --- a/packages/bigframes/bigframes/bigquery/_operations/array.py +++ b/packages/bigframes/bigframes/bigquery/_operations/array.py @@ -32,40 +32,6 @@ import bigframes.dataframe as dataframe -def array_length(series: series.Series) -> series.Series: - """Compute the length of each array element in the Series. - - **Examples:** - - >>> import bigframes.pandas as bpd - >>> import bigframes.bigquery as bbq - - >>> s = bpd.Series([[1, 2, 8, 3], [], [3, 4]]) - >>> bbq.array_length(s) - 0 4 - 1 0 - 2 2 - dtype: Int64 - - You can also apply this function directly to Series. - - >>> s.apply(bbq.array_length, by_row=False) - 0 4 - 1 0 - 2 2 - dtype: Int64 - - Args: - series (bigframes.series.Series): A Series with array columns. - - Returns: - bigframes.series.Series: A Series of integer values indicating - the length of each element in the Series. - - """ - return series._apply_unary_op(ops.len_op) - - def array_agg( obj: groupby.SeriesGroupBy | groupby.DataFrameGroupBy, ) -> series.Series | dataframe.DataFrame: @@ -115,31 +81,3 @@ def array_agg( raise ValueError( f"Unsupported type {type(obj)} to apply `array_agg` function. {constants.FEEDBACK_LINK}" ) - - -def array_to_string(series: series.Series, delimiter: str) -> series.Series: - """Converts array elements within a Series into delimited strings. - - **Examples:** - - >>> import bigframes.pandas as bpd - >>> import bigframes.bigquery as bbq - - >>> s = bpd.Series([["H", "i", "!"], ["Hello", "World"], np.nan, [], ["Hi"]]) - >>> bbq.array_to_string(s, delimiter=", ") - 0 H, i, ! - 1 Hello, World - 2 - 3 - 4 Hi - dtype: string - - Args: - series (bigframes.series.Series): A Series containing arrays. - delimiter (str): The string used to separate array elements. - - Returns: - bigframes.series.Series: A Series containing delimited strings. - - """ - return series._apply_unary_op(ops.ArrayToStringOp(delimiter=delimiter)) diff --git a/packages/bigframes/bigframes/bigquery/aead.py b/packages/bigframes/bigframes/bigquery/aead.py index f18e12bc5cf5..c4243a5c010c 100644 --- a/packages/bigframes/bigframes/bigquery/aead.py +++ b/packages/bigframes/bigframes/bigquery/aead.py @@ -16,7 +16,7 @@ from __future__ import annotations -from bigframes.bigquery._operations.aead import decrypt_bytes, decrypt_string, encrypt +from bigframes.operations.googlesql.aead import decrypt_bytes, decrypt_string, encrypt __all__ = [ "decrypt_bytes", diff --git a/packages/bigframes/bigframes/bigquery/_googlesql.py b/packages/bigframes/bigframes/core/googlesql.py similarity index 100% rename from packages/bigframes/bigframes/bigquery/_googlesql.py rename to packages/bigframes/bigframes/core/googlesql.py diff --git a/packages/bigframes/bigframes/operations/googlesql.py b/packages/bigframes/bigframes/operations/googlesql/__init__.py similarity index 100% rename from packages/bigframes/bigframes/operations/googlesql.py rename to packages/bigframes/bigframes/operations/googlesql/__init__.py diff --git a/packages/bigframes/bigframes/bigquery/_operations/aead.py b/packages/bigframes/bigframes/operations/googlesql/aead.py similarity index 85% rename from packages/bigframes/bigframes/bigquery/_operations/aead.py rename to packages/bigframes/bigframes/operations/googlesql/aead.py index 6bfb65360384..8c461b3ff04b 100644 --- a/packages/bigframes/bigframes/bigquery/_operations/aead.py +++ b/packages/bigframes/bigframes/operations/googlesql/aead.py @@ -19,19 +19,18 @@ from __future__ import annotations import datetime +import decimal from typing import Any, Literal, Optional, TypeVar, Union -import bigframes.bigquery._googlesql import bigframes.core.col import bigframes.core.expression as ex +import bigframes.core.googlesql import bigframes.core.sentinels as sentinels import bigframes.operations as ops import bigframes.series as series from bigframes import dtypes from bigframes.operations import googlesql -T = TypeVar("T", series.Series, bigframes.core.col.Expression) - _DECRYPT_BYTES_OP = googlesql.GoogleSqlScalarOp( "AEAD.DECRYPT_BYTES", args=(googlesql.ArgSpec(), googlesql.ArgSpec(), googlesql.ArgSpec()), @@ -51,77 +50,77 @@ def decrypt_bytes( keyset: Union[ - T, + series.Series, bigframes.core.col.Expression, Union[Literal[sentinels.Sentinel.ARGUMENT_DEFAULT], bytes, dict], ], ciphertext: Union[ - T, + series.Series, bigframes.core.col.Expression, Union[Literal[sentinels.Sentinel.ARGUMENT_DEFAULT], bytes], ], additional_data: Union[ - T, + series.Series, bigframes.core.col.Expression, Union[Literal[sentinels.Sentinel.ARGUMENT_DEFAULT], bytes], ], -) -> T: +) -> Union[series.Series, bigframes.core.col.Expression]: """Uses the matching key from keyset to decrypt ciphertext and verifies the integrity of the data using additional_data. Returns an error if decryption or verification fails.""" - return bigframes.bigquery._googlesql.apply_googlesql_scalar_op( + return bigframes.core.googlesql.apply_googlesql_scalar_op( _DECRYPT_BYTES_OP, keyset, ciphertext, additional_data, - ) # type: ignore + ) def decrypt_string( keyset: Union[ - T, + series.Series, bigframes.core.col.Expression, Union[Literal[sentinels.Sentinel.ARGUMENT_DEFAULT], bytes, dict], ], ciphertext: Union[ - T, + series.Series, bigframes.core.col.Expression, Union[Literal[sentinels.Sentinel.ARGUMENT_DEFAULT], bytes], ], additional_data: Union[ - T, + series.Series, bigframes.core.col.Expression, Union[Literal[sentinels.Sentinel.ARGUMENT_DEFAULT], str], ], -) -> T: +) -> Union[series.Series, bigframes.core.col.Expression]: """Like AEAD.DECRYPT_BYTES, but where additional_data is of type STRING.""" - return bigframes.bigquery._googlesql.apply_googlesql_scalar_op( + return bigframes.core.googlesql.apply_googlesql_scalar_op( _DECRYPT_STRING_OP, keyset, ciphertext, additional_data, - ) # type: ignore + ) def encrypt( keyset: Union[ - T, + series.Series, bigframes.core.col.Expression, Union[Literal[sentinels.Sentinel.ARGUMENT_DEFAULT], bytes, dict], ], plaintext: Union[ - T, + series.Series, bigframes.core.col.Expression, Union[Literal[sentinels.Sentinel.ARGUMENT_DEFAULT], bytes, str], ], additional_data: Union[ - T, + series.Series, bigframes.core.col.Expression, Union[Literal[sentinels.Sentinel.ARGUMENT_DEFAULT], bytes, str], ], -) -> T: +) -> Union[series.Series, bigframes.core.col.Expression]: """Encrypts plaintext using the primary cryptographic key in keyset. The algorithm of the primary key must be AEAD_AES_GCM_256. Binds the ciphertext to the context defined by additional_data. Returns NULL if any input is NULL.""" - return bigframes.bigquery._googlesql.apply_googlesql_scalar_op( + return bigframes.core.googlesql.apply_googlesql_scalar_op( _ENCRYPT_OP, keyset, plaintext, additional_data, - ) # type: ignore + ) diff --git a/packages/bigframes/bigframes/operations/googlesql/global_namespace/__init__.py b/packages/bigframes/bigframes/operations/googlesql/global_namespace/__init__.py new file mode 100644 index 000000000000..58d482ea3866 --- /dev/null +++ b/packages/bigframes/bigframes/operations/googlesql/global_namespace/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/packages/bigframes/bigframes/bigquery/_operations/global_namespace/aead_encryption.py b/packages/bigframes/bigframes/operations/googlesql/global_namespace/aead_encryption.py similarity index 86% rename from packages/bigframes/bigframes/bigquery/_operations/global_namespace/aead_encryption.py rename to packages/bigframes/bigframes/operations/googlesql/global_namespace/aead_encryption.py index b1522a70d73e..dd67027287a8 100644 --- a/packages/bigframes/bigframes/bigquery/_operations/global_namespace/aead_encryption.py +++ b/packages/bigframes/bigframes/operations/googlesql/global_namespace/aead_encryption.py @@ -19,19 +19,18 @@ from __future__ import annotations import datetime +import decimal from typing import Any, Literal, Optional, TypeVar, Union -import bigframes.bigquery._googlesql import bigframes.core.col import bigframes.core.expression as ex +import bigframes.core.googlesql import bigframes.core.sentinels as sentinels import bigframes.operations as ops import bigframes.series as series from bigframes import dtypes from bigframes.operations import googlesql -T = TypeVar("T", series.Series, bigframes.core.col.Expression) - _DETERMINISTIC_DECRYPT_BYTES_OP = googlesql.GoogleSqlScalarOp( "DETERMINISTIC_DECRYPT_BYTES", args=(googlesql.ArgSpec(), googlesql.ArgSpec(), googlesql.ArgSpec()), @@ -51,77 +50,77 @@ def deterministic_decrypt_bytes( keyset: Union[ - T, + series.Series, bigframes.core.col.Expression, Union[Literal[sentinels.Sentinel.ARGUMENT_DEFAULT], bytes, dict], ], ciphertext: Union[ - T, + series.Series, bigframes.core.col.Expression, Union[Literal[sentinels.Sentinel.ARGUMENT_DEFAULT], bytes], ], additional_data: Union[ - T, + series.Series, bigframes.core.col.Expression, Union[Literal[sentinels.Sentinel.ARGUMENT_DEFAULT], bytes], ], -) -> T: +) -> Union[series.Series, bigframes.core.col.Expression]: """Uses the matching key from `keyset` to decrypt `ciphertext` and verifies the integrity of the data using `additional_data`. Returns an error if decryption fails.""" - return bigframes.bigquery._googlesql.apply_googlesql_scalar_op( + return bigframes.core.googlesql.apply_googlesql_scalar_op( _DETERMINISTIC_DECRYPT_BYTES_OP, keyset, ciphertext, additional_data, - ) # type: ignore + ) def deterministic_decrypt_string( keyset: Union[ - T, + series.Series, bigframes.core.col.Expression, Union[Literal[sentinels.Sentinel.ARGUMENT_DEFAULT], bytes, dict], ], ciphertext: Union[ - T, + series.Series, bigframes.core.col.Expression, Union[Literal[sentinels.Sentinel.ARGUMENT_DEFAULT], bytes], ], additional_data: Union[ - T, + series.Series, bigframes.core.col.Expression, Union[Literal[sentinels.Sentinel.ARGUMENT_DEFAULT], str], ], -) -> T: +) -> Union[series.Series, bigframes.core.col.Expression]: """Like `DETERMINISTIC_DECRYPT_BYTES`, but where plaintext is of type STRING.""" - return bigframes.bigquery._googlesql.apply_googlesql_scalar_op( + return bigframes.core.googlesql.apply_googlesql_scalar_op( _DETERMINISTIC_DECRYPT_STRING_OP, keyset, ciphertext, additional_data, - ) # type: ignore + ) def deterministic_encrypt( keyset: Union[ - T, + series.Series, bigframes.core.col.Expression, Union[Literal[sentinels.Sentinel.ARGUMENT_DEFAULT], bytes, dict], ], plaintext: Union[ - T, + series.Series, bigframes.core.col.Expression, Union[Literal[sentinels.Sentinel.ARGUMENT_DEFAULT], bytes, str], ], additional_data: Union[ - T, + series.Series, bigframes.core.col.Expression, Union[Literal[sentinels.Sentinel.ARGUMENT_DEFAULT], bytes, str], ], -) -> T: +) -> Union[series.Series, bigframes.core.col.Expression]: """Encrypts `plaintext` using the primary cryptographic key in `keyset` using deterministic AEAD. The algorithm of the primary key must be `DETERMINISTIC_AEAD_AES_SIV_CMAC_256`. Binds the ciphertext to the context defined by `additional_data`. Returns `NULL` if any input is `NULL`.""" - return bigframes.bigquery._googlesql.apply_googlesql_scalar_op( + return bigframes.core.googlesql.apply_googlesql_scalar_op( _DETERMINISTIC_ENCRYPT_OP, keyset, plaintext, additional_data, - ) # type: ignore + ) diff --git a/packages/bigframes/bigframes/operations/googlesql/global_namespace/array.py b/packages/bigframes/bigframes/operations/googlesql/global_namespace/array.py new file mode 100644 index 000000000000..3b0f93656442 --- /dev/null +++ b/packages/bigframes/bigframes/operations/googlesql/global_namespace/array.py @@ -0,0 +1,858 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# DO NOT MODIFY THIS FILE DIRECTLY. +# This file was generated from: scripts/data/sql-functions/global_namespace/array.yaml +# by the script: scripts/generate_bigframes_bigquery.py + +from __future__ import annotations + +import datetime +import decimal +from typing import Any, Literal, Optional, TypeVar, Union + +import bigframes.core.col +import bigframes.core.expression as ex +import bigframes.core.googlesql +import bigframes.core.sentinels as sentinels +import bigframes.operations as ops +import bigframes.series as series +from bigframes import dtypes +from bigframes.operations import googlesql + + +def _ARRAY_CONCAT_SIG(*args): + # Pad args with None to match max expected args + args = args + (None,) * (2 - len(args)) + # Try matching impl 0 + any1_val = None + match_ok = True + if match_ok and args[0] is not None: + if not dtypes.is_array_like(args[0]): + match_ok = False + else: + inner = dtypes.get_array_inner_type(args[0]) + if any1_val is not None: + try: + any1_val = dtypes.coerce_to_common(any1_val, inner) + except TypeError: + match_ok = False + else: + any1_val = inner + if match_ok and args[1] is not None: + if not dtypes.is_array_like(args[1]): + match_ok = False + else: + inner = dtypes.get_array_inner_type(args[1]) + if any1_val is not None: + try: + any1_val = dtypes.coerce_to_common(any1_val, inner) + except TypeError: + match_ok = False + else: + any1_val = inner + if match_ok: + if any1_val is not None: + return dtypes.list_type(any1_val) + else: + return None + + raise TypeError( + f"Could not find matching signature for array_concat with argument types: {[str(t) for t in args]}" + ) + + +_ARRAY_CONCAT_OP = googlesql.GoogleSqlScalarOp( + "ARRAY_CONCAT", + args=(googlesql.ArgSpec(), googlesql.ArgSpec()), + signature=_ARRAY_CONCAT_SIG, +) + + +def _ARRAY_FIRST_SIG(*args): + # Pad args with None to match max expected args + args = args + (None,) * (1 - len(args)) + # Try matching impl 0 + any1_val = None + match_ok = True + if match_ok and args[0] is not None: + if not dtypes.is_array_like(args[0]): + match_ok = False + else: + inner = dtypes.get_array_inner_type(args[0]) + if any1_val is not None: + try: + any1_val = dtypes.coerce_to_common(any1_val, inner) + except TypeError: + match_ok = False + else: + any1_val = inner + if match_ok: + return any1_val + + raise TypeError( + f"Could not find matching signature for array_first with argument types: {[str(t) for t in args]}" + ) + + +_ARRAY_FIRST_OP = googlesql.GoogleSqlScalarOp( + "ARRAY_FIRST", + args=(googlesql.ArgSpec(),), + signature=_ARRAY_FIRST_SIG, +) + + +def _ARRAY_FIRST_N_SIG(*args): + # Pad args with None to match max expected args + args = args + (None,) * (2 - len(args)) + # Try matching impl 0 + any1_val = None + match_ok = True + if match_ok and args[0] is not None: + if not dtypes.is_array_like(args[0]): + match_ok = False + else: + inner = dtypes.get_array_inner_type(args[0]) + if any1_val is not None: + try: + any1_val = dtypes.coerce_to_common(any1_val, inner) + except TypeError: + match_ok = False + else: + any1_val = inner + if match_ok and args[1] is not None: + try: + if dtypes.coerce_to_common(args[1], dtypes.INT_DTYPE) != dtypes.INT_DTYPE: + match_ok = False + except TypeError: + match_ok = False + if match_ok: + if any1_val is not None: + return dtypes.list_type(any1_val) + else: + return None + + raise TypeError( + f"Could not find matching signature for array_first_n with argument types: {[str(t) for t in args]}" + ) + + +_ARRAY_FIRST_N_OP = googlesql.GoogleSqlScalarOp( + "ARRAY_FIRST_N", + args=(googlesql.ArgSpec(), googlesql.ArgSpec()), + signature=_ARRAY_FIRST_N_SIG, +) +_ARRAY_INCLUDES_OP = googlesql.GoogleSqlScalarOp( + "ARRAY_INCLUDES", + args=(googlesql.ArgSpec(), googlesql.ArgSpec()), + signature=lambda *args: dtypes.BOOL_DTYPE, +) +_ARRAY_INCLUDES_ALL_OP = googlesql.GoogleSqlScalarOp( + "ARRAY_INCLUDES_ALL", + args=(googlesql.ArgSpec(), googlesql.ArgSpec()), + signature=lambda *args: dtypes.BOOL_DTYPE, +) +_ARRAY_INCLUDES_ANY_OP = googlesql.GoogleSqlScalarOp( + "ARRAY_INCLUDES_ANY", + args=(googlesql.ArgSpec(), googlesql.ArgSpec()), + signature=lambda *args: dtypes.BOOL_DTYPE, +) +_ARRAY_IS_DISTINCT_OP = googlesql.GoogleSqlScalarOp( + "ARRAY_IS_DISTINCT", + args=(googlesql.ArgSpec(),), + signature=lambda *args: dtypes.BOOL_DTYPE, +) + + +def _ARRAY_LAST_SIG(*args): + # Pad args with None to match max expected args + args = args + (None,) * (1 - len(args)) + # Try matching impl 0 + any1_val = None + match_ok = True + if match_ok and args[0] is not None: + if not dtypes.is_array_like(args[0]): + match_ok = False + else: + inner = dtypes.get_array_inner_type(args[0]) + if any1_val is not None: + try: + any1_val = dtypes.coerce_to_common(any1_val, inner) + except TypeError: + match_ok = False + else: + any1_val = inner + if match_ok: + return any1_val + + raise TypeError( + f"Could not find matching signature for array_last with argument types: {[str(t) for t in args]}" + ) + + +_ARRAY_LAST_OP = googlesql.GoogleSqlScalarOp( + "ARRAY_LAST", + args=(googlesql.ArgSpec(),), + signature=_ARRAY_LAST_SIG, +) +_ARRAY_LENGTH_OP = googlesql.GoogleSqlScalarOp( + "ARRAY_LENGTH", + args=(googlesql.ArgSpec(),), + signature=lambda *args: dtypes.INT_DTYPE, +) + + +def _ARRAY_REVERSE_SIG(*args): + # Pad args with None to match max expected args + args = args + (None,) * (1 - len(args)) + # Try matching impl 0 + any1_val = None + match_ok = True + if match_ok and args[0] is not None: + if not dtypes.is_array_like(args[0]): + match_ok = False + else: + inner = dtypes.get_array_inner_type(args[0]) + if any1_val is not None: + try: + any1_val = dtypes.coerce_to_common(any1_val, inner) + except TypeError: + match_ok = False + else: + any1_val = inner + if match_ok: + if any1_val is not None: + return dtypes.list_type(any1_val) + else: + return None + + raise TypeError( + f"Could not find matching signature for array_reverse with argument types: {[str(t) for t in args]}" + ) + + +_ARRAY_REVERSE_OP = googlesql.GoogleSqlScalarOp( + "ARRAY_REVERSE", + args=(googlesql.ArgSpec(),), + signature=_ARRAY_REVERSE_SIG, +) + + +def _ARRAY_SLICE_SIG(*args): + # Pad args with None to match max expected args + args = args + (None,) * (3 - len(args)) + # Try matching impl 0 + any1_val = None + match_ok = True + if match_ok and args[0] is not None: + if not dtypes.is_array_like(args[0]): + match_ok = False + else: + inner = dtypes.get_array_inner_type(args[0]) + if any1_val is not None: + try: + any1_val = dtypes.coerce_to_common(any1_val, inner) + except TypeError: + match_ok = False + else: + any1_val = inner + if match_ok and args[1] is not None: + try: + if dtypes.coerce_to_common(args[1], dtypes.INT_DTYPE) != dtypes.INT_DTYPE: + match_ok = False + except TypeError: + match_ok = False + if match_ok and args[2] is not None: + try: + if dtypes.coerce_to_common(args[2], dtypes.INT_DTYPE) != dtypes.INT_DTYPE: + match_ok = False + except TypeError: + match_ok = False + if match_ok: + if any1_val is not None: + return dtypes.list_type(any1_val) + else: + return None + + raise TypeError( + f"Could not find matching signature for array_slice with argument types: {[str(t) for t in args]}" + ) + + +_ARRAY_SLICE_OP = googlesql.GoogleSqlScalarOp( + "ARRAY_SLICE", + args=(googlesql.ArgSpec(), googlesql.ArgSpec(), googlesql.ArgSpec()), + signature=_ARRAY_SLICE_SIG, +) + + +def _ARRAY_TO_STRING_SIG(*args): + # Pad args with None to match max expected args + args = args + (None,) * (3 - len(args)) + # Try matching impl 0 + any1_val = None + match_ok = True + if match_ok and args[0] is not None: + if not dtypes.is_array_like(args[0]): + match_ok = False + else: + inner = dtypes.get_array_inner_type(args[0]) + try: + if ( + dtypes.coerce_to_common(inner, dtypes.STRING_DTYPE) + != dtypes.STRING_DTYPE + ): + match_ok = False + except TypeError: + match_ok = False + if match_ok and args[1] is not None: + try: + if ( + dtypes.coerce_to_common(args[1], dtypes.STRING_DTYPE) + != dtypes.STRING_DTYPE + ): + match_ok = False + except TypeError: + match_ok = False + if match_ok and args[2] is not None: + try: + if ( + dtypes.coerce_to_common(args[2], dtypes.STRING_DTYPE) + != dtypes.STRING_DTYPE + ): + match_ok = False + except TypeError: + match_ok = False + if match_ok: + return dtypes.STRING_DTYPE + + # Try matching impl 1 + any1_val = None + match_ok = True + if match_ok and args[0] is not None: + if not dtypes.is_array_like(args[0]): + match_ok = False + else: + inner = dtypes.get_array_inner_type(args[0]) + try: + if ( + dtypes.coerce_to_common(inner, dtypes.BYTES_DTYPE) + != dtypes.BYTES_DTYPE + ): + match_ok = False + except TypeError: + match_ok = False + if match_ok and args[1] is not None: + try: + if ( + dtypes.coerce_to_common(args[1], dtypes.BYTES_DTYPE) + != dtypes.BYTES_DTYPE + ): + match_ok = False + except TypeError: + match_ok = False + if match_ok and args[2] is not None: + try: + if ( + dtypes.coerce_to_common(args[2], dtypes.BYTES_DTYPE) + != dtypes.BYTES_DTYPE + ): + match_ok = False + except TypeError: + match_ok = False + if match_ok: + return dtypes.BYTES_DTYPE + + raise TypeError( + f"Could not find matching signature for array_to_string with argument types: {[str(t) for t in args]}" + ) + + +_ARRAY_TO_STRING_OP = googlesql.GoogleSqlScalarOp( + "ARRAY_TO_STRING", + args=(googlesql.ArgSpec(), googlesql.ArgSpec(), googlesql.ArgSpec(optional=True)), + signature=_ARRAY_TO_STRING_SIG, +) + + +def _FLATTEN_SIG(*args): + # Pad args with None to match max expected args + args = args + (None,) * (2 - len(args)) + # Try matching impl 0 + any1_val = None + match_ok = True + if match_ok and args[0] is not None: + if not dtypes.is_array_like(args[0]): + match_ok = False + else: + inner = dtypes.get_array_inner_type(args[0]) + if any1_val is not None: + try: + any1_val = dtypes.coerce_to_common(any1_val, inner) + except TypeError: + match_ok = False + else: + any1_val = inner + if match_ok and args[1] is not None: + try: + if dtypes.coerce_to_common(args[1], dtypes.INT_DTYPE) != dtypes.INT_DTYPE: + match_ok = False + except TypeError: + match_ok = False + if match_ok: + if any1_val is not None: + return dtypes.list_type(any1_val) + else: + return None + + raise TypeError( + f"Could not find matching signature for flatten with argument types: {[str(t) for t in args]}" + ) + + +_FLATTEN_OP = googlesql.GoogleSqlScalarOp( + "FLATTEN", + args=(googlesql.ArgSpec(), googlesql.ArgSpec(arg_name="depth", optional=True)), + signature=_FLATTEN_SIG, +) + + +def _GENERATE_ARRAY_SIG(*args): + # Pad args with None to match max expected args + args = args + (None,) * (3 - len(args)) + # Try matching impl 0 + any1_val = None + match_ok = True + if match_ok and args[0] is not None: + try: + if dtypes.coerce_to_common(args[0], dtypes.INT_DTYPE) != dtypes.INT_DTYPE: + match_ok = False + except TypeError: + match_ok = False + if match_ok and args[1] is not None: + try: + if dtypes.coerce_to_common(args[1], dtypes.INT_DTYPE) != dtypes.INT_DTYPE: + match_ok = False + except TypeError: + match_ok = False + if match_ok and args[2] is not None: + try: + if dtypes.coerce_to_common(args[2], dtypes.INT_DTYPE) != dtypes.INT_DTYPE: + match_ok = False + except TypeError: + match_ok = False + if match_ok: + return dtypes.list_type(dtypes.INT_DTYPE) + + # Try matching impl 1 + any1_val = None + match_ok = True + if match_ok and args[0] is not None: + try: + if ( + dtypes.coerce_to_common(args[0], dtypes.NUMERIC_DTYPE) + != dtypes.NUMERIC_DTYPE + ): + match_ok = False + except TypeError: + match_ok = False + if match_ok and args[1] is not None: + try: + if ( + dtypes.coerce_to_common(args[1], dtypes.NUMERIC_DTYPE) + != dtypes.NUMERIC_DTYPE + ): + match_ok = False + except TypeError: + match_ok = False + if match_ok and args[2] is not None: + try: + if ( + dtypes.coerce_to_common(args[2], dtypes.NUMERIC_DTYPE) + != dtypes.NUMERIC_DTYPE + ): + match_ok = False + except TypeError: + match_ok = False + if match_ok: + return dtypes.list_type(dtypes.NUMERIC_DTYPE) + + # Try matching impl 2 + any1_val = None + match_ok = True + if match_ok and args[0] is not None: + try: + if ( + dtypes.coerce_to_common(args[0], dtypes.FLOAT_DTYPE) + != dtypes.FLOAT_DTYPE + ): + match_ok = False + except TypeError: + match_ok = False + if match_ok and args[1] is not None: + try: + if ( + dtypes.coerce_to_common(args[1], dtypes.FLOAT_DTYPE) + != dtypes.FLOAT_DTYPE + ): + match_ok = False + except TypeError: + match_ok = False + if match_ok and args[2] is not None: + try: + if ( + dtypes.coerce_to_common(args[2], dtypes.FLOAT_DTYPE) + != dtypes.FLOAT_DTYPE + ): + match_ok = False + except TypeError: + match_ok = False + if match_ok: + return dtypes.list_type(dtypes.FLOAT_DTYPE) + + raise TypeError( + f"Could not find matching signature for generate_array with argument types: {[str(t) for t in args]}" + ) + + +_GENERATE_ARRAY_OP = googlesql.GoogleSqlScalarOp( + "GENERATE_ARRAY", + args=(googlesql.ArgSpec(), googlesql.ArgSpec(), googlesql.ArgSpec(optional=True)), + signature=_GENERATE_ARRAY_SIG, +) + + +def array_concat( + array_expression_1: Union[ + series.Series, + bigframes.core.col.Expression, + Union[Any, Literal[sentinels.Sentinel.ARGUMENT_DEFAULT]], + ], + array_expression_2: Union[ + series.Series, + bigframes.core.col.Expression, + Union[Any, Literal[sentinels.Sentinel.ARGUMENT_DEFAULT]], + ], +) -> Union[series.Series, bigframes.core.col.Expression]: + """Concatenates one or more arrays with the same element type into a single array.""" + return bigframes.core.googlesql.apply_googlesql_scalar_op( + _ARRAY_CONCAT_OP, + array_expression_1, + array_expression_2, + ) + + +def array_first( + array_expression: Union[ + series.Series, + bigframes.core.col.Expression, + Union[Any, Literal[sentinels.Sentinel.ARGUMENT_DEFAULT]], + ], +) -> Union[series.Series, bigframes.core.col.Expression]: + """Takes an array and returns the first element in the array.""" + return bigframes.core.googlesql.apply_googlesql_scalar_op( + _ARRAY_FIRST_OP, + array_expression, + ) + + +def array_first_n( + input_array: Union[ + series.Series, + bigframes.core.col.Expression, + Union[Any, Literal[sentinels.Sentinel.ARGUMENT_DEFAULT]], + ], + n: Union[ + series.Series, + bigframes.core.col.Expression, + Union[Literal[sentinels.Sentinel.ARGUMENT_DEFAULT], int], + ], +) -> Union[series.Series, bigframes.core.col.Expression]: + """Returns a prefix of `input_array` consisting of the first `n` elements.""" + return bigframes.core.googlesql.apply_googlesql_scalar_op( + _ARRAY_FIRST_N_OP, + input_array, + n, + ) + + +def array_includes( + array_to_search: Union[ + series.Series, + bigframes.core.col.Expression, + Union[Any, Literal[sentinels.Sentinel.ARGUMENT_DEFAULT]], + ], + search_value: Union[ + series.Series, + bigframes.core.col.Expression, + Union[Any, Literal[sentinels.Sentinel.ARGUMENT_DEFAULT]], + ], +) -> Union[series.Series, bigframes.core.col.Expression]: + """Takes an array and returns `TRUE` if there is an element in the array that is equal to the search_value.""" + return bigframes.core.googlesql.apply_googlesql_scalar_op( + _ARRAY_INCLUDES_OP, + array_to_search, + search_value, + ) + + +def array_includes_all( + array_to_search: Union[ + series.Series, + bigframes.core.col.Expression, + Union[Any, Literal[sentinels.Sentinel.ARGUMENT_DEFAULT]], + ], + search_values: Union[ + series.Series, + bigframes.core.col.Expression, + Union[Any, Literal[sentinels.Sentinel.ARGUMENT_DEFAULT]], + ], +) -> Union[series.Series, bigframes.core.col.Expression]: + """Takes an array to search and an array of search values. Returns `TRUE` if all search values are in the array to search, otherwise returns `FALSE`.""" + return bigframes.core.googlesql.apply_googlesql_scalar_op( + _ARRAY_INCLUDES_ALL_OP, + array_to_search, + search_values, + ) + + +def array_includes_any( + array_to_search: Union[ + series.Series, + bigframes.core.col.Expression, + Union[Any, Literal[sentinels.Sentinel.ARGUMENT_DEFAULT]], + ], + search_values: Union[ + series.Series, + bigframes.core.col.Expression, + Union[Any, Literal[sentinels.Sentinel.ARGUMENT_DEFAULT]], + ], +) -> Union[series.Series, bigframes.core.col.Expression]: + """Takes an array to search and an array of search values. Returns `TRUE` if any search values are in the array to search, otherwise returns `FALSE`.""" + return bigframes.core.googlesql.apply_googlesql_scalar_op( + _ARRAY_INCLUDES_ANY_OP, + array_to_search, + search_values, + ) + + +def array_is_distinct( + array_expression: Union[ + series.Series, + bigframes.core.col.Expression, + Union[Any, Literal[sentinels.Sentinel.ARGUMENT_DEFAULT]], + ], +) -> Union[series.Series, bigframes.core.col.Expression]: + """Returns `TRUE` if the array contains no repeated elements, using the same equality comparison logic as `SELECT DISTINCT`.""" + return bigframes.core.googlesql.apply_googlesql_scalar_op( + _ARRAY_IS_DISTINCT_OP, + array_expression, + ) + + +def array_last( + array_expression: Union[ + series.Series, + bigframes.core.col.Expression, + Union[Any, Literal[sentinels.Sentinel.ARGUMENT_DEFAULT]], + ], +) -> Union[series.Series, bigframes.core.col.Expression]: + """Takes an array and returns the last element in the array.""" + return bigframes.core.googlesql.apply_googlesql_scalar_op( + _ARRAY_LAST_OP, + array_expression, + ) + + +def array_length( + series: Union[ + series.Series, + bigframes.core.col.Expression, + Union[Any, Literal[sentinels.Sentinel.ARGUMENT_DEFAULT]], + ], +) -> Union[series.Series, bigframes.core.col.Expression]: + """Compute the length of each array element in the Series. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> import bigframes.bigquery as bbq + + >>> s = bpd.Series([[1, 2, 8, 3], [], [3, 4]]) + >>> bbq.array_length(s) + 0 4 + 1 0 + 2 2 + dtype: Int64 + + You can also apply this function directly to Series. + + >>> s.apply(bbq.array_length, by_row=False) + 0 4 + 1 0 + 2 2 + dtype: Int64 + + Args: + series (bigframes.series.Series): A Series with array columns. + + Returns: + bigframes.series.Series: A Series of integer values indicating + the length of each element in the Series. + """ + return bigframes.core.googlesql.apply_googlesql_scalar_op( + _ARRAY_LENGTH_OP, + series, + ) + + +def array_reverse( + value: Union[ + series.Series, + bigframes.core.col.Expression, + Union[Any, Literal[sentinels.Sentinel.ARGUMENT_DEFAULT]], + ], +) -> Union[series.Series, bigframes.core.col.Expression]: + """Returns the input `ARRAY` with elements in reverse order.""" + return bigframes.core.googlesql.apply_googlesql_scalar_op( + _ARRAY_REVERSE_OP, + value, + ) + + +def array_slice( + array_to_slice: Union[ + series.Series, + bigframes.core.col.Expression, + Union[Any, Literal[sentinels.Sentinel.ARGUMENT_DEFAULT]], + ], + start_offset: Union[ + series.Series, + bigframes.core.col.Expression, + Union[Literal[sentinels.Sentinel.ARGUMENT_DEFAULT], int], + ], + end_offset: Union[ + series.Series, + bigframes.core.col.Expression, + Union[Literal[sentinels.Sentinel.ARGUMENT_DEFAULT], int], + ], +) -> Union[series.Series, bigframes.core.col.Expression]: + """Returns an array containing zero or more consecutive elements from the input array.""" + return bigframes.core.googlesql.apply_googlesql_scalar_op( + _ARRAY_SLICE_OP, + array_to_slice, + start_offset, + end_offset, + ) + + +def array_to_string( + series: Union[ + series.Series, + bigframes.core.col.Expression, + Union[Any, Literal[sentinels.Sentinel.ARGUMENT_DEFAULT]], + ], + delimiter: Union[ + series.Series, + bigframes.core.col.Expression, + Union[Literal[sentinels.Sentinel.ARGUMENT_DEFAULT], bytes, str], + ], + null_text: Union[ + series.Series, + bigframes.core.col.Expression, + Union[Literal[sentinels.Sentinel.ARGUMENT_DEFAULT], bytes, str], + ] = sentinels.Sentinel.ARGUMENT_DEFAULT, +) -> Union[series.Series, bigframes.core.col.Expression]: + """Converts array elements within a Series into delimited strings. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> import bigframes.bigquery as bbq + + >>> s = bpd.Series([["H", "i", "!"], ["Hello", "World"], np.nan, [], ["Hi"]]) + >>> bbq.array_to_string(s, delimiter=", ") + 0 H, i, ! + 1 Hello, World + 2 + 3 + 4 Hi + dtype: string + + Args: + series (bigframes.series.Series): A Series containing arrays. + delimiter (str): The string used to separate array elements. + null_text (str, optional): The string to replace any NULL values in the array with. + + Returns: + bigframes.series.Series: A Series containing delimited strings. + """ + return bigframes.core.googlesql.apply_googlesql_scalar_op( + _ARRAY_TO_STRING_OP, + series, + delimiter, + null_text, + ) + + +def flatten( + array_to_flatten: Union[ + series.Series, + bigframes.core.col.Expression, + Union[Any, Literal[sentinels.Sentinel.ARGUMENT_DEFAULT]], + ], + depth: Union[ + series.Series, + bigframes.core.col.Expression, + Union[Literal[sentinels.Sentinel.ARGUMENT_DEFAULT], int], + ] = sentinels.Sentinel.ARGUMENT_DEFAULT, +) -> Union[series.Series, bigframes.core.col.Expression]: + """Takes an array of nested data and flattens a specific part of it into a single, flat array with the [array elements field access operator][array-el-field-operator]. Returns `NULL` if the input value is `NULL`.""" + return bigframes.core.googlesql.apply_googlesql_scalar_op( + _FLATTEN_OP, + array_to_flatten, + depth, + ) + + +def generate_array( + start_expression: Union[ + series.Series, + bigframes.core.col.Expression, + Union[ + Literal[sentinels.Sentinel.ARGUMENT_DEFAULT], decimal.Decimal, float, int + ], + ], + end_expression: Union[ + series.Series, + bigframes.core.col.Expression, + Union[ + Literal[sentinels.Sentinel.ARGUMENT_DEFAULT], decimal.Decimal, float, int + ], + ], + step_expression: Union[ + series.Series, + bigframes.core.col.Expression, + Union[ + Literal[sentinels.Sentinel.ARGUMENT_DEFAULT], decimal.Decimal, float, int + ], + ] = sentinels.Sentinel.ARGUMENT_DEFAULT, +) -> Union[series.Series, bigframes.core.col.Expression]: + """Returns an array of values. The `start_expression` and `end_expression` parameters determine the inclusive start and end of the array.""" + return bigframes.core.googlesql.apply_googlesql_scalar_op( + _GENERATE_ARRAY_OP, + start_expression, + end_expression, + step_expression, + ) diff --git a/packages/bigframes/bigframes/operations/semantics.py b/packages/bigframes/bigframes/operations/semantics.py index 4a6543fdad61..3cad3258e035 100644 --- a/packages/bigframes/bigframes/operations/semantics.py +++ b/packages/bigframes/bigframes/operations/semantics.py @@ -16,7 +16,7 @@ import re import typing import warnings -from typing import List, Optional +from typing import List, Optional, cast import numpy as np @@ -201,13 +201,19 @@ def agg( agg_df[cluster_column] = agg_df[cluster_column].list[0] # Skip if the aggregated group only has a single item - single_row_df: bigframes.series.Series = bbq.array_to_string( - agg_df[agg_df[group_row_index].list.len() <= 1][column], - delimiter="", + single_row_df: bigframes.series.Series = cast( + bigframes.series.Series, + bbq.array_to_string( + agg_df[agg_df[group_row_index].list.len() <= 1][column], + delimiter="", + ), ) - prompt_s: bigframes.series.Series = bbq.array_to_string( - agg_df[agg_df[group_row_index].list.len() > 1][llm_prompt], - delimiter="", + prompt_s: bigframes.series.Series = cast( + bigframes.series.Series, + bbq.array_to_string( + agg_df[agg_df[group_row_index].list.len() > 1][llm_prompt], + delimiter="", + ), ) prompt_s = output_instruction + prompt_s # type:ignore diff --git a/packages/bigframes/bigframes/series.py b/packages/bigframes/bigframes/series.py index 1065744f1716..a28c2f14cc9d 100644 --- a/packages/bigframes/bigframes/series.py +++ b/packages/bigframes/bigframes/series.py @@ -51,6 +51,7 @@ import bigframes.core import bigframes.core.block_transforms as block_ops import bigframes.core.blocks as blocks +import bigframes.core.col import bigframes.core.expression as ex import bigframes.core.identifiers as ids import bigframes.core.indexers @@ -2710,7 +2711,7 @@ def _apply_binary_aggregation( assert isinstance(right, ex.DerefOp) return block.get_binary_stat(left.id.name, right.id.name, stat) - AlignedExprT = Union[ex.ScalarConstantExpression, ex.DerefOp] + AlignedExprT = Union[ex.ScalarConstantExpression, ex.DerefOp, ex.OmittedArg] @typing.overload def _align( @@ -2764,16 +2765,20 @@ def _align3( def _align_n( self, - others: typing.Sequence[typing.Union[Series, scalars.Scalar]], + others: typing.Sequence[ + typing.Union[Series, bigframes.core.col.Expression, scalars.Scalar] + ], how="outer", ignore_self=False, cast_scalars: bool = False, ) -> tuple[ - typing.Sequence[Union[ex.ScalarConstantExpression, ex.DerefOp]], + typing.Sequence[Union[ex.ScalarConstantExpression, ex.DerefOp, ex.OmittedArg]], blocks.Block, ]: if ignore_self: - value_ids: List[Union[ex.ScalarConstantExpression, ex.DerefOp]] = [] + value_ids: List[ + Union[ex.ScalarConstantExpression, ex.DerefOp, ex.OmittedArg] + ] = [] else: value_ids = [ex.deref(self._value_column)] @@ -2798,6 +2803,17 @@ def _align_n( *remapped_value_ids, # type: ignore ex.deref(get_column_right[other._value_column]), ] + elif isinstance(other, bigframes.core.col.Expression): + if isinstance(other._value, ex.OmittedArg): + value_ids = [*value_ids, other._value] + continue + + label_to_col_ref = { + label: ex.deref(id) for id, label in block.col_id_to_label.items() + } + resolved_expr = other._value.bind_variables(label_to_col_ref) + block = block.project_block_exprs([resolved_expr], labels=[None]) + value_ids = [*value_ids, ex.deref(block.value_columns[-1])] else: # Will throw if can't interpret as scalar. dtype = typing.cast(bigframes.dtypes.Dtype, self._dtype) diff --git a/packages/bigframes/scripts/data/sql-functions/global_namespace/array.yaml b/packages/bigframes/scripts/data/sql-functions/global_namespace/array.yaml new file mode 100644 index 000000000000..a7d01a9143ce --- /dev/null +++ b/packages/bigframes/scripts/data/sql-functions/global_namespace/array.yaml @@ -0,0 +1,286 @@ +urn: extension:google:bq_scalar_functions +scalar_functions: + - name: "array_concat" + description: "Concatenates one or more arrays with the same element type into a single array." + impls: + # Signature: array_concat:list_list + - args: + - name: "array_expression_1" + value: list + optional: false + keyword_only: false + - name: "array_expression_2" + value: list + optional: false + keyword_only: false + return: list + - name: "array_first" + description: "Takes an array and returns the first element in the array." + impls: + # Signature: array_first:list + - args: + - name: "array_expression" + value: list + optional: false + keyword_only: false + return: any1 + - name: "array_first_n" + description: "Returns a prefix of `input_array` consisting of the first `n` elements." + impls: + # Signature: array_first_n:list_i64 + - args: + - name: "input_array" + value: list + optional: false + keyword_only: false + - name: "n" + value: i64 + optional: false + keyword_only: false + return: list + - name: "array_includes" + description: "Takes an array and returns `TRUE` if there is an element in the array that is equal to the search_value." + impls: + # Signature: array_includes:list_any + - args: + - name: "array_to_search" + value: list + optional: false + keyword_only: false + - name: "search_value" + value: any1 + optional: false + keyword_only: false + return: boolean + - name: "array_includes_all" + description: "Takes an array to search and an array of search values. Returns `TRUE` if all search values are in the array to search, otherwise returns `FALSE`." + impls: + # Signature: array_includes_all:list_list + - args: + - name: "array_to_search" + value: list + optional: false + keyword_only: false + - name: "search_values" + value: list + optional: false + keyword_only: false + return: boolean + - name: "array_includes_any" + description: "Takes an array to search and an array of search values. Returns `TRUE` if any search values are in the array to search, otherwise returns `FALSE`." + impls: + # Signature: array_includes_any:list_list + - args: + - name: "array_to_search" + value: list + optional: false + keyword_only: false + - name: "search_values" + value: list + optional: false + keyword_only: false + return: boolean + - name: "array_is_distinct" + description: "Returns `TRUE` if the array contains no repeated elements, using the same equality comparison logic as `SELECT DISTINCT`." + impls: + # Signature: array_is_distinct:list + - args: + - name: "array_expression" + value: list + optional: false + keyword_only: false + return: boolean + - name: "array_last" + description: "Takes an array and returns the last element in the array." + impls: + # Signature: array_last:list + - args: + - name: "array_expression" + value: list + optional: false + keyword_only: false + return: any1 + - name: "array_length" + description: | + Compute the length of each array element in the Series. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> import bigframes.bigquery as bbq + + >>> s = bpd.Series([[1, 2, 8, 3], [], [3, 4]]) + >>> bbq.array_length(s) + 0 4 + 1 0 + 2 2 + dtype: Int64 + + You can also apply this function directly to Series. + + >>> s.apply(bbq.array_length, by_row=False) + 0 4 + 1 0 + 2 2 + dtype: Int64 + + Args: + series (bigframes.series.Series): A Series with array columns. + + Returns: + bigframes.series.Series: A Series of integer values indicating + the length of each element in the Series. + impls: + # Signature: array_length:list + - args: + - name: "series" + value: list + optional: false + keyword_only: false + return: i64 + - name: "array_reverse" + description: "Returns the input `ARRAY` with elements in reverse order." + impls: + # Signature: array_reverse:list + - args: + - name: "value" + value: list + optional: false + keyword_only: false + return: list + - name: "array_slice" + description: "Returns an array containing zero or more consecutive elements from the input array." + impls: + # Signature: array_slice:list_i64_i64 + - args: + - name: "array_to_slice" + value: list + optional: false + keyword_only: false + - name: "start_offset" + value: i64 + optional: false + keyword_only: false + - name: "end_offset" + value: i64 + optional: false + keyword_only: false + return: list + - name: "array_to_string" + description: | + Converts array elements within a Series into delimited strings. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> import bigframes.bigquery as bbq + + >>> s = bpd.Series([["H", "i", "!"], ["Hello", "World"], np.nan, [], ["Hi"]]) + >>> bbq.array_to_string(s, delimiter=", ") + 0 H, i, ! + 1 Hello, World + 2 + 3 + 4 Hi + dtype: string + + Args: + series (bigframes.series.Series): A Series containing arrays. + delimiter (str): The string used to separate array elements. + null_text (str, optional): The string to replace any NULL values in the array with. + + Returns: + bigframes.series.Series: A Series containing delimited strings. + impls: + # Signature: array_to_string:list_str_str + - args: + - name: "series" + value: list + optional: false + keyword_only: false + - name: "delimiter" + value: string + optional: false + keyword_only: false + - name: "null_text" + value: string + optional: true + keyword_only: false + return: string + # Signature: array_to_string:list_vbin_vbin + - args: + - name: "series" + value: list + optional: false + keyword_only: false + - name: "delimiter" + value: binary + optional: false + keyword_only: false + - name: "null_text" + value: binary + optional: true + keyword_only: false + return: binary + - name: "flatten" + description: "Takes an array of nested data and flattens a specific part of it into a single, flat array with the [array elements field access operator][array-el-field-operator]. Returns `NULL` if the input value is `NULL`." + impls: + # Signature: flatten:list_i64 + - args: + - name: "array_to_flatten" + value: list + optional: false + keyword_only: false + - name: "depth" + value: i64 + optional: true + keyword_only: true + return: list + - name: "generate_array" + description: "Returns an array of values. The `start_expression` and `end_expression` parameters determine the inclusive start and end of the array." + impls: + # Signature: generate_array:i64_i64_i64 + - args: + - name: "start_expression" + value: i64 + optional: false + keyword_only: false + - name: "end_expression" + value: i64 + optional: false + keyword_only: false + - name: "step_expression" + value: i64 + optional: true + keyword_only: false + return: list + # Signature: generate_array:dec_dec_dec + - args: + - name: "start_expression" + value: decimal<38,9> + optional: false + keyword_only: false + - name: "end_expression" + value: decimal<38,9> + optional: false + keyword_only: false + - name: "step_expression" + value: decimal<38,9> + optional: true + keyword_only: false + return: list> + # Signature: generate_array:fp64_fp64_fp64 + - args: + - name: "start_expression" + value: fp64 + optional: false + keyword_only: false + - name: "end_expression" + value: fp64 + optional: false + keyword_only: false + - name: "step_expression" + value: fp64 + optional: true + keyword_only: false + return: list diff --git a/packages/bigframes/scripts/generate_bigframes_bigquery.py b/packages/bigframes/scripts/generate_bigframes_bigquery.py index 4cd98bddbf21..afdda2a5f98b 100755 --- a/packages/bigframes/scripts/generate_bigframes_bigquery.py +++ b/packages/bigframes/scripts/generate_bigframes_bigquery.py @@ -1,4 +1,4 @@ -#!/usr/bin/env -S uv run --script +#!/usr/bin/env -S uv run --active --script # # /// script # dependencies = [ @@ -32,42 +32,56 @@ # Directory containing the YAML files DATA_DIR = pathlib.Path("scripts/data/sql-functions") # Directory where the generated Python files will be placed -OUTPUT_DIR = pathlib.Path("bigframes/bigquery/_operations") +OUTPUT_DIR = pathlib.Path("bigframes/operations/googlesql") # Directory where the generated test files will be placed -TEST_OUTPUT_DIR = pathlib.Path("tests/unit/bigquery/_operations") +TEST_OUTPUT_DIR = pathlib.Path("tests/unit/bigquery/generated") # Directory containing the Jinja2 templates TEMPLATE_DIR = pathlib.Path("scripts/templates") -RUFF_ARGS = [ +RUFF_COMMON_ARGS = [ + "--target-version=py310", + "--line-length=88", +] +RUFF_CHECK_ARGS = [ "ruff", "check", "--select", "I", "--fix", - "--target-version=py310", - "--line-length=88", -] +] + RUFF_COMMON_ARGS +RUFF_FORMAT_ARGS = [ + "ruff", + "format", +] + RUFF_COMMON_ARGS + DTYPE_MAP = { "binary": "dtypes.BYTES_DTYPE", "string": "dtypes.STRING_DTYPE", "int64": "dtypes.INT_DTYPE", + "i64": "dtypes.INT_DTYPE", "float64": "dtypes.FLOAT_DTYPE", + "fp64": "dtypes.FLOAT_DTYPE", "bool": "dtypes.BOOL_DTYPE", + "boolean": "dtypes.BOOL_DTYPE", "geography": "dtypes.GEO_DTYPE", "json": "dtypes.JSON_DTYPE", "date": "dtypes.DATE_DTYPE", "time": "dtypes.TIME_DTYPE", "datetime": "dtypes.DATETIME_DTYPE", "timestamp": "dtypes.TIMESTAMP_DTYPE", + "decimal<38,9>": "dtypes.NUMERIC_DTYPE", } PY_TYPE_MAP = { "binary": "bytes", "string": "str", "int64": "int", + "i64": "int", "float64": "float", + "fp64": "float", "bool": "bool", + "boolean": "bool", "geography": "Any", "json": "Any", "date": "datetime.date", @@ -75,19 +89,24 @@ "datetime": "datetime.datetime", "timestamp": "datetime.datetime", "struct": "dict", + "decimal<38,9>": "decimal.Decimal", } YAML_TYPE_TO_COL = { "binary": "bytes_col", "string": "string_col", "int64": "int64_col", + "i64": "int64_col", "float64": "float64_col", + "fp64": "float64_col", "bool": "bool_col", + "boolean": "bool_col", "geography": "geography_col", "date": "date_col", "time": "time_col", "datetime": "datetime_col", "timestamp": "timestamp_col", + "decimal<38,9>": "numeric_col", } @@ -107,7 +126,12 @@ def load_templates(): trim_blocks=True, lstrip_blocks=True, ) - return env.get_template("operation.py.j2"), env.get_template("test_operation.py.j2") + return { + "operation": env.get_template("operation.py.j2"), + "test_operation": env.get_template("test_operation.py.j2"), + "license": env.get_template("license.py.j2"), + "signature_def": env.get_template("signature_def.py.j2"), + } def _collect_args(impls): @@ -141,14 +165,77 @@ def _build_arg_specs(args_by_name, arg_order): return arg_specs -def _get_return_signature(impls): +def _is_concrete_type(yaml_type): + if yaml_type in DTYPE_MAP: + return True + if yaml_type.startswith("list<") and yaml_type.endswith(">"): + inner = yaml_type[5:-1] + return inner in DTYPE_MAP + return False + + +def _get_concrete_type_expr(yaml_type): + if yaml_type in DTYPE_MAP: + return DTYPE_MAP[yaml_type] + if yaml_type.startswith("list<") and yaml_type.endswith(">"): + inner = yaml_type[5:-1] + return f"dtypes.list_type({DTYPE_MAP[inner]})" + raise ValueError(f"Not a concrete type: {yaml_type}") + + +def _validate_types(impls): + for impl in impls: + for arg in impl["args"]: + val = arg["value"] + if val == "any1": + continue + if val.startswith("list<") and val.endswith(">"): + inner = val[5:-1] + if inner != "any1" and inner not in DTYPE_MAP: + raise ValueError(f"Unsupported inner type: {inner}") + continue + if val == "struct": + continue + if val not in DTYPE_MAP: + raise ValueError(f"Unsupported type: {val}") + + ret = impl["return"] + if ret == "any1": + continue + if ret.startswith("list<") and ret.endswith(">"): + inner = ret[5:-1] + if inner != "any1" and inner not in DTYPE_MAP: + raise ValueError(f"Unsupported inner type: {inner}") + continue + if ret not in DTYPE_MAP: + raise ValueError(f"Unsupported type: {ret}") + + +def _generate_signature_def(python_name, impls, sql_name, template): return_types = {impl["return"] for impl in impls} + + # Optimization: if all impls return the same concrete type, + # we can skip type checking and just return that type. if len(return_types) == 1: - ret_type = sorted(return_types)[0] - return f"lambda *args: {DTYPE_MAP.get(ret_type, 'None')}" - else: - # Fallback to Any/None if ambiguous - return "lambda *args: None" + ret_type = next(iter(return_types)) + if _is_concrete_type(ret_type): + sig_expr = f"lambda *args: {_get_concrete_type_expr(ret_type)}" + return sig_expr, None + + _validate_types(impls) + + func_name = f"_{python_name.upper()}_SIG" + max_args = max(len(impl["args"]) for impl in impls) + + rendered = template.render( + func_name=func_name, + max_args=max_args, + impls=impls, + sql_name=sql_name, + dtype_map=DTYPE_MAP, + ) + + return func_name, rendered def _get_func_args(args_by_name, arg_order): @@ -191,7 +278,7 @@ def _get_test_args(args_by_name, arg_order): return test_args -def parse_scalar_functions(data, module_name): +def parse_scalar_functions(data, module_name, signature_def_template, is_global=False): ops_list = [] functions_list = [] @@ -201,7 +288,7 @@ def parse_scalar_functions(data, module_name): for func_data in data["scalar_functions"]: sql_name = func_data["name"] python_name = to_snake_case(sql_name) - if python_name.startswith(module_name + "_"): + if not is_global and python_name.startswith(module_name + "_"): python_name = python_name[len(module_name) + 1 :] internal_op_name = f"_{python_name.upper()}_OP" @@ -211,16 +298,25 @@ def parse_scalar_functions(data, module_name): # Build ArgSpecs arg_specs = _build_arg_specs(args_by_name, arg_order) + arg_specs_str = ", ".join(arg_specs) + if len(arg_specs) == 1: + arg_specs_str += "," # Determine return dtype - signature = _get_return_signature(func_data["impls"]) + sig_name, sig_def = _generate_signature_def( + python_name, + func_data["impls"], + sql_name, + signature_def_template, + ) ops_list.append( { "internal_name": internal_op_name, "sql_name": sql_name.upper(), - "arg_specs": ", ".join(arg_specs), - "signature": signature, + "arg_specs": arg_specs_str, + "signature": sig_name, + "signature_definition": sig_def, } ) @@ -244,8 +340,20 @@ def parse_scalar_functions(data, module_name): def run_ruff(path: pathlib.Path): + import sys + + subprocess.run( + [sys.executable, "-m", "ruff"] + + RUFF_CHECK_ARGS[1:] + + [ + str(path), + ], + check=True, + ) + subprocess.run( - RUFF_ARGS + [sys.executable, "-m", "ruff"] + + RUFF_FORMAT_ARGS[1:] + [ str(path), ], @@ -253,7 +361,21 @@ def run_ruff(path: pathlib.Path): ) -def process_yaml_file(yaml_file, template, test_template): +def ensure_init_py(directory: pathlib.Path, limit_dir: pathlib.Path, license_template): + """Ensures __init__.py exists in the directory and its parents up to limit_dir.""" + curr = directory + while curr != limit_dir and curr != curr.parent: + init_file = curr / "__init__.py" + if not init_file.exists(): + print(f" Creating {init_file}") + content = license_template.render() + with open(init_file, "w") as f: + f.write(content) + run_ruff(init_file) + curr = curr.parent + + +def process_yaml_file(yaml_file, templates): print(f"Processing {yaml_file}...") with open(yaml_file, "r") as f: data = yaml.safe_load(f) @@ -263,11 +385,18 @@ def process_yaml_file(yaml_file, template, test_template): module_name = module_path.name output_file = OUTPUT_DIR.joinpath(module_path).with_suffix(".py") - ops_list, functions_list = parse_scalar_functions(data, module_name) + is_global = "global_namespace" in module_path.parts + ops_list, functions_list = parse_scalar_functions( + data, + module_name, + templates["signature_def"], + is_global=is_global, + ) # Render and write output_file.parent.mkdir(parents=True, exist_ok=True) - content = template.render( + ensure_init_py(output_file.parent, OUTPUT_DIR.parent, templates["license"]) + content = templates["operation"].render( yaml_path=str(yaml_file), script_path="scripts/generate_bigframes_bigquery.py", ops=ops_list, @@ -280,17 +409,21 @@ def process_yaml_file(yaml_file, template, test_template): print(f" Generated {output_file}") # Render and write test - import_path = "bigframes.bigquery._operations." + ".".join(module_path.parts) + import_path = "bigframes.operations.googlesql." + ".".join(module_path.parts) test_output_file = TEST_OUTPUT_DIR.joinpath( module_path.with_name(f"test_{module_path.name}") ).with_suffix(".py") test_output_file.parent.mkdir(parents=True, exist_ok=True) - test_content = test_template.render( + ensure_init_py( + test_output_file.parent, TEST_OUTPUT_DIR.parent, templates["license"] + ) + test_content = templates["test_operation"].render( yaml_path=str(yaml_file), script_path="scripts/generate_bigframes_bigquery.py", import_path=import_path, short_name=module_path.name, + is_global=is_global, functions=functions_list, ) with open(test_output_file, "w") as f: @@ -299,22 +432,12 @@ def process_yaml_file(yaml_file, template, test_template): run_ruff(test_output_file) print(f" Generated {test_output_file}") - print(f" Updating snapshots for {test_output_file}...") - subprocess.run( - [ - "pytest", - str(test_output_file), - "--snapshot-update", - ], - check=False, - ) - def main(): - template, test_template = load_templates() + templates = load_templates() for yaml_file in sorted(DATA_DIR.glob("**/*.yaml")): - process_yaml_file(yaml_file, template, test_template) + process_yaml_file(yaml_file, templates) if __name__ == "__main__": diff --git a/packages/bigframes/scripts/generate_bigframes_bigquery.py.lock b/packages/bigframes/scripts/generate_bigframes_bigquery.py.lock deleted file mode 100644 index 0c89fde6d406..000000000000 --- a/packages/bigframes/scripts/generate_bigframes_bigquery.py.lock +++ /dev/null @@ -1,104 +0,0 @@ -version = 1 -revision = 3 -requires-python = ">=3.14" - -[manifest] -requirements = [ - { name = "jinja2" }, - { name = "pyyaml" }, - { name = "ruff", specifier = "==0.14.14" }, -] - -[[package]] -name = "jinja2" -version = "3.1.6" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "markupsafe" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/df/bf/f7da0350254c0ed7c72f3e33cef02e048281fec7ecec5f032d4aac52226b/jinja2-3.1.6.tar.gz", hash = "sha256:0137fb05990d35f1275a587e9aee6d56da821fc83491a0fb838183be43f66d6d", size = 245115, upload-time = "2025-03-05T20:05:02.478Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/62/a1/3d680cbfd5f4b8f15abc1d571870c5fc3e594bb582bc3b64ea099db13e56/jinja2-3.1.6-py3-none-any.whl", hash = "sha256:85ece4451f492d0c13c5dd7c13a64681a86afae63a5f347908daf103ce6d2f67", size = 134899, upload-time = "2025-03-05T20:05:00.369Z" }, -] - -[[package]] -name = "markupsafe" -version = "3.0.3" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/7e/99/7690b6d4034fffd95959cbe0c02de8deb3098cc577c67bb6a24fe5d7caa7/markupsafe-3.0.3.tar.gz", hash = "sha256:722695808f4b6457b320fdc131280796bdceb04ab50fe1795cd540799ebe1698", size = 80313, upload-time = "2025-09-27T18:37:40.426Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/33/8a/8e42d4838cd89b7dde187011e97fe6c3af66d8c044997d2183fbd6d31352/markupsafe-3.0.3-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:eaa9599de571d72e2daf60164784109f19978b327a3910d3e9de8c97b5b70cfe", size = 11619, upload-time = "2025-09-27T18:37:06.342Z" }, - { url = "https://files.pythonhosted.org/packages/b5/64/7660f8a4a8e53c924d0fa05dc3a55c9cee10bbd82b11c5afb27d44b096ce/markupsafe-3.0.3-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:c47a551199eb8eb2121d4f0f15ae0f923d31350ab9280078d1e5f12b249e0026", size = 12029, upload-time = "2025-09-27T18:37:07.213Z" }, - { url = "https://files.pythonhosted.org/packages/da/ef/e648bfd021127bef5fa12e1720ffed0c6cbb8310c8d9bea7266337ff06de/markupsafe-3.0.3-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f34c41761022dd093b4b6896d4810782ffbabe30f2d443ff5f083e0cbbb8c737", size = 24408, upload-time = "2025-09-27T18:37:09.572Z" }, - { url = "https://files.pythonhosted.org/packages/41/3c/a36c2450754618e62008bf7435ccb0f88053e07592e6028a34776213d877/markupsafe-3.0.3-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:457a69a9577064c05a97c41f4e65148652db078a3a509039e64d3467b9e7ef97", size = 23005, upload-time = "2025-09-27T18:37:10.58Z" }, - { url = "https://files.pythonhosted.org/packages/bc/20/b7fdf89a8456b099837cd1dc21974632a02a999ec9bf7ca3e490aacd98e7/markupsafe-3.0.3-cp314-cp314-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:e8afc3f2ccfa24215f8cb28dcf43f0113ac3c37c2f0f0806d8c70e4228c5cf4d", size = 22048, upload-time = "2025-09-27T18:37:11.547Z" }, - { url = "https://files.pythonhosted.org/packages/9a/a7/591f592afdc734f47db08a75793a55d7fbcc6902a723ae4cfbab61010cc5/markupsafe-3.0.3-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:ec15a59cf5af7be74194f7ab02d0f59a62bdcf1a537677ce67a2537c9b87fcda", size = 23821, upload-time = "2025-09-27T18:37:12.48Z" }, - { url = "https://files.pythonhosted.org/packages/7d/33/45b24e4f44195b26521bc6f1a82197118f74df348556594bd2262bda1038/markupsafe-3.0.3-cp314-cp314-musllinux_1_2_riscv64.whl", hash = "sha256:0eb9ff8191e8498cca014656ae6b8d61f39da5f95b488805da4bb029cccbfbaf", size = 21606, upload-time = "2025-09-27T18:37:13.485Z" }, - { url = "https://files.pythonhosted.org/packages/ff/0e/53dfaca23a69fbfbbf17a4b64072090e70717344c52eaaaa9c5ddff1e5f0/markupsafe-3.0.3-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:2713baf880df847f2bece4230d4d094280f4e67b1e813eec43b4c0e144a34ffe", size = 23043, upload-time = "2025-09-27T18:37:14.408Z" }, - { url = "https://files.pythonhosted.org/packages/46/11/f333a06fc16236d5238bfe74daccbca41459dcd8d1fa952e8fbd5dccfb70/markupsafe-3.0.3-cp314-cp314-win32.whl", hash = "sha256:729586769a26dbceff69f7a7dbbf59ab6572b99d94576a5592625d5b411576b9", size = 14747, upload-time = "2025-09-27T18:37:15.36Z" }, - { url = "https://files.pythonhosted.org/packages/28/52/182836104b33b444e400b14f797212f720cbc9ed6ba34c800639d154e821/markupsafe-3.0.3-cp314-cp314-win_amd64.whl", hash = "sha256:bdc919ead48f234740ad807933cdf545180bfbe9342c2bb451556db2ed958581", size = 15341, upload-time = "2025-09-27T18:37:16.496Z" }, - { url = "https://files.pythonhosted.org/packages/6f/18/acf23e91bd94fd7b3031558b1f013adfa21a8e407a3fdb32745538730382/markupsafe-3.0.3-cp314-cp314-win_arm64.whl", hash = "sha256:5a7d5dc5140555cf21a6fefbdbf8723f06fcd2f63ef108f2854de715e4422cb4", size = 14073, upload-time = "2025-09-27T18:37:17.476Z" }, - { url = "https://files.pythonhosted.org/packages/3c/f0/57689aa4076e1b43b15fdfa646b04653969d50cf30c32a102762be2485da/markupsafe-3.0.3-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:1353ef0c1b138e1907ae78e2f6c63ff67501122006b0f9abad68fda5f4ffc6ab", size = 11661, upload-time = "2025-09-27T18:37:18.453Z" }, - { url = "https://files.pythonhosted.org/packages/89/c3/2e67a7ca217c6912985ec766c6393b636fb0c2344443ff9d91404dc4c79f/markupsafe-3.0.3-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:1085e7fbddd3be5f89cc898938f42c0b3c711fdcb37d75221de2666af647c175", size = 12069, upload-time = "2025-09-27T18:37:19.332Z" }, - { url = "https://files.pythonhosted.org/packages/f0/00/be561dce4e6ca66b15276e184ce4b8aec61fe83662cce2f7d72bd3249d28/markupsafe-3.0.3-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:1b52b4fb9df4eb9ae465f8d0c228a00624de2334f216f178a995ccdcf82c4634", size = 25670, upload-time = "2025-09-27T18:37:20.245Z" }, - { url = "https://files.pythonhosted.org/packages/50/09/c419f6f5a92e5fadde27efd190eca90f05e1261b10dbd8cbcb39cd8ea1dc/markupsafe-3.0.3-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:fed51ac40f757d41b7c48425901843666a6677e3e8eb0abcff09e4ba6e664f50", size = 23598, upload-time = "2025-09-27T18:37:21.177Z" }, - { url = "https://files.pythonhosted.org/packages/22/44/a0681611106e0b2921b3033fc19bc53323e0b50bc70cffdd19f7d679bb66/markupsafe-3.0.3-cp314-cp314t-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:f190daf01f13c72eac4efd5c430a8de82489d9cff23c364c3ea822545032993e", size = 23261, upload-time = "2025-09-27T18:37:22.167Z" }, - { url = "https://files.pythonhosted.org/packages/5f/57/1b0b3f100259dc9fffe780cfb60d4be71375510e435efec3d116b6436d43/markupsafe-3.0.3-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:e56b7d45a839a697b5eb268c82a71bd8c7f6c94d6fd50c3d577fa39a9f1409f5", size = 24835, upload-time = "2025-09-27T18:37:23.296Z" }, - { url = "https://files.pythonhosted.org/packages/26/6a/4bf6d0c97c4920f1597cc14dd720705eca0bf7c787aebc6bb4d1bead5388/markupsafe-3.0.3-cp314-cp314t-musllinux_1_2_riscv64.whl", hash = "sha256:f3e98bb3798ead92273dc0e5fd0f31ade220f59a266ffd8a4f6065e0a3ce0523", size = 22733, upload-time = "2025-09-27T18:37:24.237Z" }, - { url = "https://files.pythonhosted.org/packages/14/c7/ca723101509b518797fedc2fdf79ba57f886b4aca8a7d31857ba3ee8281f/markupsafe-3.0.3-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:5678211cb9333a6468fb8d8be0305520aa073f50d17f089b5b4b477ea6e67fdc", size = 23672, upload-time = "2025-09-27T18:37:25.271Z" }, - { url = "https://files.pythonhosted.org/packages/fb/df/5bd7a48c256faecd1d36edc13133e51397e41b73bb77e1a69deab746ebac/markupsafe-3.0.3-cp314-cp314t-win32.whl", hash = "sha256:915c04ba3851909ce68ccc2b8e2cd691618c4dc4c4232fb7982bca3f41fd8c3d", size = 14819, upload-time = "2025-09-27T18:37:26.285Z" }, - { url = "https://files.pythonhosted.org/packages/1a/8a/0402ba61a2f16038b48b39bccca271134be00c5c9f0f623208399333c448/markupsafe-3.0.3-cp314-cp314t-win_amd64.whl", hash = "sha256:4faffd047e07c38848ce017e8725090413cd80cbc23d86e55c587bf979e579c9", size = 15426, upload-time = "2025-09-27T18:37:27.316Z" }, - { url = "https://files.pythonhosted.org/packages/70/bc/6f1c2f612465f5fa89b95bead1f44dcb607670fd42891d8fdcd5d039f4f4/markupsafe-3.0.3-cp314-cp314t-win_arm64.whl", hash = "sha256:32001d6a8fc98c8cb5c947787c5d08b0a50663d139f1305bac5885d98d9b40fa", size = 14146, upload-time = "2025-09-27T18:37:28.327Z" }, -] - -[[package]] -name = "pyyaml" -version = "6.0.3" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/05/8e/961c0007c59b8dd7729d542c61a4d537767a59645b82a0b521206e1e25c2/pyyaml-6.0.3.tar.gz", hash = "sha256:d76623373421df22fb4cf8817020cbb7ef15c725b9d5e45f17e189bfc384190f", size = 130960, upload-time = "2025-09-25T21:33:16.546Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/9d/8c/f4bd7f6465179953d3ac9bc44ac1a8a3e6122cf8ada906b4f96c60172d43/pyyaml-6.0.3-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:8d1fab6bb153a416f9aeb4b8763bc0f22a5586065f86f7664fc23339fc1c1fac", size = 181814, upload-time = "2025-09-25T21:32:35.712Z" }, - { url = "https://files.pythonhosted.org/packages/bd/9c/4d95bb87eb2063d20db7b60faa3840c1b18025517ae857371c4dd55a6b3a/pyyaml-6.0.3-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:34d5fcd24b8445fadc33f9cf348c1047101756fd760b4dacb5c3e99755703310", size = 173809, upload-time = "2025-09-25T21:32:36.789Z" }, - { url = "https://files.pythonhosted.org/packages/92/b5/47e807c2623074914e29dabd16cbbdd4bf5e9b2db9f8090fa64411fc5382/pyyaml-6.0.3-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:501a031947e3a9025ed4405a168e6ef5ae3126c59f90ce0cd6f2bfc477be31b7", size = 766454, upload-time = "2025-09-25T21:32:37.966Z" }, - { url = "https://files.pythonhosted.org/packages/02/9e/e5e9b168be58564121efb3de6859c452fccde0ab093d8438905899a3a483/pyyaml-6.0.3-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:b3bc83488de33889877a0f2543ade9f70c67d66d9ebb4ac959502e12de895788", size = 836355, upload-time = "2025-09-25T21:32:39.178Z" }, - { url = "https://files.pythonhosted.org/packages/88/f9/16491d7ed2a919954993e48aa941b200f38040928474c9e85ea9e64222c3/pyyaml-6.0.3-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c458b6d084f9b935061bc36216e8a69a7e293a2f1e68bf956dcd9e6cbcd143f5", size = 794175, upload-time = "2025-09-25T21:32:40.865Z" }, - { url = "https://files.pythonhosted.org/packages/dd/3f/5989debef34dc6397317802b527dbbafb2b4760878a53d4166579111411e/pyyaml-6.0.3-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:7c6610def4f163542a622a73fb39f534f8c101d690126992300bf3207eab9764", size = 755228, upload-time = "2025-09-25T21:32:42.084Z" }, - { url = "https://files.pythonhosted.org/packages/d7/ce/af88a49043cd2e265be63d083fc75b27b6ed062f5f9fd6cdc223ad62f03e/pyyaml-6.0.3-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:5190d403f121660ce8d1d2c1bb2ef1bd05b5f68533fc5c2ea899bd15f4399b35", size = 789194, upload-time = "2025-09-25T21:32:43.362Z" }, - { url = "https://files.pythonhosted.org/packages/23/20/bb6982b26a40bb43951265ba29d4c246ef0ff59c9fdcdf0ed04e0687de4d/pyyaml-6.0.3-cp314-cp314-win_amd64.whl", hash = "sha256:4a2e8cebe2ff6ab7d1050ecd59c25d4c8bd7e6f400f5f82b96557ac0abafd0ac", size = 156429, upload-time = "2025-09-25T21:32:57.844Z" }, - { url = "https://files.pythonhosted.org/packages/f4/f4/a4541072bb9422c8a883ab55255f918fa378ecf083f5b85e87fc2b4eda1b/pyyaml-6.0.3-cp314-cp314-win_arm64.whl", hash = "sha256:93dda82c9c22deb0a405ea4dc5f2d0cda384168e466364dec6255b293923b2f3", size = 143912, upload-time = "2025-09-25T21:32:59.247Z" }, - { url = "https://files.pythonhosted.org/packages/7c/f9/07dd09ae774e4616edf6cda684ee78f97777bdd15847253637a6f052a62f/pyyaml-6.0.3-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:02893d100e99e03eda1c8fd5c441d8c60103fd175728e23e431db1b589cf5ab3", size = 189108, upload-time = "2025-09-25T21:32:44.377Z" }, - { url = "https://files.pythonhosted.org/packages/4e/78/8d08c9fb7ce09ad8c38ad533c1191cf27f7ae1effe5bb9400a46d9437fcf/pyyaml-6.0.3-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:c1ff362665ae507275af2853520967820d9124984e0f7466736aea23d8611fba", size = 183641, upload-time = "2025-09-25T21:32:45.407Z" }, - { url = "https://files.pythonhosted.org/packages/7b/5b/3babb19104a46945cf816d047db2788bcaf8c94527a805610b0289a01c6b/pyyaml-6.0.3-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6adc77889b628398debc7b65c073bcb99c4a0237b248cacaf3fe8a557563ef6c", size = 831901, upload-time = "2025-09-25T21:32:48.83Z" }, - { url = "https://files.pythonhosted.org/packages/8b/cc/dff0684d8dc44da4d22a13f35f073d558c268780ce3c6ba1b87055bb0b87/pyyaml-6.0.3-cp314-cp314t-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:a80cb027f6b349846a3bf6d73b5e95e782175e52f22108cfa17876aaeff93702", size = 861132, upload-time = "2025-09-25T21:32:50.149Z" }, - { url = "https://files.pythonhosted.org/packages/b1/5e/f77dc6b9036943e285ba76b49e118d9ea929885becb0a29ba8a7c75e29fe/pyyaml-6.0.3-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:00c4bdeba853cc34e7dd471f16b4114f4162dc03e6b7afcc2128711f0eca823c", size = 839261, upload-time = "2025-09-25T21:32:51.808Z" }, - { url = "https://files.pythonhosted.org/packages/ce/88/a9db1376aa2a228197c58b37302f284b5617f56a5d959fd1763fb1675ce6/pyyaml-6.0.3-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:66e1674c3ef6f541c35191caae2d429b967b99e02040f5ba928632d9a7f0f065", size = 805272, upload-time = "2025-09-25T21:32:52.941Z" }, - { url = "https://files.pythonhosted.org/packages/da/92/1446574745d74df0c92e6aa4a7b0b3130706a4142b2d1a5869f2eaa423c6/pyyaml-6.0.3-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:16249ee61e95f858e83976573de0f5b2893b3677ba71c9dd36b9cf8be9ac6d65", size = 829923, upload-time = "2025-09-25T21:32:54.537Z" }, - { url = "https://files.pythonhosted.org/packages/f0/7a/1c7270340330e575b92f397352af856a8c06f230aa3e76f86b39d01b416a/pyyaml-6.0.3-cp314-cp314t-win_amd64.whl", hash = "sha256:4ad1906908f2f5ae4e5a8ddfce73c320c2a1429ec52eafd27138b7f1cbe341c9", size = 174062, upload-time = "2025-09-25T21:32:55.767Z" }, - { url = "https://files.pythonhosted.org/packages/f1/12/de94a39c2ef588c7e6455cfbe7343d3b2dc9d6b6b2f40c4c6565744c873d/pyyaml-6.0.3-cp314-cp314t-win_arm64.whl", hash = "sha256:ebc55a14a21cb14062aa4162f906cd962b28e2e9ea38f9b4391244cd8de4ae0b", size = 149341, upload-time = "2025-09-25T21:32:56.828Z" }, -] - -[[package]] -name = "ruff" -version = "0.14.14" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/2e/06/f71e3a86b2df0dfa2d2f72195941cd09b44f87711cb7fa5193732cb9a5fc/ruff-0.14.14.tar.gz", hash = "sha256:2d0f819c9a90205f3a867dbbd0be083bee9912e170fd7d9704cc8ae45824896b", size = 4515732, upload-time = "2026-01-22T22:30:17.527Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/d2/89/20a12e97bc6b9f9f68343952da08a8099c57237aef953a56b82711d55edd/ruff-0.14.14-py3-none-linux_armv6l.whl", hash = "sha256:7cfe36b56e8489dee8fbc777c61959f60ec0f1f11817e8f2415f429552846aed", size = 10467650, upload-time = "2026-01-22T22:30:08.578Z" }, - { url = "https://files.pythonhosted.org/packages/a3/b1/c5de3fd2d5a831fcae21beda5e3589c0ba67eec8202e992388e4b17a6040/ruff-0.14.14-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:6006a0082336e7920b9573ef8a7f52eec837add1265cc74e04ea8a4368cd704c", size = 10883245, upload-time = "2026-01-22T22:30:04.155Z" }, - { url = "https://files.pythonhosted.org/packages/b8/7c/3c1db59a10e7490f8f6f8559d1db8636cbb13dccebf18686f4e3c9d7c772/ruff-0.14.14-py3-none-macosx_11_0_arm64.whl", hash = "sha256:026c1d25996818f0bf498636686199d9bd0d9d6341c9c2c3b62e2a0198b758de", size = 10231273, upload-time = "2026-01-22T22:30:34.642Z" }, - { url = "https://files.pythonhosted.org/packages/a1/6e/5e0e0d9674be0f8581d1f5e0f0a04761203affce3232c1a1189d0e3b4dad/ruff-0.14.14-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f666445819d31210b71e0a6d1c01e24447a20b85458eea25a25fe8142210ae0e", size = 10585753, upload-time = "2026-01-22T22:30:31.781Z" }, - { url = "https://files.pythonhosted.org/packages/23/09/754ab09f46ff1884d422dc26d59ba18b4e5d355be147721bb2518aa2a014/ruff-0.14.14-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:3c0f18b922c6d2ff9a5e6c3ee16259adc513ca775bcf82c67ebab7cbd9da5bc8", size = 10286052, upload-time = "2026-01-22T22:30:24.827Z" }, - { url = "https://files.pythonhosted.org/packages/c8/cc/e71f88dd2a12afb5f50733851729d6b571a7c3a35bfdb16c3035132675a0/ruff-0.14.14-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1629e67489c2dea43e8658c3dba659edbfd87361624b4040d1df04c9740ae906", size = 11043637, upload-time = "2026-01-22T22:30:13.239Z" }, - { url = "https://files.pythonhosted.org/packages/67/b2/397245026352494497dac935d7f00f1468c03a23a0c5db6ad8fc49ca3fb2/ruff-0.14.14-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:27493a2131ea0f899057d49d303e4292b2cae2bb57253c1ed1f256fbcd1da480", size = 12194761, upload-time = "2026-01-22T22:30:22.542Z" }, - { url = "https://files.pythonhosted.org/packages/5b/06/06ef271459f778323112c51b7587ce85230785cd64e91772034ddb88f200/ruff-0.14.14-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:01ff589aab3f5b539e35db38425da31a57521efd1e4ad1ae08fc34dbe30bd7df", size = 12005701, upload-time = "2026-01-22T22:30:20.499Z" }, - { url = "https://files.pythonhosted.org/packages/41/d6/99364514541cf811ccc5ac44362f88df66373e9fec1b9d1c4cc830593fe7/ruff-0.14.14-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:1cc12d74eef0f29f51775f5b755913eb523546b88e2d733e1d701fe65144e89b", size = 11282455, upload-time = "2026-01-22T22:29:59.679Z" }, - { url = "https://files.pythonhosted.org/packages/ca/71/37daa46f89475f8582b7762ecd2722492df26421714a33e72ccc9a84d7a5/ruff-0.14.14-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bb8481604b7a9e75eff53772496201690ce2687067e038b3cc31aaf16aa0b974", size = 11215882, upload-time = "2026-01-22T22:29:57.032Z" }, - { url = "https://files.pythonhosted.org/packages/2c/10/a31f86169ec91c0705e618443ee74ede0bdd94da0a57b28e72db68b2dbac/ruff-0.14.14-py3-none-manylinux_2_31_riscv64.whl", hash = "sha256:14649acb1cf7b5d2d283ebd2f58d56b75836ed8c6f329664fa91cdea19e76e66", size = 11180549, upload-time = "2026-01-22T22:30:27.175Z" }, - { url = "https://files.pythonhosted.org/packages/fd/1e/c723f20536b5163adf79bdd10c5f093414293cdf567eed9bdb7b83940f3f/ruff-0.14.14-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:e8058d2145566510790eab4e2fad186002e288dec5e0d343a92fe7b0bc1b3e13", size = 10543416, upload-time = "2026-01-22T22:30:01.964Z" }, - { url = "https://files.pythonhosted.org/packages/3e/34/8a84cea7e42c2d94ba5bde1d7a4fae164d6318f13f933d92da6d7c2041ff/ruff-0.14.14-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:e651e977a79e4c758eb807f0481d673a67ffe53cfa92209781dfa3a996cf8412", size = 10285491, upload-time = "2026-01-22T22:30:29.51Z" }, - { url = "https://files.pythonhosted.org/packages/55/ef/b7c5ea0be82518906c978e365e56a77f8de7678c8bb6651ccfbdc178c29f/ruff-0.14.14-py3-none-musllinux_1_2_i686.whl", hash = "sha256:cc8b22da8d9d6fdd844a68ae937e2a0adf9b16514e9a97cc60355e2d4b219fc3", size = 10733525, upload-time = "2026-01-22T22:30:06.499Z" }, - { url = "https://files.pythonhosted.org/packages/6a/5b/aaf1dfbcc53a2811f6cc0a1759de24e4b03e02ba8762daabd9b6bd8c59e3/ruff-0.14.14-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:16bc890fb4cc9781bb05beb5ab4cd51be9e7cb376bf1dd3580512b24eb3fda2b", size = 11315626, upload-time = "2026-01-22T22:30:36.848Z" }, - { url = "https://files.pythonhosted.org/packages/2c/aa/9f89c719c467dfaf8ad799b9bae0df494513fb21d31a6059cb5870e57e74/ruff-0.14.14-py3-none-win32.whl", hash = "sha256:b530c191970b143375b6a68e6f743800b2b786bbcf03a7965b06c4bf04568167", size = 10502442, upload-time = "2026-01-22T22:30:38.93Z" }, - { url = "https://files.pythonhosted.org/packages/87/44/90fa543014c45560cae1fffc63ea059fb3575ee6e1cb654562197e5d16fb/ruff-0.14.14-py3-none-win_amd64.whl", hash = "sha256:3dde1435e6b6fe5b66506c1dff67a421d0b7f6488d466f651c07f4cab3bf20fd", size = 11630486, upload-time = "2026-01-22T22:30:10.852Z" }, - { url = "https://files.pythonhosted.org/packages/9e/6a/40fee331a52339926a92e17ae748827270b288a35ef4a15c9c8f2ec54715/ruff-0.14.14-py3-none-win_arm64.whl", hash = "sha256:56e6981a98b13a32236a72a8da421d7839221fa308b223b9283312312e5ac76c", size = 10920448, upload-time = "2026-01-22T22:30:15.417Z" }, -] diff --git a/packages/bigframes/scripts/templates/operation.py.j2 b/packages/bigframes/scripts/templates/operation.py.j2 index 7e7c7f95b62f..720d867986e2 100644 --- a/packages/bigframes/scripts/templates/operation.py.j2 +++ b/packages/bigframes/scripts/templates/operation.py.j2 @@ -8,20 +8,24 @@ from __future__ import annotations import datetime +import decimal from typing import Any, Literal, Optional, TypeVar, Union from bigframes import dtypes -import bigframes.bigquery._googlesql import bigframes.core.col import bigframes.core.expression as ex +import bigframes.core.googlesql import bigframes.core.sentinels as sentinels from bigframes.operations import googlesql import bigframes.operations as ops import bigframes.series as series -T = TypeVar("T", series.Series, bigframes.core.col.Expression) - {% for op in ops %} +{% if op.signature_definition %} +{{ op.signature_definition }} + + +{% endif %} {{ op.internal_name }} = googlesql.GoogleSqlScalarOp( "{{ op.sql_name }}", args=({{ op.arg_specs }}), @@ -33,14 +37,14 @@ T = TypeVar("T", series.Series, bigframes.core.col.Expression) def {{ func.name }}( {% for arg in func.args %} - {{ arg.name }}: Union[T, bigframes.core.col.Expression, {{ arg.type_hint }}]{% if arg.default %} = {{ arg.default }}{% endif %}, + {{ arg.name }}: Union[series.Series, bigframes.core.col.Expression, {{ arg.type_hint }}]{% if arg.default %} = {{ arg.default }}{% endif %}, {% endfor %} -) -> T: - """{{ func.description }}""" - return bigframes.bigquery._googlesql.apply_googlesql_scalar_op( +) -> Union[series.Series, bigframes.core.col.Expression]: + """{{ func.description | indent(4) }}""" + return bigframes.core.googlesql.apply_googlesql_scalar_op( {{ func.op_name }}, {% for arg in func.args %} {{ arg.name }}, {% endfor %} - ) # type: ignore + ) {% endfor %} diff --git a/packages/bigframes/scripts/templates/signature_def.py.j2 b/packages/bigframes/scripts/templates/signature_def.py.j2 new file mode 100644 index 000000000000..ad1871f7df6d --- /dev/null +++ b/packages/bigframes/scripts/templates/signature_def.py.j2 @@ -0,0 +1,74 @@ +def {{ func_name }}(*args): + # Pad args with None to match max expected args + args = args + (None,) * ({{ max_args }} - len(args)) + {% for impl in impls %} + # Try matching impl {{ loop.index0 }} + any1_val = None + match_ok = True + {% for arg in impl.args %} + {% set idx = loop.index0 %} + if match_ok and args[{{ idx }}] is not None: + {% if arg.value == "any1" %} + if any1_val is not None: + try: + any1_val = dtypes.coerce_to_common(any1_val, args[{{ idx }}]) + except TypeError: + match_ok = False + else: + any1_val = args[{{ idx }}] + {% elif arg.value.startswith("list<") and arg.value.endswith(">") %} + {% set inner_type = arg.value[5:-1] %} + if not dtypes.is_array_like(args[{{ idx }}]): + match_ok = False + else: + inner = dtypes.get_array_inner_type(args[{{ idx }}]) + {% if inner_type == "any1" %} + if any1_val is not None: + try: + any1_val = dtypes.coerce_to_common(any1_val, inner) + except TypeError: + match_ok = False + else: + any1_val = inner + {% else %} + {% set dtype_expr = dtype_map[inner_type] %} + try: + if dtypes.coerce_to_common(inner, {{ dtype_expr }}) != {{ dtype_expr }}: + match_ok = False + except TypeError: + match_ok = False + {% endif %} + {% elif arg.value == "struct" %} + if not dtypes.is_struct_like(args[{{ idx }}]): + match_ok = False + {% else %} + {% set dtype_expr = dtype_map[arg.value] %} + try: + if dtypes.coerce_to_common(args[{{ idx }}], {{ dtype_expr }}) != {{ dtype_expr }}: + match_ok = False + except TypeError: + match_ok = False + {% endif %} + {% endfor %} + if match_ok: + {% set return_type_yaml = impl.return %} + {% if return_type_yaml == "any1" %} + return any1_val + {% elif return_type_yaml.startswith("list<") and return_type_yaml.endswith(">") %} + {% set inner_type = return_type_yaml[5:-1] %} + {% if inner_type == "any1" %} + if any1_val is not None: + return dtypes.list_type(any1_val) + else: + return None + {% else %} + {% set dtype_expr = dtype_map[inner_type] %} + return dtypes.list_type({{ dtype_expr }}) + {% endif %} + {% else %} + {% set dtype_expr = dtype_map[return_type_yaml] %} + return {{ dtype_expr }} + {% endif %} + + {% endfor %} + raise TypeError(f"Could not find matching signature for {{ sql_name }} with argument types: {[str(t) for t in args]}") diff --git a/packages/bigframes/scripts/templates/test_operation.py.j2 b/packages/bigframes/scripts/templates/test_operation.py.j2 index 940712f4da26..21db9cbfc8ba 100644 --- a/packages/bigframes/scripts/templates/test_operation.py.j2 +++ b/packages/bigframes/scripts/templates/test_operation.py.j2 @@ -5,25 +5,40 @@ # This file was generated from: {{ yaml_path }} # by the script: {{ script_path }} -from typing import cast - -import pytest - +import bigframes.core.col +import bigframes.core.expression as ex import bigframes.pandas as bpd -import {{ import_path }} as {{ short_name }} - -pytest.importorskip("pytest_snapshot") +import {{ import_path }} as {{ short_name }}_op +import bigframes.bigquery as bbq {% for func in functions %} -def test_{{ func.name }}(scalar_types_df: bpd.DataFrame, snapshot): - result = {{ short_name }}.{{ func.name }}( -{% for arg in func.test_args %} - cast(bpd.Series, scalar_types_df["{{ arg.col_name }}"]), -{% endfor %} - ).to_frame() - - snapshot.assert_match(result.sql.rstrip() + "\n", "out.sql") +def test_{{ func.name }}_expression(): + # Call the function with col() expressions +{% if is_global %} + result = bbq.{{ func.name }}( +{% else %} + result = bbq.{{ short_name }}.{{ func.name }}( +{% endif %} + {% for arg in func.args %} + bpd.col("{{ arg.name }}"), + {% endfor %} + ) + + # Verify result is a col Expression + assert isinstance(result, bigframes.core.col.Expression) + + # Verify the internal expression structure + expr = result._value + assert isinstance(expr, ex.OpExpression) + assert expr.op == {{ short_name }}_op._{{ func.name | upper }}_OP + + # Verify arguments are free variables matching the names + assert len(expr.inputs) == {{ func.args | length }} + {% for arg in func.args %} + assert isinstance(expr.inputs[{{ loop.index0 }}], ex.UnboundVariableExpression) + assert expr.inputs[{{ loop.index0 }}].id == "{{ arg.name }}" + {% endfor %} {% endfor %} diff --git a/packages/bigframes/specs/bigframes-bigquery-generator.md b/packages/bigframes/specs/bigframes-bigquery-generator.md index 26d1d45c9f51..1078bbd05a3c 100644 --- a/packages/bigframes/specs/bigframes-bigquery-generator.md +++ b/packages/bigframes/specs/bigframes-bigquery-generator.md @@ -11,10 +11,9 @@ generates python submodules for the `bigframes.bigquery` module. When run without any arguments, it iterates through all yaml files at `packages/bigframes/scripts/data/sql-functions/**/*.yaml` to generate the code. -The script at `packages/bigframes/scripts/check_bigframes_bigquery.py` iterates -through all the same yaml files and checks that the functions have been included -in the `bigframes.bigquery` module, as the `__init__.py` file requires manual -updates. +The script also generates a unit test that verifies that the functions have been +included in the `bigframes.bigquery` module, which is important to check, as the +`__init__.py` file requires manual updates. ## Running the generator diff --git a/packages/bigframes/tests/system/small/bigquery/test_array.py b/packages/bigframes/tests/system/small/bigquery/test_array.py index 2c2b2001ebe8..148d84a22f3a 100644 --- a/packages/bigframes/tests/system/small/bigquery/test_array.py +++ b/packages/bigframes/tests/system/small/bigquery/test_array.py @@ -12,6 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. +from typing import cast + import numpy as np import pandas as pd import pytest @@ -72,8 +74,9 @@ def test_array_length(input_data, expected): index=pd.Index(range(len(input_data)), dtype="Int64"), dtype=bigframes.dtypes.INT_DTYPE, ) + result = cast(bpd.Series, bbq.array_length(series)) pd.testing.assert_series_equal( - bbq.array_length(series).to_pandas(), + result.to_pandas(), expected, check_index_type=False, ) diff --git a/packages/bigframes/tests/unit/bigquery/_operations/global_namespace/snapshots/test_aead_encryption/test_deterministic_decrypt_bytes/out.sql b/packages/bigframes/tests/unit/bigquery/_operations/global_namespace/snapshots/test_aead_encryption/test_deterministic_decrypt_bytes/out.sql deleted file mode 100644 index 1a567e3b002b..000000000000 --- a/packages/bigframes/tests/unit/bigquery/_operations/global_namespace/snapshots/test_aead_encryption/test_deterministic_decrypt_bytes/out.sql +++ /dev/null @@ -1,4 +0,0 @@ -SELECT - `rowindex`, - DETERMINISTIC_DECRYPT_BYTES(`bytes_col`, `bytes_col`, `bytes_col`) AS `0` -FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` AS `bft_0` diff --git a/packages/bigframes/tests/unit/bigquery/_operations/global_namespace/snapshots/test_aead_encryption/test_deterministic_decrypt_string/out.sql b/packages/bigframes/tests/unit/bigquery/_operations/global_namespace/snapshots/test_aead_encryption/test_deterministic_decrypt_string/out.sql deleted file mode 100644 index 84625951e6da..000000000000 --- a/packages/bigframes/tests/unit/bigquery/_operations/global_namespace/snapshots/test_aead_encryption/test_deterministic_decrypt_string/out.sql +++ /dev/null @@ -1,4 +0,0 @@ -SELECT - `rowindex`, - DETERMINISTIC_DECRYPT_STRING(`bytes_col`, `bytes_col`, `string_col`) AS `0` -FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` AS `bft_0` diff --git a/packages/bigframes/tests/unit/bigquery/_operations/global_namespace/snapshots/test_aead_encryption/test_deterministic_encrypt/out.sql b/packages/bigframes/tests/unit/bigquery/_operations/global_namespace/snapshots/test_aead_encryption/test_deterministic_encrypt/out.sql deleted file mode 100644 index 659097ded956..000000000000 --- a/packages/bigframes/tests/unit/bigquery/_operations/global_namespace/snapshots/test_aead_encryption/test_deterministic_encrypt/out.sql +++ /dev/null @@ -1,4 +0,0 @@ -SELECT - `rowindex`, - DETERMINISTIC_ENCRYPT(`bytes_col`, `bytes_col`, `bytes_col`) AS `0` -FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` AS `bft_0` diff --git a/packages/bigframes/tests/unit/bigquery/_operations/global_namespace/test_aead_encryption.py b/packages/bigframes/tests/unit/bigquery/_operations/global_namespace/test_aead_encryption.py deleted file mode 100644 index d7e14ccb4bef..000000000000 --- a/packages/bigframes/tests/unit/bigquery/_operations/global_namespace/test_aead_encryption.py +++ /dev/null @@ -1,56 +0,0 @@ -# Copyright 2026 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -# DO NOT MODIFY THIS FILE DIRECTLY. -# This file was generated from: scripts/data/sql-functions/global_namespace/aead_encryption.yaml -# by the script: scripts/generate_bigframes_bigquery.py - -from typing import cast - -import pytest - -import bigframes.bigquery._operations.global_namespace.aead_encryption as aead_encryption -import bigframes.pandas as bpd - -pytest.importorskip("pytest_snapshot") - - -def test_deterministic_decrypt_bytes(scalar_types_df: bpd.DataFrame, snapshot): - result = aead_encryption.deterministic_decrypt_bytes( - cast(bpd.Series, scalar_types_df["bytes_col"]), - cast(bpd.Series, scalar_types_df["bytes_col"]), - cast(bpd.Series, scalar_types_df["bytes_col"]), - ).to_frame() - - snapshot.assert_match(result.sql.rstrip() + "\n", "out.sql") - - -def test_deterministic_decrypt_string(scalar_types_df: bpd.DataFrame, snapshot): - result = aead_encryption.deterministic_decrypt_string( - cast(bpd.Series, scalar_types_df["bytes_col"]), - cast(bpd.Series, scalar_types_df["bytes_col"]), - cast(bpd.Series, scalar_types_df["string_col"]), - ).to_frame() - - snapshot.assert_match(result.sql.rstrip() + "\n", "out.sql") - - -def test_deterministic_encrypt(scalar_types_df: bpd.DataFrame, snapshot): - result = aead_encryption.deterministic_encrypt( - cast(bpd.Series, scalar_types_df["bytes_col"]), - cast(bpd.Series, scalar_types_df["bytes_col"]), - cast(bpd.Series, scalar_types_df["bytes_col"]), - ).to_frame() - - snapshot.assert_match(result.sql.rstrip() + "\n", "out.sql") diff --git a/packages/bigframes/tests/unit/bigquery/_operations/snapshots/test_aead/test_decrypt_bytes/out.sql b/packages/bigframes/tests/unit/bigquery/_operations/snapshots/test_aead/test_decrypt_bytes/out.sql deleted file mode 100644 index 5b8b6416b36f..000000000000 --- a/packages/bigframes/tests/unit/bigquery/_operations/snapshots/test_aead/test_decrypt_bytes/out.sql +++ /dev/null @@ -1,4 +0,0 @@ -SELECT - `rowindex`, - AEAD.DECRYPT_BYTES(`bytes_col`, `bytes_col`, `bytes_col`) AS `0` -FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` AS `bft_0` diff --git a/packages/bigframes/tests/unit/bigquery/_operations/snapshots/test_aead/test_decrypt_string/out.sql b/packages/bigframes/tests/unit/bigquery/_operations/snapshots/test_aead/test_decrypt_string/out.sql deleted file mode 100644 index 97b1ccff9c75..000000000000 --- a/packages/bigframes/tests/unit/bigquery/_operations/snapshots/test_aead/test_decrypt_string/out.sql +++ /dev/null @@ -1,4 +0,0 @@ -SELECT - `rowindex`, - AEAD.DECRYPT_STRING(`bytes_col`, `bytes_col`, `string_col`) AS `0` -FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` AS `bft_0` diff --git a/packages/bigframes/tests/unit/bigquery/_operations/snapshots/test_aead/test_encrypt/out.sql b/packages/bigframes/tests/unit/bigquery/_operations/snapshots/test_aead/test_encrypt/out.sql deleted file mode 100644 index 9ab9f8c0a7bb..000000000000 --- a/packages/bigframes/tests/unit/bigquery/_operations/snapshots/test_aead/test_encrypt/out.sql +++ /dev/null @@ -1,4 +0,0 @@ -SELECT - `rowindex`, - AEAD.ENCRYPT(`bytes_col`, `bytes_col`, `bytes_col`) AS `0` -FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` AS `bft_0` diff --git a/packages/bigframes/tests/unit/bigquery/_operations/test_aead.py b/packages/bigframes/tests/unit/bigquery/_operations/test_aead.py deleted file mode 100644 index 5ff0f8c7badf..000000000000 --- a/packages/bigframes/tests/unit/bigquery/_operations/test_aead.py +++ /dev/null @@ -1,56 +0,0 @@ -# Copyright 2026 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -# DO NOT MODIFY THIS FILE DIRECTLY. -# This file was generated from: scripts/data/sql-functions/aead.yaml -# by the script: scripts/generate_bigframes_bigquery.py - -from typing import cast - -import pytest - -import bigframes.bigquery._operations.aead as aead -import bigframes.pandas as bpd - -pytest.importorskip("pytest_snapshot") - - -def test_decrypt_bytes(scalar_types_df: bpd.DataFrame, snapshot): - result = aead.decrypt_bytes( - cast(bpd.Series, scalar_types_df["bytes_col"]), - cast(bpd.Series, scalar_types_df["bytes_col"]), - cast(bpd.Series, scalar_types_df["bytes_col"]), - ).to_frame() - - snapshot.assert_match(result.sql.rstrip() + "\n", "out.sql") - - -def test_decrypt_string(scalar_types_df: bpd.DataFrame, snapshot): - result = aead.decrypt_string( - cast(bpd.Series, scalar_types_df["bytes_col"]), - cast(bpd.Series, scalar_types_df["bytes_col"]), - cast(bpd.Series, scalar_types_df["string_col"]), - ).to_frame() - - snapshot.assert_match(result.sql.rstrip() + "\n", "out.sql") - - -def test_encrypt(scalar_types_df: bpd.DataFrame, snapshot): - result = aead.encrypt( - cast(bpd.Series, scalar_types_df["bytes_col"]), - cast(bpd.Series, scalar_types_df["bytes_col"]), - cast(bpd.Series, scalar_types_df["bytes_col"]), - ).to_frame() - - snapshot.assert_match(result.sql.rstrip() + "\n", "out.sql") diff --git a/packages/bigframes/tests/unit/bigquery/_operations/test_io.py b/packages/bigframes/tests/unit/bigquery/_operations/test_io.py deleted file mode 100644 index b5dc9544aa85..000000000000 --- a/packages/bigframes/tests/unit/bigquery/_operations/test_io.py +++ /dev/null @@ -1,40 +0,0 @@ -# Copyright 2026 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from unittest import mock - -import pytest - -import bigframes.bigquery._operations.io -import bigframes.session - - -@pytest.fixture -def mock_session(): - return mock.create_autospec(spec=bigframes.session.Session) - - -@mock.patch("bigframes.bigquery._operations.io._get_table_metadata") -def test_load_data(get_table_metadata_mock, mock_session): - bigframes.bigquery._operations.io.load_data( - "my-project.my_dataset.my_table", - columns={"col1": "INT64", "col2": "STRING"}, - from_files_options={"format": "CSV", "uris": ["gs://bucket/path*"]}, - session=mock_session, - ) - mock_session.read_gbq_query.assert_called_once() - generated_sql = mock_session.read_gbq_query.call_args[0][0] - expected = "LOAD DATA INTO `my-project.my_dataset.my_table` (\n `col1` INT64,\n `col2` STRING\n) FROM FILES (format='CSV', uris=['gs://bucket/path*'])" - assert generated_sql == expected - get_table_metadata_mock.assert_called_once() diff --git a/packages/bigframes/tests/unit/bigquery/generated/__init__.py b/packages/bigframes/tests/unit/bigquery/generated/__init__.py new file mode 100644 index 000000000000..58d482ea3866 --- /dev/null +++ b/packages/bigframes/tests/unit/bigquery/generated/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/packages/bigframes/tests/unit/bigquery/generated/global_namespace/__init__.py b/packages/bigframes/tests/unit/bigquery/generated/global_namespace/__init__.py new file mode 100644 index 000000000000..58d482ea3866 --- /dev/null +++ b/packages/bigframes/tests/unit/bigquery/generated/global_namespace/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/packages/bigframes/tests/unit/bigquery/generated/global_namespace/test_aead_encryption.py b/packages/bigframes/tests/unit/bigquery/generated/global_namespace/test_aead_encryption.py new file mode 100644 index 000000000000..818151952ff0 --- /dev/null +++ b/packages/bigframes/tests/unit/bigquery/generated/global_namespace/test_aead_encryption.py @@ -0,0 +1,101 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# DO NOT MODIFY THIS FILE DIRECTLY. +# This file was generated from: scripts/data/sql-functions/global_namespace/aead_encryption.yaml +# by the script: scripts/generate_bigframes_bigquery.py + +import bigframes.bigquery as bbq +import bigframes.core.col +import bigframes.core.expression as ex +import bigframes.operations.googlesql.global_namespace.aead_encryption as aead_encryption_op +import bigframes.pandas as bpd + + +def test_deterministic_decrypt_bytes_expression(): + # Call the function with col() expressions + result = bbq.deterministic_decrypt_bytes( + bpd.col("keyset"), + bpd.col("ciphertext"), + bpd.col("additional_data"), + ) + + # Verify result is a col Expression + assert isinstance(result, bigframes.core.col.Expression) + + # Verify the internal expression structure + expr = result._value + assert isinstance(expr, ex.OpExpression) + assert expr.op == aead_encryption_op._DETERMINISTIC_DECRYPT_BYTES_OP + + # Verify arguments are free variables matching the names + assert len(expr.inputs) == 3 + assert isinstance(expr.inputs[0], ex.UnboundVariableExpression) + assert expr.inputs[0].id == "keyset" + assert isinstance(expr.inputs[1], ex.UnboundVariableExpression) + assert expr.inputs[1].id == "ciphertext" + assert isinstance(expr.inputs[2], ex.UnboundVariableExpression) + assert expr.inputs[2].id == "additional_data" + + +def test_deterministic_decrypt_string_expression(): + # Call the function with col() expressions + result = bbq.deterministic_decrypt_string( + bpd.col("keyset"), + bpd.col("ciphertext"), + bpd.col("additional_data"), + ) + + # Verify result is a col Expression + assert isinstance(result, bigframes.core.col.Expression) + + # Verify the internal expression structure + expr = result._value + assert isinstance(expr, ex.OpExpression) + assert expr.op == aead_encryption_op._DETERMINISTIC_DECRYPT_STRING_OP + + # Verify arguments are free variables matching the names + assert len(expr.inputs) == 3 + assert isinstance(expr.inputs[0], ex.UnboundVariableExpression) + assert expr.inputs[0].id == "keyset" + assert isinstance(expr.inputs[1], ex.UnboundVariableExpression) + assert expr.inputs[1].id == "ciphertext" + assert isinstance(expr.inputs[2], ex.UnboundVariableExpression) + assert expr.inputs[2].id == "additional_data" + + +def test_deterministic_encrypt_expression(): + # Call the function with col() expressions + result = bbq.deterministic_encrypt( + bpd.col("keyset"), + bpd.col("plaintext"), + bpd.col("additional_data"), + ) + + # Verify result is a col Expression + assert isinstance(result, bigframes.core.col.Expression) + + # Verify the internal expression structure + expr = result._value + assert isinstance(expr, ex.OpExpression) + assert expr.op == aead_encryption_op._DETERMINISTIC_ENCRYPT_OP + + # Verify arguments are free variables matching the names + assert len(expr.inputs) == 3 + assert isinstance(expr.inputs[0], ex.UnboundVariableExpression) + assert expr.inputs[0].id == "keyset" + assert isinstance(expr.inputs[1], ex.UnboundVariableExpression) + assert expr.inputs[1].id == "plaintext" + assert isinstance(expr.inputs[2], ex.UnboundVariableExpression) + assert expr.inputs[2].id == "additional_data" diff --git a/packages/bigframes/tests/unit/bigquery/generated/global_namespace/test_array.py b/packages/bigframes/tests/unit/bigquery/generated/global_namespace/test_array.py new file mode 100644 index 000000000000..56b853869024 --- /dev/null +++ b/packages/bigframes/tests/unit/bigquery/generated/global_namespace/test_array.py @@ -0,0 +1,339 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# DO NOT MODIFY THIS FILE DIRECTLY. +# This file was generated from: scripts/data/sql-functions/global_namespace/array.yaml +# by the script: scripts/generate_bigframes_bigquery.py + +import bigframes.bigquery as bbq +import bigframes.core.col +import bigframes.core.expression as ex +import bigframes.operations.googlesql.global_namespace.array as array_op +import bigframes.pandas as bpd + + +def test_array_concat_expression(): + # Call the function with col() expressions + result = bbq.array_concat( + bpd.col("array_expression_1"), + bpd.col("array_expression_2"), + ) + + # Verify result is a col Expression + assert isinstance(result, bigframes.core.col.Expression) + + # Verify the internal expression structure + expr = result._value + assert isinstance(expr, ex.OpExpression) + assert expr.op == array_op._ARRAY_CONCAT_OP + + # Verify arguments are free variables matching the names + assert len(expr.inputs) == 2 + assert isinstance(expr.inputs[0], ex.UnboundVariableExpression) + assert expr.inputs[0].id == "array_expression_1" + assert isinstance(expr.inputs[1], ex.UnboundVariableExpression) + assert expr.inputs[1].id == "array_expression_2" + + +def test_array_first_expression(): + # Call the function with col() expressions + result = bbq.array_first( + bpd.col("array_expression"), + ) + + # Verify result is a col Expression + assert isinstance(result, bigframes.core.col.Expression) + + # Verify the internal expression structure + expr = result._value + assert isinstance(expr, ex.OpExpression) + assert expr.op == array_op._ARRAY_FIRST_OP + + # Verify arguments are free variables matching the names + assert len(expr.inputs) == 1 + assert isinstance(expr.inputs[0], ex.UnboundVariableExpression) + assert expr.inputs[0].id == "array_expression" + + +def test_array_first_n_expression(): + # Call the function with col() expressions + result = bbq.array_first_n( + bpd.col("input_array"), + bpd.col("n"), + ) + + # Verify result is a col Expression + assert isinstance(result, bigframes.core.col.Expression) + + # Verify the internal expression structure + expr = result._value + assert isinstance(expr, ex.OpExpression) + assert expr.op == array_op._ARRAY_FIRST_N_OP + + # Verify arguments are free variables matching the names + assert len(expr.inputs) == 2 + assert isinstance(expr.inputs[0], ex.UnboundVariableExpression) + assert expr.inputs[0].id == "input_array" + assert isinstance(expr.inputs[1], ex.UnboundVariableExpression) + assert expr.inputs[1].id == "n" + + +def test_array_includes_expression(): + # Call the function with col() expressions + result = bbq.array_includes( + bpd.col("array_to_search"), + bpd.col("search_value"), + ) + + # Verify result is a col Expression + assert isinstance(result, bigframes.core.col.Expression) + + # Verify the internal expression structure + expr = result._value + assert isinstance(expr, ex.OpExpression) + assert expr.op == array_op._ARRAY_INCLUDES_OP + + # Verify arguments are free variables matching the names + assert len(expr.inputs) == 2 + assert isinstance(expr.inputs[0], ex.UnboundVariableExpression) + assert expr.inputs[0].id == "array_to_search" + assert isinstance(expr.inputs[1], ex.UnboundVariableExpression) + assert expr.inputs[1].id == "search_value" + + +def test_array_includes_all_expression(): + # Call the function with col() expressions + result = bbq.array_includes_all( + bpd.col("array_to_search"), + bpd.col("search_values"), + ) + + # Verify result is a col Expression + assert isinstance(result, bigframes.core.col.Expression) + + # Verify the internal expression structure + expr = result._value + assert isinstance(expr, ex.OpExpression) + assert expr.op == array_op._ARRAY_INCLUDES_ALL_OP + + # Verify arguments are free variables matching the names + assert len(expr.inputs) == 2 + assert isinstance(expr.inputs[0], ex.UnboundVariableExpression) + assert expr.inputs[0].id == "array_to_search" + assert isinstance(expr.inputs[1], ex.UnboundVariableExpression) + assert expr.inputs[1].id == "search_values" + + +def test_array_includes_any_expression(): + # Call the function with col() expressions + result = bbq.array_includes_any( + bpd.col("array_to_search"), + bpd.col("search_values"), + ) + + # Verify result is a col Expression + assert isinstance(result, bigframes.core.col.Expression) + + # Verify the internal expression structure + expr = result._value + assert isinstance(expr, ex.OpExpression) + assert expr.op == array_op._ARRAY_INCLUDES_ANY_OP + + # Verify arguments are free variables matching the names + assert len(expr.inputs) == 2 + assert isinstance(expr.inputs[0], ex.UnboundVariableExpression) + assert expr.inputs[0].id == "array_to_search" + assert isinstance(expr.inputs[1], ex.UnboundVariableExpression) + assert expr.inputs[1].id == "search_values" + + +def test_array_is_distinct_expression(): + # Call the function with col() expressions + result = bbq.array_is_distinct( + bpd.col("array_expression"), + ) + + # Verify result is a col Expression + assert isinstance(result, bigframes.core.col.Expression) + + # Verify the internal expression structure + expr = result._value + assert isinstance(expr, ex.OpExpression) + assert expr.op == array_op._ARRAY_IS_DISTINCT_OP + + # Verify arguments are free variables matching the names + assert len(expr.inputs) == 1 + assert isinstance(expr.inputs[0], ex.UnboundVariableExpression) + assert expr.inputs[0].id == "array_expression" + + +def test_array_last_expression(): + # Call the function with col() expressions + result = bbq.array_last( + bpd.col("array_expression"), + ) + + # Verify result is a col Expression + assert isinstance(result, bigframes.core.col.Expression) + + # Verify the internal expression structure + expr = result._value + assert isinstance(expr, ex.OpExpression) + assert expr.op == array_op._ARRAY_LAST_OP + + # Verify arguments are free variables matching the names + assert len(expr.inputs) == 1 + assert isinstance(expr.inputs[0], ex.UnboundVariableExpression) + assert expr.inputs[0].id == "array_expression" + + +def test_array_length_expression(): + # Call the function with col() expressions + result = bbq.array_length( + bpd.col("series"), + ) + + # Verify result is a col Expression + assert isinstance(result, bigframes.core.col.Expression) + + # Verify the internal expression structure + expr = result._value + assert isinstance(expr, ex.OpExpression) + assert expr.op == array_op._ARRAY_LENGTH_OP + + # Verify arguments are free variables matching the names + assert len(expr.inputs) == 1 + assert isinstance(expr.inputs[0], ex.UnboundVariableExpression) + assert expr.inputs[0].id == "series" + + +def test_array_reverse_expression(): + # Call the function with col() expressions + result = bbq.array_reverse( + bpd.col("value"), + ) + + # Verify result is a col Expression + assert isinstance(result, bigframes.core.col.Expression) + + # Verify the internal expression structure + expr = result._value + assert isinstance(expr, ex.OpExpression) + assert expr.op == array_op._ARRAY_REVERSE_OP + + # Verify arguments are free variables matching the names + assert len(expr.inputs) == 1 + assert isinstance(expr.inputs[0], ex.UnboundVariableExpression) + assert expr.inputs[0].id == "value" + + +def test_array_slice_expression(): + # Call the function with col() expressions + result = bbq.array_slice( + bpd.col("array_to_slice"), + bpd.col("start_offset"), + bpd.col("end_offset"), + ) + + # Verify result is a col Expression + assert isinstance(result, bigframes.core.col.Expression) + + # Verify the internal expression structure + expr = result._value + assert isinstance(expr, ex.OpExpression) + assert expr.op == array_op._ARRAY_SLICE_OP + + # Verify arguments are free variables matching the names + assert len(expr.inputs) == 3 + assert isinstance(expr.inputs[0], ex.UnboundVariableExpression) + assert expr.inputs[0].id == "array_to_slice" + assert isinstance(expr.inputs[1], ex.UnboundVariableExpression) + assert expr.inputs[1].id == "start_offset" + assert isinstance(expr.inputs[2], ex.UnboundVariableExpression) + assert expr.inputs[2].id == "end_offset" + + +def test_array_to_string_expression(): + # Call the function with col() expressions + result = bbq.array_to_string( + bpd.col("series"), + bpd.col("delimiter"), + bpd.col("null_text"), + ) + + # Verify result is a col Expression + assert isinstance(result, bigframes.core.col.Expression) + + # Verify the internal expression structure + expr = result._value + assert isinstance(expr, ex.OpExpression) + assert expr.op == array_op._ARRAY_TO_STRING_OP + + # Verify arguments are free variables matching the names + assert len(expr.inputs) == 3 + assert isinstance(expr.inputs[0], ex.UnboundVariableExpression) + assert expr.inputs[0].id == "series" + assert isinstance(expr.inputs[1], ex.UnboundVariableExpression) + assert expr.inputs[1].id == "delimiter" + assert isinstance(expr.inputs[2], ex.UnboundVariableExpression) + assert expr.inputs[2].id == "null_text" + + +def test_flatten_expression(): + # Call the function with col() expressions + result = bbq.flatten( + bpd.col("array_to_flatten"), + bpd.col("depth"), + ) + + # Verify result is a col Expression + assert isinstance(result, bigframes.core.col.Expression) + + # Verify the internal expression structure + expr = result._value + assert isinstance(expr, ex.OpExpression) + assert expr.op == array_op._FLATTEN_OP + + # Verify arguments are free variables matching the names + assert len(expr.inputs) == 2 + assert isinstance(expr.inputs[0], ex.UnboundVariableExpression) + assert expr.inputs[0].id == "array_to_flatten" + assert isinstance(expr.inputs[1], ex.UnboundVariableExpression) + assert expr.inputs[1].id == "depth" + + +def test_generate_array_expression(): + # Call the function with col() expressions + result = bbq.generate_array( + bpd.col("start_expression"), + bpd.col("end_expression"), + bpd.col("step_expression"), + ) + + # Verify result is a col Expression + assert isinstance(result, bigframes.core.col.Expression) + + # Verify the internal expression structure + expr = result._value + assert isinstance(expr, ex.OpExpression) + assert expr.op == array_op._GENERATE_ARRAY_OP + + # Verify arguments are free variables matching the names + assert len(expr.inputs) == 3 + assert isinstance(expr.inputs[0], ex.UnboundVariableExpression) + assert expr.inputs[0].id == "start_expression" + assert isinstance(expr.inputs[1], ex.UnboundVariableExpression) + assert expr.inputs[1].id == "end_expression" + assert isinstance(expr.inputs[2], ex.UnboundVariableExpression) + assert expr.inputs[2].id == "step_expression" diff --git a/packages/bigframes/tests/unit/bigquery/generated/test_aead.py b/packages/bigframes/tests/unit/bigquery/generated/test_aead.py new file mode 100644 index 000000000000..ce728b41899d --- /dev/null +++ b/packages/bigframes/tests/unit/bigquery/generated/test_aead.py @@ -0,0 +1,101 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# DO NOT MODIFY THIS FILE DIRECTLY. +# This file was generated from: scripts/data/sql-functions/aead.yaml +# by the script: scripts/generate_bigframes_bigquery.py + +import bigframes.bigquery as bbq +import bigframes.core.col +import bigframes.core.expression as ex +import bigframes.operations.googlesql.aead as aead_op +import bigframes.pandas as bpd + + +def test_decrypt_bytes_expression(): + # Call the function with col() expressions + result = bbq.aead.decrypt_bytes( + bpd.col("keyset"), + bpd.col("ciphertext"), + bpd.col("additional_data"), + ) + + # Verify result is a col Expression + assert isinstance(result, bigframes.core.col.Expression) + + # Verify the internal expression structure + expr = result._value + assert isinstance(expr, ex.OpExpression) + assert expr.op == aead_op._DECRYPT_BYTES_OP + + # Verify arguments are free variables matching the names + assert len(expr.inputs) == 3 + assert isinstance(expr.inputs[0], ex.UnboundVariableExpression) + assert expr.inputs[0].id == "keyset" + assert isinstance(expr.inputs[1], ex.UnboundVariableExpression) + assert expr.inputs[1].id == "ciphertext" + assert isinstance(expr.inputs[2], ex.UnboundVariableExpression) + assert expr.inputs[2].id == "additional_data" + + +def test_decrypt_string_expression(): + # Call the function with col() expressions + result = bbq.aead.decrypt_string( + bpd.col("keyset"), + bpd.col("ciphertext"), + bpd.col("additional_data"), + ) + + # Verify result is a col Expression + assert isinstance(result, bigframes.core.col.Expression) + + # Verify the internal expression structure + expr = result._value + assert isinstance(expr, ex.OpExpression) + assert expr.op == aead_op._DECRYPT_STRING_OP + + # Verify arguments are free variables matching the names + assert len(expr.inputs) == 3 + assert isinstance(expr.inputs[0], ex.UnboundVariableExpression) + assert expr.inputs[0].id == "keyset" + assert isinstance(expr.inputs[1], ex.UnboundVariableExpression) + assert expr.inputs[1].id == "ciphertext" + assert isinstance(expr.inputs[2], ex.UnboundVariableExpression) + assert expr.inputs[2].id == "additional_data" + + +def test_encrypt_expression(): + # Call the function with col() expressions + result = bbq.aead.encrypt( + bpd.col("keyset"), + bpd.col("plaintext"), + bpd.col("additional_data"), + ) + + # Verify result is a col Expression + assert isinstance(result, bigframes.core.col.Expression) + + # Verify the internal expression structure + expr = result._value + assert isinstance(expr, ex.OpExpression) + assert expr.op == aead_op._ENCRYPT_OP + + # Verify arguments are free variables matching the names + assert len(expr.inputs) == 3 + assert isinstance(expr.inputs[0], ex.UnboundVariableExpression) + assert expr.inputs[0].id == "keyset" + assert isinstance(expr.inputs[1], ex.UnboundVariableExpression) + assert expr.inputs[1].id == "plaintext" + assert isinstance(expr.inputs[2], ex.UnboundVariableExpression) + assert expr.inputs[2].id == "additional_data" diff --git a/packages/bigframes/tests/unit/test_series_polars.py b/packages/bigframes/tests/unit/test_series_polars.py index f823aa5ce24e..2e22d6ed4b6b 100644 --- a/packages/bigframes/tests/unit/test_series_polars.py +++ b/packages/bigframes/tests/unit/test_series_polars.py @@ -5146,3 +5146,42 @@ def test_series_dt_total_seconds(scalars_df_index, scalars_pandas_df_index): # bigframes uses Float64, newer pandas may use double[pyarrow] check_dtype=False, ) + + +def test_series_where_with_expression(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + s1 = scalars_df["float64_col"] + s2 = scalars_df["bool_col"] + + bf_result = s1.where(s2, bpd.col("bool_col")).to_pandas() + + s1_pd = scalars_pandas_df["float64_col"] + s2_pd = scalars_pandas_df["bool_col"] + + pd_result = s1_pd.where(s2_pd, s2_pd) + + pd.testing.assert_series_equal(bf_result, pd_result, check_dtype=False) + + +def test_series_expression_unbound_fails(scalars_dfs): + scalars_df, _ = scalars_dfs + s1 = scalars_df["float64_col"] + s2 = scalars_df["bool_col"] + + with pytest.raises(ValueError, match="remains unbound"): + s1.where(s2, bpd.col("non_existent_column")) + + +def test_series_where_with_expression_resolving_to_self(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + s1 = scalars_df["float64_col"] + s2 = scalars_df["bool_col"] + + bf_result = s1.where(s2, bpd.col("float64_col")).to_pandas() + + s1_pd = scalars_pandas_df["float64_col"] + s2_pd = scalars_pandas_df["bool_col"] + + pd_result = s1_pd.where(s2_pd, s1_pd) + + pd.testing.assert_series_equal(bf_result, pd_result, check_dtype=False) diff --git a/packages/bigframes/third_party/bigframes_vendored/sqlglot/expressions.py b/packages/bigframes/third_party/bigframes_vendored/sqlglot/expressions.py index 9dde730dad45..e8e4cc8e10d6 100644 --- a/packages/bigframes/third_party/bigframes_vendored/sqlglot/expressions.py +++ b/packages/bigframes/third_party/bigframes_vendored/sqlglot/expressions.py @@ -5906,7 +5906,7 @@ class FarmFingerprint(Func): class Flatten(Func): - pass + arg_types = {"this": True, "depth": False} class Float64(Func):