From 92de28d003e3d05ce84c2c2c2e3a5babf8cf7e5e Mon Sep 17 00:00:00 2001
From: Anton Volkov <antonwolfy@gmail.com>
Date: Fri, 23 Jan 2026 03:55:25 -0800
Subject: [PATCH 01/18] Move dependent dpctl.tensor headers from utils and
 kernels namespace

---
 CMakeLists.txt                                |    1 -
 .../libtensor/include/kernels/alignment.hpp   |   46 +
 .../include/kernels/dpctl_tensor_types.hpp    |   40 +
 .../kernels/elementwise_functions/common.hpp  | 1045 +++++++++++++++++
 .../elementwise_functions/common_detail.hpp   |   70 ++
 .../elementwise_functions/logaddexp.hpp       |  268 +++++
 .../kernels/elementwise_functions/maximum.hpp |  322 +++++
 .../kernels/elementwise_functions/minimum.hpp |  321 +++++
 .../elementwise_functions/sycl_complex.hpp    |   44 +
 .../elementwise_functions/vec_size_util.hpp   |   70 ++
 .../include/utils/indexing_utils.hpp          |  153 +++
 .../libtensor/include/utils/math_utils.hpp    |  148 +++
 .../include/utils/memory_overlap.hpp          |  157 +++
 .../libtensor/include/utils/offset_utils.hpp  |  824 +++++++++++++
 .../include/utils/output_validation.hpp       |   79 ++
 .../libtensor/include/utils/strided_iters.hpp |  996 ++++++++++++++++
 .../include/utils/sycl_alloc_utils.hpp        |  223 ++++
 .../libtensor/include/utils/sycl_utils.hpp    |  662 +++++++++++
 .../libtensor/include/utils/type_dispatch.hpp |  134 +++
 .../include/utils/type_dispatch_building.hpp  |  300 +++++
 .../libtensor/include/utils/type_utils.hpp    |  164 +++
 dpnp/backend/CMakeLists.txt                   |    1 -
 dpnp/backend/extensions/blas/CMakeLists.txt   |    5 +-
 dpnp/backend/extensions/blas/dot_common.hpp   |    1 +
 dpnp/backend/extensions/common/ext/common.hpp |    2 +
 dpnp/backend/extensions/fft/CMakeLists.txt    |    5 +-
 .../extensions/indexing/CMakeLists.txt        |    5 +-
 dpnp/backend/extensions/lapack/CMakeLists.txt |    5 +-
 .../extensions/statistics/CMakeLists.txt      |    5 +-
 dpnp/backend/extensions/ufunc/CMakeLists.txt  |    5 +-
 dpnp/backend/extensions/vm/CMakeLists.txt     |    5 +-
 dpnp/backend/extensions/window/CMakeLists.txt |    5 +-
 pyproject.toml                                |    2 +-
 33 files changed, 6102 insertions(+), 11 deletions(-)
 create mode 100644 dpctl/tensor/libtensor/include/kernels/alignment.hpp
 create mode 100644 dpctl/tensor/libtensor/include/kernels/dpctl_tensor_types.hpp
 create mode 100644 dpctl/tensor/libtensor/include/kernels/elementwise_functions/common.hpp
 create mode 100644 dpctl/tensor/libtensor/include/kernels/elementwise_functions/common_detail.hpp
 create mode 100644 dpctl/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp
 create mode 100644 dpctl/tensor/libtensor/include/kernels/elementwise_functions/maximum.hpp
 create mode 100644 dpctl/tensor/libtensor/include/kernels/elementwise_functions/minimum.hpp
 create mode 100644 dpctl/tensor/libtensor/include/kernels/elementwise_functions/sycl_complex.hpp
 create mode 100644 dpctl/tensor/libtensor/include/kernels/elementwise_functions/vec_size_util.hpp
 create mode 100644 dpctl/tensor/libtensor/include/utils/indexing_utils.hpp
 create mode 100644 dpctl/tensor/libtensor/include/utils/math_utils.hpp
 create mode 100644 dpctl/tensor/libtensor/include/utils/memory_overlap.hpp
 create mode 100644 dpctl/tensor/libtensor/include/utils/offset_utils.hpp
 create mode 100644 dpctl/tensor/libtensor/include/utils/output_validation.hpp
 create mode 100644 dpctl/tensor/libtensor/include/utils/strided_iters.hpp
 create mode 100644 dpctl/tensor/libtensor/include/utils/sycl_alloc_utils.hpp
 create mode 100644 dpctl/tensor/libtensor/include/utils/sycl_utils.hpp
 create mode 100644 dpctl/tensor/libtensor/include/utils/type_dispatch.hpp
 create mode 100644 dpctl/tensor/libtensor/include/utils/type_dispatch_building.hpp
 create mode 100644 dpctl/tensor/libtensor/include/utils/type_utils.hpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9d676232f08e..386b17b44294 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -97,7 +97,6 @@ find_package(Cython REQUIRED)
 
 find_package(Dpctl REQUIRED)
 message(STATUS "Dpctl_INCLUDE_DIR=" ${Dpctl_INCLUDE_DIR})
-message(STATUS "Dpctl_TENSOR_INCLUDE_DIR=" ${Dpctl_TENSOR_INCLUDE_DIR})
 
 option(DPNP_USE_ONEMATH "Build DPNP with oneMath" OFF)
 set(DPNP_TARGET_CUDA
diff --git a/dpctl/tensor/libtensor/include/kernels/alignment.hpp b/dpctl/tensor/libtensor/include/kernels/alignment.hpp
new file mode 100644
index 000000000000..a67e9b15306e
--- /dev/null
+++ b/dpctl/tensor/libtensor/include/kernels/alignment.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+
+#pragma once
+
+#include <cstddef>
+#include <cstdint>
+
+namespace dpctl::tensor::kernels::alignment_utils
+{
+inline constexpr std::size_t required_alignment = 64UL;
+
+template <std::uintptr_t alignment, typename Ptr>
+bool is_aligned(Ptr p)
+{
+    return !(reinterpret_cast<std::uintptr_t>(p) % alignment);
+}
+
+template <typename KernelName>
+class disabled_sg_loadstore_wrapper_krn;
+} // namespace dpctl::tensor::kernels::alignment_utils
diff --git a/dpctl/tensor/libtensor/include/kernels/dpctl_tensor_types.hpp b/dpctl/tensor/libtensor/include/kernels/dpctl_tensor_types.hpp
new file mode 100644
index 000000000000..4db78e1805e3
--- /dev/null
+++ b/dpctl/tensor/libtensor/include/kernels/dpctl_tensor_types.hpp
@@ -0,0 +1,40 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===--------------------------------------------------------------------===//
+
+#pragma once
+
+#include <cstddef>
+
+namespace dpctl::tensor
+{
+typedef std::ptrdiff_t ssize_t;
+} // namespace dpctl::tensor
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common.hpp
new file mode 100644
index 000000000000..d19930b722a9
--- /dev/null
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common.hpp
@@ -0,0 +1,1045 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+///
+/// \file
+/// This file defines common code for elementwise tensor operations.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+
+#include <algorithm>
+#include <cmath>
+#include <complex>
+#include <cstddef>
+#include <cstdint>
+#include <limits>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "common_detail.hpp"
+
+#include "utils/offset_utils.hpp"
+#include "utils/sycl_alloc_utils.hpp"
+#include "utils/sycl_utils.hpp"
+
+#include "kernels/alignment.hpp"
+#include "kernels/dpctl_tensor_types.hpp"
+
+namespace dpctl::tensor::kernels::elementwise_common
+{
+using dpctl::tensor::ssize_t;
+using dpctl::tensor::kernels::alignment_utils::
+    disabled_sg_loadstore_wrapper_krn;
+using dpctl::tensor::kernels::alignment_utils::is_aligned;
+using dpctl::tensor::kernels::alignment_utils::required_alignment;
+
+using dpctl::tensor::sycl_utils::sub_group_load;
+using dpctl::tensor::sycl_utils::sub_group_store;
+
+/*! @brief Functor for unary function evaluation on contiguous array */
+template <typename argT,
+          typename resT,
+          typename UnaryOperatorT,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+struct UnaryContigFunctor
+{
+private:
+    const argT *in = nullptr;
+    resT *out = nullptr;
+    std::size_t nelems_;
+
+public:
+    UnaryContigFunctor(const argT *inp, resT *res, const std::size_t n_elems)
+        : in(inp), out(res), nelems_(n_elems)
+    {
+    }
+
+    void operator()(sycl::nd_item<1> ndit) const
+    {
+        static constexpr std::uint8_t elems_per_wi = n_vecs * vec_sz;
+        UnaryOperatorT op{};
+        /* Each work-item processes vec_sz elements, contiguous in memory */
+        /* NOTE: work-group size must be divisible by sub-group size */
+
+        if constexpr (enable_sg_loadstore && UnaryOperatorT::is_constant::value)
+        {
+            // value of operator is known to be a known constant
+            constexpr resT const_val = UnaryOperatorT::constant_value;
+
+            auto sg = ndit.get_sub_group();
+            const std::uint16_t sgSize = sg.get_max_local_range()[0];
+
+            const std::size_t base =
+                elems_per_wi * (ndit.get_group(0) * ndit.get_local_range(0) +
+                                sg.get_group_id()[0] * sgSize);
+            if (base + elems_per_wi * sgSize < nelems_) {
+                static constexpr sycl::vec<resT, vec_sz> res_vec(const_val);
+#pragma unroll
+                for (std::uint8_t it = 0; it < elems_per_wi; it += vec_sz) {
+                    const std::size_t offset = base + it * sgSize;
+                    auto out_multi_ptr = sycl::address_space_cast<
+                        sycl::access::address_space::global_space,
+                        sycl::access::decorated::yes>(&out[offset]);
+
+                    sub_group_store<vec_sz>(sg, res_vec, out_multi_ptr);
+                }
+            }
+            else {
+                const std::size_t lane_id = sg.get_local_id()[0];
+                for (std::size_t k = base + lane_id; k < nelems_; k += sgSize) {
+                    out[k] = const_val;
+                }
+            }
+        }
+        else if constexpr (enable_sg_loadstore &&
+                           UnaryOperatorT::supports_sg_loadstore::value &&
+                           UnaryOperatorT::supports_vec::value && (vec_sz > 1))
+        {
+            auto sg = ndit.get_sub_group();
+            const std::uint16_t sgSize = sg.get_max_local_range()[0];
+
+            const std::size_t base =
+                elems_per_wi * (ndit.get_group(0) * ndit.get_local_range(0) +
+                                sg.get_group_id()[0] * sgSize);
+            if (base + elems_per_wi * sgSize < nelems_) {
+#pragma unroll
+                for (std::uint8_t it = 0; it < elems_per_wi; it += vec_sz) {
+                    const std::size_t offset = base + it * sgSize;
+                    auto in_multi_ptr = sycl::address_space_cast<
+                        sycl::access::address_space::global_space,
+                        sycl::access::decorated::yes>(&in[offset]);
+                    auto out_multi_ptr = sycl::address_space_cast<
+                        sycl::access::address_space::global_space,
+                        sycl::access::decorated::yes>(&out[offset]);
+
+                    const sycl::vec<argT, vec_sz> x =
+                        sub_group_load<vec_sz>(sg, in_multi_ptr);
+                    const sycl::vec<resT, vec_sz> res_vec = op(x);
+                    sub_group_store<vec_sz>(sg, res_vec, out_multi_ptr);
+                }
+            }
+            else {
+                const std::size_t lane_id = sg.get_local_id()[0];
+                for (std::size_t k = base + lane_id; k < nelems_; k += sgSize) {
+                    // scalar call
+                    out[k] = op(in[k]);
+                }
+            }
+        }
+        else if constexpr (enable_sg_loadstore &&
+                           UnaryOperatorT::supports_sg_loadstore::value &&
+                           std::is_same_v<resT, argT>)
+        {
+            // default: use scalar-value function
+
+            auto sg = ndit.get_sub_group();
+            const std::uint16_t sgSize = sg.get_max_local_range()[0];
+            const std::size_t base =
+                elems_per_wi * (ndit.get_group(0) * ndit.get_local_range(0) +
+                                sg.get_group_id()[0] * sgSize);
+
+            if (base + elems_per_wi * sgSize < nelems_) {
+#pragma unroll
+                for (std::uint8_t it = 0; it < elems_per_wi; it += vec_sz) {
+                    const std::size_t offset = base + it * sgSize;
+                    auto in_multi_ptr = sycl::address_space_cast<
+                        sycl::access::address_space::global_space,
+                        sycl::access::decorated::yes>(&in[offset]);
+                    auto out_multi_ptr = sycl::address_space_cast<
+                        sycl::access::address_space::global_space,
+                        sycl::access::decorated::yes>(&out[offset]);
+
+                    sycl::vec<argT, vec_sz> arg_vec =
+                        sub_group_load<vec_sz>(sg, in_multi_ptr);
+#pragma unroll
+                    for (std::uint32_t k = 0; k < vec_sz; ++k) {
+                        arg_vec[k] = op(arg_vec[k]);
+                    }
+                    sub_group_store<vec_sz>(sg, arg_vec, out_multi_ptr);
+                }
+            }
+            else {
+                const std::size_t lane_id = sg.get_local_id()[0];
+                for (std::size_t k = base + lane_id; k < nelems_; k += sgSize) {
+                    out[k] = op(in[k]);
+                }
+            }
+        }
+        else if constexpr (enable_sg_loadstore &&
+                           UnaryOperatorT::supports_sg_loadstore::value)
+        {
+            // default: use scalar-value function
+
+            auto sg = ndit.get_sub_group();
+            const std::uint16_t sgSize = sg.get_max_local_range()[0];
+            const std::size_t base =
+                elems_per_wi * (ndit.get_group(0) * ndit.get_local_range(0) +
+                                sg.get_group_id()[0] * sgSize);
+
+            if (base + elems_per_wi * sgSize < nelems_) {
+#pragma unroll
+                for (std::uint8_t it = 0; it < elems_per_wi; it += vec_sz) {
+                    const std::size_t offset = base + it * sgSize;
+                    auto in_multi_ptr = sycl::address_space_cast<
+                        sycl::access::address_space::global_space,
+                        sycl::access::decorated::yes>(&in[offset]);
+                    auto out_multi_ptr = sycl::address_space_cast<
+                        sycl::access::address_space::global_space,
+                        sycl::access::decorated::yes>(&out[offset]);
+
+                    const sycl::vec<argT, vec_sz> arg_vec =
+                        sub_group_load<vec_sz>(sg, in_multi_ptr);
+                    sycl::vec<resT, vec_sz> res_vec;
+#pragma unroll
+                    for (std::uint8_t k = 0; k < vec_sz; ++k) {
+                        res_vec[k] = op(arg_vec[k]);
+                    }
+                    sub_group_store<vec_sz>(sg, res_vec, out_multi_ptr);
+                }
+            }
+            else {
+                const std::size_t lane_id = sg.get_local_id()[0];
+                for (std::size_t k = base + lane_id; k < nelems_; k += sgSize) {
+                    out[k] = op(in[k]);
+                }
+            }
+        }
+        else {
+            const std::uint16_t sgSize =
+                ndit.get_sub_group().get_local_range()[0];
+            const std::size_t gid = ndit.get_global_linear_id();
+            const std::uint16_t elems_per_sg = sgSize * elems_per_wi;
+
+            const std::size_t start =
+                (gid / sgSize) * (elems_per_sg - sgSize) + gid;
+            const std::size_t end = std::min(nelems_, start + elems_per_sg);
+            for (std::size_t offset = start; offset < end; offset += sgSize) {
+                out[offset] = op(in[offset]);
+            }
+        }
+    }
+};
+
+template <typename argT, typename resT, typename IndexerT, typename UnaryOpT>
+struct UnaryStridedFunctor
+{
+private:
+    const argT *inp_ = nullptr;
+    resT *res_ = nullptr;
+    IndexerT inp_out_indexer_;
+
+public:
+    UnaryStridedFunctor(const argT *inp_p,
+                        resT *res_p,
+                        const IndexerT &inp_out_indexer)
+        : inp_(inp_p), res_(res_p), inp_out_indexer_(inp_out_indexer)
+    {
+    }
+
+    void operator()(sycl::id<1> wid) const
+    {
+        const auto &offsets_ = inp_out_indexer_(wid.get(0));
+        const ssize_t &inp_offset = offsets_.get_first_offset();
+        const ssize_t &res_offset = offsets_.get_second_offset();
+
+        UnaryOpT op{};
+
+        res_[res_offset] = op(inp_[inp_offset]);
+    }
+};
+
+template <typename SizeT>
+SizeT select_lws(const sycl::device &, SizeT n_work_items_needed)
+{
+    // TODO: make the decision based on device descriptors
+
+    // constexpr SizeT few_threshold = (SizeT(1) << 17);
+    static constexpr SizeT med_threshold = (SizeT(1) << 21);
+
+    const SizeT lws =
+        (n_work_items_needed <= med_threshold ? SizeT(128) : SizeT(256));
+
+    return lws;
+}
+
+template <typename argTy,
+          template <typename T>
+          class UnaryOutputType,
+          template <typename A,
+                    typename R,
+                    std::uint8_t vs,
+                    std::uint8_t nv,
+                    bool enable>
+          class ContigFunctorT,
+          template <typename A, typename R, std::uint8_t vs, std::uint8_t nv>
+          class kernel_name,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u>
+sycl::event unary_contig_impl(sycl::queue &exec_q,
+                              std::size_t nelems,
+                              const char *arg_p,
+                              char *res_p,
+                              const std::vector<sycl::event> &depends = {})
+{
+    static constexpr std::uint8_t elems_per_wi = n_vecs * vec_sz;
+    const std::size_t n_work_items_needed = nelems / elems_per_wi;
+    const std::size_t lws =
+        select_lws(exec_q.get_device(), n_work_items_needed);
+
+    const std::size_t n_groups =
+        ((nelems + lws * elems_per_wi - 1) / (lws * elems_per_wi));
+    const auto gws_range = sycl::range<1>(n_groups * lws);
+    const auto lws_range = sycl::range<1>(lws);
+
+    using resTy = typename UnaryOutputType<argTy>::value_type;
+    using BaseKernelName = kernel_name<argTy, resTy, vec_sz, n_vecs>;
+
+    const argTy *arg_tp = reinterpret_cast<const argTy *>(arg_p);
+    resTy *res_tp = reinterpret_cast<resTy *>(res_p);
+
+    sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        if (is_aligned<required_alignment>(arg_p) &&
+            is_aligned<required_alignment>(res_p))
+        {
+            static constexpr bool enable_sg_loadstore = true;
+            using KernelName = BaseKernelName;
+            using Impl = ContigFunctorT<argTy, resTy, vec_sz, n_vecs,
+                                        enable_sg_loadstore>;
+
+            cgh.parallel_for<KernelName>(
+                sycl::nd_range<1>(gws_range, lws_range),
+                Impl(arg_tp, res_tp, nelems));
+        }
+        else {
+            static constexpr bool disable_sg_loadstore = false;
+            using KernelName =
+                disabled_sg_loadstore_wrapper_krn<BaseKernelName>;
+            using Impl = ContigFunctorT<argTy, resTy, vec_sz, n_vecs,
+                                        disable_sg_loadstore>;
+
+            cgh.parallel_for<KernelName>(
+                sycl::nd_range<1>(gws_range, lws_range),
+                Impl(arg_tp, res_tp, nelems));
+        }
+    });
+
+    return comp_ev;
+}
+
+template <typename argTy,
+          template <typename T>
+          class UnaryOutputType,
+          template <typename A, typename R, typename I>
+          class StridedFunctorT,
+          template <typename A, typename R, typename I>
+          class kernel_name>
+sycl::event
+    unary_strided_impl(sycl::queue &exec_q,
+                       std::size_t nelems,
+                       int nd,
+                       const ssize_t *shape_and_strides,
+                       const char *arg_p,
+                       ssize_t arg_offset,
+                       char *res_p,
+                       ssize_t res_offset,
+                       const std::vector<sycl::event> &depends,
+                       const std::vector<sycl::event> &additional_depends)
+{
+    sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+        cgh.depends_on(additional_depends);
+
+        using resTy = typename UnaryOutputType<argTy>::value_type;
+        using IndexerT =
+            typename dpctl::tensor::offset_utils::TwoOffsets_StridedIndexer;
+
+        const IndexerT indexer{nd, arg_offset, res_offset, shape_and_strides};
+
+        const argTy *arg_tp = reinterpret_cast<const argTy *>(arg_p);
+        resTy *res_tp = reinterpret_cast<resTy *>(res_p);
+
+        using Impl = StridedFunctorT<argTy, resTy, IndexerT>;
+
+        cgh.parallel_for<kernel_name<argTy, resTy, IndexerT>>(
+            {nelems}, Impl(arg_tp, res_tp, indexer));
+    });
+    return comp_ev;
+}
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          typename BinaryOperatorT,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+struct BinaryContigFunctor
+{
+private:
+    const argT1 *in1 = nullptr;
+    const argT2 *in2 = nullptr;
+    resT *out = nullptr;
+    std::size_t nelems_;
+
+public:
+    BinaryContigFunctor(const argT1 *inp1,
+                        const argT2 *inp2,
+                        resT *res,
+                        const std::size_t n_elems)
+        : in1(inp1), in2(inp2), out(res), nelems_(n_elems)
+    {
+    }
+
+    void operator()(sycl::nd_item<1> ndit) const
+    {
+        static constexpr std::uint8_t elems_per_wi = n_vecs * vec_sz;
+        BinaryOperatorT op{};
+        /* Each work-item processes vec_sz elements, contiguous in memory */
+        /* NOTE: work-group size must be divisible by sub-group size */
+
+        if constexpr (enable_sg_loadstore &&
+                      BinaryOperatorT::supports_sg_loadstore::value &&
+                      BinaryOperatorT::supports_vec::value && (vec_sz > 1))
+        {
+            auto sg = ndit.get_sub_group();
+            std::uint16_t sgSize = sg.get_max_local_range()[0];
+
+            const std::size_t base =
+                elems_per_wi * (ndit.get_group(0) * ndit.get_local_range(0) +
+                                sg.get_group_id()[0] * sgSize);
+
+            if (base + elems_per_wi * sgSize < nelems_) {
+                sycl::vec<resT, vec_sz> res_vec;
+
+#pragma unroll
+                for (std::uint8_t it = 0; it < elems_per_wi; it += vec_sz) {
+                    std::size_t offset = base + it * sgSize;
+                    auto in1_multi_ptr = sycl::address_space_cast<
+                        sycl::access::address_space::global_space,
+                        sycl::access::decorated::yes>(&in1[offset]);
+                    auto in2_multi_ptr = sycl::address_space_cast<
+                        sycl::access::address_space::global_space,
+                        sycl::access::decorated::yes>(&in2[offset]);
+                    auto out_multi_ptr = sycl::address_space_cast<
+                        sycl::access::address_space::global_space,
+                        sycl::access::decorated::yes>(&out[offset]);
+
+                    const sycl::vec<argT1, vec_sz> arg1_vec =
+                        sub_group_load<vec_sz>(sg, in1_multi_ptr);
+                    const sycl::vec<argT2, vec_sz> arg2_vec =
+                        sub_group_load<vec_sz>(sg, in2_multi_ptr);
+                    res_vec = op(arg1_vec, arg2_vec);
+                    sub_group_store<vec_sz>(sg, res_vec, out_multi_ptr);
+                }
+            }
+            else {
+                const std::size_t lane_id = sg.get_local_id()[0];
+                for (std::size_t k = base + lane_id; k < nelems_; k += sgSize) {
+                    out[k] = op(in1[k], in2[k]);
+                }
+            }
+        }
+        else if constexpr (enable_sg_loadstore &&
+                           BinaryOperatorT::supports_sg_loadstore::value)
+        {
+            auto sg = ndit.get_sub_group();
+            const std::uint16_t sgSize = sg.get_max_local_range()[0];
+
+            const std::size_t base =
+                elems_per_wi * (ndit.get_group(0) * ndit.get_local_range(0) +
+                                sg.get_group_id()[0] * sgSize);
+
+            if (base + elems_per_wi * sgSize < nelems_) {
+#pragma unroll
+                for (std::uint8_t it = 0; it < elems_per_wi; it += vec_sz) {
+                    const std::size_t offset = base + it * sgSize;
+                    auto in1_multi_ptr = sycl::address_space_cast<
+                        sycl::access::address_space::global_space,
+                        sycl::access::decorated::yes>(&in1[offset]);
+                    auto in2_multi_ptr = sycl::address_space_cast<
+                        sycl::access::address_space::global_space,
+                        sycl::access::decorated::yes>(&in2[offset]);
+                    auto out_multi_ptr = sycl::address_space_cast<
+                        sycl::access::address_space::global_space,
+                        sycl::access::decorated::yes>(&out[offset]);
+
+                    const sycl::vec<argT1, vec_sz> arg1_vec =
+                        sub_group_load<vec_sz>(sg, in1_multi_ptr);
+                    const sycl::vec<argT2, vec_sz> arg2_vec =
+                        sub_group_load<vec_sz>(sg, in2_multi_ptr);
+
+                    sycl::vec<resT, vec_sz> res_vec;
+#pragma unroll
+                    for (std::uint8_t vec_id = 0; vec_id < vec_sz; ++vec_id) {
+                        res_vec[vec_id] =
+                            op(arg1_vec[vec_id], arg2_vec[vec_id]);
+                    }
+                    sub_group_store<vec_sz>(sg, res_vec, out_multi_ptr);
+                }
+            }
+            else {
+                const std::size_t lane_id = sg.get_local_id()[0];
+                for (std::size_t k = base + lane_id; k < nelems_; k += sgSize) {
+                    out[k] = op(in1[k], in2[k]);
+                }
+            }
+        }
+        else {
+            const std::size_t sgSize =
+                ndit.get_sub_group().get_local_range()[0];
+            const std::size_t gid = ndit.get_global_linear_id();
+            const std::size_t elems_per_sg = sgSize * elems_per_wi;
+
+            const std::size_t start =
+                (gid / sgSize) * (elems_per_sg - sgSize) + gid;
+            const std::size_t end = std::min(nelems_, start + elems_per_sg);
+            for (std::size_t offset = start; offset < end; offset += sgSize) {
+                out[offset] = op(in1[offset], in2[offset]);
+            }
+        }
+    }
+};
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          typename ThreeOffsets_IndexerT,
+          typename BinaryOperatorT>
+struct BinaryStridedFunctor
+{
+private:
+    const argT1 *in1 = nullptr;
+    const argT2 *in2 = nullptr;
+    resT *out = nullptr;
+    ThreeOffsets_IndexerT three_offsets_indexer_;
+
+public:
+    BinaryStridedFunctor(const argT1 *inp1_tp,
+                         const argT2 *inp2_tp,
+                         resT *res_tp,
+                         const ThreeOffsets_IndexerT &inps_res_indexer)
+        : in1(inp1_tp), in2(inp2_tp), out(res_tp),
+          three_offsets_indexer_(inps_res_indexer)
+    {
+    }
+
+    void operator()(sycl::id<1> wid) const
+    {
+        const auto &three_offsets_ =
+            three_offsets_indexer_(static_cast<ssize_t>(wid.get(0)));
+
+        const auto &inp1_offset = three_offsets_.get_first_offset();
+        const auto &inp2_offset = three_offsets_.get_second_offset();
+        const auto &out_offset = three_offsets_.get_third_offset();
+
+        BinaryOperatorT op{};
+        out[out_offset] = op(in1[inp1_offset], in2[inp2_offset]);
+    }
+};
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          typename BinaryOperatorT>
+struct BinaryContigMatrixContigRowBroadcastingFunctor
+{
+private:
+    const argT1 *mat;
+    const argT2 *padded_vec;
+    resT *res;
+    std::size_t n_elems;
+    std::size_t n1;
+
+public:
+    BinaryContigMatrixContigRowBroadcastingFunctor(const argT1 *mat_tp,
+                                                   const argT2 *row_tp,
+                                                   resT *res_tp,
+                                                   std::size_t n_elems_in_mat,
+                                                   std::size_t n_elems_in_row)
+        : mat(mat_tp), padded_vec(row_tp), res(res_tp), n_elems(n_elems_in_mat),
+          n1(n_elems_in_row)
+    {
+    }
+
+    void operator()(sycl::nd_item<1> ndit) const
+    {
+        /* NOTE: work-group size must be divisible by sub-group size */
+
+        BinaryOperatorT op{};
+        static_assert(BinaryOperatorT::supports_sg_loadstore::value);
+
+        const auto &sg = ndit.get_sub_group();
+        const std::size_t gid = ndit.get_global_linear_id();
+
+        const std::size_t sgSize = sg.get_max_local_range()[0];
+        const std::size_t base = gid - sg.get_local_id()[0];
+
+        if (base + sgSize < n_elems) {
+            auto in1_multi_ptr = sycl::address_space_cast<
+                sycl::access::address_space::global_space,
+                sycl::access::decorated::yes>(&mat[base]);
+
+            auto in2_multi_ptr = sycl::address_space_cast<
+                sycl::access::address_space::global_space,
+                sycl::access::decorated::yes>(&padded_vec[base % n1]);
+
+            auto out_multi_ptr = sycl::address_space_cast<
+                sycl::access::address_space::global_space,
+                sycl::access::decorated::yes>(&res[base]);
+
+            const argT1 mat_el = sub_group_load(sg, in1_multi_ptr);
+            const argT2 vec_el = sub_group_load(sg, in2_multi_ptr);
+
+            resT res_el = op(mat_el, vec_el);
+
+            sub_group_store(sg, res_el, out_multi_ptr);
+        }
+        else {
+            const std::size_t lane_id = sg.get_local_id()[0];
+            for (std::size_t k = base + lane_id; k < n_elems; k += sgSize) {
+                res[k] = op(mat[k], padded_vec[k % n1]);
+            }
+        }
+    }
+};
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          typename BinaryOperatorT>
+struct BinaryContigRowContigMatrixBroadcastingFunctor
+{
+private:
+    const argT1 *padded_vec;
+    const argT2 *mat;
+    resT *res;
+    std::size_t n_elems;
+    std::size_t n1;
+
+public:
+    BinaryContigRowContigMatrixBroadcastingFunctor(const argT1 *row_tp,
+                                                   const argT2 *mat_tp,
+                                                   resT *res_tp,
+                                                   std::size_t n_elems_in_mat,
+                                                   std::size_t n_elems_in_row)
+        : padded_vec(row_tp), mat(mat_tp), res(res_tp), n_elems(n_elems_in_mat),
+          n1(n_elems_in_row)
+    {
+    }
+
+    void operator()(sycl::nd_item<1> ndit) const
+    {
+        /* NOTE: work-group size must be divisible by sub-group size */
+        BinaryOperatorT op{};
+        static_assert(BinaryOperatorT::supports_sg_loadstore::value);
+
+        const auto &sg = ndit.get_sub_group();
+        std::size_t gid = ndit.get_global_linear_id();
+
+        const std::size_t sgSize = sg.get_max_local_range()[0];
+        const std::size_t base = gid - sg.get_local_id()[0];
+
+        if (base + sgSize < n_elems) {
+            auto in1_multi_ptr = sycl::address_space_cast<
+                sycl::access::address_space::global_space,
+                sycl::access::decorated::yes>(&padded_vec[base % n1]);
+
+            auto in2_multi_ptr = sycl::address_space_cast<
+                sycl::access::address_space::global_space,
+                sycl::access::decorated::yes>(&mat[base]);
+
+            auto out_multi_ptr = sycl::address_space_cast<
+                sycl::access::address_space::global_space,
+                sycl::access::decorated::yes>(&res[base]);
+
+            const argT2 mat_el = sub_group_load(sg, in2_multi_ptr);
+            const argT1 vec_el = sub_group_load(sg, in1_multi_ptr);
+
+            resT res_el = op(vec_el, mat_el);
+
+            sub_group_store(sg, res_el, out_multi_ptr);
+        }
+        else {
+            const std::size_t lane_id = sg.get_local_id()[0];
+            for (std::size_t k = base + lane_id; k < n_elems; k += sgSize) {
+                res[k] = op(padded_vec[k % n1], mat[k]);
+            }
+        }
+    }
+};
+
+// Typedefs for function pointers
+
+typedef sycl::event (*unary_contig_impl_fn_ptr_t)(
+    sycl::queue &,
+    std::size_t,
+    const char *,
+    char *,
+    const std::vector<sycl::event> &);
+
+typedef sycl::event (*unary_strided_impl_fn_ptr_t)(
+    sycl::queue &,
+    std::size_t,
+    int,
+    const ssize_t *,
+    const char *,
+    ssize_t,
+    char *,
+    ssize_t,
+    const std::vector<sycl::event> &,
+    const std::vector<sycl::event> &);
+
+typedef sycl::event (*binary_contig_impl_fn_ptr_t)(
+    sycl::queue &,
+    std::size_t,
+    const char *,
+    ssize_t,
+    const char *,
+    ssize_t,
+    char *,
+    ssize_t,
+    const std::vector<sycl::event> &);
+
+typedef sycl::event (*binary_strided_impl_fn_ptr_t)(
+    sycl::queue &,
+    std::size_t,
+    int,
+    const ssize_t *,
+    const char *,
+    ssize_t,
+    const char *,
+    ssize_t,
+    char *,
+    ssize_t,
+    const std::vector<sycl::event> &,
+    const std::vector<sycl::event> &);
+
+typedef sycl::event (*binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t)(
+    sycl::queue &,
+    std::vector<sycl::event> &,
+    std::size_t,
+    std::size_t,
+    const char *,
+    ssize_t,
+    const char *,
+    ssize_t,
+    char *,
+    ssize_t,
+    const std::vector<sycl::event> &);
+
+typedef sycl::event (*binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t)(
+    sycl::queue &,
+    std::vector<sycl::event> &,
+    std::size_t,
+    std::size_t,
+    const char *,
+    ssize_t,
+    const char *,
+    ssize_t,
+    char *,
+    ssize_t,
+    const std::vector<sycl::event> &);
+
+template <typename argTy1,
+          typename argTy2,
+          template <typename T1, typename T2>
+          class BinaryOutputType,
+          template <typename T1,
+                    typename T2,
+                    typename T3,
+                    std::uint8_t vs,
+                    std::uint8_t nv,
+                    bool enable_sg_loadstore>
+          class BinaryContigFunctorT,
+          template <typename T1,
+                    typename T2,
+                    typename T3,
+                    std::uint8_t vs,
+                    std::uint8_t nv>
+          class kernel_name,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u>
+sycl::event binary_contig_impl(sycl::queue &exec_q,
+                               std::size_t nelems,
+                               const char *arg1_p,
+                               ssize_t arg1_offset,
+                               const char *arg2_p,
+                               ssize_t arg2_offset,
+                               char *res_p,
+                               ssize_t res_offset,
+                               const std::vector<sycl::event> &depends = {})
+{
+    const std::size_t n_work_items_needed = nelems / (n_vecs * vec_sz);
+    const std::size_t lws =
+        select_lws(exec_q.get_device(), n_work_items_needed);
+
+    const std::size_t n_groups =
+        ((nelems + lws * n_vecs * vec_sz - 1) / (lws * n_vecs * vec_sz));
+    const auto gws_range = sycl::range<1>(n_groups * lws);
+    const auto lws_range = sycl::range<1>(lws);
+
+    using resTy = typename BinaryOutputType<argTy1, argTy2>::value_type;
+    using BaseKernelName = kernel_name<argTy1, argTy2, resTy, vec_sz, n_vecs>;
+
+    const argTy1 *arg1_tp =
+        reinterpret_cast<const argTy1 *>(arg1_p) + arg1_offset;
+    const argTy2 *arg2_tp =
+        reinterpret_cast<const argTy2 *>(arg2_p) + arg2_offset;
+    resTy *res_tp = reinterpret_cast<resTy *>(res_p) + res_offset;
+
+    sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        if (is_aligned<required_alignment>(arg1_tp) &&
+            is_aligned<required_alignment>(arg2_tp) &&
+            is_aligned<required_alignment>(res_tp))
+        {
+            static constexpr bool enable_sg_loadstore = true;
+            using KernelName = BaseKernelName;
+            using Impl = BinaryContigFunctorT<argTy1, argTy2, resTy, vec_sz,
+                                              n_vecs, enable_sg_loadstore>;
+
+            cgh.parallel_for<KernelName>(
+                sycl::nd_range<1>(gws_range, lws_range),
+                Impl(arg1_tp, arg2_tp, res_tp, nelems));
+        }
+        else {
+            static constexpr bool disable_sg_loadstore = false;
+            using KernelName =
+                disabled_sg_loadstore_wrapper_krn<BaseKernelName>;
+            using Impl = BinaryContigFunctorT<argTy1, argTy2, resTy, vec_sz,
+                                              n_vecs, disable_sg_loadstore>;
+
+            cgh.parallel_for<KernelName>(
+                sycl::nd_range<1>(gws_range, lws_range),
+                Impl(arg1_tp, arg2_tp, res_tp, nelems));
+        }
+    });
+    return comp_ev;
+}
+
+template <typename argTy1,
+          typename argTy2,
+          template <typename T1, typename T2>
+          class BinaryOutputType,
+          template <typename T1, typename T2, typename T3, typename IndT>
+          class BinaryStridedFunctorT,
+          template <typename T1, typename T2, typename T3, typename IndT>
+          class kernel_name>
+sycl::event
+    binary_strided_impl(sycl::queue &exec_q,
+                        std::size_t nelems,
+                        int nd,
+                        const ssize_t *shape_and_strides,
+                        const char *arg1_p,
+                        ssize_t arg1_offset,
+                        const char *arg2_p,
+                        ssize_t arg2_offset,
+                        char *res_p,
+                        ssize_t res_offset,
+                        const std::vector<sycl::event> &depends,
+                        const std::vector<sycl::event> &additional_depends)
+{
+    sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+        cgh.depends_on(additional_depends);
+
+        using resTy = typename BinaryOutputType<argTy1, argTy2>::value_type;
+
+        using IndexerT =
+            typename dpctl::tensor::offset_utils::ThreeOffsets_StridedIndexer;
+
+        const IndexerT indexer{nd, arg1_offset, arg2_offset, res_offset,
+                               shape_and_strides};
+
+        const argTy1 *arg1_tp = reinterpret_cast<const argTy1 *>(arg1_p);
+        const argTy2 *arg2_tp = reinterpret_cast<const argTy2 *>(arg2_p);
+        resTy *res_tp = reinterpret_cast<resTy *>(res_p);
+
+        using Impl = BinaryStridedFunctorT<argTy1, argTy2, resTy, IndexerT>;
+
+        cgh.parallel_for<kernel_name<argTy1, argTy2, resTy, IndexerT>>(
+            {nelems}, Impl(arg1_tp, arg2_tp, res_tp, indexer));
+    });
+    return comp_ev;
+}
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          template <typename T1, typename T2, typename T3>
+          class BinaryContigMatrixContigRowBroadcastFunctorT,
+          template <typename T1, typename T2, typename T3>
+          class kernel_name>
+sycl::event binary_contig_matrix_contig_row_broadcast_impl(
+    sycl::queue &exec_q,
+    std::vector<sycl::event> &host_tasks,
+    std::size_t n0,
+    std::size_t n1,
+    const char *mat_p, // typeless pointer to (n0, n1) C-contiguous matrix
+    ssize_t mat_offset,
+    const char *vec_p, // typeless pointer to (n1,) contiguous row
+    ssize_t vec_offset,
+    char *res_p, // typeless pointer to (n0, n1) result C-contig. matrix,
+                 //    res[i,j] = op(mat[i,j], vec[j])
+    ssize_t res_offset,
+    const std::vector<sycl::event> &depends = {})
+{
+    const argT1 *mat = reinterpret_cast<const argT1 *>(mat_p) + mat_offset;
+    const argT2 *vec = reinterpret_cast<const argT2 *>(vec_p) + vec_offset;
+    resT *res = reinterpret_cast<resT *>(res_p) + res_offset;
+
+    const auto &dev = exec_q.get_device();
+    const auto &sg_sizes = dev.get_info<sycl::info::device::sub_group_sizes>();
+    // Get device-specific kernel info max_sub_group_size
+    std::size_t max_sgSize =
+        *(std::max_element(std::begin(sg_sizes), std::end(sg_sizes)));
+
+    std::size_t n1_padded = n1 + max_sgSize;
+    auto padded_vec_owner =
+        dpctl::tensor::alloc_utils::smart_malloc_device<argT2>(n1_padded,
+                                                               exec_q);
+    argT2 *padded_vec = padded_vec_owner.get();
+
+    sycl::event make_padded_vec_ev =
+        dpctl::tensor::kernels::elementwise_detail::populate_padded_vector<
+            argT2>(exec_q, vec, n1, padded_vec, n1_padded, depends);
+
+    // sub-group spans work-items [I, I + sgSize)
+    // base = ndit.get_global_linear_id() - sg.get_local_id()[0]
+    // Generically, sub_group_load( &mat[base]) may load arrays from
+    // different rows of mat. The start corresponds to row (base / n0)
+    // We read sub_group_load(&padded_vec[(base / n0)]).
+    // The vector is padded to ensure that reads are accessible
+
+    const std::size_t lws = 128;
+
+    sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(make_padded_vec_ev);
+
+        auto lwsRange = sycl::range<1>(lws);
+        std::size_t n_elems = n0 * n1;
+        std::size_t n_groups = (n_elems + lws - 1) / lws;
+        auto gwsRange = sycl::range<1>(n_groups * lws);
+
+        using Impl =
+            BinaryContigMatrixContigRowBroadcastFunctorT<argT1, argT2, resT>;
+
+        cgh.parallel_for<class kernel_name<argT1, argT2, resT>>(
+            sycl::nd_range<1>(gwsRange, lwsRange),
+            Impl(mat, padded_vec, res, n_elems, n1));
+    });
+
+    sycl::event tmp_cleanup_ev = dpctl::tensor::alloc_utils::async_smart_free(
+        exec_q, {comp_ev}, padded_vec_owner);
+
+    host_tasks.push_back(tmp_cleanup_ev);
+
+    return comp_ev;
+}
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          template <typename T1, typename T2, typename T3>
+          class BinaryContigRowContigMatrixBroadcastFunctorT,
+          template <typename T1, typename T2, typename T3>
+          class kernel_name>
+sycl::event binary_contig_row_contig_matrix_broadcast_impl(
+    sycl::queue &exec_q,
+    std::vector<sycl::event> &host_tasks,
+    std::size_t n0,
+    std::size_t n1,
+    const char *vec_p, // typeless pointer to (n1,) contiguous row
+    ssize_t vec_offset,
+    const char *mat_p, // typeless pointer to (n0, n1) C-contiguous matrix
+    ssize_t mat_offset,
+    char *res_p, // typeless pointer to (n0, n1) result C-contig. matrix,
+                 //    res[i,j] = op(vec[j], mat[i,j])
+    ssize_t res_offset,
+    const std::vector<sycl::event> &depends = {})
+{
+    const argT1 *vec = reinterpret_cast<const argT2 *>(vec_p) + vec_offset;
+    const argT2 *mat = reinterpret_cast<const argT1 *>(mat_p) + mat_offset;
+    resT *res = reinterpret_cast<resT *>(res_p) + res_offset;
+
+    const auto &dev = exec_q.get_device();
+    const auto &sg_sizes = dev.get_info<sycl::info::device::sub_group_sizes>();
+    // Get device-specific kernel info max_sub_group_size
+    std::size_t max_sgSize =
+        *(std::max_element(std::begin(sg_sizes), std::end(sg_sizes)));
+
+    std::size_t n1_padded = n1 + max_sgSize;
+    auto padded_vec_owner =
+        dpctl::tensor::alloc_utils::smart_malloc_device<argT2>(n1_padded,
+                                                               exec_q);
+    argT2 *padded_vec = padded_vec_owner.get();
+
+    sycl::event make_padded_vec_ev =
+        dpctl::tensor::kernels::elementwise_detail::populate_padded_vector<
+            argT2>(exec_q, vec, n1, padded_vec, n1_padded, depends);
+
+    // sub-group spans work-items [I, I + sgSize)
+    // base = ndit.get_global_linear_id() - sg.get_local_id()[0]
+    // Generically, sub_group_load( &mat[base]) may load arrays from
+    // different rows of mat. The start corresponds to row (base / n0)
+    // We read sub_group_load(&padded_vec[(base / n0)]). The vector is
+    // padded to ensure that reads are accessible
+
+    const std::size_t lws = 128;
+
+    sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(make_padded_vec_ev);
+
+        auto lwsRange = sycl::range<1>(lws);
+        std::size_t n_elems = n0 * n1;
+        std::size_t n_groups = (n_elems + lws - 1) / lws;
+        auto gwsRange = sycl::range<1>(n_groups * lws);
+
+        using Impl =
+            BinaryContigRowContigMatrixBroadcastFunctorT<argT1, argT2, resT>;
+
+        cgh.parallel_for<class kernel_name<argT1, argT2, resT>>(
+            sycl::nd_range<1>(gwsRange, lwsRange),
+            Impl(padded_vec, mat, res, n_elems, n1));
+    });
+
+    sycl::event tmp_cleanup_ev = dpctl::tensor::alloc_utils::async_smart_free(
+        exec_q, {comp_ev}, padded_vec_owner);
+
+    host_tasks.push_back(tmp_cleanup_ev);
+
+    return comp_ev;
+};
+} // namespace dpctl::tensor::kernels::elementwise_common
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common_detail.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common_detail.hpp
new file mode 100644
index 000000000000..b304b5ac3a39
--- /dev/null
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common_detail.hpp
@@ -0,0 +1,70 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+///
+/// \file
+/// This file defines common code for elementwise tensor operations.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+
+#include <cstddef>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+namespace dpctl::tensor::kernels::elementwise_detail
+{
+template <typename T>
+class populate_padded_vec_krn;
+
+template <typename T>
+sycl::event
+    populate_padded_vector(sycl::queue &exec_q,
+                           const T *vec,
+                           std::size_t vec_sz,
+                           T *padded_vec,
+                           size_t padded_vec_sz,
+                           const std::vector<sycl::event> &dependent_events)
+{
+    sycl::event populate_padded_vec_ev = exec_q.submit([&](sycl::handler &cgh) {
+        // ensure vec contains actual data
+        cgh.depends_on(dependent_events);
+
+        sycl::range<1> gRange{padded_vec_sz};
+
+        cgh.parallel_for<class populate_padded_vec_krn<T>>(
+            gRange, [=](sycl::id<1> id)
+        {
+            std::size_t i = id[0];
+            padded_vec[i] = vec[i % vec_sz];
+            });
+    });
+
+    return populate_padded_vec_ev;
+}
+} // namespace dpctl::tensor::kernels::elementwise_detail
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp
new file mode 100644
index 000000000000..8565df2cf528
--- /dev/null
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp
@@ -0,0 +1,268 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+///
+/// \file
+/// This file defines kernels for elementwise evaluation of LOGADDEXP(x1, x2)
+/// function.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+
+#include <algorithm>
+#include <cmath>
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "common.hpp"
+#include "vec_size_util.hpp"
+
+#include "utils/math_utils.hpp"
+#include "utils/offset_utils.hpp"
+#include "utils/type_dispatch_building.hpp"
+#include "utils/type_utils.hpp"
+
+#include "kernels/dpctl_tensor_types.hpp"
+
+namespace dpctl::tensor::kernels::logaddexp
+{
+using dpctl::tensor::ssize_t;
+namespace td_ns = dpctl::tensor::type_dispatch;
+namespace tu_ns = dpctl::tensor::type_utils;
+
+using dpctl::tensor::type_utils::is_complex;
+using dpctl::tensor::type_utils::vec_cast;
+
+template <typename argT1, typename argT2, typename resT>
+struct LogAddExpFunctor
+{
+    using supports_sg_loadstore = std::true_type;
+    using supports_vec = std::true_type;
+
+    resT operator()(const argT1 &in1, const argT2 &in2) const
+    {
+        using dpctl::tensor::math_utils::logaddexp;
+        return logaddexp<resT>(in1, in2);
+    }
+
+    template <int vec_sz>
+    sycl::vec<resT, vec_sz>
+        operator()(const sycl::vec<argT1, vec_sz> &in1,
+                   const sycl::vec<argT2, vec_sz> &in2) const
+    {
+        sycl::vec<resT, vec_sz> res;
+        auto diff = in1 - in2; // take advantange of faster vec arithmetic
+
+#pragma unroll
+        for (int i = 0; i < vec_sz; ++i) {
+            if (std::isfinite(diff[i])) {
+                res[i] = std::max<resT>(in1[i], in2[i]) +
+                         impl_finite<resT>(-sycl::fabs(diff[i]));
+            }
+            else {
+                using dpctl::tensor::math_utils::logaddexp;
+                res[i] = logaddexp<resT>(in1[i], in2[i]);
+            }
+        }
+
+        return res;
+    }
+
+private:
+    template <typename T>
+    T impl_finite(T const &in) const
+    {
+        return (in > 0) ? (in + sycl::log1p(sycl::exp(-in)))
+                        : sycl::log1p(sycl::exp(in));
+    }
+};
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using LogAddExpContigFunctor = elementwise_common::BinaryContigFunctor<
+    argT1,
+    argT2,
+    resT,
+    LogAddExpFunctor<argT1, argT2, resT>,
+    vec_sz,
+    n_vecs,
+    enable_sg_loadstore>;
+
+template <typename argT1, typename argT2, typename resT, typename IndexerT>
+using LogAddExpStridedFunctor = elementwise_common::BinaryStridedFunctor<
+    argT1,
+    argT2,
+    resT,
+    IndexerT,
+    LogAddExpFunctor<argT1, argT2, resT>>;
+
+template <typename T1, typename T2>
+struct LogAddExpOutputType
+{
+    using value_type = typename std::disjunction<
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        sycl::half,
+                                        T2,
+                                        sycl::half,
+                                        sycl::half>,
+        td_ns::BinaryTypeMapResultEntry<T1, float, T2, float, float>,
+        td_ns::BinaryTypeMapResultEntry<T1, double, T2, double, double>,
+        td_ns::DefaultResultEntry<void>>::result_type;
+
+    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
+};
+
+namespace hyperparam_detail
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::BinaryContigHyperparameterSetEntry;
+using vsu_ns::ContigHyperparameterSetDefault;
+
+template <typename argTy1, typename argTy2>
+struct LogAddExpContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of namespace hyperparam_detail
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          std::uint8_t vec_sz,
+          std::uint8_t n_vecs>
+class logaddexp_contig_kernel;
+
+template <typename argTy1, typename argTy2>
+sycl::event logaddexp_contig_impl(sycl::queue &exec_q,
+                                  std::size_t nelems,
+                                  const char *arg1_p,
+                                  ssize_t arg1_offset,
+                                  const char *arg2_p,
+                                  ssize_t arg2_offset,
+                                  char *res_p,
+                                  ssize_t res_offset,
+                                  const std::vector<sycl::event> &depends = {})
+{
+    using LogAddExpHS =
+        hyperparam_detail::LogAddExpContigHyperparameterSet<argTy1, argTy2>;
+    static constexpr std::uint8_t vec_sz = LogAddExpHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = LogAddExpHS::n_vecs;
+
+    return elementwise_common::binary_contig_impl<
+        argTy1, argTy2, LogAddExpOutputType, LogAddExpContigFunctor,
+        logaddexp_contig_kernel, vec_sz, n_vecs>(
+        exec_q, nelems, arg1_p, arg1_offset, arg2_p, arg2_offset, res_p,
+        res_offset, depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct LogAddExpContigFactory
+{
+    fnT get()
+    {
+        if constexpr (!LogAddExpOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = logaddexp_contig_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T1, typename T2>
+struct LogAddExpTypeMapFactory
+{
+    /*! @brief get typeid for output type of logaddexp(T1 x, T2 y) */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename LogAddExpOutputType<T1, T2>::value_type;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename T1, typename T2, typename resT, typename IndexerT>
+class logaddexp_strided_kernel;
+
+template <typename argTy1, typename argTy2>
+sycl::event
+    logaddexp_strided_impl(sycl::queue &exec_q,
+                           std::size_t nelems,
+                           int nd,
+                           const ssize_t *shape_and_strides,
+                           const char *arg1_p,
+                           ssize_t arg1_offset,
+                           const char *arg2_p,
+                           ssize_t arg2_offset,
+                           char *res_p,
+                           ssize_t res_offset,
+                           const std::vector<sycl::event> &depends,
+                           const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::binary_strided_impl<
+        argTy1, argTy2, LogAddExpOutputType, LogAddExpStridedFunctor,
+        logaddexp_strided_kernel>(exec_q, nelems, nd, shape_and_strides, arg1_p,
+                                  arg1_offset, arg2_p, arg2_offset, res_p,
+                                  res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct LogAddExpStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (!LogAddExpOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = logaddexp_strided_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+template <typename argT1, typename argT2, typename resT>
+class logaddexp_matrix_row_broadcast_sg_krn;
+
+} // namespace dpctl::tensor::kernels::logaddexp
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/maximum.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/maximum.hpp
new file mode 100644
index 000000000000..067ccd84f059
--- /dev/null
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/maximum.hpp
@@ -0,0 +1,322 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+///
+/// \file
+/// This file defines kernels for elementwise evaluation of MAXIMUM(x1, x2)
+/// function.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+
+#include <complex>
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "common.hpp"
+#include "vec_size_util.hpp"
+
+#include "utils/math_utils.hpp"
+#include "utils/offset_utils.hpp"
+#include "utils/type_dispatch_building.hpp"
+#include "utils/type_utils.hpp"
+
+#include "kernels/dpctl_tensor_types.hpp"
+
+namespace dpctl::tensor::kernels::maximum
+{
+using dpctl::tensor::ssize_t;
+namespace td_ns = dpctl::tensor::type_dispatch;
+namespace tu_ns = dpctl::tensor::type_utils;
+
+template <typename argT1, typename argT2, typename resT>
+struct MaximumFunctor
+{
+
+    using supports_sg_loadstore = std::negation<
+        std::disjunction<tu_ns::is_complex<argT1>, tu_ns::is_complex<argT2>>>;
+    using supports_vec = std::conjunction<
+        std::is_same<argT1, argT2>,
+        std::negation<std::disjunction<tu_ns::is_complex<argT1>,
+                                       tu_ns::is_complex<argT2>>>>;
+
+    resT operator()(const argT1 &in1, const argT2 &in2) const
+    {
+        if constexpr (tu_ns::is_complex<argT1>::value ||
+                      tu_ns::is_complex<argT2>::value)
+        {
+            static_assert(std::is_same_v<argT1, argT2>);
+            using dpctl::tensor::math_utils::max_complex;
+            return max_complex<argT1>(in1, in2);
+        }
+        else if constexpr (std::is_floating_point_v<argT1> ||
+                           std::is_same_v<argT1, sycl::half>)
+        {
+            const bool choose_first = (sycl::isnan(in1) || (in1 > in2));
+            return (choose_first) ? in1 : in2;
+        }
+        else {
+            return (in1 > in2) ? in1 : in2;
+        }
+    }
+
+    template <int vec_sz>
+    sycl::vec<resT, vec_sz>
+        operator()(const sycl::vec<argT1, vec_sz> &in1,
+                   const sycl::vec<argT2, vec_sz> &in2) const
+    {
+        sycl::vec<resT, vec_sz> res;
+#pragma unroll
+        for (int i = 0; i < vec_sz; ++i) {
+            const auto &v1 = in1[i];
+            const auto &v2 = in2[i];
+            if constexpr (std::is_floating_point_v<argT1> ||
+                          std::is_same_v<argT1, sycl::half>)
+            {
+                const bool choose_first = (sycl::isnan(v1) || (v1 > v2));
+                res[i] = (choose_first) ? v1 : v2;
+            }
+            else {
+                res[i] = (v1 > v2) ? v1 : v2;
+            }
+        }
+        return res;
+    }
+};
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using MaximumContigFunctor =
+    elementwise_common::BinaryContigFunctor<argT1,
+                                            argT2,
+                                            resT,
+                                            MaximumFunctor<argT1, argT2, resT>,
+                                            vec_sz,
+                                            n_vecs,
+                                            enable_sg_loadstore>;
+
+template <typename argT1, typename argT2, typename resT, typename IndexerT>
+using MaximumStridedFunctor = elementwise_common::BinaryStridedFunctor<
+    argT1,
+    argT2,
+    resT,
+    IndexerT,
+    MaximumFunctor<argT1, argT2, resT>>;
+
+template <typename T1, typename T2>
+struct MaximumOutputType
+{
+    using value_type = typename std::disjunction<
+        td_ns::BinaryTypeMapResultEntry<T1, bool, T2, bool, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint8_t,
+                                        T2,
+                                        std::uint8_t,
+                                        std::uint8_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int8_t,
+                                        T2,
+                                        std::int8_t,
+                                        std::int8_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint16_t,
+                                        T2,
+                                        std::uint16_t,
+                                        std::uint16_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int16_t,
+                                        T2,
+                                        std::int16_t,
+                                        std::int16_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint32_t,
+                                        T2,
+                                        std::uint32_t,
+                                        std::uint32_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int32_t,
+                                        T2,
+                                        std::int32_t,
+                                        std::int32_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint64_t,
+                                        T2,
+                                        std::uint64_t,
+                                        std::uint64_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int64_t,
+                                        T2,
+                                        std::int64_t,
+                                        std::int64_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        sycl::half,
+                                        T2,
+                                        sycl::half,
+                                        sycl::half>,
+        td_ns::BinaryTypeMapResultEntry<T1, float, T2, float, float>,
+        td_ns::BinaryTypeMapResultEntry<T1, double, T2, double, double>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::complex<float>,
+                                        T2,
+                                        std::complex<float>,
+                                        std::complex<float>>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::complex<double>,
+                                        T2,
+                                        std::complex<double>,
+                                        std::complex<double>>,
+        td_ns::DefaultResultEntry<void>>::result_type;
+
+    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
+};
+
+namespace hyperparam_detail
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::BinaryContigHyperparameterSetEntry;
+using vsu_ns::ContigHyperparameterSetDefault;
+
+template <typename argTy1, typename argTy2>
+struct MaximumContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of namespace hyperparam_detail
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          std::uint8_t vec_sz,
+          std::uint8_t n_vecs>
+class maximum_contig_kernel;
+
+template <typename argTy1, typename argTy2>
+sycl::event maximum_contig_impl(sycl::queue &exec_q,
+                                std::size_t nelems,
+                                const char *arg1_p,
+                                ssize_t arg1_offset,
+                                const char *arg2_p,
+                                ssize_t arg2_offset,
+                                char *res_p,
+                                ssize_t res_offset,
+                                const std::vector<sycl::event> &depends = {})
+{
+    using MaxHS =
+        hyperparam_detail::MaximumContigHyperparameterSet<argTy1, argTy2>;
+    static constexpr std::uint8_t vec_sz = MaxHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = MaxHS::n_vecs;
+
+    return elementwise_common::binary_contig_impl<
+        argTy1, argTy2, MaximumOutputType, MaximumContigFunctor,
+        maximum_contig_kernel, vec_sz, n_vecs>(exec_q, nelems, arg1_p,
+                                               arg1_offset, arg2_p, arg2_offset,
+                                               res_p, res_offset, depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct MaximumContigFactory
+{
+    fnT get()
+    {
+        if constexpr (!MaximumOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = maximum_contig_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T1, typename T2>
+struct MaximumTypeMapFactory
+{
+    /*! @brief get typeid for output type of maximum(T1 x, T2 y) */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename MaximumOutputType<T1, T2>::value_type;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename T1, typename T2, typename resT, typename IndexerT>
+class maximum_strided_kernel;
+
+template <typename argTy1, typename argTy2>
+sycl::event
+    maximum_strided_impl(sycl::queue &exec_q,
+                         std::size_t nelems,
+                         int nd,
+                         const ssize_t *shape_and_strides,
+                         const char *arg1_p,
+                         ssize_t arg1_offset,
+                         const char *arg2_p,
+                         ssize_t arg2_offset,
+                         char *res_p,
+                         ssize_t res_offset,
+                         const std::vector<sycl::event> &depends,
+                         const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::binary_strided_impl<
+        argTy1, argTy2, MaximumOutputType, MaximumStridedFunctor,
+        maximum_strided_kernel>(exec_q, nelems, nd, shape_and_strides, arg1_p,
+                                arg1_offset, arg2_p, arg2_offset, res_p,
+                                res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct MaximumStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (!MaximumOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = maximum_strided_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+} // namespace dpctl::tensor::kernels::maximum
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/minimum.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/minimum.hpp
new file mode 100644
index 000000000000..a38945f89a25
--- /dev/null
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/minimum.hpp
@@ -0,0 +1,321 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+///
+/// \file
+/// This file defines kernels for elementwise evaluation of MINIMUM(x1, x2)
+/// function.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+
+#include <complex>
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+
+#include <sycl/sycl.hpp>
+
+#include "common.hpp"
+#include "vec_size_util.hpp"
+
+#include "utils/math_utils.hpp"
+#include "utils/offset_utils.hpp"
+#include "utils/type_dispatch_building.hpp"
+#include "utils/type_utils.hpp"
+
+#include "kernels/dpctl_tensor_types.hpp"
+
+namespace dpctl::tensor::kernels::minimum
+{
+using dpctl::tensor::ssize_t;
+namespace td_ns = dpctl::tensor::type_dispatch;
+namespace tu_ns = dpctl::tensor::type_utils;
+
+template <typename argT1, typename argT2, typename resT>
+struct MinimumFunctor
+{
+
+    using supports_sg_loadstore = std::negation<
+        std::disjunction<tu_ns::is_complex<argT1>, tu_ns::is_complex<argT2>>>;
+    using supports_vec = std::conjunction<
+        std::is_same<argT1, argT2>,
+        std::negation<std::disjunction<tu_ns::is_complex<argT1>,
+                                       tu_ns::is_complex<argT2>>>>;
+
+    resT operator()(const argT1 &in1, const argT2 &in2) const
+    {
+        if constexpr (tu_ns::is_complex<argT1>::value ||
+                      tu_ns::is_complex<argT2>::value)
+        {
+            static_assert(std::is_same_v<argT1, argT2>);
+            using dpctl::tensor::math_utils::min_complex;
+            return min_complex<argT1>(in1, in2);
+        }
+        else if constexpr (std::is_floating_point_v<argT1> ||
+                           std::is_same_v<argT1, sycl::half>)
+        {
+            const bool choose_first = sycl::isnan(in1) || (in1 < in2);
+            return (choose_first) ? in1 : in2;
+        }
+        else {
+            return (in1 < in2) ? in1 : in2;
+        }
+    }
+
+    template <int vec_sz>
+    sycl::vec<resT, vec_sz>
+        operator()(const sycl::vec<argT1, vec_sz> &in1,
+                   const sycl::vec<argT2, vec_sz> &in2) const
+    {
+        sycl::vec<resT, vec_sz> res;
+#pragma unroll
+        for (int i = 0; i < vec_sz; ++i) {
+            const auto &v1 = in1[i];
+            const auto &v2 = in2[i];
+            if constexpr (std::is_floating_point_v<argT1> ||
+                          std::is_same_v<argT1, sycl::half>)
+            {
+                const bool choose_first = sycl::isnan(v1) || (v1 < v2);
+                res[i] = (choose_first) ? v1 : v2;
+            }
+            else {
+                res[i] = (v1 < v2) ? v1 : v2;
+            }
+        }
+        return res;
+    }
+};
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using MinimumContigFunctor =
+    elementwise_common::BinaryContigFunctor<argT1,
+                                            argT2,
+                                            resT,
+                                            MinimumFunctor<argT1, argT2, resT>,
+                                            vec_sz,
+                                            n_vecs,
+                                            enable_sg_loadstore>;
+
+template <typename argT1, typename argT2, typename resT, typename IndexerT>
+using MinimumStridedFunctor = elementwise_common::BinaryStridedFunctor<
+    argT1,
+    argT2,
+    resT,
+    IndexerT,
+    MinimumFunctor<argT1, argT2, resT>>;
+
+template <typename T1, typename T2>
+struct MinimumOutputType
+{
+    using value_type = typename std::disjunction<
+        td_ns::BinaryTypeMapResultEntry<T1, bool, T2, bool, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint8_t,
+                                        T2,
+                                        std::uint8_t,
+                                        std::uint8_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int8_t,
+                                        T2,
+                                        std::int8_t,
+                                        std::int8_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint16_t,
+                                        T2,
+                                        std::uint16_t,
+                                        std::uint16_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int16_t,
+                                        T2,
+                                        std::int16_t,
+                                        std::int16_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint32_t,
+                                        T2,
+                                        std::uint32_t,
+                                        std::uint32_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int32_t,
+                                        T2,
+                                        std::int32_t,
+                                        std::int32_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint64_t,
+                                        T2,
+                                        std::uint64_t,
+                                        std::uint64_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int64_t,
+                                        T2,
+                                        std::int64_t,
+                                        std::int64_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        sycl::half,
+                                        T2,
+                                        sycl::half,
+                                        sycl::half>,
+        td_ns::BinaryTypeMapResultEntry<T1, float, T2, float, float>,
+        td_ns::BinaryTypeMapResultEntry<T1, double, T2, double, double>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::complex<float>,
+                                        T2,
+                                        std::complex<float>,
+                                        std::complex<float>>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::complex<double>,
+                                        T2,
+                                        std::complex<double>,
+                                        std::complex<double>>,
+        td_ns::DefaultResultEntry<void>>::result_type;
+
+    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
+};
+
+namespace hyperparam_detail
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::BinaryContigHyperparameterSetEntry;
+using vsu_ns::ContigHyperparameterSetDefault;
+
+template <typename argTy1, typename argTy2>
+struct MinimumContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of namespace hyperparam_detail
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          std::uint8_t vec_sz,
+          std::uint8_t n_vecs>
+class minimum_contig_kernel;
+
+template <typename argTy1, typename argTy2>
+sycl::event minimum_contig_impl(sycl::queue &exec_q,
+                                std::size_t nelems,
+                                const char *arg1_p,
+                                ssize_t arg1_offset,
+                                const char *arg2_p,
+                                ssize_t arg2_offset,
+                                char *res_p,
+                                ssize_t res_offset,
+                                const std::vector<sycl::event> &depends = {})
+{
+    using MinHS =
+        hyperparam_detail::MinimumContigHyperparameterSet<argTy1, argTy2>;
+    static constexpr std::uint8_t vec_sz = MinHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = MinHS::n_vecs;
+
+    return elementwise_common::binary_contig_impl<
+        argTy1, argTy2, MinimumOutputType, MinimumContigFunctor,
+        minimum_contig_kernel, vec_sz, n_vecs>(exec_q, nelems, arg1_p,
+                                               arg1_offset, arg2_p, arg2_offset,
+                                               res_p, res_offset, depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct MinimumContigFactory
+{
+    fnT get()
+    {
+        if constexpr (!MinimumOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = minimum_contig_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T1, typename T2>
+struct MinimumTypeMapFactory
+{
+    /*! @brief get typeid for output type of minimum(T1 x, T2 y) */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename MinimumOutputType<T1, T2>::value_type;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename T1, typename T2, typename resT, typename IndexerT>
+class minimum_strided_kernel;
+
+template <typename argTy1, typename argTy2>
+sycl::event
+    minimum_strided_impl(sycl::queue &exec_q,
+                         std::size_t nelems,
+                         int nd,
+                         const ssize_t *shape_and_strides,
+                         const char *arg1_p,
+                         ssize_t arg1_offset,
+                         const char *arg2_p,
+                         ssize_t arg2_offset,
+                         char *res_p,
+                         ssize_t res_offset,
+                         const std::vector<sycl::event> &depends,
+                         const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::binary_strided_impl<
+        argTy1, argTy2, MinimumOutputType, MinimumStridedFunctor,
+        minimum_strided_kernel>(exec_q, nelems, nd, shape_and_strides, arg1_p,
+                                arg1_offset, arg2_p, arg2_offset, res_p,
+                                res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct MinimumStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (!MinimumOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = minimum_strided_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+} // namespace dpctl::tensor::kernels::minimum
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sycl_complex.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sycl_complex.hpp
new file mode 100644
index 000000000000..5cadec6ce2a4
--- /dev/null
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sycl_complex.hpp
@@ -0,0 +1,44 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+///
+/// \file
+/// This file defines a macro for defining the SYCL_EXT_ONEAPI_COMPLEX macro
+/// and indirect inclusion of the experimental oneAPI SYCL complex extension
+/// header file.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+
+#define SYCL_EXT_ONEAPI_COMPLEX
+#if __has_include(<sycl/ext/oneapi/experimental/sycl_complex.hpp>)
+#include <sycl/ext/oneapi/experimental/sycl_complex.hpp>
+#else
+#include <sycl/ext/oneapi/experimental/complex/complex.hpp>
+#endif
+
+namespace exprm_ns = sycl::ext::oneapi::experimental;
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/vec_size_util.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/vec_size_util.hpp
new file mode 100644
index 000000000000..bdbc7e50cc86
--- /dev/null
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/vec_size_util.hpp
@@ -0,0 +1,70 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+///
+/// \file
+/// This file defines utilities for selection of hyperparameters for kernels
+/// implementing unary and binary elementwise functions for contiguous inputs
+//===---------------------------------------------------------------------===//
+
+#pragma once
+
+#include <cstdint>
+#include <type_traits>
+
+namespace dpctl::tensor::kernels::vec_size_utils
+{
+template <typename Ty1,
+          typename ArgTy1,
+          typename Ty2,
+          typename ArgTy2,
+          std::uint8_t vec_sz_v,
+          std::uint8_t n_vecs_v>
+struct BinaryContigHyperparameterSetEntry
+    : std::conjunction<std::is_same<Ty1, ArgTy1>, std::is_same<Ty2, ArgTy2>>
+{
+    static constexpr std::uint8_t vec_sz = vec_sz_v;
+    static constexpr std::uint8_t n_vecs = n_vecs_v;
+};
+
+template <typename Ty,
+          typename ArgTy,
+          std::uint8_t vec_sz_v,
+          std::uint8_t n_vecs_v>
+struct UnaryContigHyperparameterSetEntry : std::is_same<Ty, ArgTy>
+{
+    static constexpr std::uint8_t vec_sz = vec_sz_v;
+    static constexpr std::uint8_t n_vecs = n_vecs_v;
+};
+
+template <std::uint8_t vec_sz_v, std::uint8_t n_vecs_v>
+struct ContigHyperparameterSetDefault : std::true_type
+{
+    static constexpr std::uint8_t vec_sz = vec_sz_v;
+    static constexpr std::uint8_t n_vecs = n_vecs_v;
+};
+} // namespace dpctl::tensor::kernels::vec_size_utils
diff --git a/dpctl/tensor/libtensor/include/utils/indexing_utils.hpp b/dpctl/tensor/libtensor/include/utils/indexing_utils.hpp
new file mode 100644
index 000000000000..d28c8174c39c
--- /dev/null
+++ b/dpctl/tensor/libtensor/include/utils/indexing_utils.hpp
@@ -0,0 +1,153 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+///
+/// \file
+/// This file defines utilities for handling out-of-bounds integer indices in
+/// kernels that involve indexing operations, such as take, put, or advanced
+/// tensor integer indexing.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <cstddef>
+#include <cstdint>
+#include <limits>
+#include <type_traits>
+
+#include <sycl/sycl.hpp>
+
+#include "kernels/dpctl_tensor_types.hpp"
+
+namespace dpctl::tensor::indexing_utils
+{
+using dpctl::tensor::ssize_t;
+
+/*
+ * ssize_t for indices is a design choice, dpctl::tensor::usm_ndarray
+ * uses py::ssize_t for shapes and strides internally and Python uses
+ * py_ssize_t for sizes of e.g. lists.
+ */
+
+template <typename IndT>
+struct WrapIndex
+{
+    static_assert(std::is_integral_v<IndT>);
+
+    ssize_t operator()(ssize_t max_item, IndT ind) const
+    {
+        ssize_t projected;
+        static constexpr ssize_t unit(1);
+        max_item = sycl::max(max_item, unit);
+
+        static constexpr std::uintmax_t ind_max =
+            std::numeric_limits<IndT>::max();
+        static constexpr std::uintmax_t ssize_max =
+            std::numeric_limits<ssize_t>::max();
+
+        if constexpr (std::is_signed_v<IndT>) {
+            static constexpr std::intmax_t ind_min =
+                std::numeric_limits<IndT>::min();
+            static constexpr std::intmax_t ssize_min =
+                std::numeric_limits<ssize_t>::min();
+
+            if constexpr (ind_max <= ssize_max && ind_min >= ssize_min) {
+                const ssize_t ind_ = static_cast<ssize_t>(ind);
+                const ssize_t lb = -max_item;
+                const ssize_t ub = max_item - 1;
+                projected = sycl::clamp(ind_, lb, ub);
+            }
+            else {
+                const IndT lb = static_cast<IndT>(-max_item);
+                const IndT ub = static_cast<IndT>(max_item - 1);
+                projected = static_cast<ssize_t>(sycl::clamp(ind, lb, ub));
+            }
+            return (projected < 0) ? projected + max_item : projected;
+        }
+        else {
+            if constexpr (ind_max <= ssize_max) {
+                const ssize_t ind_ = static_cast<ssize_t>(ind);
+                const ssize_t ub = max_item - 1;
+                projected = sycl::min(ind_, ub);
+            }
+            else {
+                const IndT ub = static_cast<IndT>(max_item - 1);
+                projected = static_cast<ssize_t>(sycl::min(ind, ub));
+            }
+            return projected;
+        }
+    }
+};
+
+template <typename IndT>
+struct ClipIndex
+{
+    static_assert(std::is_integral_v<IndT>);
+
+    ssize_t operator()(ssize_t max_item, IndT ind) const
+    {
+        ssize_t projected;
+        static constexpr ssize_t unit(1);
+        max_item = sycl::max<ssize_t>(max_item, unit);
+
+        static constexpr std::uintmax_t ind_max =
+            std::numeric_limits<IndT>::max();
+        static constexpr std::uintmax_t ssize_max =
+            std::numeric_limits<ssize_t>::max();
+        if constexpr (std::is_signed_v<IndT>) {
+            static constexpr std::intmax_t ind_min =
+                std::numeric_limits<IndT>::min();
+            static constexpr std::intmax_t ssize_min =
+                std::numeric_limits<ssize_t>::min();
+
+            if constexpr (ind_max <= ssize_max && ind_min >= ssize_min) {
+                const ssize_t ind_ = static_cast<ssize_t>(ind);
+                static constexpr ssize_t lb(0);
+                const ssize_t ub = max_item - 1;
+                projected = sycl::clamp(ind_, lb, ub);
+            }
+            else {
+                static constexpr IndT lb(0);
+                const IndT ub = static_cast<IndT>(max_item - 1);
+                projected = static_cast<std::size_t>(sycl::clamp(ind, lb, ub));
+            }
+        }
+        else {
+            if constexpr (ind_max <= ssize_max) {
+                const ssize_t ind_ = static_cast<ssize_t>(ind);
+                const ssize_t ub = max_item - 1;
+                projected = sycl::min(ind_, ub);
+            }
+            else {
+                const IndT ub = static_cast<IndT>(max_item - 1);
+                projected = static_cast<ssize_t>(sycl::min(ind, ub));
+            }
+        }
+        return projected;
+    }
+};
+} // namespace dpctl::tensor::indexing_utils
diff --git a/dpctl/tensor/libtensor/include/utils/math_utils.hpp b/dpctl/tensor/libtensor/include/utils/math_utils.hpp
new file mode 100644
index 000000000000..d35eff0074dc
--- /dev/null
+++ b/dpctl/tensor/libtensor/include/utils/math_utils.hpp
@@ -0,0 +1,148 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+///
+/// \file
+/// This file defines math utility functions.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <cmath>
+#include <complex>
+#include <limits>
+
+#include <sycl/sycl.hpp>
+
+namespace dpctl::tensor::math_utils
+{
+template <typename T>
+bool less_complex(const T &x1, const T &x2)
+{
+    using realT = typename T::value_type;
+    realT real1 = std::real(x1);
+    realT real2 = std::real(x2);
+    realT imag1 = std::imag(x1);
+    realT imag2 = std::imag(x2);
+
+    return (real1 == real2)
+               ? (imag1 < imag2)
+               : (real1 < real2 && !std::isnan(imag1) && !std::isnan(imag2));
+}
+
+template <typename T>
+bool greater_complex(const T &x1, const T &x2)
+{
+    using realT = typename T::value_type;
+    realT real1 = std::real(x1);
+    realT real2 = std::real(x2);
+    realT imag1 = std::imag(x1);
+    realT imag2 = std::imag(x2);
+
+    return (real1 == real2)
+               ? (imag1 > imag2)
+               : (real1 > real2 && !std::isnan(imag1) && !std::isnan(imag2));
+}
+
+template <typename T>
+bool less_equal_complex(const T &x1, const T &x2)
+{
+    using realT = typename T::value_type;
+    realT real1 = std::real(x1);
+    realT real2 = std::real(x2);
+    realT imag1 = std::imag(x1);
+    realT imag2 = std::imag(x2);
+
+    return (real1 == real2)
+               ? (imag1 <= imag2)
+               : (real1 < real2 && !std::isnan(imag1) && !std::isnan(imag2));
+}
+
+template <typename T>
+bool greater_equal_complex(const T &x1, const T &x2)
+{
+    using realT = typename T::value_type;
+    realT real1 = std::real(x1);
+    realT real2 = std::real(x2);
+    realT imag1 = std::imag(x1);
+    realT imag2 = std::imag(x2);
+
+    return (real1 == real2)
+               ? (imag1 >= imag2)
+               : (real1 > real2 && !std::isnan(imag1) && !std::isnan(imag2));
+}
+
+template <typename T>
+T max_complex(const T &x1, const T &x2)
+{
+    using realT = typename T::value_type;
+    realT real1 = std::real(x1);
+    realT real2 = std::real(x2);
+    realT imag1 = std::imag(x1);
+    realT imag2 = std::imag(x2);
+
+    bool isnan_imag1 = std::isnan(imag1);
+    bool gt = (real1 == real2)
+                  ? (imag1 > imag2)
+                  : (real1 > real2 && !isnan_imag1 && !std::isnan(imag2));
+    return (std::isnan(real1) || isnan_imag1 || gt) ? x1 : x2;
+}
+
+template <typename T>
+T min_complex(const T &x1, const T &x2)
+{
+    using realT = typename T::value_type;
+    realT real1 = std::real(x1);
+    realT real2 = std::real(x2);
+    realT imag1 = std::imag(x1);
+    realT imag2 = std::imag(x2);
+
+    bool isnan_imag1 = std::isnan(imag1);
+    bool lt = (real1 == real2)
+                  ? (imag1 < imag2)
+                  : (real1 < real2 && !isnan_imag1 && !std::isnan(imag2));
+    return (std::isnan(real1) || isnan_imag1 || lt) ? x1 : x2;
+}
+
+template <typename T>
+T logaddexp(T x, T y)
+{
+    if (x == y) { // handle signed infinities
+        const T log2 = sycl::log(T(2));
+        return x + log2;
+    }
+    else {
+        const T tmp = x - y;
+        static constexpr T zero(0);
+
+        return (tmp > zero)
+                   ? (x + sycl::log1p(sycl::exp(-tmp)))
+                   : ((tmp <= zero) ? y + sycl::log1p(sycl::exp(tmp))
+                                    : std::numeric_limits<T>::quiet_NaN());
+    }
+}
+} // namespace dpctl::tensor::math_utils
diff --git a/dpctl/tensor/libtensor/include/utils/memory_overlap.hpp b/dpctl/tensor/libtensor/include/utils/memory_overlap.hpp
new file mode 100644
index 000000000000..3b1bc772b514
--- /dev/null
+++ b/dpctl/tensor/libtensor/include/utils/memory_overlap.hpp
@@ -0,0 +1,157 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+///
+/// \file
+/// This file defines utility to determine whether two arrays have memory
+/// overlap.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <algorithm>
+#include <iterator>
+
+#include <pybind11/pybind11.h>
+
+#include "dpctl4pybind11.hpp"
+
+/* @brief check for overlap of memory regions behind arrays.
+
+Presently assume that array occupies all bytes between smallest and largest
+displaced elements.
+
+TODO: Write proper Frobenius solver to account for holes, e.g.
+   overlap( x_contig[::2], x_contig[1::2]) should give False,
+   while this implementation gives True.
+*/
+namespace dpctl::tensor::overlap
+{
+namespace py = pybind11;
+
+struct MemoryOverlap
+{
+    bool operator()(dpctl::tensor::usm_ndarray ar1,
+                    dpctl::tensor::usm_ndarray ar2) const
+    {
+        const char *ar1_data = ar1.get_data();
+
+        const auto &ar1_offsets = ar1.get_minmax_offsets();
+        py::ssize_t ar1_elem_size =
+            static_cast<py::ssize_t>(ar1.get_elemsize());
+
+        const char *ar2_data = ar2.get_data();
+        const auto &ar2_offsets = ar2.get_minmax_offsets();
+        py::ssize_t ar2_elem_size =
+            static_cast<py::ssize_t>(ar2.get_elemsize());
+
+        /* Memory of array1 extends from  */
+        /*    [ar1_data + ar1_offsets.first * ar1_elem_size, ar1_data +
+         * ar1_offsets.second * ar1_elem_size + ar1_elem_size] */
+        /* Memory of array2 extends from */
+        /*    [ar2_data + ar2_offsets.first * ar2_elem_size, ar2_data +
+         * ar2_offsets.second * ar2_elem_size + ar2_elem_size] */
+
+        /* Intervals [x0, x1] and [y0, y1] do not overlap if (x0 <= x1) && (y0
+         * <= y1)
+         * && (x1 <=y0 || y1 <= x0 ) */
+        /* Given that x0 <= x1 and y0 <= y1 are true by construction, the
+         * condition for overlap us (x1 > y0) && (y1 > x0) */
+
+        /*  Applying:
+            (ar1_data + ar1_offsets.second * ar1_elem_size + ar1_elem_size >
+        ar2_data
+        + ar2_offsets.first * ar2_elem_size) && (ar2_data + ar2_offsets.second *
+        ar2_elem_size + ar2_elem_size > ar1_data + ar1_offsets.first *
+        ar1_elem_size)
+        */
+
+        auto byte_distance = static_cast<py::ssize_t>(ar2_data - ar1_data);
+
+        py::ssize_t x1_minus_y0 =
+            (-byte_distance +
+             (ar1_elem_size + (ar1_offsets.second * ar1_elem_size) -
+              (ar2_offsets.first * ar2_elem_size)));
+
+        py::ssize_t y1_minus_x0 =
+            (byte_distance +
+             (ar2_elem_size + (ar2_offsets.second * ar2_elem_size) -
+              (ar1_offsets.first * ar1_elem_size)));
+
+        bool memory_overlap = (x1_minus_y0 > 0) && (y1_minus_x0 > 0);
+
+        return memory_overlap;
+    }
+};
+
+struct SameLogicalTensors
+{
+    bool operator()(dpctl::tensor::usm_ndarray ar1,
+                    dpctl::tensor::usm_ndarray ar2) const
+    {
+        // Same ndim
+        int nd1 = ar1.get_ndim();
+        if (nd1 != ar2.get_ndim())
+            return false;
+
+        // Same dtype
+        int tn1 = ar1.get_typenum();
+        if (tn1 != ar2.get_typenum())
+            return false;
+
+        // Same pointer
+        const char *ar1_data = ar1.get_data();
+        const char *ar2_data = ar2.get_data();
+
+        if (ar1_data != ar2_data)
+            return false;
+
+        // Same shape and strides
+        const py::ssize_t *ar1_shape = ar1.get_shape_raw();
+        const py::ssize_t *ar2_shape = ar2.get_shape_raw();
+
+        if (!std::equal(ar1_shape, ar1_shape + nd1, ar2_shape))
+            return false;
+
+        // Same shape and strides
+        auto const &ar1_strides = ar1.get_strides_vector();
+        auto const &ar2_strides = ar2.get_strides_vector();
+
+        auto ar1_beg_it = std::begin(ar1_strides);
+        auto ar1_end_it = std::end(ar1_strides);
+
+        auto ar2_beg_it = std::begin(ar2_strides);
+
+        if (!std::equal(ar1_beg_it, ar1_end_it, ar2_beg_it))
+            return false;
+
+        // all checks passed: arrays are logical views
+        // into the same memory
+        return true;
+    }
+};
+} // namespace dpctl::tensor::overlap
diff --git a/dpctl/tensor/libtensor/include/utils/offset_utils.hpp b/dpctl/tensor/libtensor/include/utils/offset_utils.hpp
new file mode 100644
index 000000000000..19664c3d4e12
--- /dev/null
+++ b/dpctl/tensor/libtensor/include/utils/offset_utils.hpp
@@ -0,0 +1,824 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+///
+/// \file
+/// This file defines Indexer callable operator to compute element offset in
+/// an array addressed by gloabl_id.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <algorithm>
+#include <array>
+#include <cstddef>
+#include <memory>
+#include <tuple>
+#include <utility>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "kernels/dpctl_tensor_types.hpp"
+#include "utils/strided_iters.hpp"
+#include "utils/sycl_alloc_utils.hpp"
+
+namespace dpctl::tensor::offset_utils
+{
+namespace detail
+{
+struct sink_t
+{
+    sink_t(){};
+    template <class T>
+    sink_t(T &&){};
+};
+
+template <class V>
+std::size_t __accumulate_size(std::size_t &s, V &&v)
+{
+    return s += v.size();
+}
+
+template <class V, class U>
+sink_t __appender(V &lhs, U &&rhs)
+{
+    lhs.insert(lhs.end(), rhs.begin(), rhs.end());
+    return {};
+}
+
+template <typename T, typename A, typename... Vs>
+std::vector<T, A> concat(std::vector<T, A> lhs, Vs &&...vs)
+{
+    std::size_t s = lhs.size();
+    {
+        // limited scope ensures array is freed
+        [[maybe_unused]] sink_t tmp[] = {__accumulate_size(s, vs)..., 0};
+    }
+    lhs.reserve(s);
+    {
+        // array of no-data objects ensures ordering of calls to the appender
+        [[maybe_unused]] sink_t tmp[] = {
+            __appender(lhs, std::forward<Vs>(vs))..., 0};
+    }
+
+    return std::move(lhs); // prevent return-value optimization
+}
+} // namespace detail
+
+template <typename indT, typename... Vs>
+std::tuple<std::unique_ptr<indT, dpctl::tensor::alloc_utils::USMDeleter>,
+           std::size_t,
+           sycl::event>
+    device_allocate_and_pack(sycl::queue &q,
+                             std::vector<sycl::event> &host_task_events,
+                             Vs &&...vs)
+{
+
+    using dpctl::tensor::alloc_utils::usm_host_allocator;
+
+    // memory transfer optimization, use USM-host for temporary speeds up
+    // transfer to device, especially on dGPUs
+    using usm_host_allocatorT = usm_host_allocator<indT>;
+    using shT = std::vector<indT, usm_host_allocatorT>;
+
+    usm_host_allocatorT usm_host_alloc(q);
+    shT empty{0, usm_host_alloc};
+    shT packed_shape_strides = detail::concat(std::move(empty), vs...);
+
+    auto packed_shape_strides_owner =
+        std::make_shared<shT>(std::move(packed_shape_strides));
+
+    auto sz = packed_shape_strides_owner->size();
+    auto shape_strides_owner =
+        dpctl::tensor::alloc_utils::smart_malloc_device<indT>(sz, q);
+    indT *shape_strides = shape_strides_owner.get();
+
+    sycl::event copy_ev =
+        q.copy<indT>(packed_shape_strides_owner->data(), shape_strides, sz);
+
+    sycl::event cleanup_host_task_ev = q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(copy_ev);
+        cgh.host_task([packed_shape_strides_owner =
+                           std::move(packed_shape_strides_owner)] {
+            // increment shared pointer ref-count to keep it alive
+            // till copy operation completes;
+        });
+    });
+    host_task_events.push_back(cleanup_host_task_ev);
+
+    return std::make_tuple(std::move(shape_strides_owner), sz, copy_ev);
+}
+
+struct NoOpIndexer
+{
+    constexpr NoOpIndexer() {}
+    constexpr std::size_t operator()(std::size_t gid) const
+    {
+        return gid;
+    }
+};
+
+using dpctl::tensor::ssize_t;
+
+/* @brief Indexer with shape and strides arrays of same size are packed */
+struct StridedIndexer
+{
+    StridedIndexer(int _nd,
+                   ssize_t _offset,
+                   ssize_t const *_packed_shape_strides)
+        : nd(_nd), starting_offset(_offset),
+          shape_strides(_packed_shape_strides)
+    {
+    }
+
+    ssize_t operator()(ssize_t gid) const
+    {
+        return compute_offset(gid);
+    }
+
+    ssize_t operator()(std::size_t gid) const
+    {
+        return compute_offset(static_cast<ssize_t>(gid));
+    }
+
+private:
+    int nd;
+    ssize_t starting_offset;
+    ssize_t const *shape_strides;
+
+    ssize_t compute_offset(ssize_t gid) const
+    {
+        using dpctl::tensor::strides::CIndexer_vector;
+
+        CIndexer_vector _ind(nd);
+        ssize_t relative_offset(0);
+        _ind.get_displacement<const ssize_t *, const ssize_t *>(
+            gid,
+            shape_strides,      // shape ptr
+            shape_strides + nd, // strides ptr
+            relative_offset);
+        return starting_offset + relative_offset;
+    }
+};
+
+// ensure that indexer is device copyable
+static_assert(sycl::is_device_copyable_v<StridedIndexer>);
+
+/* @brief Indexer with shape, strides provided separately */
+struct UnpackedStridedIndexer
+{
+    UnpackedStridedIndexer(int _nd,
+                           ssize_t _offset,
+                           ssize_t const *_shape,
+                           ssize_t const *_strides)
+        : nd(_nd), starting_offset(_offset), shape(_shape), strides(_strides)
+    {
+    }
+
+    ssize_t operator()(ssize_t gid) const
+    {
+        return compute_offset(gid);
+    }
+
+    ssize_t operator()(std::size_t gid) const
+    {
+        return compute_offset(static_cast<ssize_t>(gid));
+    }
+
+private:
+    int nd;
+    ssize_t starting_offset;
+    ssize_t const *shape;
+    ssize_t const *strides;
+
+    ssize_t compute_offset(ssize_t gid) const
+    {
+        using dpctl::tensor::strides::CIndexer_vector;
+
+        CIndexer_vector _ind(nd);
+        ssize_t relative_offset(0);
+        _ind.get_displacement<const ssize_t *, const ssize_t *>(
+            gid,
+            shape,   // shape ptr
+            strides, // strides ptr
+            relative_offset);
+        return starting_offset + relative_offset;
+    }
+};
+
+// ensure that indexer is device copyable
+static_assert(sycl::is_device_copyable_v<UnpackedStridedIndexer>);
+
+struct Strided1DIndexer
+{
+    Strided1DIndexer(std::size_t _size) : offset{}, size(_size), step(1) {}
+    Strided1DIndexer(ssize_t _size)
+        : offset{}, size(static_cast<std::size_t>(_size)), step(1)
+    {
+    }
+    Strided1DIndexer(std::size_t _size, ssize_t _step)
+        : offset{}, size(_size), step(_step)
+    {
+    }
+    Strided1DIndexer(std::size_t _size, std::size_t _step)
+        : offset{}, size(_size), step(static_cast<ssize_t>(_step))
+    {
+    }
+    Strided1DIndexer(ssize_t _size, ssize_t _step)
+        : offset{}, size(static_cast<std::size_t>(_size)), step(_step)
+    {
+    }
+    Strided1DIndexer(ssize_t _offset, std::size_t _size, ssize_t _step)
+        : offset(_offset), size(_size), step(_step)
+    {
+    }
+    Strided1DIndexer(ssize_t _offset, std::size_t _size, std::size_t _step)
+        : offset(_offset), size(_size), step(static_cast<ssize_t>(_step))
+    {
+    }
+    Strided1DIndexer(ssize_t _offset, ssize_t _size, ssize_t _step)
+        : offset(_offset), size(static_cast<std::size_t>(_size)), step(_step)
+    {
+    }
+
+    ssize_t operator()(std::size_t gid) const
+    {
+        // ensure 0 <= gid < size
+        return offset + std::min<std::size_t>(gid, size - 1) * step;
+    }
+
+private:
+    ssize_t offset = 0;
+    std::size_t size = 1;
+    ssize_t step = 1;
+};
+
+static_assert(sycl::is_device_copyable_v<Strided1DIndexer>);
+
+struct Strided1DCyclicIndexer
+{
+    Strided1DCyclicIndexer(ssize_t _offset, ssize_t _size, ssize_t _step)
+        : offset(_offset), size(static_cast<std::size_t>(_size)), step(_step)
+    {
+    }
+
+    ssize_t operator()(std::size_t gid) const
+    {
+        return offset + (gid % size) * step;
+    }
+
+private:
+    ssize_t offset = 0;
+    std::size_t size = 1;
+    ssize_t step = 1;
+};
+
+static_assert(sycl::is_device_copyable_v<Strided1DCyclicIndexer>);
+
+template <typename displacementT>
+struct TwoOffsets
+{
+    constexpr TwoOffsets() : first_offset(0), second_offset(0) {}
+    constexpr TwoOffsets(const displacementT &first_offset_,
+                         const displacementT &second_offset_)
+        : first_offset(first_offset_), second_offset(second_offset_)
+    {
+    }
+
+    constexpr displacementT get_first_offset() const
+    {
+        return first_offset;
+    }
+    constexpr displacementT get_second_offset() const
+    {
+        return second_offset;
+    }
+
+private:
+    displacementT first_offset = 0;
+    displacementT second_offset = 0;
+};
+
+struct TwoOffsets_StridedIndexer
+{
+    TwoOffsets_StridedIndexer(int common_nd,
+                              ssize_t first_offset_,
+                              ssize_t second_offset_,
+                              ssize_t const *_packed_shape_strides)
+        : nd(common_nd), starting_first_offset(first_offset_),
+          starting_second_offset(second_offset_),
+          shape_strides(_packed_shape_strides)
+    {
+    }
+
+    TwoOffsets<ssize_t> operator()(ssize_t gid) const
+    {
+        return compute_offsets(gid);
+    }
+
+    TwoOffsets<ssize_t> operator()(std::size_t gid) const
+    {
+        return compute_offsets(static_cast<ssize_t>(gid));
+    }
+
+private:
+    int nd;
+    ssize_t starting_first_offset;
+    ssize_t starting_second_offset;
+    ssize_t const *shape_strides;
+
+    TwoOffsets<ssize_t> compute_offsets(ssize_t gid) const
+    {
+        using dpctl::tensor::strides::CIndexer_vector;
+
+        CIndexer_vector _ind(nd);
+        ssize_t relative_first_offset(0);
+        ssize_t relative_second_offset(0);
+        _ind.get_displacement<const ssize_t *, const ssize_t *>(
+            gid,
+            shape_strides,          // shape ptr
+            shape_strides + nd,     // strides ptr
+            shape_strides + 2 * nd, // strides ptr
+            relative_first_offset, relative_second_offset);
+        return TwoOffsets<ssize_t>(
+            starting_first_offset + relative_first_offset,
+            starting_second_offset + relative_second_offset);
+    }
+};
+
+struct TwoZeroOffsets_Indexer
+{
+    constexpr TwoZeroOffsets_Indexer() {}
+
+    constexpr TwoOffsets<ssize_t> operator()(ssize_t) const
+    {
+        return TwoOffsets<ssize_t>();
+    }
+};
+
+static_assert(sycl::is_device_copyable_v<TwoZeroOffsets_Indexer>);
+
+template <typename FirstIndexerT, typename SecondIndexerT>
+struct TwoOffsets_CombinedIndexer
+{
+private:
+    FirstIndexerT first_indexer_;
+    SecondIndexerT second_indexer_;
+
+public:
+    constexpr TwoOffsets_CombinedIndexer(const FirstIndexerT &first_indexer,
+                                         const SecondIndexerT &second_indexer)
+        : first_indexer_(first_indexer), second_indexer_(second_indexer)
+    {
+    }
+
+    constexpr TwoOffsets<ssize_t> operator()(ssize_t gid) const
+    {
+        return TwoOffsets<ssize_t>(first_indexer_(gid), second_indexer_(gid));
+    }
+};
+
+template <typename displacementT>
+struct ThreeOffsets
+{
+    constexpr ThreeOffsets()
+        : first_offset(0), second_offset(0), third_offset(0)
+    {
+    }
+    constexpr ThreeOffsets(const displacementT &first_offset_,
+                           const displacementT &second_offset_,
+                           const displacementT &third_offset_)
+        : first_offset(first_offset_), second_offset(second_offset_),
+          third_offset(third_offset_)
+    {
+    }
+
+    constexpr displacementT get_first_offset() const
+    {
+        return first_offset;
+    }
+    constexpr displacementT get_second_offset() const
+    {
+        return second_offset;
+    }
+    constexpr displacementT get_third_offset() const
+    {
+        return third_offset;
+    }
+
+private:
+    displacementT first_offset = 0;
+    displacementT second_offset = 0;
+    displacementT third_offset = 0;
+};
+
+struct ThreeOffsets_StridedIndexer
+{
+    ThreeOffsets_StridedIndexer(int common_nd,
+                                ssize_t first_offset_,
+                                ssize_t second_offset_,
+                                ssize_t third_offset_,
+                                ssize_t const *_packed_shape_strides)
+        : nd(common_nd), starting_first_offset(first_offset_),
+          starting_second_offset(second_offset_),
+          starting_third_offset(third_offset_),
+          shape_strides(_packed_shape_strides)
+    {
+    }
+
+    ThreeOffsets<ssize_t> operator()(ssize_t gid) const
+    {
+        return compute_offsets(gid);
+    }
+
+    ThreeOffsets<ssize_t> operator()(std::size_t gid) const
+    {
+        return compute_offsets(static_cast<ssize_t>(gid));
+    }
+
+private:
+    int nd;
+    ssize_t starting_first_offset;
+    ssize_t starting_second_offset;
+    ssize_t starting_third_offset;
+    ssize_t const *shape_strides;
+
+    ThreeOffsets<ssize_t> compute_offsets(ssize_t gid) const
+    {
+        using dpctl::tensor::strides::CIndexer_vector;
+
+        CIndexer_vector _ind(nd);
+        ssize_t relative_first_offset(0);
+        ssize_t relative_second_offset(0);
+        ssize_t relative_third_offset(0);
+        _ind.get_displacement<const ssize_t *, const ssize_t *>(
+            gid,
+            shape_strides,          // shape ptr
+            shape_strides + nd,     // strides ptr
+            shape_strides + 2 * nd, // strides ptr
+            shape_strides + 3 * nd, // strides ptr
+            relative_first_offset, relative_second_offset,
+            relative_third_offset);
+        return ThreeOffsets<ssize_t>(
+            starting_first_offset + relative_first_offset,
+            starting_second_offset + relative_second_offset,
+            starting_third_offset + relative_third_offset);
+    }
+};
+
+static_assert(sycl::is_device_copyable_v<ThreeOffsets_StridedIndexer>);
+
+struct ThreeZeroOffsets_Indexer
+{
+    constexpr ThreeZeroOffsets_Indexer() {}
+
+    constexpr ThreeOffsets<ssize_t> operator()(ssize_t) const
+    {
+        return ThreeOffsets<ssize_t>();
+    }
+
+    constexpr ThreeOffsets<ssize_t> operator()(std::size_t) const
+    {
+        return ThreeOffsets<ssize_t>();
+    }
+};
+
+static_assert(sycl::is_device_copyable_v<ThreeZeroOffsets_Indexer>);
+
+template <typename FirstIndexerT,
+          typename SecondIndexerT,
+          typename ThirdIndexerT>
+struct ThreeOffsets_CombinedIndexer
+{
+private:
+    FirstIndexerT first_indexer_;
+    SecondIndexerT second_indexer_;
+    ThirdIndexerT third_indexer_;
+
+public:
+    constexpr ThreeOffsets_CombinedIndexer(const FirstIndexerT &first_indexer,
+                                           const SecondIndexerT &second_indexer,
+                                           const ThirdIndexerT &third_indexer)
+        : first_indexer_(first_indexer), second_indexer_(second_indexer),
+          third_indexer_(third_indexer)
+    {
+    }
+
+    constexpr ThreeOffsets<ssize_t> operator()(ssize_t gid) const
+    {
+        return ThreeOffsets<ssize_t>(first_indexer_(gid), second_indexer_(gid),
+                                     third_indexer_(gid));
+    }
+};
+
+template <typename displacementT>
+struct FourOffsets
+{
+    constexpr FourOffsets()
+        : first_offset(0), second_offset(0), third_offset(0), fourth_offset(0)
+    {
+    }
+    constexpr FourOffsets(const displacementT &first_offset_,
+                          const displacementT &second_offset_,
+                          const displacementT &third_offset_,
+                          const displacementT &fourth_offset_)
+        : first_offset(first_offset_), second_offset(second_offset_),
+          third_offset(third_offset_), fourth_offset(fourth_offset_)
+    {
+    }
+
+    constexpr displacementT get_first_offset() const
+    {
+        return first_offset;
+    }
+    constexpr displacementT get_second_offset() const
+    {
+        return second_offset;
+    }
+    constexpr displacementT get_third_offset() const
+    {
+        return third_offset;
+    }
+    constexpr displacementT get_fourth_offset() const
+    {
+        return fourth_offset;
+    }
+
+private:
+    displacementT first_offset = 0;
+    displacementT second_offset = 0;
+    displacementT third_offset = 0;
+    displacementT fourth_offset = 0;
+};
+
+struct FourOffsets_StridedIndexer
+{
+    constexpr FourOffsets_StridedIndexer(int common_nd,
+                                         ssize_t first_offset_,
+                                         ssize_t second_offset_,
+                                         ssize_t third_offset_,
+                                         ssize_t fourth_offset_,
+                                         ssize_t const *_packed_shape_strides)
+        : nd(common_nd), starting_first_offset(first_offset_),
+          starting_second_offset(second_offset_),
+          starting_third_offset(third_offset_),
+          starting_fourth_offset(fourth_offset_),
+          shape_strides(_packed_shape_strides)
+    {
+    }
+
+    constexpr FourOffsets<ssize_t> operator()(ssize_t gid) const
+    {
+        return compute_offsets(gid);
+    }
+
+    constexpr FourOffsets<ssize_t> operator()(std::size_t gid) const
+    {
+        return compute_offsets(static_cast<ssize_t>(gid));
+    }
+
+private:
+    int nd;
+    ssize_t starting_first_offset;
+    ssize_t starting_second_offset;
+    ssize_t starting_third_offset;
+    ssize_t starting_fourth_offset;
+    ssize_t const *shape_strides;
+
+    FourOffsets<ssize_t> compute_offsets(ssize_t gid) const
+    {
+        using dpctl::tensor::strides::CIndexer_vector;
+
+        CIndexer_vector _ind(nd);
+        ssize_t relative_first_offset(0);
+        ssize_t relative_second_offset(0);
+        ssize_t relative_third_offset(0);
+        ssize_t relative_fourth_offset(0);
+        _ind.get_displacement<const ssize_t *, const ssize_t *>(
+            gid,
+            shape_strides,          // shape ptr
+            shape_strides + nd,     // strides ptr
+            shape_strides + 2 * nd, // strides ptr
+            shape_strides + 3 * nd, // strides ptr
+            shape_strides + 4 * nd, // strides ptr
+            relative_first_offset, relative_second_offset,
+            relative_third_offset, relative_fourth_offset);
+        return FourOffsets<ssize_t>(
+            starting_first_offset + relative_first_offset,
+            starting_second_offset + relative_second_offset,
+            starting_third_offset + relative_third_offset,
+            starting_fourth_offset + relative_fourth_offset);
+    }
+};
+
+static_assert(sycl::is_device_copyable_v<FourOffsets_StridedIndexer>);
+
+struct FourZeroOffsets_Indexer
+{
+    constexpr FourZeroOffsets_Indexer() {}
+
+    constexpr FourOffsets<ssize_t> operator()(ssize_t) const
+    {
+        return FourOffsets<ssize_t>();
+    }
+};
+
+static_assert(sycl::is_device_copyable_v<FourZeroOffsets_Indexer>);
+
+struct NthStrideOffset
+{
+    NthStrideOffset(int common_nd,
+                    ssize_t const *_offsets,
+                    ssize_t const *_packed_shape_strides)
+        : _ind(common_nd), nd(common_nd), offsets(_offsets),
+          shape_strides(_packed_shape_strides)
+    {
+    }
+
+    std::size_t operator()(ssize_t gid, int n) const
+    {
+        ssize_t relative_offset(0);
+        _ind.get_displacement<const ssize_t *, const ssize_t *>(
+            gid, shape_strides, shape_strides + ((n + 1) * nd),
+            relative_offset);
+
+        return relative_offset + offsets[n];
+    }
+
+private:
+    dpctl::tensor::strides::CIndexer_vector<ssize_t> _ind;
+
+    int nd;
+    ssize_t const *offsets;
+    ssize_t const *shape_strides;
+};
+
+static_assert(sycl::is_device_copyable_v<NthStrideOffset>);
+
+template <int nd>
+struct FixedDimStridedIndexer
+{
+    FixedDimStridedIndexer(const std::array<ssize_t, nd> &_shape,
+                           const std::array<ssize_t, nd> &_strides,
+                           ssize_t _offset)
+        : _ind(_shape), strides(_strides), starting_offset(_offset)
+    {
+    }
+    std::size_t operator()(std::size_t gid) const
+    {
+        dpctl::tensor::strides::CIndexer_array<nd, ssize_t> local_indexer(
+            std::move(_ind));
+        local_indexer.set(gid);
+        auto mi = local_indexer.get();
+
+        ssize_t relative_offset = 0;
+
+#pragma unroll
+        for (int i = 0; i < nd; ++i) {
+            relative_offset += mi[i] * strides[i];
+        }
+        return starting_offset + relative_offset;
+    }
+
+private:
+    dpctl::tensor::strides::CIndexer_array<nd, ssize_t> _ind;
+
+    std::array<ssize_t, nd> strides;
+    ssize_t starting_offset;
+};
+
+static_assert(sycl::is_device_copyable_v<FixedDimStridedIndexer<1>>);
+
+template <int nd>
+struct TwoOffsets_FixedDimStridedIndexer
+{
+    TwoOffsets_FixedDimStridedIndexer(const std::array<ssize_t, nd> &_shape,
+                                      const std::array<ssize_t, nd> &_strides1,
+                                      const std::array<ssize_t, nd> &_strides2,
+                                      ssize_t _offset1,
+                                      ssize_t _offset2)
+        : _ind(_shape), strides1(_strides1), strides2(_strides2),
+          starting_offset1(_offset1), starting_offset2(_offset2)
+    {
+    }
+
+    TwoOffsets<ssize_t> operator()(std::size_t gid) const
+    {
+        dpctl::tensor::strides::CIndexer_array<nd, ssize_t> local_indexer(
+            std::move(_ind));
+        local_indexer.set(gid);
+        auto mi = local_indexer.get();
+
+        ssize_t relative_offset1 = 0;
+#pragma unroll
+        for (int i = 0; i < nd; ++i) {
+            relative_offset1 += mi[i] * strides1[i];
+        }
+
+        ssize_t relative_offset2 = 0;
+#pragma unroll
+        for (int i = 0; i < nd; ++i) {
+            relative_offset2 += mi[i] * strides2[i];
+        }
+
+        return TwoOffsets<ssize_t>(starting_offset1 + relative_offset1,
+                                   starting_offset2 + relative_offset2);
+    }
+
+private:
+    dpctl::tensor::strides::CIndexer_array<nd, ssize_t> _ind;
+
+    std::array<ssize_t, nd> strides1;
+    std::array<ssize_t, nd> strides2;
+    ssize_t starting_offset1;
+    ssize_t starting_offset2;
+};
+
+static_assert(sycl::is_device_copyable_v<TwoOffsets_FixedDimStridedIndexer<1>>);
+
+template <int nd>
+struct ThreeOffsets_FixedDimStridedIndexer
+{
+    ThreeOffsets_FixedDimStridedIndexer(
+        const std::array<ssize_t, nd> &_shape,
+        const std::array<ssize_t, nd> &_strides1,
+        const std::array<ssize_t, nd> &_strides2,
+        const std::array<ssize_t, nd> &_strides3,
+        ssize_t _offset1,
+        ssize_t _offset2,
+        ssize_t _offset3)
+        : _ind(_shape), strides1(_strides1), strides2(_strides2),
+          strides3(_strides3), starting_offset1(_offset1),
+          starting_offset2(_offset2), starting_offset3(_offset3)
+    {
+    }
+
+    ThreeOffsets<ssize_t> operator()(std::size_t gid) const
+    {
+        dpctl::tensor::strides::CIndexer_array<nd, ssize_t> local_indexer(
+            std::move(_ind));
+        local_indexer.set(gid);
+        auto mi = local_indexer.get();
+
+        ssize_t relative_offset1 = 0;
+#pragma unroll
+        for (int i = 0; i < nd; ++i) {
+            relative_offset1 += mi[i] * strides1[i];
+        }
+
+        ssize_t relative_offset2 = 0;
+#pragma unroll
+        for (int i = 0; i < nd; ++i) {
+            relative_offset2 += mi[i] * strides2[i];
+        }
+
+        ssize_t relative_offset3 = 0;
+#pragma unroll
+        for (int i = 0; i < nd; ++i) {
+            relative_offset3 += mi[i] * strides3[i];
+        }
+
+        return ThreeOffsets<ssize_t>(starting_offset1 + relative_offset1,
+                                     starting_offset2 + relative_offset2,
+                                     starting_offset3 + relative_offset3);
+    }
+
+private:
+    dpctl::tensor::strides::CIndexer_array<nd, ssize_t> _ind;
+
+    std::array<ssize_t, nd> strides1;
+    std::array<ssize_t, nd> strides2;
+    std::array<ssize_t, nd> strides3;
+    ssize_t starting_offset1;
+    ssize_t starting_offset2;
+    ssize_t starting_offset3;
+};
+
+static_assert(
+    sycl::is_device_copyable_v<ThreeOffsets_FixedDimStridedIndexer<1>>);
+} // namespace dpctl::tensor::offset_utils
diff --git a/dpctl/tensor/libtensor/include/utils/output_validation.hpp b/dpctl/tensor/libtensor/include/utils/output_validation.hpp
new file mode 100644
index 000000000000..1397efdee230
--- /dev/null
+++ b/dpctl/tensor/libtensor/include/utils/output_validation.hpp
@@ -0,0 +1,79 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+///
+/// \file
+/// This file defines utilities for determining if an array is a valid output
+/// array.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <stdexcept>
+
+#include <pybind11/pybind11.h>
+
+#include "dpctl4pybind11.hpp"
+
+namespace dpctl::tensor::validation
+{
+namespace py = pybind11;
+
+/*! @brief Raises a value error if an array is read-only.
+
+    This should be called with an array before writing.*/
+struct CheckWritable
+{
+    static void throw_if_not_writable(const dpctl::tensor::usm_ndarray &arr)
+    {
+        if (!arr.is_writable()) {
+            throw py::value_error("output array is read-only.");
+        }
+        return;
+    }
+};
+
+/*! @brief Raises a value error if an array's memory is not sufficiently ample
+    to accommodate an input number of elements.
+
+    This should be called with an array before writing.*/
+struct AmpleMemory
+{
+    template <typename T>
+    static void throw_if_not_ample(const dpctl::tensor::usm_ndarray &arr,
+                                   T nelems)
+    {
+        auto arr_offsets = arr.get_minmax_offsets();
+        T range = static_cast<T>(arr_offsets.second - arr_offsets.first);
+        if (range + 1 < nelems) {
+            throw py::value_error("Memory addressed by the output array is not "
+                                  "sufficiently ample.");
+        }
+        return;
+    }
+};
+} // namespace dpctl::tensor::validation
diff --git a/dpctl/tensor/libtensor/include/utils/strided_iters.hpp b/dpctl/tensor/libtensor/include/utils/strided_iters.hpp
new file mode 100644
index 000000000000..0bed181802ae
--- /dev/null
+++ b/dpctl/tensor/libtensor/include/utils/strided_iters.hpp
@@ -0,0 +1,996 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+///
+/// \file
+/// This file defines CIndexer_array, and CIndexer_vector classes, as well
+/// iteration space simplifiers.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <algorithm>
+#include <array>
+#include <cstddef>
+#include <numeric>
+#include <tuple>
+#include <type_traits>
+#include <vector>
+
+namespace dpctl::tensor::strides
+{
+/* An N-dimensional array can be stored in a single
+ * contiguous chunk of memory by contiguously laying
+ * array elements in lexicographinc order of their
+ * array indices. Such a layout is called C-contiguous.
+ *
+ * E.g. for (2, 3, 2) array `a` with zero-based indexing convention
+ * the C-array's elements are
+ * { a[0,0,0], a[0,0,1], a[0,1,0], a[0,1,1], a[0,2,0], a[0,2,1],
+ *   a[1,0,0], a[1,0,1], a[1,1,0], a[1,1,1], a[1,2,0], a[1,2,1] }
+ *
+ * Indexer maps zero-based index in C-array to a multi-index
+ * for the purpose of computing element displacement in the
+ * strided array, i.e. in the above example for k = 5, the displacement
+ * is (s0*0 + s1*2 + s2*1), and for k = 7 it is (s0*1 + s1*0 + s2*1)
+ * for N-dimensional array with strides (s0, s1, s2).
+ *
+ * Cindexer_vector need not know array rank `dim` at compile time.
+ * Shape and strides are stored in std::vector, which are not trivially
+ * copyable.
+ *
+ * For the class to be trivially copyable for offloading displacement
+ * computation methods take accessor/pointer arguments of type T for
+ * shape and stride and modify displacement argument passed by reference.
+ */
+template <typename indT = std::ptrdiff_t>
+class CIndexer_vector
+{
+    static_assert(std::is_integral<indT>::value, "Integral type is required");
+    static_assert(std::is_signed<indT>::value,
+                  "Signed integral type is required");
+    int nd;
+
+public:
+    CIndexer_vector(int dim) : nd(dim) {}
+
+    template <class ShapeTy>
+    indT size(const ShapeTy &shape) const
+    {
+        indT s = static_cast<indT>(1);
+        for (int i = 0; i < nd; ++i) {
+            s *= shape[i];
+        }
+        return s;
+    }
+
+    template <class ShapeTy, class StridesTy>
+    void get_displacement(const indT i,
+                          const ShapeTy &shape,
+                          const StridesTy &stride,
+                          indT &disp) const
+    {
+        if (nd == 1) {
+            disp = i * stride[0];
+            return;
+        }
+
+        indT i_ = i;
+        indT d = 0;
+        for (int dim = nd; --dim > 0;) {
+            const indT si = shape[dim];
+            const indT q = i_ / si;
+            const indT r = (i_ - q * si);
+            d += r * stride[dim];
+            i_ = q;
+        }
+        disp = d + i_ * stride[0];
+    }
+
+    template <class ShapeTy, class StridesTy>
+    void get_displacement(const indT i,
+                          const ShapeTy &shape,
+                          const StridesTy &stride1,
+                          const StridesTy &stride2,
+                          indT &disp1,
+                          indT &disp2) const
+    {
+        if (nd == 1) {
+            disp1 = i * stride1[0];
+            disp2 = i * stride2[0];
+            return;
+        }
+
+        indT i_ = i;
+        indT d1 = 0, d2 = 0;
+        for (int dim = nd; --dim > 0;) {
+            const indT si = shape[dim];
+            const indT q = i_ / si;
+            const indT r = (i_ - q * si);
+            i_ = q;
+            d1 += r * stride1[dim];
+            d2 += r * stride2[dim];
+        }
+        disp1 = d1 + i_ * stride1[0];
+        disp2 = d2 + i_ * stride2[0];
+        return;
+    }
+
+    template <class ShapeTy, class StridesTy>
+    void get_displacement(const indT i,
+                          const ShapeTy &shape,
+                          const StridesTy &stride1,
+                          const StridesTy &stride2,
+                          const StridesTy &stride3,
+                          indT &disp1,
+                          indT &disp2,
+                          indT &disp3) const
+    {
+        if (nd == 1) {
+            disp1 = i * stride1[0];
+            disp2 = i * stride2[0];
+            disp3 = i * stride3[0];
+            return;
+        }
+
+        indT i_ = i;
+        indT d1 = 0, d2 = 0, d3 = 0;
+        for (int dim = nd; --dim > 0;) {
+            const indT si = shape[dim];
+            const indT q = i_ / si;
+            const indT r = (i_ - q * si);
+            i_ = q;
+            d1 += r * stride1[dim];
+            d2 += r * stride2[dim];
+            d3 += r * stride3[dim];
+        };
+        disp1 = d1 + i_ * stride1[0];
+        disp2 = d2 + i_ * stride2[0];
+        disp3 = d3 + i_ * stride3[0];
+        return;
+    }
+
+    template <class ShapeTy, class StridesTy>
+    void get_displacement(const indT i,
+                          const ShapeTy &shape,
+                          const StridesTy &stride1,
+                          const StridesTy &stride2,
+                          const StridesTy &stride3,
+                          const StridesTy &stride4,
+                          indT &disp1,
+                          indT &disp2,
+                          indT &disp3,
+                          indT &disp4) const
+    {
+        if (nd == 1) {
+            disp1 = i * stride1[0];
+            disp2 = i * stride2[0];
+            disp3 = i * stride3[0];
+            disp4 = i * stride4[0];
+            return;
+        }
+
+        indT i_ = i;
+        indT d1 = 0, d2 = 0, d3 = 0, d4 = 0;
+        for (int dim = nd; --dim > 0;) {
+            const indT si = shape[dim];
+            const indT q = i_ / si;
+            const indT r = (i_ - q * si);
+            i_ = q;
+            d1 += r * stride1[dim];
+            d2 += r * stride2[dim];
+            d3 += r * stride3[dim];
+            d4 += r * stride4[dim];
+        }
+        disp1 = d1 + i_ * stride1[0];
+        disp2 = d2 + i_ * stride2[0];
+        disp3 = d3 + i_ * stride3[0];
+        disp4 = d4 + i_ * stride4[0];
+        return;
+    }
+
+    template <class ShapeTy, class StridesTy, int nstrides>
+    void get_displacement(const indT i,
+                          const ShapeTy &shape,
+                          const std::array<StridesTy, nstrides> &strides,
+                          std::array<indT, nstrides> &disps) const
+    {
+        if (nd == 1) {
+            for (int k = 0; k < nstrides; ++k) {
+                disps[k] = i * strides[k][0];
+            }
+            return;
+        }
+
+        indT i_ = i;
+        std::array<indT, nstrides> ds;
+        for (int k = 0; k < nstrides; ++k) {
+            ds[k] = 0;
+        }
+
+        for (int dim = nd; --dim > 0;) {
+            const indT si = shape[dim];
+            const indT q = i_ / si;
+            const indT r = (i_ - q * si);
+            for (int k = 0; k < nstrides; ++k) {
+                ds[k] += r * strides[k][dim];
+            }
+            i_ = q;
+        };
+        for (int k = 0; k < nstrides; ++k) {
+            disps[k] = ds[k] + i_ * strides[k][0];
+        }
+        return;
+    }
+
+    template <class ShapeTy, class StridesTy>
+    void get_left_rolled_displacement(const indT i,
+                                      const ShapeTy &shape,
+                                      const StridesTy &stride,
+                                      const StridesTy &shifts,
+                                      indT &disp) const
+    {
+        indT i_ = i;
+        indT d(0);
+        for (int dim = nd; --dim > 0;) {
+            const indT si = shape[dim];
+            const indT q = i_ / si;
+            const indT r = (i_ - q * si);
+            // assumes si > shifts[dim] >= 0
+            const indT shifted_r =
+                (r < shifts[dim] ? r + si - shifts[dim] : r - shifts[dim]);
+            d += shifted_r * stride[dim];
+            i_ = q;
+        }
+        const indT shifted_r =
+            (i_ < shifts[0] ? i_ + shape[0] - shifts[0] : i_ - shifts[0]);
+        disp = d + shifted_r * stride[0];
+    }
+};
+
+/*
+ * CIndexer is for arrays whose array-rank is known at compile time.
+ * Statically allocated shape and multi_index arrays are members of
+ * the class instance, and it remains trivially copyable.
+ *
+ * Method `set(k)` populates work-item private array multi_index, which
+ * can be accessed using `get()` to compute the displacement as needed.
+ */
+
+template <int _ndim, typename indT = std::ptrdiff_t>
+class CIndexer_array
+{
+    static constexpr int ndim = _ndim;
+
+    static_assert(std::is_integral<indT>::value, "Integral type is required");
+    static_assert(std::is_signed<indT>::value,
+                  "Signed integral type is required");
+    static_assert(ndim > 0, "Dimensionality must be positive");
+
+private:
+    typedef std::array<indT, ndim> index_t;
+
+    indT elem_count;
+    index_t shape;
+    index_t multi_index;
+
+public:
+    CIndexer_array() : elem_count(0), shape{}, multi_index{} {}
+
+    explicit CIndexer_array(const index_t &input_shape)
+        : elem_count(0), shape{}, multi_index{}
+    {
+        indT s(1);
+        for (int i = 0; i < ndim; ++i) {
+            shape[i] = input_shape[i];
+            s *= input_shape[i];
+        }
+        elem_count = s;
+    }
+
+    indT size() const
+    {
+        return elem_count;
+    }
+    indT rank() const
+    {
+        return ndim;
+    }
+
+    void set(const indT i)
+    {
+        if (ndim == 1) {
+            multi_index[0] = i;
+            return;
+        }
+
+        indT i_ = i;
+#pragma unroll
+        for (int dim = ndim; --dim > 0;) {
+            indT si = shape[dim];
+            indT q = i_ / si;
+            multi_index[dim] = i_ - q * si;
+            i_ = q;
+        }
+        multi_index[0] = i_;
+    }
+
+    const index_t &get() const
+    {
+        return multi_index;
+    }
+};
+
+/*
+    For purposes of iterating over elements of array with
+    `shape` and `strides` given as pointers
+    `simplify_iteration_strides(nd, shape_ptr, strides_ptr, disp)`
+    may modify memory and returns new length of these arrays.
+
+    The new shape and new strides, as well as the offset
+    `(new_shape, new_strides, disp)` are such that iterating over
+    them will traverse the same elements, possibly in
+    different order.
+
+    ..Example: python
+        import itertools
+        # for some array Y over whose elements we iterate
+        csh, cst, cp = contract_iter(Y.shape, Y.strides)
+        def pointers_set(sh, st, p):
+            citers = itertools.product(*map(lambda s: range(s), sh))
+            dot = lambda st, it: sum(st[k]*it[k] for k in range(len(st)))
+            return set(p + dot(st, it) for it in citers)
+        ps1 = pointers_set(csh, cst, cp)
+        ps2 = pointers_set(Y.shape, Y.strides, 0)
+        assert ps1 == ps2
+
+ */
+template <class ShapeTy, class StridesTy>
+int simplify_iteration_stride(const int nd,
+                              ShapeTy *shape,
+                              StridesTy *strides,
+                              StridesTy &disp)
+{
+    disp = StridesTy(0);
+    if (nd < 2)
+        return nd;
+
+    std::vector<int> pos(nd);
+    std::iota(pos.begin(), pos.end(), 0);
+
+    std::stable_sort(
+        pos.begin(), pos.end(), [&strides, &shape](int i1, int i2) {
+            auto abs_str1 = (strides[i1] < 0) ? -strides[i1] : strides[i1];
+            auto abs_str2 = (strides[i2] < 0) ? -strides[i2] : strides[i2];
+            return (abs_str1 > abs_str2) ||
+                   (abs_str1 == abs_str2 && shape[i1] > shape[i2]);
+        });
+
+    std::vector<ShapeTy> shape_w;
+    std::vector<StridesTy> strides_w;
+    int nd_ = nd;
+    shape_w.reserve(nd_);
+    strides_w.reserve(nd_);
+
+    for (int i = 0; i < nd; ++i) {
+        auto p = pos[i];
+        auto sh_p = shape[p];
+        auto str_p = strides[p];
+        shape_w.push_back(sh_p);
+        if (str_p < 0) {
+            disp += str_p * (sh_p - 1);
+            str_p = -str_p;
+        }
+        strides_w.push_back(str_p);
+    }
+
+    {
+        bool changed;
+        do {
+            changed = false;
+            for (int i = 0; i + 1 < nd_; ++i) {
+                StridesTy step = strides_w[i + 1];
+                StridesTy jump = strides_w[i] - (shape_w[i + 1] - 1) * step;
+                if (jump == step) {
+                    changed = true;
+                    for (int k = i; k + 1 < nd_; ++k) {
+                        strides_w[k] = strides_w[k + 1];
+                    }
+                    shape_w[i] *= shape_w[i + 1];
+                    for (int k = i + 1; k + 1 < nd_; ++k) {
+                        shape_w[k] = shape_w[k + 1];
+                    }
+                    --nd_;
+                }
+            }
+        } while (changed);
+    }
+
+    for (int i = 0; i < nd_; ++i) {
+        shape[i] = shape_w[i];
+    }
+    for (int i = 0; i < nd_; ++i) {
+        strides[i] = strides_w[i];
+    }
+
+    return nd_;
+}
+
+/*
+    For purposes of iterating over pairs of elements of two arrays
+    with  `shape` and strides `strides1`, `strides2` given as pointers
+    `simplify_iteration_two_strides(nd, shape_ptr, strides1_ptr,
+    strides2_ptr, disp1, disp2)`
+    may modify memory and returns new length of these arrays.
+
+    The new shape and new strides, as well as the offset
+    `(new_shape, new_strides1, disp1, new_stride2, disp2)` are such that
+    iterating over them will traverse the same set of pairs of elements,
+    possibly in a different order.
+ */
+template <class ShapeTy, class StridesTy>
+int simplify_iteration_two_strides(const int nd,
+                                   ShapeTy *shape,
+                                   StridesTy *strides1,
+                                   StridesTy *strides2,
+                                   StridesTy &disp1,
+                                   StridesTy &disp2)
+{
+    disp1 = StridesTy(0);
+    disp2 = StridesTy(0);
+    if (nd < 2)
+        return nd;
+
+    std::vector<int> pos(nd);
+    std::iota(pos.begin(), pos.end(), 0);
+
+    std::stable_sort(
+        pos.begin(), pos.end(), [&strides1, &strides2, &shape](int i1, int i2) {
+            auto abs_str1_i1 =
+                (strides1[i1] < 0) ? -strides1[i1] : strides1[i1];
+            auto abs_str1_i2 =
+                (strides1[i2] < 0) ? -strides1[i2] : strides1[i2];
+            auto abs_str2_i1 =
+                (strides2[i1] < 0) ? -strides2[i1] : strides2[i1];
+            auto abs_str2_i2 =
+                (strides2[i2] < 0) ? -strides2[i2] : strides2[i2];
+            return (abs_str2_i1 > abs_str2_i2) ||
+                   (abs_str2_i1 == abs_str2_i2 &&
+                    (abs_str1_i1 > abs_str1_i2 ||
+                     (abs_str1_i1 == abs_str1_i2 && shape[i1] > shape[i2])));
+        });
+
+    std::vector<ShapeTy> shape_w;
+    std::vector<StridesTy> strides1_w;
+    std::vector<StridesTy> strides2_w;
+
+    bool contractable = true;
+    for (int i = 0; i < nd; ++i) {
+        auto p = pos[i];
+        auto sh_p = shape[p];
+        auto str1_p = strides1[p];
+        auto str2_p = strides2[p];
+        shape_w.push_back(sh_p);
+        if (str1_p <= 0 && str2_p <= 0 && std::min(str1_p, str2_p) < 0) {
+            disp1 += str1_p * (sh_p - 1);
+            str1_p = -str1_p;
+            disp2 += str2_p * (sh_p - 1);
+            str2_p = -str2_p;
+        }
+        if (str1_p < 0 || str2_p < 0) {
+            contractable = false;
+        }
+        strides1_w.push_back(str1_p);
+        strides2_w.push_back(str2_p);
+    }
+
+    int nd_ = nd;
+    while (contractable) {
+        bool changed = false;
+        for (int i = 0; i + 1 < nd_; ++i) {
+            StridesTy str1 = strides1_w[i + 1];
+            StridesTy str2 = strides2_w[i + 1];
+            StridesTy jump1 = strides1_w[i] - (shape_w[i + 1] - 1) * str1;
+            StridesTy jump2 = strides2_w[i] - (shape_w[i + 1] - 1) * str2;
+
+            if (jump1 == str1 && jump2 == str2) {
+                changed = true;
+                shape_w[i] *= shape_w[i + 1];
+                for (int j = i; j < nd_; ++j) {
+                    strides1_w[j] = strides1_w[j + 1];
+                }
+                for (int j = i; j < nd_; ++j) {
+                    strides2_w[j] = strides2_w[j + 1];
+                }
+                for (int j = i + 1; j + 1 < nd_; ++j) {
+                    shape_w[j] = shape_w[j + 1];
+                }
+                --nd_;
+                break;
+            }
+        }
+        if (!changed)
+            break;
+    }
+    for (int i = 0; i < nd_; ++i) {
+        shape[i] = shape_w[i];
+    }
+    for (int i = 0; i < nd_; ++i) {
+        strides1[i] = strides1_w[i];
+    }
+    for (int i = 0; i < nd_; ++i) {
+        strides2[i] = strides2_w[i];
+    }
+
+    return nd_;
+}
+
+template <typename T, class Error, typename vecT = std::vector<T>>
+std::tuple<vecT, vecT, T> contract_iter(const vecT &shape, const vecT &strides)
+{
+    const std::size_t dim = shape.size();
+    if (dim != strides.size()) {
+        throw Error("Shape and strides must be of equal size.");
+    }
+    vecT out_shape = shape;
+    vecT out_strides = strides;
+    T disp(0);
+
+    int nd = simplify_iteration_stride(dim, out_shape.data(),
+                                       out_strides.data(), disp);
+    out_shape.resize(nd);
+    out_strides.resize(nd);
+    return std::make_tuple(out_shape, out_strides, disp);
+}
+
+template <typename T, class Error, typename vecT = std::vector<T>>
+std::tuple<vecT, vecT, T, vecT, T> contract_iter2(const vecT &shape,
+                                                  const vecT &strides1,
+                                                  const vecT &strides2)
+{
+    const std::size_t dim = shape.size();
+    if (dim != strides1.size() || dim != strides2.size()) {
+        throw Error("Shape and strides must be of equal size.");
+    }
+    vecT out_shape = shape;
+    vecT out_strides1 = strides1;
+    vecT out_strides2 = strides2;
+    T disp1(0);
+    T disp2(0);
+
+    int nd = simplify_iteration_two_strides(dim, out_shape.data(),
+                                            out_strides1.data(),
+                                            out_strides2.data(), disp1, disp2);
+    out_shape.resize(nd);
+    out_strides1.resize(nd);
+    out_strides2.resize(nd);
+    return std::make_tuple(out_shape, out_strides1, disp1, out_strides2, disp2);
+}
+
+/*
+    For purposes of iterating over pairs of elements of three arrays
+    with  `shape` and strides `strides1`, `strides2`, `strides3` given as
+    pointers `simplify_iteration_three_strides(nd, shape_ptr, strides1_ptr,
+    strides2_ptr, strides3_ptr, disp1, disp2, disp3)`
+    may modify memory and returns new length of these arrays.
+
+    The new shape and new strides, as well as the offset
+    `(new_shape, new_strides1, disp1, new_stride2, disp2, new_stride3, disp3)`
+    are such that iterating over them will traverse the same set of tuples of
+    elements, possibly in a different order.
+ */
+template <class ShapeTy, class StridesTy>
+int simplify_iteration_three_strides(const int nd,
+                                     ShapeTy *shape,
+                                     StridesTy *strides1,
+                                     StridesTy *strides2,
+                                     StridesTy *strides3,
+                                     StridesTy &disp1,
+                                     StridesTy &disp2,
+                                     StridesTy &disp3)
+{
+    disp1 = StridesTy(0);
+    disp2 = StridesTy(0);
+    if (nd < 2)
+        return nd;
+
+    std::vector<int> pos(nd);
+    std::iota(pos.begin(), pos.end(), 0);
+
+    std::stable_sort(pos.begin(), pos.end(),
+                     [&strides1, &strides2, &strides3, &shape](int i1, int i2) {
+                         auto abs_str1_i1 =
+                             (strides1[i1] < 0) ? -strides1[i1] : strides1[i1];
+                         auto abs_str1_i2 =
+                             (strides1[i2] < 0) ? -strides1[i2] : strides1[i2];
+                         auto abs_str2_i1 =
+                             (strides2[i1] < 0) ? -strides2[i1] : strides2[i1];
+                         auto abs_str2_i2 =
+                             (strides2[i2] < 0) ? -strides2[i2] : strides2[i2];
+                         auto abs_str3_i1 =
+                             (strides3[i1] < 0) ? -strides3[i1] : strides3[i1];
+                         auto abs_str3_i2 =
+                             (strides3[i2] < 0) ? -strides3[i2] : strides3[i2];
+                         return (abs_str3_i1 > abs_str3_i2) ||
+                                ((abs_str3_i1 == abs_str3_i2) &&
+                                 ((abs_str2_i1 > abs_str2_i2) ||
+                                  ((abs_str2_i1 == abs_str2_i2) &&
+                                   ((abs_str1_i1 > abs_str1_i2) ||
+                                    ((abs_str1_i1 == abs_str1_i2) &&
+                                     (shape[i1] > shape[i2]))))));
+                     });
+
+    std::vector<ShapeTy> shape_w;
+    std::vector<StridesTy> strides1_w;
+    std::vector<StridesTy> strides2_w;
+    std::vector<StridesTy> strides3_w;
+
+    bool contractable = true;
+    for (int i = 0; i < nd; ++i) {
+        auto p = pos[i];
+        auto sh_p = shape[p];
+        auto str1_p = strides1[p];
+        auto str2_p = strides2[p];
+        auto str3_p = strides3[p];
+        shape_w.push_back(sh_p);
+        if (str1_p <= 0 && str2_p <= 0 && str3_p <= 0 &&
+            std::min({str1_p, str2_p, str3_p}) < 0)
+        {
+            disp1 += str1_p * (sh_p - 1);
+            str1_p = -str1_p;
+            disp2 += str2_p * (sh_p - 1);
+            str2_p = -str2_p;
+            disp3 += str3_p * (sh_p - 1);
+            str3_p = -str3_p;
+        }
+        if (str1_p < 0 || str2_p < 0 || str3_p < 0) {
+            contractable = false;
+        }
+        strides1_w.push_back(str1_p);
+        strides2_w.push_back(str2_p);
+        strides3_w.push_back(str3_p);
+    }
+    int nd_ = nd;
+    while (contractable) {
+        bool changed = false;
+        for (int i = 0; i + 1 < nd_; ++i) {
+            StridesTy str1 = strides1_w[i + 1];
+            StridesTy str2 = strides2_w[i + 1];
+            StridesTy str3 = strides3_w[i + 1];
+            StridesTy jump1 = strides1_w[i] - (shape_w[i + 1] - 1) * str1;
+            StridesTy jump2 = strides2_w[i] - (shape_w[i + 1] - 1) * str2;
+            StridesTy jump3 = strides3_w[i] - (shape_w[i + 1] - 1) * str3;
+
+            if (jump1 == str1 && jump2 == str2 && jump3 == str3) {
+                changed = true;
+                shape_w[i] *= shape_w[i + 1];
+                for (int j = i; j < nd_; ++j) {
+                    strides1_w[j] = strides1_w[j + 1];
+                }
+                for (int j = i; j < nd_; ++j) {
+                    strides2_w[j] = strides2_w[j + 1];
+                }
+                for (int j = i; j < nd_; ++j) {
+                    strides3_w[j] = strides3_w[j + 1];
+                }
+                for (int j = i + 1; j + 1 < nd_; ++j) {
+                    shape_w[j] = shape_w[j + 1];
+                }
+                --nd_;
+                break;
+            }
+        }
+        if (!changed)
+            break;
+    }
+    for (int i = 0; i < nd_; ++i) {
+        shape[i] = shape_w[i];
+    }
+    for (int i = 0; i < nd_; ++i) {
+        strides1[i] = strides1_w[i];
+    }
+    for (int i = 0; i < nd_; ++i) {
+        strides2[i] = strides2_w[i];
+    }
+    for (int i = 0; i < nd_; ++i) {
+        strides3[i] = strides3_w[i];
+    }
+
+    return nd_;
+}
+
+template <typename T, class Error, typename vecT = std::vector<T>>
+std::tuple<vecT, vecT, T, vecT, T, vecT, T> contract_iter3(const vecT &shape,
+                                                           const vecT &strides1,
+                                                           const vecT &strides2,
+                                                           const vecT &strides3)
+{
+    const std::size_t dim = shape.size();
+    if (dim != strides1.size() || dim != strides2.size() ||
+        dim != strides3.size()) {
+        throw Error("Shape and strides must be of equal size.");
+    }
+    vecT out_shape = shape;
+    vecT out_strides1 = strides1;
+    vecT out_strides2 = strides2;
+    vecT out_strides3 = strides3;
+    T disp1(0);
+    T disp2(0);
+    T disp3(0);
+
+    int nd = simplify_iteration_three_strides(
+        dim, out_shape.data(), out_strides1.data(), out_strides2.data(),
+        out_strides3.data(), disp1, disp2, disp3);
+    out_shape.resize(nd);
+    out_strides1.resize(nd);
+    out_strides2.resize(nd);
+    out_strides3.resize(nd);
+    return std::make_tuple(out_shape, out_strides1, disp1, out_strides2, disp2,
+                           out_strides3, disp3);
+}
+
+/*
+    For purposes of iterating over pairs of elements of four arrays
+    with  `shape` and strides `strides1`, `strides2`, `strides3`,
+    `strides4` given as pointers `simplify_iteration_four_strides(nd,
+    shape_ptr, strides1_ptr, strides2_ptr, strides3_ptr, strides4_ptr,
+    disp1, disp2, disp3, disp4)` may modify memory and returns new
+    length of these arrays.
+
+    The new shape and new strides, as well as the offset
+    `(new_shape, new_strides1, disp1, new_stride2, disp2, new_stride3, disp3,
+    new_stride4, disp4)` are such that iterating over them will traverse the
+    same set of tuples of elements, possibly in a different order.
+ */
+template <class ShapeTy, class StridesTy>
+int simplify_iteration_four_strides(const int nd,
+                                    ShapeTy *shape,
+                                    StridesTy *strides1,
+                                    StridesTy *strides2,
+                                    StridesTy *strides3,
+                                    StridesTy *strides4,
+                                    StridesTy &disp1,
+                                    StridesTy &disp2,
+                                    StridesTy &disp3,
+                                    StridesTy &disp4)
+{
+    disp1 = StridesTy(0);
+    disp2 = StridesTy(0);
+    if (nd < 2)
+        return nd;
+
+    std::vector<int> pos(nd);
+    std::iota(pos.begin(), pos.end(), 0);
+
+    std::stable_sort(
+        pos.begin(), pos.end(),
+        [&strides1, &strides2, &strides3, &strides4, &shape](int i1, int i2) {
+            auto abs_str1_i1 =
+                (strides1[i1] < 0) ? -strides1[i1] : strides1[i1];
+            auto abs_str1_i2 =
+                (strides1[i2] < 0) ? -strides1[i2] : strides1[i2];
+            auto abs_str2_i1 =
+                (strides2[i1] < 0) ? -strides2[i1] : strides2[i1];
+            auto abs_str2_i2 =
+                (strides2[i2] < 0) ? -strides2[i2] : strides2[i2];
+            auto abs_str3_i1 =
+                (strides3[i1] < 0) ? -strides3[i1] : strides3[i1];
+            auto abs_str3_i2 =
+                (strides3[i2] < 0) ? -strides3[i2] : strides3[i2];
+            auto abs_str4_i1 =
+                (strides4[i1] < 0) ? -strides4[i1] : strides4[i1];
+            auto abs_str4_i2 =
+                (strides4[i2] < 0) ? -strides4[i2] : strides4[i2];
+            return (abs_str4_i1 > abs_str4_i2) ||
+                   ((abs_str4_i1 == abs_str4_i2) &&
+                    ((abs_str3_i1 > abs_str3_i2) ||
+                     ((abs_str3_i1 == abs_str3_i2) &&
+                      ((abs_str2_i1 > abs_str2_i2) ||
+                       ((abs_str2_i1 == abs_str2_i2) &&
+                        ((abs_str1_i1 > abs_str1_i2) ||
+                         ((abs_str1_i1 == abs_str1_i2) &&
+                          (shape[i1] > shape[i2]))))))));
+        });
+
+    std::vector<ShapeTy> shape_w;
+    std::vector<StridesTy> strides1_w;
+    std::vector<StridesTy> strides2_w;
+    std::vector<StridesTy> strides3_w;
+    std::vector<StridesTy> strides4_w;
+
+    bool contractable = true;
+    for (int i = 0; i < nd; ++i) {
+        auto p = pos[i];
+        auto sh_p = shape[p];
+        auto str1_p = strides1[p];
+        auto str2_p = strides2[p];
+        auto str3_p = strides3[p];
+        auto str4_p = strides4[p];
+        shape_w.push_back(sh_p);
+        if (str1_p <= 0 && str2_p <= 0 && str3_p <= 0 && str4_p <= 0 &&
+            std::min({str1_p, str2_p, str3_p, str4_p}) < 0)
+        {
+            disp1 += str1_p * (sh_p - 1);
+            str1_p = -str1_p;
+            disp2 += str2_p * (sh_p - 1);
+            str2_p = -str2_p;
+            disp3 += str3_p * (sh_p - 1);
+            str3_p = -str3_p;
+            disp4 += str4_p * (sh_p - 1);
+            str4_p = -str4_p;
+        }
+        if (str1_p < 0 || str2_p < 0 || str3_p < 0 || str4_p < 0) {
+            contractable = false;
+        }
+        strides1_w.push_back(str1_p);
+        strides2_w.push_back(str2_p);
+        strides3_w.push_back(str3_p);
+        strides4_w.push_back(str4_p);
+    }
+    int nd_ = nd;
+    while (contractable) {
+        bool changed = false;
+        for (int i = 0; i + 1 < nd_; ++i) {
+            StridesTy str1 = strides1_w[i + 1];
+            StridesTy str2 = strides2_w[i + 1];
+            StridesTy str3 = strides3_w[i + 1];
+            StridesTy str4 = strides4_w[i + 1];
+            StridesTy jump1 = strides1_w[i] - (shape_w[i + 1] - 1) * str1;
+            StridesTy jump2 = strides2_w[i] - (shape_w[i + 1] - 1) * str2;
+            StridesTy jump3 = strides3_w[i] - (shape_w[i + 1] - 1) * str3;
+            StridesTy jump4 = strides4_w[i] - (shape_w[i + 1] - 1) * str4;
+
+            if (jump1 == str1 && jump2 == str2 && jump3 == str3 &&
+                jump4 == str4) {
+                changed = true;
+                shape_w[i] *= shape_w[i + 1];
+                for (int j = i; j < nd_; ++j) {
+                    strides1_w[j] = strides1_w[j + 1];
+                }
+                for (int j = i; j < nd_; ++j) {
+                    strides2_w[j] = strides2_w[j + 1];
+                }
+                for (int j = i; j < nd_; ++j) {
+                    strides3_w[j] = strides3_w[j + 1];
+                }
+                for (int j = i; j < nd_; ++j) {
+                    strides4_w[j] = strides4_w[j + 1];
+                }
+                for (int j = i + 1; j + 1 < nd_; ++j) {
+                    shape_w[j] = shape_w[j + 1];
+                }
+                --nd_;
+                break;
+            }
+        }
+        if (!changed)
+            break;
+    }
+    for (int i = 0; i < nd_; ++i) {
+        shape[i] = shape_w[i];
+    }
+    for (int i = 0; i < nd_; ++i) {
+        strides1[i] = strides1_w[i];
+    }
+    for (int i = 0; i < nd_; ++i) {
+        strides2[i] = strides2_w[i];
+    }
+    for (int i = 0; i < nd_; ++i) {
+        strides3[i] = strides3_w[i];
+    }
+    for (int i = 0; i < nd_; ++i) {
+        strides4[i] = strides4_w[i];
+    }
+
+    return nd_;
+}
+
+template <typename T, class Error, typename vecT = std::vector<T>>
+std::tuple<vecT, vecT, T, vecT, T, vecT, T, vecT, T>
+    contract_iter4(const vecT &shape,
+                   const vecT &strides1,
+                   const vecT &strides2,
+                   const vecT &strides3,
+                   const vecT &strides4)
+{
+    const std::size_t dim = shape.size();
+    if (dim != strides1.size() || dim != strides2.size() ||
+        dim != strides3.size() || dim != strides4.size())
+    {
+        throw Error("Shape and strides must be of equal size.");
+    }
+    vecT out_shape = shape;
+    vecT out_strides1 = strides1;
+    vecT out_strides2 = strides2;
+    vecT out_strides3 = strides3;
+    vecT out_strides4 = strides4;
+    T disp1(0);
+    T disp2(0);
+    T disp3(0);
+    T disp4(0);
+
+    int nd = simplify_iteration_four_strides(
+        dim, out_shape.data(), out_strides1.data(), out_strides2.data(),
+        out_strides3.data(), out_strides4.data(), disp1, disp2, disp3, disp4);
+    out_shape.resize(nd);
+    out_strides1.resize(nd);
+    out_strides2.resize(nd);
+    out_strides3.resize(nd);
+    out_strides4.resize(nd);
+    return std::make_tuple(out_shape, out_strides1, disp1, out_strides2, disp2,
+                           out_strides3, disp3, out_strides4, disp4);
+}
+
+/*
+    For purposes of iterating over elements of an array with  `shape` and
+    strides `strides` given as pointers `compact_iteration(nd, shape, strides)`
+    may modify memory and returns the new length of the array.
+
+    The new shape and new strides `(new_shape, new_strides)` are such that
+    iterating over them will traverse the same elements in the same order,
+    possibly with reduced dimensionality.
+ */
+template <class ShapeTy, class StridesTy>
+int compact_iteration(const int nd, ShapeTy *shape, StridesTy *strides)
+{
+    if (nd < 2)
+        return nd;
+
+    bool contractable = true;
+    for (int i = 0; i < nd; ++i) {
+        if (strides[i] < 0) {
+            contractable = false;
+        }
+    }
+
+    int nd_ = nd;
+    while (contractable) {
+        bool changed = false;
+        for (int i = 0; i + 1 < nd_; ++i) {
+            StridesTy str = strides[i + 1];
+            StridesTy jump = strides[i] - (shape[i + 1] - 1) * str;
+
+            if (jump == str) {
+                changed = true;
+                shape[i] *= shape[i + 1];
+                for (int j = i; j < nd_; ++j) {
+                    strides[j] = strides[j + 1];
+                }
+                for (int j = i + 1; j + 1 < nd_; ++j) {
+                    shape[j] = shape[j + 1];
+                }
+                --nd_;
+                break;
+            }
+        }
+        if (!changed)
+            break;
+    }
+
+    return nd_;
+}
+} // namespace dpctl::tensor::strides
diff --git a/dpctl/tensor/libtensor/include/utils/sycl_alloc_utils.hpp b/dpctl/tensor/libtensor/include/utils/sycl_alloc_utils.hpp
new file mode 100644
index 000000000000..76f0174b9fdf
--- /dev/null
+++ b/dpctl/tensor/libtensor/include/utils/sycl_alloc_utils.hpp
@@ -0,0 +1,223 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+///
+/// \file
+/// This file defines CIndexer_array, and CIndexer_vector classes, as well
+/// iteration space simplifiers.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <cstddef>
+#include <exception>
+#include <iostream>
+#include <memory>
+#include <stdexcept>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+namespace dpctl::tensor::alloc_utils
+{
+template <typename T>
+class usm_host_allocator : public sycl::usm_allocator<T, sycl::usm::alloc::host>
+{
+public:
+    using baseT = sycl::usm_allocator<T, sycl::usm::alloc::host>;
+    using baseT::baseT;
+
+    template <typename U>
+    struct rebind
+    {
+        typedef usm_host_allocator<U> other;
+    };
+
+    void deallocate(T *ptr, std::size_t n)
+    {
+        try {
+            baseT::deallocate(ptr, n);
+        } catch (const std::exception &e) {
+            std::cerr
+                << "Exception caught in `usm_host_allocator::deallocate`: "
+                << e.what() << std::endl;
+        }
+    }
+};
+
+template <typename T>
+void sycl_free_noexcept(T *ptr, const sycl::context &ctx) noexcept
+{
+    try {
+        sycl::free(ptr, ctx);
+    } catch (const std::exception &e) {
+        std::cerr << "Call to sycl::free caught exception: " << e.what()
+                  << std::endl;
+    }
+}
+
+template <typename T>
+void sycl_free_noexcept(T *ptr, const sycl::queue &q) noexcept
+{
+    sycl_free_noexcept(ptr, q.get_context());
+}
+
+class USMDeleter
+{
+private:
+    sycl::context ctx_;
+
+public:
+    USMDeleter(const sycl::queue &q) : ctx_(q.get_context()) {}
+    USMDeleter(const sycl::context &ctx) : ctx_(ctx) {}
+
+    template <typename T>
+    void operator()(T *ptr) const
+    {
+        sycl_free_noexcept(ptr, ctx_);
+    }
+};
+
+template <typename T>
+std::unique_ptr<T, USMDeleter>
+    smart_malloc(std::size_t count,
+                 const sycl::queue &q,
+                 sycl::usm::alloc kind,
+                 const sycl::property_list &propList = {})
+{
+    T *ptr = sycl::malloc<T>(count, q, kind, propList);
+    if (nullptr == ptr) {
+        throw std::runtime_error("Unable to allocate device_memory");
+    }
+
+    auto usm_deleter = USMDeleter(q);
+    return std::unique_ptr<T, USMDeleter>(ptr, usm_deleter);
+}
+
+template <typename T>
+std::unique_ptr<T, USMDeleter>
+    smart_malloc_device(std::size_t count,
+                        const sycl::queue &q,
+                        const sycl::property_list &propList = {})
+{
+    return smart_malloc<T>(count, q, sycl::usm::alloc::device, propList);
+}
+
+template <typename T>
+std::unique_ptr<T, USMDeleter>
+    smart_malloc_shared(std::size_t count,
+                        const sycl::queue &q,
+                        const sycl::property_list &propList = {})
+{
+    return smart_malloc<T>(count, q, sycl::usm::alloc::shared, propList);
+}
+
+template <typename T>
+std::unique_ptr<T, USMDeleter>
+    smart_malloc_host(std::size_t count,
+                      const sycl::queue &q,
+                      const sycl::property_list &propList = {})
+{
+    return smart_malloc<T>(count, q, sycl::usm::alloc::host, propList);
+}
+
+namespace detail
+{
+template <typename T>
+struct valid_smart_ptr : public std::false_type
+{
+};
+
+template <typename ValT, typename DeleterT>
+struct valid_smart_ptr<std::unique_ptr<ValT, DeleterT> &>
+    : public std::is_same<DeleterT, USMDeleter>
+{
+};
+
+template <typename ValT, typename DeleterT>
+struct valid_smart_ptr<std::unique_ptr<ValT, DeleterT>>
+    : public std::is_same<DeleterT, USMDeleter>
+{
+};
+
+// base case
+template <typename... Rest>
+struct all_valid_smart_ptrs
+{
+    static constexpr bool value = true;
+};
+
+template <typename Arg, typename... RestArgs>
+struct all_valid_smart_ptrs<Arg, RestArgs...>
+{
+    static constexpr bool value = valid_smart_ptr<Arg>::value &&
+                                  (all_valid_smart_ptrs<RestArgs...>::value);
+};
+} // end of namespace detail
+
+/*! @brief Submit host_task and transfer ownership from smart pointers to it */
+template <typename... UniquePtrTs>
+sycl::event async_smart_free(sycl::queue &exec_q,
+                             const std::vector<sycl::event> &depends,
+                             UniquePtrTs &&...unique_pointers)
+{
+    static constexpr std::size_t n = sizeof...(UniquePtrTs);
+    static_assert(
+        n > 0, "async_smart_free requires at least one smart pointer argument");
+
+    static_assert(
+        detail::all_valid_smart_ptrs<UniquePtrTs...>::value,
+        "async_smart_free requires unique_ptr created with smart_malloc");
+
+    std::vector<void *> ptrs;
+    ptrs.reserve(n);
+    (ptrs.push_back(reinterpret_cast<void *>(unique_pointers.get())), ...);
+
+    std::vector<USMDeleter> dels;
+    dels.reserve(n);
+    (dels.emplace_back(unique_pointers.get_deleter()), ...);
+
+    sycl::event ht_e = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        cgh.host_task([ptrs = std::move(ptrs), dels = std::move(dels)]() {
+            for (std::size_t i = 0; i < ptrs.size(); ++i) {
+                dels[i](ptrs[i]);
+            }
+        });
+    });
+
+    // Upon successful submission of host_task, USM allocations are owned
+    // by the host_task. Release smart pointer ownership to avoid double
+    // deallocation
+    (unique_pointers.release(), ...);
+
+    return ht_e;
+}
+} // namespace dpctl::tensor::alloc_utils
diff --git a/dpctl/tensor/libtensor/include/utils/sycl_utils.hpp b/dpctl/tensor/libtensor/include/utils/sycl_utils.hpp
new file mode 100644
index 000000000000..1cb70adafeec
--- /dev/null
+++ b/dpctl/tensor/libtensor/include/utils/sycl_utils.hpp
@@ -0,0 +1,662 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+///
+/// \file
+/// This file defines utilities used for kernel submission.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <algorithm>
+#include <complex>
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "math_utils.hpp"
+
+namespace dpctl::tensor::sycl_utils
+{
+namespace detail
+{
+template <typename...>
+struct TypeList;
+
+template <typename Head, typename... Tail>
+struct TypeList<Head, Tail...>
+{
+    using head = Head;
+    using tail = TypeList<Tail...>;
+};
+
+using NullTypeList = TypeList<>;
+template <typename T>
+struct IsNullTypeList : std::conditional_t<std::is_same_v<T, NullTypeList>,
+                                           std::true_type,
+                                           std::false_type>
+{
+};
+
+// recursively check if type is contained in given TypeList
+template <typename T, typename TList>
+struct IsContained
+    : std::conditional_t<
+          std::is_same_v<typename TList::head, std::remove_cv_t<T>>,
+          std::true_type,
+          IsContained<T, typename TList::tail>>
+{
+};
+
+template <>
+struct TypeList<>
+{
+};
+
+// std::false_type when last case has been checked for membership
+template <typename T>
+struct IsContained<T, NullTypeList> : std::false_type
+{
+};
+
+template <class T>
+struct IsComplex : std::false_type
+{
+};
+template <class T>
+struct IsComplex<std::complex<T>> : std::true_type
+{
+};
+} // namespace detail
+
+template <typename T>
+using sycl_ops = detail::TypeList<sycl::plus<T>,
+                                  sycl::bit_or<T>,
+                                  sycl::bit_xor<T>,
+                                  sycl::bit_and<T>,
+                                  sycl::maximum<T>,
+                                  sycl::minimum<T>,
+                                  sycl::multiplies<T>>;
+
+template <typename T, typename Op>
+struct IsSyclOp
+{
+    static constexpr bool value =
+        detail::IsContained<Op, sycl_ops<std::remove_const_t<T>>>::value ||
+        detail::IsContained<Op, sycl_ops<std::add_const_t<T>>>::value;
+};
+
+/*! @brief Find the smallest multiple of supported sub-group size larger than
+ * nelems */
+template <std::size_t f = 4>
+std::size_t choose_workgroup_size(const std::size_t nelems,
+                                  const std::vector<std::size_t> &sg_sizes)
+{
+    std::vector<std::size_t> wg_choices;
+    wg_choices.reserve(f * sg_sizes.size());
+
+    for (const auto &sg_size : sg_sizes) {
+#pragma unroll
+        for (std::size_t i = 1; i <= f; ++i) {
+            wg_choices.push_back(sg_size * i);
+        }
+    }
+    std::sort(std::begin(wg_choices), std::end(wg_choices));
+
+    std::size_t wg = 1;
+    for (std::size_t i = 0; i < wg_choices.size(); ++i) {
+        if (wg_choices[i] == wg) {
+            continue;
+        }
+        wg = wg_choices[i];
+        std::size_t n_groups = ((nelems + wg - 1) / wg);
+        if (n_groups == 1)
+            break;
+    }
+
+    return wg;
+}
+
+namespace detail
+{
+
+template <typename LocAccT, typename OpT>
+void _fold(LocAccT &local_mem_acc,
+           const std::uint32_t lid,
+           const std::uint32_t cutoff,
+           const std::uint32_t step,
+           const OpT &op)
+{
+    if (lid < cutoff) {
+        local_mem_acc[lid] = op(local_mem_acc[lid], local_mem_acc[step + lid]);
+    }
+}
+
+template <typename LocAccT, typename OpT>
+void _fold(LocAccT &local_mem_acc,
+           const std::uint32_t lid,
+           const std::uint32_t step,
+           const OpT &op)
+{
+    if (lid < step) {
+        local_mem_acc[lid] = op(local_mem_acc[lid], local_mem_acc[step + lid]);
+    }
+}
+
+} // end of namespace detail
+
+template <typename T, typename GroupT, typename LocAccT, typename OpT>
+T custom_reduce_over_group(const GroupT &wg,
+                           LocAccT local_mem_acc,
+                           const T &local_val,
+                           const OpT &op)
+{
+    // value experimentally tuned to achieve best runtime on Iris Xe,
+    // Arc A140V integrated Intel GPUs, and discrete Intel Max GPU.
+    static constexpr std::uint32_t low_sz = 8u;
+    // maximal work-group size
+    static constexpr std::uint32_t high_sz = 1024u;
+    const std::uint32_t wgs = wg.get_local_linear_range();
+    const std::uint32_t lid = wg.get_local_linear_id();
+
+    local_mem_acc[lid] = local_val;
+    sycl::group_barrier(wg, sycl::memory_scope::work_group);
+
+    std::uint32_t n_witems = wgs;
+    if (wgs & (wgs - 1)) {
+        // wgs is not a power of 2
+#pragma unroll
+        for (std::uint32_t sz = high_sz; sz >= low_sz; sz >>= 1) {
+            if (n_witems >= sz) {
+                const std::uint32_t n_witems_ = (n_witems + 1) >> 1;
+                detail::_fold(local_mem_acc, lid, n_witems - n_witems_,
+                              n_witems_, op);
+                sycl::group_barrier(wg, sycl::memory_scope::work_group);
+                n_witems = n_witems_;
+            }
+        }
+    }
+    else {
+        // wgs is a power of 2
+#pragma unroll
+        for (std::uint32_t sz = high_sz; sz >= low_sz; sz >>= 1) {
+            if (n_witems >= sz) {
+                n_witems >>= 1;
+                detail::_fold(local_mem_acc, lid, n_witems, op);
+                sycl::group_barrier(wg, sycl::memory_scope::work_group);
+            }
+        }
+    }
+
+    T red_val_over_wg = local_mem_acc[0];
+    if (wg.leader()) {
+        for (std::uint32_t i = 1; i < n_witems; ++i) {
+            red_val_over_wg = op(red_val_over_wg, local_mem_acc[i]);
+        }
+    }
+
+    return sycl::group_broadcast(wg, red_val_over_wg, 0);
+}
+
+template <typename GroupT,
+          typename SubGroupT,
+          typename LocAccT,
+          typename T,
+          typename OpT>
+T custom_inclusive_scan_over_group(GroupT &&wg,
+                                   SubGroupT &&sg,
+                                   LocAccT &&local_mem_acc,
+                                   const T &local_val,
+                                   const T &identity,
+                                   OpT &&op)
+{
+    const std::uint32_t local_id = wg.get_local_id(0);
+    const std::uint32_t wgs = wg.get_local_range(0);
+
+    const std::uint32_t lane_id = sg.get_local_id()[0];
+    const std::uint32_t sgSize = sg.get_local_range()[0];
+
+    T scan_val = local_val;
+    for (std::uint32_t step = 1; step < sgSize; step *= 2) {
+        const bool advanced_lane = (lane_id >= step);
+        const std::uint32_t src_lane_id =
+            (advanced_lane ? lane_id - step : lane_id);
+        const T modifier = sycl::select_from_group(sg, scan_val, src_lane_id);
+        if (advanced_lane) {
+            scan_val = op(scan_val, modifier);
+        }
+    }
+
+    local_mem_acc[local_id] = scan_val;
+    sycl::group_barrier(wg, sycl::memory_scope::work_group);
+
+    const std::uint32_t max_sgSize = sg.get_max_local_range()[0];
+    const std::uint32_t sgr_id = sg.get_group_id()[0];
+
+    // now scan
+    const std::uint32_t n_aggregates = 1 + ((wgs - 1) / max_sgSize);
+    const bool large_wg = (n_aggregates > max_sgSize);
+    if (large_wg) {
+        if (wg.leader()) {
+            T _scan_val = identity;
+            for (std::uint32_t i = 1; i <= n_aggregates - max_sgSize; ++i) {
+                _scan_val = op(local_mem_acc[i * max_sgSize - 1], _scan_val);
+                local_mem_acc[i * max_sgSize - 1] = _scan_val;
+            }
+        }
+        sycl::group_barrier(wg, sycl::memory_scope::work_group);
+    }
+
+    if (sgr_id == 0) {
+        const std::uint32_t offset =
+            (large_wg) ? n_aggregates - max_sgSize : 0u;
+        const bool in_range = (lane_id < n_aggregates);
+        const bool in_bounds = in_range && (lane_id > 0 || large_wg);
+
+        T __scan_val = (in_bounds)
+                           ? local_mem_acc[(offset + lane_id) * max_sgSize - 1]
+                           : identity;
+        for (std::uint32_t step = 1; step < sgSize; step *= 2) {
+            const bool advanced_lane = (lane_id >= step);
+            const std::uint32_t src_lane_id =
+                (advanced_lane ? lane_id - step : lane_id);
+            const T modifier =
+                sycl::select_from_group(sg, __scan_val, src_lane_id);
+            if (advanced_lane && in_range) {
+                __scan_val = op(__scan_val, modifier);
+            }
+        }
+        if (in_bounds) {
+            local_mem_acc[(offset + lane_id) * max_sgSize - 1] = __scan_val;
+        }
+    }
+    sycl::group_barrier(wg, sycl::memory_scope::work_group);
+
+    if (sgr_id > 0) {
+        const T modifier = local_mem_acc[sgr_id * max_sgSize - 1];
+        scan_val = op(scan_val, modifier);
+    }
+
+    // ensure all work-items finished reading from SLM
+    sycl::group_barrier(wg, sycl::memory_scope::work_group);
+
+    return scan_val;
+}
+
+// Reduction functors
+
+// Maximum
+
+template <typename T>
+struct Maximum
+{
+    T operator()(const T &x, const T &y) const
+    {
+        if constexpr (detail::IsComplex<T>::value) {
+            using dpctl::tensor::math_utils::max_complex;
+            return max_complex<T>(x, y);
+        }
+        else if constexpr (std::is_floating_point_v<T> ||
+                           std::is_same_v<T, sycl::half>) {
+            return (std::isnan(x) || x > y) ? x : y;
+        }
+        else if constexpr (std::is_same_v<T, bool>) {
+            return x || y;
+        }
+        else {
+            return (x > y) ? x : y;
+        }
+    }
+};
+
+// Minimum
+
+template <typename T>
+struct Minimum
+{
+    T operator()(const T &x, const T &y) const
+    {
+        if constexpr (detail::IsComplex<T>::value) {
+            using dpctl::tensor::math_utils::min_complex;
+            return min_complex<T>(x, y);
+        }
+        else if constexpr (std::is_floating_point_v<T> ||
+                           std::is_same_v<T, sycl::half>) {
+            return (std::isnan(x) || x < y) ? x : y;
+        }
+        else if constexpr (std::is_same_v<T, bool>) {
+            return x && y;
+        }
+        else {
+            return (x < y) ? x : y;
+        }
+    }
+};
+
+// Define identities and operator checking structs
+
+template <typename Op, typename T, typename = void>
+struct GetIdentity
+{
+};
+
+// Maximum
+
+template <typename T, class Op>
+using IsMaximum = std::bool_constant<std::is_same_v<Op, sycl::maximum<T>> ||
+                                     std::is_same_v<Op, Maximum<T>>>;
+
+template <typename T, class Op>
+using IsSyclMaximum = std::bool_constant<std::is_same_v<Op, sycl::maximum<T>>>;
+
+template <typename Op, typename T>
+struct GetIdentity<Op, T, std::enable_if_t<IsMaximum<T, Op>::value>>
+{
+    static constexpr T value =
+        static_cast<T>(std::numeric_limits<T>::has_infinity
+                           ? static_cast<T>(-std::numeric_limits<T>::infinity())
+                           : std::numeric_limits<T>::lowest());
+};
+
+template <typename Op>
+struct GetIdentity<Op, bool, std::enable_if_t<IsMaximum<bool, Op>::value>>
+{
+    static constexpr bool value = false;
+};
+
+template <typename Op, typename T>
+struct GetIdentity<Op,
+                   std::complex<T>,
+                   std::enable_if_t<IsMaximum<std::complex<T>, Op>::value>>
+{
+    static constexpr std::complex<T> value{-std::numeric_limits<T>::infinity(),
+                                           -std::numeric_limits<T>::infinity()};
+};
+
+// Minimum
+
+template <typename T, class Op>
+using IsMinimum = std::bool_constant<std::is_same_v<Op, sycl::minimum<T>> ||
+                                     std::is_same_v<Op, Minimum<T>>>;
+
+template <typename T, class Op>
+using IsSyclMinimum = std::bool_constant<std::is_same_v<Op, sycl::minimum<T>>>;
+
+template <typename Op, typename T>
+struct GetIdentity<Op, T, std::enable_if_t<IsMinimum<T, Op>::value>>
+{
+    static constexpr T value =
+        static_cast<T>(std::numeric_limits<T>::has_infinity
+                           ? static_cast<T>(std::numeric_limits<T>::infinity())
+                           : std::numeric_limits<T>::max());
+};
+
+template <typename Op>
+struct GetIdentity<Op, bool, std::enable_if_t<IsMinimum<bool, Op>::value>>
+{
+    static constexpr bool value = true;
+};
+
+template <typename Op, typename T>
+struct GetIdentity<Op,
+                   std::complex<T>,
+                   std::enable_if_t<IsMinimum<std::complex<T>, Op>::value>>
+{
+    static constexpr std::complex<T> value{std::numeric_limits<T>::infinity(),
+                                           std::numeric_limits<T>::infinity()};
+};
+
+// Plus
+
+template <typename T, class Op>
+using IsPlus = std::bool_constant<std::is_same_v<Op, sycl::plus<T>> ||
+                                  std::is_same_v<Op, std::plus<T>>>;
+
+template <typename T, class Op>
+using IsSyclPlus = std::bool_constant<std::is_same_v<Op, sycl::plus<T>>>;
+
+// Multiplies
+
+template <typename T, class Op>
+using IsMultiplies =
+    std::bool_constant<std::is_same_v<Op, sycl::multiplies<T>> ||
+                       std::is_same_v<Op, std::multiplies<T>>>;
+
+template <typename T, class Op>
+using IsSyclMultiplies =
+    std::bool_constant<std::is_same_v<Op, sycl::multiplies<T>>>;
+
+template <typename Op, typename T>
+struct GetIdentity<Op, T, std::enable_if_t<IsMultiplies<T, Op>::value>>
+{
+    static constexpr T value = static_cast<T>(1);
+};
+
+// LogSumExp
+
+template <typename T>
+struct LogSumExp
+{
+    T operator()(const T &x, const T &y) const
+    {
+        using dpctl::tensor::math_utils::logaddexp;
+        return logaddexp<T>(x, y);
+    }
+};
+
+template <typename T, class Op>
+using IsLogSumExp = std::bool_constant<std::is_same_v<Op, LogSumExp<T>>>;
+
+// only defined for types with infinity
+template <typename Op, typename T>
+struct GetIdentity<Op, T, std::enable_if_t<IsLogSumExp<T, Op>::value>>
+{
+    static constexpr T value = -std::numeric_limits<T>::infinity();
+};
+
+// Hypot
+
+template <typename T>
+struct Hypot
+{
+    T operator()(const T &x, const T &y) const
+    {
+        return sycl::hypot(x, y);
+    }
+};
+
+template <typename T, class Op>
+using IsHypot = std::bool_constant<std::is_same_v<Op, Hypot<T>>>;
+
+template <typename Op, typename T>
+struct GetIdentity<Op, T, std::enable_if_t<IsHypot<T, Op>::value>>
+{
+    static constexpr T value = 0;
+};
+
+// Logical_And
+
+template <typename T, class Op>
+using IsLogicalAnd =
+    std::bool_constant<std::is_same_v<Op, sycl::logical_and<T>> ||
+                       std::is_same_v<Op, std::logical_and<T>>>;
+
+template <typename T, class Op>
+using IsSyclLogicalAnd =
+    std::bool_constant<std::is_same_v<Op, sycl::logical_and<T>>>;
+
+template <typename Op, typename T>
+struct GetIdentity<Op, T, std::enable_if_t<IsLogicalAnd<T, Op>::value>>
+{
+    static constexpr T value = static_cast<T>(1);
+};
+
+// Logical_Or
+
+template <typename T, class Op>
+using IsLogicalOr =
+    std::bool_constant<std::is_same_v<Op, sycl::logical_or<T>> ||
+                       std::is_same_v<Op, std::logical_or<T>>>;
+
+template <typename T, class Op>
+using IsSyclLogicalOr =
+    std::bool_constant<std::is_same_v<Op, sycl::logical_or<T>>>;
+
+template <typename Op, typename T>
+struct GetIdentity<Op, T, std::enable_if_t<IsLogicalOr<T, Op>::value>>
+{
+    static constexpr T value = static_cast<T>(0);
+};
+
+// Identity
+
+template <typename Op, typename T, typename = void>
+struct Identity
+{
+};
+
+template <typename Op, typename T>
+using UseBuiltInIdentity =
+    std::conjunction<IsSyclOp<T, Op>, sycl::has_known_identity<Op, T>>;
+
+template <typename Op, typename T>
+struct Identity<Op, T, std::enable_if_t<!UseBuiltInIdentity<Op, T>::value>>
+{
+    static constexpr T value = GetIdentity<Op, T>::value;
+};
+
+template <typename Op, typename T>
+struct Identity<Op, T, std::enable_if_t<UseBuiltInIdentity<Op, T>::value>>
+{
+    static constexpr T value = sycl::known_identity<Op, T>::value;
+};
+
+// Sub-group load/store
+
+#ifndef USE_GROUP_LOAD_STORE
+#if defined(SYCL_EXT_ONEAPI_GROUP_LOAD_STORE) &&                               \
+    SYCL_EXT_ONEAPI_GROUP_LOAD_STORE
+#define USE_GROUP_LOAD_STORE 1
+#else
+#if defined(__LIBSYCL_MAJOR_VERSION) && (__LIBSYCL_MAJOR_VERSION >= 8u)
+#define USE_GROUP_LOAD_STORE 1
+#else
+#define USE_GROUP_LOAD_STORE 0
+#endif
+#endif
+#endif
+
+#if (USE_GROUP_LOAD_STORE)
+namespace ls_ns = sycl::ext::oneapi::experimental;
+#endif
+
+template <std::uint8_t vec_sz,
+          sycl::access::address_space Space,
+          sycl::access::decorated DecorateAddress,
+          typename ElementType>
+auto sub_group_load(const sycl::sub_group &sg,
+                    sycl::multi_ptr<ElementType, Space, DecorateAddress> m_ptr)
+{
+#if (USE_GROUP_LOAD_STORE)
+    using ValueT = typename std::remove_cv_t<ElementType>;
+    sycl::vec<ValueT, vec_sz> x{};
+    static constexpr auto striped =
+        ls_ns::properties{ls_ns::data_placement_striped};
+    ls_ns::group_load(sg, m_ptr, x, striped);
+    return x;
+#else
+    return sg.load<vec_sz>(m_ptr);
+#endif
+}
+
+template <sycl::access::address_space Space,
+          sycl::access::decorated DecorateAddress,
+          typename ElementType>
+auto sub_group_load(const sycl::sub_group &sg,
+                    sycl::multi_ptr<ElementType, Space, DecorateAddress> m_ptr)
+{
+#if (USE_GROUP_LOAD_STORE)
+    using ValueT = typename std::remove_cv_t<ElementType>;
+    ValueT x{};
+    static constexpr auto striped =
+        ls_ns::properties{ls_ns::data_placement_striped};
+    ls_ns::group_load(sg, m_ptr, x, striped);
+    return x;
+#else
+    return sg.load(m_ptr);
+#endif
+}
+
+template <std::uint8_t vec_sz,
+          sycl::access::address_space Space,
+          sycl::access::decorated DecorateAddress,
+          typename VecT,
+          typename ElementType>
+std::enable_if_t<
+    std::is_same_v<std::remove_cv_t<ElementType>, std::remove_cv_t<VecT>>,
+    void>
+    sub_group_store(const sycl::sub_group &sg,
+                    const sycl::vec<VecT, vec_sz> &val,
+                    sycl::multi_ptr<ElementType, Space, DecorateAddress> m_ptr)
+{
+#if (USE_GROUP_LOAD_STORE)
+    static_assert(std::is_same_v<VecT, ElementType>);
+    static constexpr auto striped =
+        ls_ns::properties{ls_ns::data_placement_striped};
+    ls_ns::group_store(sg, val, m_ptr, striped);
+    return;
+#else
+    sg.store<vec_sz>(m_ptr, val);
+    return;
+#endif
+}
+
+template <sycl::access::address_space Space,
+          sycl::access::decorated DecorateAddress,
+          typename VecT,
+          typename ElementType>
+std::enable_if_t<
+    std::is_same_v<std::remove_cv_t<ElementType>, std::remove_cv_t<VecT>>,
+    void>
+    sub_group_store(const sycl::sub_group &sg,
+                    const VecT &val,
+                    sycl::multi_ptr<ElementType, Space, DecorateAddress> m_ptr)
+{
+#if (USE_GROUP_LOAD_STORE)
+    static constexpr auto striped =
+        ls_ns::properties{ls_ns::data_placement_striped};
+    ls_ns::group_store(sg, val, m_ptr, striped);
+    return;
+#else
+    sg.store(m_ptr, val);
+    return;
+#endif
+}
+} // namespace dpctl::tensor::sycl_utils
diff --git a/dpctl/tensor/libtensor/include/utils/type_dispatch.hpp b/dpctl/tensor/libtensor/include/utils/type_dispatch.hpp
new file mode 100644
index 000000000000..5ec84783c901
--- /dev/null
+++ b/dpctl/tensor/libtensor/include/utils/type_dispatch.hpp
@@ -0,0 +1,134 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+///
+/// \file
+/// This file defines class to implement dispatch tables for pair of types
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <cstdint>
+#include <stdexcept>
+#include <string>
+
+#include "dpctl4pybind11.hpp"
+
+#include "type_dispatch_building.hpp"
+
+namespace dpctl::tensor::type_dispatch
+{
+struct usm_ndarray_types
+{
+    int typenum_to_lookup_id(int typenum) const
+    {
+        using typenum_t = ::dpctl::tensor::type_dispatch::typenum_t;
+        auto const &api = ::dpctl::detail::dpctl_capi::get();
+
+        if (typenum == api.UAR_DOUBLE_) {
+            return static_cast<int>(typenum_t::DOUBLE);
+        }
+        else if (typenum == api.UAR_INT64_) {
+            return static_cast<int>(typenum_t::INT64);
+        }
+        else if (typenum == api.UAR_INT32_) {
+            return static_cast<int>(typenum_t::INT32);
+        }
+        else if (typenum == api.UAR_BOOL_) {
+            return static_cast<int>(typenum_t::BOOL);
+        }
+        else if (typenum == api.UAR_CDOUBLE_) {
+            return static_cast<int>(typenum_t::CDOUBLE);
+        }
+        else if (typenum == api.UAR_FLOAT_) {
+            return static_cast<int>(typenum_t::FLOAT);
+        }
+        else if (typenum == api.UAR_INT16_) {
+            return static_cast<int>(typenum_t::INT16);
+        }
+        else if (typenum == api.UAR_INT8_) {
+            return static_cast<int>(typenum_t::INT8);
+        }
+        else if (typenum == api.UAR_UINT64_) {
+            return static_cast<int>(typenum_t::UINT64);
+        }
+        else if (typenum == api.UAR_UINT32_) {
+            return static_cast<int>(typenum_t::UINT32);
+        }
+        else if (typenum == api.UAR_UINT16_) {
+            return static_cast<int>(typenum_t::UINT16);
+        }
+        else if (typenum == api.UAR_UINT8_) {
+            return static_cast<int>(typenum_t::UINT8);
+        }
+        else if (typenum == api.UAR_CFLOAT_) {
+            return static_cast<int>(typenum_t::CFLOAT);
+        }
+        else if (typenum == api.UAR_HALF_) {
+            return static_cast<int>(typenum_t::HALF);
+        }
+        else if (typenum == api.UAR_INT_ || typenum == api.UAR_UINT_) {
+            switch (sizeof(int)) {
+            case sizeof(std::int32_t):
+                return ((typenum == api.UAR_INT_)
+                            ? static_cast<int>(typenum_t::INT32)
+                            : static_cast<int>(typenum_t::UINT32));
+            case sizeof(std::int64_t):
+                return ((typenum == api.UAR_INT_)
+                            ? static_cast<int>(typenum_t::INT64)
+                            : static_cast<int>(typenum_t::UINT64));
+            default:
+                throw_unrecognized_typenum_error(typenum);
+            }
+        }
+        else if (typenum == api.UAR_LONGLONG_ || typenum == api.UAR_ULONGLONG_)
+        {
+            switch (sizeof(long long)) {
+            case sizeof(std::int64_t):
+                return ((typenum == api.UAR_LONGLONG_)
+                            ? static_cast<int>(typenum_t::INT64)
+                            : static_cast<int>(typenum_t::UINT64));
+            default:
+                throw_unrecognized_typenum_error(typenum);
+            }
+        }
+        else {
+            throw_unrecognized_typenum_error(typenum);
+        }
+        // return code signalling error, should never be reached
+        assert(false);
+        return -1;
+    }
+
+private:
+    void throw_unrecognized_typenum_error(int typenum) const
+    {
+        throw std::runtime_error("Unrecognized typenum " +
+                                 std::to_string(typenum) + " encountered.");
+    }
+};
+} // namespace dpctl::tensor::type_dispatch
diff --git a/dpctl/tensor/libtensor/include/utils/type_dispatch_building.hpp b/dpctl/tensor/libtensor/include/utils/type_dispatch_building.hpp
new file mode 100644
index 000000000000..b1e02eb1513b
--- /dev/null
+++ b/dpctl/tensor/libtensor/include/utils/type_dispatch_building.hpp
@@ -0,0 +1,300 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+///
+/// \file
+/// This file defines class to implement dispatch tables for pair of types
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <complex>
+#include <cstdint>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+namespace dpctl::tensor::type_dispatch
+{
+enum class typenum_t : int
+{
+    BOOL = 0,
+    INT8, // 1
+    UINT8,
+    INT16,
+    UINT16,
+    INT32, // 5
+    UINT32,
+    INT64,
+    UINT64,
+    HALF,
+    FLOAT, // 10
+    DOUBLE,
+    CFLOAT,
+    CDOUBLE, // 13
+};
+inline constexpr int num_types = 14; // number of elements in typenum_t
+
+template <typename funcPtrT,
+          template <typename fnT, typename D, typename S>
+          typename factory,
+          int _num_types>
+class DispatchTableBuilder
+{
+private:
+    template <typename dstTy>
+    const std::vector<funcPtrT> row_per_dst_type() const
+    {
+        std::vector<funcPtrT> per_dstTy = {
+            factory<funcPtrT, dstTy, bool>{}.get(),
+            factory<funcPtrT, dstTy, std::int8_t>{}.get(),
+            factory<funcPtrT, dstTy, std::uint8_t>{}.get(),
+            factory<funcPtrT, dstTy, std::int16_t>{}.get(),
+            factory<funcPtrT, dstTy, std::uint16_t>{}.get(),
+            factory<funcPtrT, dstTy, std::int32_t>{}.get(),
+            factory<funcPtrT, dstTy, std::uint32_t>{}.get(),
+            factory<funcPtrT, dstTy, std::int64_t>{}.get(),
+            factory<funcPtrT, dstTy, std::uint64_t>{}.get(),
+            factory<funcPtrT, dstTy, sycl::half>{}.get(),
+            factory<funcPtrT, dstTy, float>{}.get(),
+            factory<funcPtrT, dstTy, double>{}.get(),
+            factory<funcPtrT, dstTy, std::complex<float>>{}.get(),
+            factory<funcPtrT, dstTy, std::complex<double>>{}.get()};
+        assert(per_dstTy.size() == _num_types);
+        return per_dstTy;
+    }
+
+public:
+    DispatchTableBuilder() = default;
+    ~DispatchTableBuilder() = default;
+
+    void populate_dispatch_table(funcPtrT table[][_num_types]) const
+    {
+        const auto map_by_dst_type = {row_per_dst_type<bool>(),
+                                      row_per_dst_type<std::int8_t>(),
+                                      row_per_dst_type<std::uint8_t>(),
+                                      row_per_dst_type<std::int16_t>(),
+                                      row_per_dst_type<std::uint16_t>(),
+                                      row_per_dst_type<std::int32_t>(),
+                                      row_per_dst_type<std::uint32_t>(),
+                                      row_per_dst_type<std::int64_t>(),
+                                      row_per_dst_type<std::uint64_t>(),
+                                      row_per_dst_type<sycl::half>(),
+                                      row_per_dst_type<float>(),
+                                      row_per_dst_type<double>(),
+                                      row_per_dst_type<std::complex<float>>(),
+                                      row_per_dst_type<std::complex<double>>()};
+        assert(map_by_dst_type.size() == _num_types);
+        int dst_id = 0;
+        for (const auto &row : map_by_dst_type) {
+            int src_id = 0;
+            for (const auto &fn_ptr : row) {
+                table[dst_id][src_id] = fn_ptr;
+                ++src_id;
+            }
+            ++dst_id;
+        }
+    }
+};
+
+template <typename funcPtrT,
+          template <typename fnT, typename T>
+          typename factory,
+          int _num_types>
+class DispatchVectorBuilder
+{
+private:
+    template <typename Ty>
+    const funcPtrT func_per_type() const
+    {
+        funcPtrT f = factory<funcPtrT, Ty>{}.get();
+        return f;
+    }
+
+public:
+    DispatchVectorBuilder() = default;
+    ~DispatchVectorBuilder() = default;
+
+    void populate_dispatch_vector(funcPtrT vector[]) const
+    {
+        const auto fn_map_by_type = {func_per_type<bool>(),
+                                     func_per_type<std::int8_t>(),
+                                     func_per_type<std::uint8_t>(),
+                                     func_per_type<std::int16_t>(),
+                                     func_per_type<std::uint16_t>(),
+                                     func_per_type<std::int32_t>(),
+                                     func_per_type<std::uint32_t>(),
+                                     func_per_type<std::int64_t>(),
+                                     func_per_type<std::uint64_t>(),
+                                     func_per_type<sycl::half>(),
+                                     func_per_type<float>(),
+                                     func_per_type<double>(),
+                                     func_per_type<std::complex<float>>(),
+                                     func_per_type<std::complex<double>>()};
+        assert(fn_map_by_type.size() == _num_types);
+        int ty_id = 0;
+        for (const auto &fn : fn_map_by_type) {
+            vector[ty_id] = fn;
+            ++ty_id;
+        }
+    }
+};
+
+/*! @brief struct to define result_type typename for Ty == ArgTy */
+template <typename Ty, typename ArgTy, typename ResTy = ArgTy>
+struct TypeMapResultEntry : std::is_same<Ty, ArgTy>
+{
+    using result_type = ResTy;
+};
+
+/*! @brief struct to define result_type typename for Ty1 == ArgTy1 && Ty2 ==
+ * ArgTy2 */
+template <typename Ty1,
+          typename ArgTy1,
+          typename Ty2,
+          typename ArgTy2,
+          typename ResTy>
+struct BinaryTypeMapResultEntry
+    : std::conjunction<std::is_same<Ty1, ArgTy1>, std::is_same<Ty2, ArgTy2>>
+{
+    using result_type = ResTy;
+};
+
+/*! @brief fall-through struct with specified result_type, usually void */
+template <typename Ty = void>
+struct DefaultResultEntry : std::true_type
+{
+    using result_type = Ty;
+};
+
+/*! @brief Utility struct to convert C++ type into typeid integer */
+template <typename T>
+struct GetTypeid
+{
+    int get()
+    {
+        if constexpr (std::is_same_v<T, bool>) {
+            return static_cast<int>(typenum_t::BOOL);
+        }
+        else if constexpr (std::is_same_v<T, std::int8_t>) {
+            return static_cast<int>(typenum_t::INT8);
+        }
+        else if constexpr (std::is_same_v<T, std::uint8_t>) {
+            return static_cast<int>(typenum_t::UINT8);
+        }
+        else if constexpr (std::is_same_v<T, std::int16_t>) {
+            return static_cast<int>(typenum_t::INT16);
+        }
+        else if constexpr (std::is_same_v<T, std::uint16_t>) {
+            return static_cast<int>(typenum_t::UINT16);
+        }
+        else if constexpr (std::is_same_v<T, std::int32_t>) {
+            return static_cast<int>(typenum_t::INT32);
+        }
+        else if constexpr (std::is_same_v<T, std::uint32_t>) {
+            return static_cast<int>(typenum_t::UINT32);
+        }
+        else if constexpr (std::is_same_v<T, std::int64_t>) {
+            return static_cast<int>(typenum_t::INT64);
+        }
+        else if constexpr (std::is_same_v<T, std::uint64_t>) {
+            return static_cast<int>(typenum_t::UINT64);
+        }
+        else if constexpr (std::is_same_v<T, sycl::half>) {
+            return static_cast<int>(typenum_t::HALF);
+        }
+        else if constexpr (std::is_same_v<T, float>) {
+            return static_cast<int>(typenum_t::FLOAT);
+        }
+        else if constexpr (std::is_same_v<T, double>) {
+            return static_cast<int>(typenum_t::DOUBLE);
+        }
+        else if constexpr (std::is_same_v<T, std::complex<float>>) {
+            return static_cast<int>(typenum_t::CFLOAT);
+        }
+        else if constexpr (std::is_same_v<T, std::complex<double>>) {
+            return static_cast<int>(typenum_t::CDOUBLE);
+        }
+        else if constexpr (std::is_same_v<T, void>) { // special token
+            return -1;
+        }
+
+        assert(("Unsupported type T", false));
+        return -2;
+    }
+};
+
+/*! @brief Class to generate vector of null function pointers */
+template <typename FunPtrT>
+struct NullPtrVector
+{
+
+    using value_type = FunPtrT;
+    using const_reference = value_type const &;
+
+    NullPtrVector() : val(nullptr) {}
+
+    const_reference operator[](int) const
+    {
+        return val;
+    }
+
+private:
+    value_type val;
+};
+
+/*! @brief Class to generate table of null function pointers */
+template <typename FunPtrT>
+struct NullPtrTable
+{
+    using value_type = NullPtrVector<FunPtrT>;
+    using const_reference = value_type const &;
+
+    NullPtrTable() : val() {}
+
+    const_reference operator[](int) const
+    {
+        return val;
+    }
+
+private:
+    value_type val;
+};
+
+template <typename Ty1, typename ArgTy, typename Ty2, typename outTy>
+struct TypePairDefinedEntry
+    : std::conjunction<std::is_same<Ty1, ArgTy>, std::is_same<Ty2, outTy>>
+{
+    static constexpr bool is_defined = true;
+};
+
+struct NotDefinedEntry : std::true_type
+{
+    static constexpr bool is_defined = false;
+};
+} // namespace dpctl::tensor::type_dispatch
diff --git a/dpctl/tensor/libtensor/include/utils/type_utils.hpp b/dpctl/tensor/libtensor/include/utils/type_utils.hpp
new file mode 100644
index 000000000000..e5855081c727
--- /dev/null
+++ b/dpctl/tensor/libtensor/include/utils/type_utils.hpp
@@ -0,0 +1,164 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+///
+/// \file
+/// This file defines functions for value casting.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <complex>
+#include <cstddef>
+#include <cstdint>
+#include <stdexcept>
+#include <type_traits>
+#include <utility>
+
+#include <sycl/sycl.hpp>
+
+namespace dpctl::tensor::type_utils
+{
+template <typename T, typename = void>
+struct is_complex : public std::false_type
+{
+};
+
+template <typename T>
+struct is_complex<
+    T,
+    std::enable_if_t<std::is_same_v<std::remove_cv_t<T>, std::complex<float>> ||
+                     std::is_same_v<std::remove_cv_t<T>, std::complex<double>>>>
+    : public std::true_type
+{
+};
+
+template <typename T>
+inline constexpr bool is_complex_v = is_complex<T>::value;
+
+template <typename dstTy, typename srcTy>
+dstTy convert_impl(const srcTy &v)
+{
+    if constexpr (std::is_same_v<dstTy, srcTy>) {
+        return v;
+    }
+    else if constexpr (std::is_same_v<dstTy, bool>) {
+        if constexpr (is_complex_v<srcTy>) {
+            // bool(complex_v) ==
+            //     (complex_v.real() != 0) && (complex_v.imag() !=0)
+            return (convert_impl<bool, typename srcTy::value_type>(v.real()) ||
+                    convert_impl<bool, typename srcTy::value_type>(v.imag()));
+        }
+        else {
+            return static_cast<dstTy>(v != srcTy{0});
+        }
+    }
+    else if constexpr (std::is_same_v<srcTy, bool>) {
+        // C++ interprets a byte of storage behind bool by only
+        // testing is least significant bit, leading to both
+        // 0x00 and 0x02 interpreted as False, while 0x01 and 0xFF
+        // interpreted as True. NumPy's interpretation of underlying
+        // storage is different: any bit set is interpreted as True,
+        // no bits set as False, see gh-2121
+        const std::uint8_t &u = sycl::bit_cast<std::uint8_t>(v);
+        if constexpr (is_complex_v<dstTy>) {
+            return (u == 0) ? dstTy{} : dstTy{1, 0};
+        }
+        else {
+            return (u == 0) ? dstTy{} : dstTy{1};
+        }
+    }
+    else if constexpr (is_complex_v<srcTy> && !is_complex_v<dstTy>) {
+        // real_t(complex_v) == real_t(complex_v.real())
+        return convert_impl<dstTy, typename srcTy::value_type>(v.real());
+    }
+    else if constexpr (!std::is_integral_v<srcTy> &&
+                       !std::is_same_v<dstTy, bool> &&
+                       std::is_integral_v<dstTy> && std::is_unsigned_v<dstTy>)
+    {
+        // first cast to signed variant, the cast to unsigned one
+        using signedT = typename std::make_signed_t<dstTy>;
+        return static_cast<dstTy>(convert_impl<signedT, srcTy>(v));
+    }
+    else {
+        return static_cast<dstTy>(v);
+    }
+}
+
+template <typename T>
+void validate_type_for_device(const sycl::device &d)
+{
+    if constexpr (std::is_same_v<T, double>) {
+        if (!d.has(sycl::aspect::fp64)) {
+            throw std::runtime_error("Device " +
+                                     d.get_info<sycl::info::device::name>() +
+                                     " does not support type 'float64'");
+        }
+    }
+    else if constexpr (std::is_same_v<T, std::complex<double>>) {
+        if (!d.has(sycl::aspect::fp64)) {
+            throw std::runtime_error("Device " +
+                                     d.get_info<sycl::info::device::name>() +
+                                     " does not support type 'complex128'");
+        }
+    }
+    else if constexpr (std::is_same_v<T, sycl::half>) {
+        if (!d.has(sycl::aspect::fp16)) {
+            throw std::runtime_error("Device " +
+                                     d.get_info<sycl::info::device::name>() +
+                                     " does not support type 'float16'");
+        }
+    }
+}
+
+template <typename T>
+void validate_type_for_device(const sycl::queue &q)
+{
+    validate_type_for_device<T>(q.get_device());
+}
+
+template <typename Op, typename Vec, std::size_t... I>
+auto vec_cast_impl(const Vec &v, std::index_sequence<I...>)
+{
+    return Op{v[I]...};
+}
+
+template <typename dstT,
+          typename srcT,
+          std::size_t N,
+          typename Indices = std::make_index_sequence<N>>
+auto vec_cast(const sycl::vec<srcT, N> &s)
+{
+    if constexpr (std::is_same_v<srcT, dstT>) {
+        return s;
+    }
+    else {
+        return vec_cast_impl<sycl::vec<dstT, N>, sycl::vec<srcT, N>>(s,
+                                                                     Indices{});
+    }
+}
+} // namespace dpctl::tensor::type_utils
diff --git a/dpnp/backend/CMakeLists.txt b/dpnp/backend/CMakeLists.txt
index ddca557a08f4..433ab298d476 100644
--- a/dpnp/backend/CMakeLists.txt
+++ b/dpnp/backend/CMakeLists.txt
@@ -89,7 +89,6 @@ target_compile_definitions(${_trgt} PUBLIC PSTL_USE_PARALLEL_POLICIES=0)
 target_compile_definitions(${_trgt} PUBLIC ONEDPL_USE_PREDEFINED_POLICIES=0)
 
 target_include_directories(${_trgt} PUBLIC ${Dpctl_INCLUDE_DIR})
-target_include_directories(${_trgt} PUBLIC ${Dpctl_TENSOR_INCLUDE_DIR})
 
 target_link_directories(${_trgt} PUBLIC "${Dpctl_INCLUDE_DIR}/..")
 target_link_libraries(${_trgt} PUBLIC DPCTLSyclInterface)
diff --git a/dpnp/backend/extensions/blas/CMakeLists.txt b/dpnp/backend/extensions/blas/CMakeLists.txt
index 267567c69e71..2da35cc695ac 100644
--- a/dpnp/backend/extensions/blas/CMakeLists.txt
+++ b/dpnp/backend/extensions/blas/CMakeLists.txt
@@ -71,9 +71,12 @@ target_include_directories(
     ${python_module_name}
     PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../common
 )
+target_include_directories(
+    ${python_module_name}
+    PRIVATE ${CMAKE_SOURCE_DIR}/dpctl/tensor/libtensor/include
+)
 
 target_include_directories(${python_module_name} PUBLIC ${Dpctl_INCLUDE_DIRS})
-target_include_directories(${python_module_name} PUBLIC ${Dpctl_TENSOR_INCLUDE_DIR})
 
 if(WIN32)
     target_compile_options(
diff --git a/dpnp/backend/extensions/blas/dot_common.hpp b/dpnp/backend/extensions/blas/dot_common.hpp
index 1672e7217cba..369e3320473c 100644
--- a/dpnp/backend/extensions/blas/dot_common.hpp
+++ b/dpnp/backend/extensions/blas/dot_common.hpp
@@ -29,6 +29,7 @@
 #pragma once
 
 #include <oneapi/mkl.hpp>
+
 #include <pybind11/pybind11.h>
 
 // dpctl tensor headers
diff --git a/dpnp/backend/extensions/common/ext/common.hpp b/dpnp/backend/extensions/common/ext/common.hpp
index d626b56ea00c..036eb635a3bd 100644
--- a/dpnp/backend/extensions/common/ext/common.hpp
+++ b/dpnp/backend/extensions/common/ext/common.hpp
@@ -29,8 +29,10 @@
 #pragma once
 
 #include <complex>
+
 #include <pybind11/numpy.h>
 #include <pybind11/pybind11.h>
+
 #include <sycl/sycl.hpp>
 
 // dpctl tensor headers
diff --git a/dpnp/backend/extensions/fft/CMakeLists.txt b/dpnp/backend/extensions/fft/CMakeLists.txt
index 50468857e3b9..0631b049ad72 100644
--- a/dpnp/backend/extensions/fft/CMakeLists.txt
+++ b/dpnp/backend/extensions/fft/CMakeLists.txt
@@ -65,9 +65,12 @@ target_include_directories(
     ${python_module_name}
     PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../../src
 )
+target_include_directories(
+    ${python_module_name}
+    PRIVATE ${CMAKE_SOURCE_DIR}/dpctl/tensor/libtensor/include
+)
 
 target_include_directories(${python_module_name} PUBLIC ${Dpctl_INCLUDE_DIRS})
-target_include_directories(${python_module_name} PUBLIC ${Dpctl_TENSOR_INCLUDE_DIR})
 
 if(WIN32)
     target_compile_options(
diff --git a/dpnp/backend/extensions/indexing/CMakeLists.txt b/dpnp/backend/extensions/indexing/CMakeLists.txt
index a6691f31f559..28d38bc28f21 100644
--- a/dpnp/backend/extensions/indexing/CMakeLists.txt
+++ b/dpnp/backend/extensions/indexing/CMakeLists.txt
@@ -68,9 +68,12 @@ target_include_directories(
     ${python_module_name}
     PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../common
 )
+target_include_directories(
+    ${python_module_name}
+    PRIVATE ${CMAKE_SOURCE_DIR}/dpctl/tensor/libtensor/include
+)
 
 target_include_directories(${python_module_name} PUBLIC ${Dpctl_INCLUDE_DIR})
-target_include_directories(${python_module_name} PUBLIC ${Dpctl_TENSOR_INCLUDE_DIR})
 
 if(WIN32)
     target_compile_options(
diff --git a/dpnp/backend/extensions/lapack/CMakeLists.txt b/dpnp/backend/extensions/lapack/CMakeLists.txt
index 5e8b95963e94..aa0f6b718972 100644
--- a/dpnp/backend/extensions/lapack/CMakeLists.txt
+++ b/dpnp/backend/extensions/lapack/CMakeLists.txt
@@ -88,9 +88,12 @@ target_include_directories(
     ${python_module_name}
     PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../common
 )
+target_include_directories(
+    ${python_module_name}
+    PRIVATE ${CMAKE_SOURCE_DIR}/dpctl/tensor/libtensor/include
+)
 
 target_include_directories(${python_module_name} PUBLIC ${Dpctl_INCLUDE_DIR})
-target_include_directories(${python_module_name} PUBLIC ${Dpctl_TENSOR_INCLUDE_DIR})
 
 if(WIN32)
     target_compile_options(
diff --git a/dpnp/backend/extensions/statistics/CMakeLists.txt b/dpnp/backend/extensions/statistics/CMakeLists.txt
index 9561daf27ce2..2d59f679793c 100644
--- a/dpnp/backend/extensions/statistics/CMakeLists.txt
+++ b/dpnp/backend/extensions/statistics/CMakeLists.txt
@@ -77,9 +77,12 @@ target_include_directories(
     ${python_module_name}
     PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../common
 )
+target_include_directories(
+    ${python_module_name}
+    PRIVATE ${CMAKE_SOURCE_DIR}/dpctl/tensor/libtensor/include
+)
 
 target_include_directories(${python_module_name} PUBLIC ${Dpctl_INCLUDE_DIR})
-target_include_directories(${python_module_name} PUBLIC ${Dpctl_TENSOR_INCLUDE_DIR})
 
 if(WIN32)
     target_compile_options(
diff --git a/dpnp/backend/extensions/ufunc/CMakeLists.txt b/dpnp/backend/extensions/ufunc/CMakeLists.txt
index b24d5d131cfe..f1378bf52d88 100644
--- a/dpnp/backend/extensions/ufunc/CMakeLists.txt
+++ b/dpnp/backend/extensions/ufunc/CMakeLists.txt
@@ -90,9 +90,12 @@ target_include_directories(
     ${python_module_name}
     PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../common
 )
+target_include_directories(
+    ${python_module_name}
+    PRIVATE ${CMAKE_SOURCE_DIR}/dpctl/tensor/libtensor/include
+)
 
 target_include_directories(${python_module_name} PUBLIC ${Dpctl_INCLUDE_DIR})
-target_include_directories(${python_module_name} PUBLIC ${Dpctl_TENSOR_INCLUDE_DIR})
 
 if(_dpnp_sycl_targets)
     # make fat binary
diff --git a/dpnp/backend/extensions/vm/CMakeLists.txt b/dpnp/backend/extensions/vm/CMakeLists.txt
index 0e3a17df77e0..b7181616f546 100644
--- a/dpnp/backend/extensions/vm/CMakeLists.txt
+++ b/dpnp/backend/extensions/vm/CMakeLists.txt
@@ -113,9 +113,12 @@ target_include_directories(
     ${python_module_name}
     PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../common
 )
+target_include_directories(
+    ${python_module_name}
+    PRIVATE ${CMAKE_SOURCE_DIR}/dpctl/tensor/libtensor/include
+)
 
 target_include_directories(${python_module_name} PUBLIC ${Dpctl_INCLUDE_DIR})
-target_include_directories(${python_module_name} PUBLIC ${Dpctl_TENSOR_INCLUDE_DIR})
 
 if(WIN32)
     target_compile_options(
diff --git a/dpnp/backend/extensions/window/CMakeLists.txt b/dpnp/backend/extensions/window/CMakeLists.txt
index fc446f523e74..186668bb1662 100644
--- a/dpnp/backend/extensions/window/CMakeLists.txt
+++ b/dpnp/backend/extensions/window/CMakeLists.txt
@@ -68,9 +68,12 @@ target_include_directories(
     ${python_module_name}
     PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../common
 )
+target_include_directories(
+    ${python_module_name}
+    PRIVATE ${CMAKE_SOURCE_DIR}/dpctl/tensor/libtensor/include
+)
 
 target_include_directories(${python_module_name} PUBLIC ${Dpctl_INCLUDE_DIR})
-target_include_directories(${python_module_name} PUBLIC ${Dpctl_TENSOR_INCLUDE_DIR})
 
 if(WIN32)
     target_compile_options(
diff --git a/pyproject.toml b/pyproject.toml
index d659428877fc..6394cf118dcf 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -108,7 +108,7 @@ target-version = ['py310', 'py311', 'py312', 'py313', 'py314']
 [tool.codespell]
 builtin = "clear,rare,informal,names"
 check-filenames = true
-ignore-words-list = "amin,arange,elemt,fro,hist,ith,mone,nd,nin,sinc,vart"
+ignore-words-list = "amin,arange,elemt,fro,hist,ith,mone,nd,nin,sinc,vart,GroupT"
 quiet-level = 3
 
 [tool.coverage.report]

From 27a36233bfed2989c2c37b87cf786a07efb38e73 Mon Sep 17 00:00:00 2001
From: Anton Volkov <antonwolfy@gmail.com>
Date: Fri, 23 Jan 2026 04:18:46 -0800
Subject: [PATCH 02/18] Remove unused include dir for building fft and
 statistics extensions

---
 dpnp/backend/extensions/fft/CMakeLists.txt        | 8 --------
 dpnp/backend/extensions/statistics/CMakeLists.txt | 8 --------
 2 files changed, 16 deletions(-)

diff --git a/dpnp/backend/extensions/fft/CMakeLists.txt b/dpnp/backend/extensions/fft/CMakeLists.txt
index 0631b049ad72..0c2c446fe8a0 100644
--- a/dpnp/backend/extensions/fft/CMakeLists.txt
+++ b/dpnp/backend/extensions/fft/CMakeLists.txt
@@ -57,14 +57,6 @@ set_target_properties(
     PROPERTIES CMAKE_POSITION_INDEPENDENT_CODE ON
 )
 
-target_include_directories(
-    ${python_module_name}
-    PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../../include
-)
-target_include_directories(
-    ${python_module_name}
-    PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../../src
-)
 target_include_directories(
     ${python_module_name}
     PRIVATE ${CMAKE_SOURCE_DIR}/dpctl/tensor/libtensor/include
diff --git a/dpnp/backend/extensions/statistics/CMakeLists.txt b/dpnp/backend/extensions/statistics/CMakeLists.txt
index 2d59f679793c..710a35346d63 100644
--- a/dpnp/backend/extensions/statistics/CMakeLists.txt
+++ b/dpnp/backend/extensions/statistics/CMakeLists.txt
@@ -65,14 +65,6 @@ set_target_properties(
     PROPERTIES CMAKE_POSITION_INDEPENDENT_CODE ON
 )
 
-target_include_directories(
-    ${python_module_name}
-    PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../../include
-)
-target_include_directories(
-    ${python_module_name}
-    PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../../src
-)
 target_include_directories(
     ${python_module_name}
     PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../common

From 7c72c6eed1d423379feaee8b6d839fc6d8f2c115 Mon Sep 17 00:00:00 2001
From: Anton Volkov <antonwolfy@gmail.com>
Date: Fri, 23 Jan 2026 04:25:54 -0800
Subject: [PATCH 03/18] Add dpnp4pybind11.hpp

---
 dpnp/backend/include/dpnp4pybind11.hpp | 1373 ++++++++++++++++++++++++
 1 file changed, 1373 insertions(+)
 create mode 100644 dpnp/backend/include/dpnp4pybind11.hpp

diff --git a/dpnp/backend/include/dpnp4pybind11.hpp b/dpnp/backend/include/dpnp4pybind11.hpp
new file mode 100644
index 000000000000..cd287989bef2
--- /dev/null
+++ b/dpnp/backend/include/dpnp4pybind11.hpp
@@ -0,0 +1,1373 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+
+#pragma once
+
+#include "dpctl_capi.h"
+
+#include <complex>
+#include <cstddef> // for std::size_t for C++ linkage
+#include <memory>
+#include <stddef.h> // for size_t for C linkage
+#include <stdexcept>
+#include <utility>
+#include <vector>
+
+#include <pybind11/pybind11.h>
+
+#include <sycl/sycl.hpp>
+
+namespace py = pybind11;
+
+namespace dpctl
+{
+namespace detail
+{
+// Lookup a type according to its size, and return a value corresponding to the
+// NumPy typenum.
+template <typename Concrete>
+constexpr int platform_typeid_lookup()
+{
+    return -1;
+}
+
+template <typename Concrete, typename T, typename... Ts, typename... Ints>
+constexpr int platform_typeid_lookup(int I, Ints... Is)
+{
+    return sizeof(Concrete) == sizeof(T)
+               ? I
+               : platform_typeid_lookup<Concrete, Ts...>(Is...);
+}
+
+class dpctl_capi
+{
+public:
+    // dpctl type objects
+    PyTypeObject *Py_SyclDeviceType_;
+    PyTypeObject *PySyclDeviceType_;
+    PyTypeObject *Py_SyclContextType_;
+    PyTypeObject *PySyclContextType_;
+    PyTypeObject *Py_SyclEventType_;
+    PyTypeObject *PySyclEventType_;
+    PyTypeObject *Py_SyclQueueType_;
+    PyTypeObject *PySyclQueueType_;
+    PyTypeObject *Py_MemoryType_;
+    PyTypeObject *PyMemoryUSMDeviceType_;
+    PyTypeObject *PyMemoryUSMSharedType_;
+    PyTypeObject *PyMemoryUSMHostType_;
+    PyTypeObject *PyUSMArrayType_;
+    PyTypeObject *PySyclProgramType_;
+    PyTypeObject *PySyclKernelType_;
+
+    DPCTLSyclDeviceRef (*SyclDevice_GetDeviceRef_)(PySyclDeviceObject *);
+    PySyclDeviceObject *(*SyclDevice_Make_)(DPCTLSyclDeviceRef);
+
+    DPCTLSyclContextRef (*SyclContext_GetContextRef_)(PySyclContextObject *);
+    PySyclContextObject *(*SyclContext_Make_)(DPCTLSyclContextRef);
+
+    DPCTLSyclEventRef (*SyclEvent_GetEventRef_)(PySyclEventObject *);
+    PySyclEventObject *(*SyclEvent_Make_)(DPCTLSyclEventRef);
+
+    DPCTLSyclQueueRef (*SyclQueue_GetQueueRef_)(PySyclQueueObject *);
+    PySyclQueueObject *(*SyclQueue_Make_)(DPCTLSyclQueueRef);
+
+    // memory
+    DPCTLSyclUSMRef (*Memory_GetUsmPointer_)(Py_MemoryObject *);
+    void *(*Memory_GetOpaquePointer_)(Py_MemoryObject *);
+    DPCTLSyclContextRef (*Memory_GetContextRef_)(Py_MemoryObject *);
+    DPCTLSyclQueueRef (*Memory_GetQueueRef_)(Py_MemoryObject *);
+    size_t (*Memory_GetNumBytes_)(Py_MemoryObject *);
+    PyObject *(*Memory_Make_)(DPCTLSyclUSMRef,
+                              size_t,
+                              DPCTLSyclQueueRef,
+                              PyObject *);
+
+    // program
+    DPCTLSyclKernelRef (*SyclKernel_GetKernelRef_)(PySyclKernelObject *);
+    PySyclKernelObject *(*SyclKernel_Make_)(DPCTLSyclKernelRef, const char *);
+
+    DPCTLSyclKernelBundleRef (*SyclProgram_GetKernelBundleRef_)(
+        PySyclProgramObject *);
+    PySyclProgramObject *(*SyclProgram_Make_)(DPCTLSyclKernelBundleRef);
+
+    // tensor
+    char *(*UsmNDArray_GetData_)(PyUSMArrayObject *);
+    int (*UsmNDArray_GetNDim_)(PyUSMArrayObject *);
+    py::ssize_t *(*UsmNDArray_GetShape_)(PyUSMArrayObject *);
+    py::ssize_t *(*UsmNDArray_GetStrides_)(PyUSMArrayObject *);
+    int (*UsmNDArray_GetTypenum_)(PyUSMArrayObject *);
+    int (*UsmNDArray_GetElementSize_)(PyUSMArrayObject *);
+    int (*UsmNDArray_GetFlags_)(PyUSMArrayObject *);
+    DPCTLSyclQueueRef (*UsmNDArray_GetQueueRef_)(PyUSMArrayObject *);
+    py::ssize_t (*UsmNDArray_GetOffset_)(PyUSMArrayObject *);
+    PyObject *(*UsmNDArray_GetUSMData_)(PyUSMArrayObject *);
+    void (*UsmNDArray_SetWritableFlag_)(PyUSMArrayObject *, int);
+    PyObject *(*UsmNDArray_MakeSimpleFromMemory_)(int,
+                                                  const py::ssize_t *,
+                                                  int,
+                                                  Py_MemoryObject *,
+                                                  py::ssize_t,
+                                                  char);
+    PyObject *(*UsmNDArray_MakeSimpleFromPtr_)(size_t,
+                                               int,
+                                               DPCTLSyclUSMRef,
+                                               DPCTLSyclQueueRef,
+                                               PyObject *);
+    PyObject *(*UsmNDArray_MakeFromPtr_)(int,
+                                         const py::ssize_t *,
+                                         int,
+                                         const py::ssize_t *,
+                                         DPCTLSyclUSMRef,
+                                         DPCTLSyclQueueRef,
+                                         py::ssize_t,
+                                         PyObject *);
+
+    int USM_ARRAY_C_CONTIGUOUS_;
+    int USM_ARRAY_F_CONTIGUOUS_;
+    int USM_ARRAY_WRITABLE_;
+    int UAR_BOOL_, UAR_BYTE_, UAR_UBYTE_, UAR_SHORT_, UAR_USHORT_, UAR_INT_,
+        UAR_UINT_, UAR_LONG_, UAR_ULONG_, UAR_LONGLONG_, UAR_ULONGLONG_,
+        UAR_FLOAT_, UAR_DOUBLE_, UAR_CFLOAT_, UAR_CDOUBLE_, UAR_TYPE_SENTINEL_,
+        UAR_HALF_;
+    int UAR_INT8_, UAR_UINT8_, UAR_INT16_, UAR_UINT16_, UAR_INT32_, UAR_UINT32_,
+        UAR_INT64_, UAR_UINT64_;
+
+    bool PySyclDevice_Check_(PyObject *obj) const
+    {
+        return PyObject_TypeCheck(obj, PySyclDeviceType_) != 0;
+    }
+    bool PySyclContext_Check_(PyObject *obj) const
+    {
+        return PyObject_TypeCheck(obj, PySyclContextType_) != 0;
+    }
+    bool PySyclEvent_Check_(PyObject *obj) const
+    {
+        return PyObject_TypeCheck(obj, PySyclEventType_) != 0;
+    }
+    bool PySyclQueue_Check_(PyObject *obj) const
+    {
+        return PyObject_TypeCheck(obj, PySyclQueueType_) != 0;
+    }
+    bool PySyclKernel_Check_(PyObject *obj) const
+    {
+        return PyObject_TypeCheck(obj, PySyclKernelType_) != 0;
+    }
+    bool PySyclProgram_Check_(PyObject *obj) const
+    {
+        return PyObject_TypeCheck(obj, PySyclProgramType_) != 0;
+    }
+
+    ~dpctl_capi()
+    {
+        as_usm_memory_.reset();
+        default_usm_ndarray_.reset();
+        default_usm_memory_.reset();
+        default_sycl_queue_.reset();
+    };
+
+    static auto &get()
+    {
+        static dpctl_capi api{};
+        return api;
+    }
+
+    py::object default_sycl_queue_pyobj()
+    {
+        return *default_sycl_queue_;
+    }
+    py::object default_usm_memory_pyobj()
+    {
+        return *default_usm_memory_;
+    }
+    py::object default_usm_ndarray_pyobj()
+    {
+        return *default_usm_ndarray_;
+    }
+    py::object as_usm_memory_pyobj()
+    {
+        return *as_usm_memory_;
+    }
+
+private:
+    struct Deleter
+    {
+        void operator()(py::object *p) const
+        {
+            const bool initialized = Py_IsInitialized();
+#if PY_VERSION_HEX < 0x30d0000
+            const bool finalizing = _Py_IsFinalizing();
+#else
+            const bool finalizing = Py_IsFinalizing();
+#endif
+            const bool guard = initialized && !finalizing;
+
+            if (guard) {
+                delete p;
+            }
+        }
+    };
+
+    std::shared_ptr<py::object> default_sycl_queue_;
+    std::shared_ptr<py::object> default_usm_memory_;
+    std::shared_ptr<py::object> default_usm_ndarray_;
+    std::shared_ptr<py::object> as_usm_memory_;
+
+    dpctl_capi()
+        : Py_SyclDeviceType_(nullptr), PySyclDeviceType_(nullptr),
+          Py_SyclContextType_(nullptr), PySyclContextType_(nullptr),
+          Py_SyclEventType_(nullptr), PySyclEventType_(nullptr),
+          Py_SyclQueueType_(nullptr), PySyclQueueType_(nullptr),
+          Py_MemoryType_(nullptr), PyMemoryUSMDeviceType_(nullptr),
+          PyMemoryUSMSharedType_(nullptr), PyMemoryUSMHostType_(nullptr),
+          PyUSMArrayType_(nullptr), PySyclProgramType_(nullptr),
+          PySyclKernelType_(nullptr), SyclDevice_GetDeviceRef_(nullptr),
+          SyclDevice_Make_(nullptr), SyclContext_GetContextRef_(nullptr),
+          SyclContext_Make_(nullptr), SyclEvent_GetEventRef_(nullptr),
+          SyclEvent_Make_(nullptr), SyclQueue_GetQueueRef_(nullptr),
+          SyclQueue_Make_(nullptr), Memory_GetUsmPointer_(nullptr),
+          Memory_GetOpaquePointer_(nullptr), Memory_GetContextRef_(nullptr),
+          Memory_GetQueueRef_(nullptr), Memory_GetNumBytes_(nullptr),
+          Memory_Make_(nullptr), SyclKernel_GetKernelRef_(nullptr),
+          SyclKernel_Make_(nullptr), SyclProgram_GetKernelBundleRef_(nullptr),
+          SyclProgram_Make_(nullptr), UsmNDArray_GetData_(nullptr),
+          UsmNDArray_GetNDim_(nullptr), UsmNDArray_GetShape_(nullptr),
+          UsmNDArray_GetStrides_(nullptr), UsmNDArray_GetTypenum_(nullptr),
+          UsmNDArray_GetElementSize_(nullptr), UsmNDArray_GetFlags_(nullptr),
+          UsmNDArray_GetQueueRef_(nullptr), UsmNDArray_GetOffset_(nullptr),
+          UsmNDArray_GetUSMData_(nullptr), UsmNDArray_SetWritableFlag_(nullptr),
+          UsmNDArray_MakeSimpleFromMemory_(nullptr),
+          UsmNDArray_MakeSimpleFromPtr_(nullptr),
+          UsmNDArray_MakeFromPtr_(nullptr), USM_ARRAY_C_CONTIGUOUS_(0),
+          USM_ARRAY_F_CONTIGUOUS_(0), USM_ARRAY_WRITABLE_(0), UAR_BOOL_(-1),
+          UAR_BYTE_(-1), UAR_UBYTE_(-1), UAR_SHORT_(-1), UAR_USHORT_(-1),
+          UAR_INT_(-1), UAR_UINT_(-1), UAR_LONG_(-1), UAR_ULONG_(-1),
+          UAR_LONGLONG_(-1), UAR_ULONGLONG_(-1), UAR_FLOAT_(-1),
+          UAR_DOUBLE_(-1), UAR_CFLOAT_(-1), UAR_CDOUBLE_(-1),
+          UAR_TYPE_SENTINEL_(-1), UAR_HALF_(-1), UAR_INT8_(-1), UAR_UINT8_(-1),
+          UAR_INT16_(-1), UAR_UINT16_(-1), UAR_INT32_(-1), UAR_UINT32_(-1),
+          UAR_INT64_(-1), UAR_UINT64_(-1), default_sycl_queue_{},
+          default_usm_memory_{}, default_usm_ndarray_{}, as_usm_memory_{}
+
+    {
+        // Import Cython-generated C-API for dpctl
+        // This imports python modules and initializes
+        // static variables such as function pointers for C-API,
+        // e.g. SyclDevice_GetDeviceRef, etc.
+        // pointers to Python types, i.e. PySyclDeviceType, etc.
+        // and exported constants, i.e. USM_ARRAY_C_CONTIGUOUS, etc.
+        import_dpctl();
+
+        // Python type objects for classes implemented by dpctl
+        this->Py_SyclDeviceType_ = &Py_SyclDeviceType;
+        this->PySyclDeviceType_ = &PySyclDeviceType;
+        this->Py_SyclContextType_ = &Py_SyclContextType;
+        this->PySyclContextType_ = &PySyclContextType;
+        this->Py_SyclEventType_ = &Py_SyclEventType;
+        this->PySyclEventType_ = &PySyclEventType;
+        this->Py_SyclQueueType_ = &Py_SyclQueueType;
+        this->PySyclQueueType_ = &PySyclQueueType;
+        this->Py_MemoryType_ = &Py_MemoryType;
+        this->PyMemoryUSMDeviceType_ = &PyMemoryUSMDeviceType;
+        this->PyMemoryUSMSharedType_ = &PyMemoryUSMSharedType;
+        this->PyMemoryUSMHostType_ = &PyMemoryUSMHostType;
+        this->PyUSMArrayType_ = &PyUSMArrayType;
+        this->PySyclProgramType_ = &PySyclProgramType;
+        this->PySyclKernelType_ = &PySyclKernelType;
+
+        // SyclDevice API
+        this->SyclDevice_GetDeviceRef_ = SyclDevice_GetDeviceRef;
+        this->SyclDevice_Make_ = SyclDevice_Make;
+
+        // SyclContext API
+        this->SyclContext_GetContextRef_ = SyclContext_GetContextRef;
+        this->SyclContext_Make_ = SyclContext_Make;
+
+        // SyclEvent API
+        this->SyclEvent_GetEventRef_ = SyclEvent_GetEventRef;
+        this->SyclEvent_Make_ = SyclEvent_Make;
+
+        // SyclQueue API
+        this->SyclQueue_GetQueueRef_ = SyclQueue_GetQueueRef;
+        this->SyclQueue_Make_ = SyclQueue_Make;
+
+        // dpctl.memory API
+        this->Memory_GetUsmPointer_ = Memory_GetUsmPointer;
+        this->Memory_GetOpaquePointer_ = Memory_GetOpaquePointer;
+        this->Memory_GetContextRef_ = Memory_GetContextRef;
+        this->Memory_GetQueueRef_ = Memory_GetQueueRef;
+        this->Memory_GetNumBytes_ = Memory_GetNumBytes;
+        this->Memory_Make_ = Memory_Make;
+
+        // dpctl.program API
+        this->SyclKernel_GetKernelRef_ = SyclKernel_GetKernelRef;
+        this->SyclKernel_Make_ = SyclKernel_Make;
+        this->SyclProgram_GetKernelBundleRef_ = SyclProgram_GetKernelBundleRef;
+        this->SyclProgram_Make_ = SyclProgram_Make;
+
+        // dpctl.tensor.usm_ndarray API
+        this->UsmNDArray_GetData_ = UsmNDArray_GetData;
+        this->UsmNDArray_GetNDim_ = UsmNDArray_GetNDim;
+        this->UsmNDArray_GetShape_ = UsmNDArray_GetShape;
+        this->UsmNDArray_GetStrides_ = UsmNDArray_GetStrides;
+        this->UsmNDArray_GetTypenum_ = UsmNDArray_GetTypenum;
+        this->UsmNDArray_GetElementSize_ = UsmNDArray_GetElementSize;
+        this->UsmNDArray_GetFlags_ = UsmNDArray_GetFlags;
+        this->UsmNDArray_GetQueueRef_ = UsmNDArray_GetQueueRef;
+        this->UsmNDArray_GetOffset_ = UsmNDArray_GetOffset;
+        this->UsmNDArray_GetUSMData_ = UsmNDArray_GetUSMData;
+        this->UsmNDArray_SetWritableFlag_ = UsmNDArray_SetWritableFlag;
+        this->UsmNDArray_MakeSimpleFromMemory_ =
+            UsmNDArray_MakeSimpleFromMemory;
+        this->UsmNDArray_MakeSimpleFromPtr_ = UsmNDArray_MakeSimpleFromPtr;
+        this->UsmNDArray_MakeFromPtr_ = UsmNDArray_MakeFromPtr;
+
+        // constants
+        this->USM_ARRAY_C_CONTIGUOUS_ = USM_ARRAY_C_CONTIGUOUS;
+        this->USM_ARRAY_F_CONTIGUOUS_ = USM_ARRAY_F_CONTIGUOUS;
+        this->USM_ARRAY_WRITABLE_ = USM_ARRAY_WRITABLE;
+        this->UAR_BOOL_ = UAR_BOOL;
+        this->UAR_BYTE_ = UAR_BYTE;
+        this->UAR_UBYTE_ = UAR_UBYTE;
+        this->UAR_SHORT_ = UAR_SHORT;
+        this->UAR_USHORT_ = UAR_USHORT;
+        this->UAR_INT_ = UAR_INT;
+        this->UAR_UINT_ = UAR_UINT;
+        this->UAR_LONG_ = UAR_LONG;
+        this->UAR_ULONG_ = UAR_ULONG;
+        this->UAR_LONGLONG_ = UAR_LONGLONG;
+        this->UAR_ULONGLONG_ = UAR_ULONGLONG;
+        this->UAR_FLOAT_ = UAR_FLOAT;
+        this->UAR_DOUBLE_ = UAR_DOUBLE;
+        this->UAR_CFLOAT_ = UAR_CFLOAT;
+        this->UAR_CDOUBLE_ = UAR_CDOUBLE;
+        this->UAR_TYPE_SENTINEL_ = UAR_TYPE_SENTINEL;
+        this->UAR_HALF_ = UAR_HALF;
+
+        // deduced disjoint types
+        this->UAR_INT8_ = UAR_BYTE;
+        this->UAR_UINT8_ = UAR_UBYTE;
+        this->UAR_INT16_ = UAR_SHORT;
+        this->UAR_UINT16_ = UAR_USHORT;
+        this->UAR_INT32_ =
+            platform_typeid_lookup<std::int32_t, long, int, short>(
+                UAR_LONG, UAR_INT, UAR_SHORT);
+        this->UAR_UINT32_ =
+            platform_typeid_lookup<std::uint32_t, unsigned long, unsigned int,
+                                   unsigned short>(UAR_ULONG, UAR_UINT,
+                                                   UAR_USHORT);
+        this->UAR_INT64_ =
+            platform_typeid_lookup<std::int64_t, long, long long, int>(
+                UAR_LONG, UAR_LONGLONG, UAR_INT);
+        this->UAR_UINT64_ =
+            platform_typeid_lookup<std::uint64_t, unsigned long,
+                                   unsigned long long, unsigned int>(
+                UAR_ULONG, UAR_ULONGLONG, UAR_UINT);
+
+        // create shared pointers to python objects used in type-casters
+        // for dpctl::memory::usm_memory and dpctl::tensor::usm_ndarray
+        sycl::queue q_{};
+        PySyclQueueObject *py_q_tmp =
+            SyclQueue_Make(reinterpret_cast<DPCTLSyclQueueRef>(&q_));
+        const py::object &py_sycl_queue = py::reinterpret_steal<py::object>(
+            reinterpret_cast<PyObject *>(py_q_tmp));
+
+        default_sycl_queue_ = std::shared_ptr<py::object>(
+            new py::object(py_sycl_queue), Deleter{});
+
+        py::module_ mod_memory = py::module_::import("dpctl.memory");
+        const py::object &py_as_usm_memory = mod_memory.attr("as_usm_memory");
+        as_usm_memory_ = std::shared_ptr<py::object>(
+            new py::object{py_as_usm_memory}, Deleter{});
+
+        auto mem_kl = mod_memory.attr("MemoryUSMHost");
+        const py::object &py_default_usm_memory =
+            mem_kl(1, py::arg("queue") = py_sycl_queue);
+        default_usm_memory_ = std::shared_ptr<py::object>(
+            new py::object{py_default_usm_memory}, Deleter{});
+
+        py::module_ mod_usmarray =
+            py::module_::import("dpctl.tensor._usmarray");
+        auto tensor_kl = mod_usmarray.attr("usm_ndarray");
+
+        const py::object &py_default_usm_ndarray =
+            tensor_kl(py::tuple(), py::arg("dtype") = py::str("u1"),
+                      py::arg("buffer") = py_default_usm_memory);
+
+        default_usm_ndarray_ = std::shared_ptr<py::object>(
+            new py::object{py_default_usm_ndarray}, Deleter{});
+    }
+
+    dpctl_capi(dpctl_capi const &) = default;
+    dpctl_capi &operator=(dpctl_capi const &) = default;
+    dpctl_capi &operator=(dpctl_capi &&) = default;
+
+}; // struct dpctl_capi
+} // namespace detail
+} // namespace dpctl
+
+namespace pybind11::detail
+{
+#define DPCTL_TYPE_CASTER(type, py_name)                                       \
+protected:                                                                     \
+    std::unique_ptr<type> value;                                               \
+                                                                               \
+public:                                                                        \
+    static constexpr auto name = py_name;                                      \
+    template <                                                                 \
+        typename T_,                                                           \
+        ::pybind11::detail::enable_if_t<                                       \
+            std::is_same<type, ::pybind11::detail::remove_cv_t<T_>>::value,    \
+            int> = 0>                                                          \
+    static ::pybind11::handle cast(T_ *src,                                    \
+                                   ::pybind11::return_value_policy policy,     \
+                                   ::pybind11::handle parent)                  \
+    {                                                                          \
+        if (!src)                                                              \
+            return ::pybind11::none().release();                               \
+        if (policy == ::pybind11::return_value_policy::take_ownership) {       \
+            auto h = cast(std::move(*src), policy, parent);                    \
+            delete src;                                                        \
+            return h;                                                          \
+        }                                                                      \
+        return cast(*src, policy, parent);                                     \
+    }                                                                          \
+    operator type *()                                                          \
+    {                                                                          \
+        return value.get();                                                    \
+    } /* NOLINT(bugprone-macro-parentheses) */                                 \
+    operator type &()                                                          \
+    {                                                                          \
+        return *value;                                                         \
+    } /* NOLINT(bugprone-macro-parentheses) */                                 \
+    operator type &&() &&                                                      \
+    {                                                                          \
+        return std::move(*value);                                              \
+    } /* NOLINT(bugprone-macro-parentheses) */                                 \
+    template <typename T_>                                                     \
+    using cast_op_type = ::pybind11::detail::movable_cast_op_type<T_>
+
+/* This type caster associates ``sycl::queue`` C++ class with
+ * :class:`dpctl.SyclQueue` for the purposes of generation of
+ * Python bindings by pybind11.
+ */
+template <>
+struct type_caster<sycl::queue>
+{
+public:
+    bool load(handle src, bool)
+    {
+        PyObject *source = src.ptr();
+        auto const &api = ::dpctl::detail::dpctl_capi::get();
+        if (api.PySyclQueue_Check_(source)) {
+            DPCTLSyclQueueRef QRef = api.SyclQueue_GetQueueRef_(
+                reinterpret_cast<PySyclQueueObject *>(source));
+            value = std::make_unique<sycl::queue>(
+                *(reinterpret_cast<sycl::queue *>(QRef)));
+            return true;
+        }
+        else {
+            throw py::type_error(
+                "Input is of unexpected type, expected dpctl.SyclQueue");
+        }
+    }
+
+    static handle cast(sycl::queue src, return_value_policy, handle)
+    {
+        auto const &api = ::dpctl::detail::dpctl_capi::get();
+        auto tmp =
+            api.SyclQueue_Make_(reinterpret_cast<DPCTLSyclQueueRef>(&src));
+        return handle(reinterpret_cast<PyObject *>(tmp));
+    }
+
+    DPCTL_TYPE_CASTER(sycl::queue, _("dpctl.SyclQueue"));
+};
+
+/* This type caster associates ``sycl::device`` C++ class with
+ * :class:`dpctl.SyclDevice` for the purposes of generation of
+ * Python bindings by pybind11.
+ */
+template <>
+struct type_caster<sycl::device>
+{
+public:
+    bool load(handle src, bool)
+    {
+        PyObject *source = src.ptr();
+        auto const &api = ::dpctl::detail::dpctl_capi::get();
+        if (api.PySyclDevice_Check_(source)) {
+            DPCTLSyclDeviceRef DRef = api.SyclDevice_GetDeviceRef_(
+                reinterpret_cast<PySyclDeviceObject *>(source));
+            value = std::make_unique<sycl::device>(
+                *(reinterpret_cast<sycl::device *>(DRef)));
+            return true;
+        }
+        else {
+            throw py::type_error(
+                "Input is of unexpected type, expected dpctl.SyclDevice");
+        }
+    }
+
+    static handle cast(sycl::device src, return_value_policy, handle)
+    {
+        auto const &api = ::dpctl::detail::dpctl_capi::get();
+        auto tmp =
+            api.SyclDevice_Make_(reinterpret_cast<DPCTLSyclDeviceRef>(&src));
+        return handle(reinterpret_cast<PyObject *>(tmp));
+    }
+
+    DPCTL_TYPE_CASTER(sycl::device, _("dpctl.SyclDevice"));
+};
+
+/* This type caster associates ``sycl::context`` C++ class with
+ * :class:`dpctl.SyclContext` for the purposes of generation of
+ * Python bindings by pybind11.
+ */
+template <>
+struct type_caster<sycl::context>
+{
+public:
+    bool load(handle src, bool)
+    {
+        PyObject *source = src.ptr();
+        auto const &api = ::dpctl::detail::dpctl_capi::get();
+        if (api.PySyclContext_Check_(source)) {
+            DPCTLSyclContextRef CRef = api.SyclContext_GetContextRef_(
+                reinterpret_cast<PySyclContextObject *>(source));
+            value = std::make_unique<sycl::context>(
+                *(reinterpret_cast<sycl::context *>(CRef)));
+            return true;
+        }
+        else {
+            throw py::type_error(
+                "Input is of unexpected type, expected dpctl.SyclContext");
+        }
+    }
+
+    static handle cast(sycl::context src, return_value_policy, handle)
+    {
+        auto const &api = ::dpctl::detail::dpctl_capi::get();
+        auto tmp =
+            api.SyclContext_Make_(reinterpret_cast<DPCTLSyclContextRef>(&src));
+        return handle(reinterpret_cast<PyObject *>(tmp));
+    }
+
+    DPCTL_TYPE_CASTER(sycl::context, _("dpctl.SyclContext"));
+};
+
+/* This type caster associates ``sycl::event`` C++ class with
+ * :class:`dpctl.SyclEvent` for the purposes of generation of
+ * Python bindings by pybind11.
+ */
+template <>
+struct type_caster<sycl::event>
+{
+public:
+    bool load(handle src, bool)
+    {
+        PyObject *source = src.ptr();
+        auto const &api = ::dpctl::detail::dpctl_capi::get();
+        if (api.PySyclEvent_Check_(source)) {
+            DPCTLSyclEventRef ERef = api.SyclEvent_GetEventRef_(
+                reinterpret_cast<PySyclEventObject *>(source));
+            value = std::make_unique<sycl::event>(
+                *(reinterpret_cast<sycl::event *>(ERef)));
+            return true;
+        }
+        else {
+            throw py::type_error(
+                "Input is of unexpected type, expected dpctl.SyclEvent");
+        }
+    }
+
+    static handle cast(sycl::event src, return_value_policy, handle)
+    {
+        auto const &api = ::dpctl::detail::dpctl_capi::get();
+        auto tmp =
+            api.SyclEvent_Make_(reinterpret_cast<DPCTLSyclEventRef>(&src));
+        return handle(reinterpret_cast<PyObject *>(tmp));
+    }
+
+    DPCTL_TYPE_CASTER(sycl::event, _("dpctl.SyclEvent"));
+};
+
+/* This type caster associates ``sycl::kernel`` C++ class with
+ * :class:`dpctl.program.SyclKernel` for the purposes of generation of
+ * Python bindings by pybind11.
+ */
+template <>
+struct type_caster<sycl::kernel>
+{
+public:
+    bool load(handle src, bool)
+    {
+        PyObject *source = src.ptr();
+        auto const &api = ::dpctl::detail::dpctl_capi::get();
+        if (api.PySyclKernel_Check_(source)) {
+            DPCTLSyclKernelRef KRef = api.SyclKernel_GetKernelRef_(
+                reinterpret_cast<PySyclKernelObject *>(source));
+            value = std::make_unique<sycl::kernel>(
+                *(reinterpret_cast<sycl::kernel *>(KRef)));
+            return true;
+        }
+        else {
+            throw py::type_error("Input is of unexpected type, expected "
+                                 "dpctl.program.SyclKernel");
+        }
+    }
+
+    static handle cast(sycl::kernel src, return_value_policy, handle)
+    {
+        auto const &api = ::dpctl::detail::dpctl_capi::get();
+        auto tmp =
+            api.SyclKernel_Make_(reinterpret_cast<DPCTLSyclKernelRef>(&src),
+                                 "dpctl4pybind11_kernel");
+        return handle(reinterpret_cast<PyObject *>(tmp));
+    }
+
+    DPCTL_TYPE_CASTER(sycl::kernel, _("dpctl.program.SyclKernel"));
+};
+
+/* This type caster associates
+ * ``sycl::kernel_bundle<sycl::bundle_state::executable>`` C++ class with
+ * :class:`dpctl.program.SyclProgram` for the purposes of generation of
+ * Python bindings by pybind11.
+ */
+template <>
+struct type_caster<sycl::kernel_bundle<sycl::bundle_state::executable>>
+{
+public:
+    bool load(handle src, bool)
+    {
+        PyObject *source = src.ptr();
+        auto const &api = ::dpctl::detail::dpctl_capi::get();
+        if (api.PySyclProgram_Check_(source)) {
+            DPCTLSyclKernelBundleRef KBRef =
+                api.SyclProgram_GetKernelBundleRef_(
+                    reinterpret_cast<PySyclProgramObject *>(source));
+            value = std::make_unique<
+                sycl::kernel_bundle<sycl::bundle_state::executable>>(
+                *(reinterpret_cast<
+                    sycl::kernel_bundle<sycl::bundle_state::executable> *>(
+                    KBRef)));
+            return true;
+        }
+        else {
+            throw py::type_error("Input is of unexpected type, expected "
+                                 "dpctl.program.SyclProgram");
+        }
+    }
+
+    static handle cast(sycl::kernel_bundle<sycl::bundle_state::executable> src,
+                       return_value_policy,
+                       handle)
+    {
+        auto const &api = ::dpctl::detail::dpctl_capi::get();
+        auto tmp = api.SyclProgram_Make_(
+            reinterpret_cast<DPCTLSyclKernelBundleRef>(&src));
+        return handle(reinterpret_cast<PyObject *>(tmp));
+    }
+
+    DPCTL_TYPE_CASTER(sycl::kernel_bundle<sycl::bundle_state::executable>,
+                      _("dpctl.program.SyclProgram"));
+};
+
+/* This type caster associates
+ * ``sycl::half`` C++ class with Python :class:`float` for the purposes
+ * of generation of Python bindings by pybind11.
+ */
+template <>
+struct type_caster<sycl::half>
+{
+public:
+    bool load(handle src, bool convert)
+    {
+        double py_value;
+
+        if (!src) {
+            return false;
+        }
+
+        PyObject *source = src.ptr();
+
+        if (convert || PyFloat_Check(source)) {
+            py_value = PyFloat_AsDouble(source);
+        }
+        else {
+            return false;
+        }
+
+        bool py_err = (py_value == double(-1)) && PyErr_Occurred();
+
+        if (py_err) {
+            PyErr_Clear();
+            if (convert && (PyNumber_Check(source) != 0)) {
+                auto tmp = reinterpret_steal<object>(PyNumber_Float(source));
+                return load(tmp, false);
+            }
+            return false;
+        }
+        value = static_cast<sycl::half>(py_value);
+        return true;
+    }
+
+    static handle cast(sycl::half src, return_value_policy, handle)
+    {
+        return PyFloat_FromDouble(static_cast<double>(src));
+    }
+
+    PYBIND11_TYPE_CASTER(sycl::half, _("float"));
+};
+} // namespace pybind11::detail
+
+namespace dpctl
+{
+namespace memory
+{
+// since PYBIND11_OBJECT_CVT uses error_already_set without namespace,
+// this allows to avoid compilation error
+using pybind11::error_already_set;
+
+class usm_memory : public py::object
+{
+public:
+    PYBIND11_OBJECT_CVT(
+        usm_memory,
+        py::object,
+        [](PyObject *o) -> bool {
+            return PyObject_TypeCheck(
+                       o, ::dpctl::detail::dpctl_capi::get().Py_MemoryType_) !=
+                   0;
+        },
+        [](PyObject *o) -> PyObject * { return as_usm_memory(o); })
+
+    usm_memory()
+        : py::object(
+              ::dpctl::detail::dpctl_capi::get().default_usm_memory_pyobj(),
+              borrowed_t{})
+    {
+        if (!m_ptr)
+            throw py::error_already_set();
+    }
+
+    /*! @brief Create usm_memory object from shared pointer that manages
+     *  lifetime of the USM allocation.
+     */
+    usm_memory(void *usm_ptr,
+               std::size_t nbytes,
+               const sycl::queue &q,
+               std::shared_ptr<void> shptr)
+    {
+        auto const &api = ::dpctl::detail::dpctl_capi::get();
+        DPCTLSyclUSMRef usm_ref = reinterpret_cast<DPCTLSyclUSMRef>(usm_ptr);
+        auto q_uptr = std::make_unique<sycl::queue>(q);
+        DPCTLSyclQueueRef QRef =
+            reinterpret_cast<DPCTLSyclQueueRef>(q_uptr.get());
+
+        auto vacuous_destructor = []() {};
+        py::capsule mock_owner(vacuous_destructor);
+
+        // create memory object owned by mock_owner, it is a new reference
+        PyObject *_memory =
+            api.Memory_Make_(usm_ref, nbytes, QRef, mock_owner.ptr());
+        auto ref_count_decrementer = [](PyObject *o) noexcept { Py_DECREF(o); };
+
+        using py_uptrT =
+            std::unique_ptr<PyObject, decltype(ref_count_decrementer)>;
+
+        if (!_memory) {
+            throw py::error_already_set();
+        }
+
+        auto memory_uptr = py_uptrT(_memory, ref_count_decrementer);
+        std::shared_ptr<void> *opaque_ptr = new std::shared_ptr<void>(shptr);
+
+        Py_MemoryObject *memobj = reinterpret_cast<Py_MemoryObject *>(_memory);
+        // replace mock_owner capsule as the owner
+        memobj->refobj = Py_None;
+        // set opaque ptr field, usm_memory now knowns that USM is managed
+        // by smart pointer
+        memobj->_opaque_ptr = reinterpret_cast<void *>(opaque_ptr);
+
+        // _memory will delete created copies of sycl::queue, and
+        // std::shared_ptr and the deleter of the shared_ptr<void> is
+        // supposed to free the USM allocation
+        m_ptr = _memory;
+        q_uptr.release();
+        memory_uptr.release();
+    }
+
+    sycl::queue get_queue() const
+    {
+        Py_MemoryObject *mem_obj = reinterpret_cast<Py_MemoryObject *>(m_ptr);
+        auto const &api = ::dpctl::detail::dpctl_capi::get();
+        DPCTLSyclQueueRef QRef = api.Memory_GetQueueRef_(mem_obj);
+        sycl::queue *obj_q = reinterpret_cast<sycl::queue *>(QRef);
+        return *obj_q;
+    }
+
+    char *get_pointer() const
+    {
+        Py_MemoryObject *mem_obj = reinterpret_cast<Py_MemoryObject *>(m_ptr);
+        auto const &api = ::dpctl::detail::dpctl_capi::get();
+        DPCTLSyclUSMRef MRef = api.Memory_GetUsmPointer_(mem_obj);
+        return reinterpret_cast<char *>(MRef);
+    }
+
+    std::size_t get_nbytes() const
+    {
+        auto const &api = ::dpctl::detail::dpctl_capi::get();
+        Py_MemoryObject *mem_obj = reinterpret_cast<Py_MemoryObject *>(m_ptr);
+        return api.Memory_GetNumBytes_(mem_obj);
+    }
+
+    bool is_managed_by_smart_ptr() const
+    {
+        auto const &api = ::dpctl::detail::dpctl_capi::get();
+        Py_MemoryObject *mem_obj = reinterpret_cast<Py_MemoryObject *>(m_ptr);
+        const void *opaque_ptr = api.Memory_GetOpaquePointer_(mem_obj);
+
+        return bool(opaque_ptr);
+    }
+
+    const std::shared_ptr<void> &get_smart_ptr_owner() const
+    {
+        auto const &api = ::dpctl::detail::dpctl_capi::get();
+        Py_MemoryObject *mem_obj = reinterpret_cast<Py_MemoryObject *>(m_ptr);
+        void *opaque_ptr = api.Memory_GetOpaquePointer_(mem_obj);
+
+        if (opaque_ptr) {
+            auto shptr_ptr =
+                reinterpret_cast<std::shared_ptr<void> *>(opaque_ptr);
+            return *shptr_ptr;
+        }
+        else {
+            throw std::runtime_error(
+                "Memory object does not have smart pointer "
+                "managing lifetime of USM allocation");
+        }
+    }
+
+protected:
+    static PyObject *as_usm_memory(PyObject *o)
+    {
+        if (o == nullptr) {
+            PyErr_SetString(PyExc_ValueError,
+                            "cannot create a usm_memory from a nullptr");
+            return nullptr;
+        }
+
+        auto converter =
+            ::dpctl::detail::dpctl_capi::get().as_usm_memory_pyobj();
+
+        py::object res;
+        try {
+            res = converter(py::handle(o));
+        } catch (const py::error_already_set &e) {
+            return nullptr;
+        }
+        return res.ptr();
+    }
+};
+} // end namespace memory
+
+namespace tensor
+{
+inline std::vector<py::ssize_t>
+    c_contiguous_strides(int nd,
+                         const py::ssize_t *shape,
+                         py::ssize_t element_size = 1)
+{
+    if (nd > 0) {
+        std::vector<py::ssize_t> c_strides(nd, element_size);
+        for (int ic = nd - 1; ic > 0;) {
+            py::ssize_t next_v = c_strides[ic] * shape[ic];
+            c_strides[--ic] = next_v;
+        }
+        return c_strides;
+    }
+    else {
+        return std::vector<py::ssize_t>();
+    }
+}
+
+inline std::vector<py::ssize_t>
+    f_contiguous_strides(int nd,
+                         const py::ssize_t *shape,
+                         py::ssize_t element_size = 1)
+{
+    if (nd > 0) {
+        std::vector<py::ssize_t> f_strides(nd, element_size);
+        for (int i = 0; i < nd - 1;) {
+            py::ssize_t next_v = f_strides[i] * shape[i];
+            f_strides[++i] = next_v;
+        }
+        return f_strides;
+    }
+    else {
+        return std::vector<py::ssize_t>();
+    }
+}
+
+inline std::vector<py::ssize_t>
+    c_contiguous_strides(const std::vector<py::ssize_t> &shape,
+                         py::ssize_t element_size = 1)
+{
+    return c_contiguous_strides(shape.size(), shape.data(), element_size);
+}
+
+inline std::vector<py::ssize_t>
+    f_contiguous_strides(const std::vector<py::ssize_t> &shape,
+                         py::ssize_t element_size = 1)
+{
+    return f_contiguous_strides(shape.size(), shape.data(), element_size);
+}
+
+class usm_ndarray : public py::object
+{
+public:
+    PYBIND11_OBJECT(usm_ndarray, py::object, [](PyObject *o) -> bool {
+        return PyObject_TypeCheck(
+                   o, ::dpctl::detail::dpctl_capi::get().PyUSMArrayType_) != 0;
+    })
+
+    usm_ndarray()
+        : py::object(
+              ::dpctl::detail::dpctl_capi::get().default_usm_ndarray_pyobj(),
+              borrowed_t{})
+    {
+        if (!m_ptr)
+            throw py::error_already_set();
+    }
+
+    char *get_data() const
+    {
+        PyUSMArrayObject *raw_ar = usm_array_ptr();
+
+        auto const &api = ::dpctl::detail::dpctl_capi::get();
+        return api.UsmNDArray_GetData_(raw_ar);
+    }
+
+    template <typename T>
+    T *get_data() const
+    {
+        return reinterpret_cast<T *>(get_data());
+    }
+
+    int get_ndim() const
+    {
+        PyUSMArrayObject *raw_ar = usm_array_ptr();
+
+        auto const &api = ::dpctl::detail::dpctl_capi::get();
+        return api.UsmNDArray_GetNDim_(raw_ar);
+    }
+
+    const py::ssize_t *get_shape_raw() const
+    {
+        PyUSMArrayObject *raw_ar = usm_array_ptr();
+
+        auto const &api = ::dpctl::detail::dpctl_capi::get();
+        return api.UsmNDArray_GetShape_(raw_ar);
+    }
+
+    std::vector<py::ssize_t> get_shape_vector() const
+    {
+        auto raw_sh = get_shape_raw();
+        auto nd = get_ndim();
+
+        std::vector<py::ssize_t> shape_vector(raw_sh, raw_sh + nd);
+        return shape_vector;
+    }
+
+    py::ssize_t get_shape(int i) const
+    {
+        auto shape_ptr = get_shape_raw();
+        return shape_ptr[i];
+    }
+
+    const py::ssize_t *get_strides_raw() const
+    {
+        PyUSMArrayObject *raw_ar = usm_array_ptr();
+
+        auto const &api = ::dpctl::detail::dpctl_capi::get();
+        return api.UsmNDArray_GetStrides_(raw_ar);
+    }
+
+    std::vector<py::ssize_t> get_strides_vector() const
+    {
+        auto raw_st = get_strides_raw();
+        auto nd = get_ndim();
+
+        if (raw_st == nullptr) {
+            auto is_c_contig = is_c_contiguous();
+            auto is_f_contig = is_f_contiguous();
+            auto raw_sh = get_shape_raw();
+            if (is_c_contig) {
+                const auto &contig_strides = c_contiguous_strides(nd, raw_sh);
+                return contig_strides;
+            }
+            else if (is_f_contig) {
+                const auto &contig_strides = f_contiguous_strides(nd, raw_sh);
+                return contig_strides;
+            }
+            else {
+                throw std::runtime_error("Invalid array encountered when "
+                                         "building strides");
+            }
+        }
+        else {
+            std::vector<py::ssize_t> st_vec(raw_st, raw_st + nd);
+            return st_vec;
+        }
+    }
+
+    py::ssize_t get_size() const
+    {
+        PyUSMArrayObject *raw_ar = usm_array_ptr();
+
+        auto const &api = ::dpctl::detail::dpctl_capi::get();
+        int ndim = api.UsmNDArray_GetNDim_(raw_ar);
+        const py::ssize_t *shape = api.UsmNDArray_GetShape_(raw_ar);
+
+        py::ssize_t nelems = 1;
+        for (int i = 0; i < ndim; ++i) {
+            nelems *= shape[i];
+        }
+
+        assert(nelems >= 0);
+        return nelems;
+    }
+
+    std::pair<py::ssize_t, py::ssize_t> get_minmax_offsets() const
+    {
+        PyUSMArrayObject *raw_ar = usm_array_ptr();
+
+        auto const &api = ::dpctl::detail::dpctl_capi::get();
+        int nd = api.UsmNDArray_GetNDim_(raw_ar);
+        const py::ssize_t *shape = api.UsmNDArray_GetShape_(raw_ar);
+        const py::ssize_t *strides = api.UsmNDArray_GetStrides_(raw_ar);
+
+        py::ssize_t offset_min = 0;
+        py::ssize_t offset_max = 0;
+        if (strides == nullptr) {
+            py::ssize_t stride(1);
+            for (int i = 0; i < nd; ++i) {
+                offset_max += stride * (shape[i] - 1);
+                stride *= shape[i];
+            }
+        }
+        else {
+            for (int i = 0; i < nd; ++i) {
+                py::ssize_t delta = strides[i] * (shape[i] - 1);
+                if (strides[i] > 0) {
+                    offset_max += delta;
+                }
+                else {
+                    offset_min += delta;
+                }
+            }
+        }
+        return std::make_pair(offset_min, offset_max);
+    }
+
+    sycl::queue get_queue() const
+    {
+        PyUSMArrayObject *raw_ar = usm_array_ptr();
+
+        auto const &api = ::dpctl::detail::dpctl_capi::get();
+        DPCTLSyclQueueRef QRef = api.UsmNDArray_GetQueueRef_(raw_ar);
+        return *(reinterpret_cast<sycl::queue *>(QRef));
+    }
+
+    sycl::device get_device() const
+    {
+        PyUSMArrayObject *raw_ar = usm_array_ptr();
+
+        auto const &api = ::dpctl::detail::dpctl_capi::get();
+        DPCTLSyclQueueRef QRef = api.UsmNDArray_GetQueueRef_(raw_ar);
+        return reinterpret_cast<sycl::queue *>(QRef)->get_device();
+    }
+
+    int get_typenum() const
+    {
+        PyUSMArrayObject *raw_ar = usm_array_ptr();
+
+        auto const &api = ::dpctl::detail::dpctl_capi::get();
+        return api.UsmNDArray_GetTypenum_(raw_ar);
+    }
+
+    int get_flags() const
+    {
+        PyUSMArrayObject *raw_ar = usm_array_ptr();
+
+        auto const &api = ::dpctl::detail::dpctl_capi::get();
+        return api.UsmNDArray_GetFlags_(raw_ar);
+    }
+
+    int get_elemsize() const
+    {
+        PyUSMArrayObject *raw_ar = usm_array_ptr();
+
+        auto const &api = ::dpctl::detail::dpctl_capi::get();
+        return api.UsmNDArray_GetElementSize_(raw_ar);
+    }
+
+    bool is_c_contiguous() const
+    {
+        int flags = get_flags();
+        auto const &api = ::dpctl::detail::dpctl_capi::get();
+        return static_cast<bool>(flags & api.USM_ARRAY_C_CONTIGUOUS_);
+    }
+
+    bool is_f_contiguous() const
+    {
+        int flags = get_flags();
+        auto const &api = ::dpctl::detail::dpctl_capi::get();
+        return static_cast<bool>(flags & api.USM_ARRAY_F_CONTIGUOUS_);
+    }
+
+    bool is_writable() const
+    {
+        int flags = get_flags();
+        auto const &api = ::dpctl::detail::dpctl_capi::get();
+        return static_cast<bool>(flags & api.USM_ARRAY_WRITABLE_);
+    }
+
+    /*! @brief Get usm_data property of array */
+    py::object get_usm_data() const
+    {
+        PyUSMArrayObject *raw_ar = usm_array_ptr();
+
+        auto const &api = ::dpctl::detail::dpctl_capi::get();
+        // UsmNDArray_GetUSMData_ gives a new reference
+        PyObject *usm_data = api.UsmNDArray_GetUSMData_(raw_ar);
+
+        // pass reference ownership to py::object
+        return py::reinterpret_steal<py::object>(usm_data);
+    }
+
+    bool is_managed_by_smart_ptr() const
+    {
+        PyUSMArrayObject *raw_ar = usm_array_ptr();
+
+        auto const &api = ::dpctl::detail::dpctl_capi::get();
+        PyObject *usm_data = api.UsmNDArray_GetUSMData_(raw_ar);
+
+        if (!PyObject_TypeCheck(usm_data, api.Py_MemoryType_)) {
+            Py_DECREF(usm_data);
+            return false;
+        }
+
+        Py_MemoryObject *mem_obj =
+            reinterpret_cast<Py_MemoryObject *>(usm_data);
+        const void *opaque_ptr = api.Memory_GetOpaquePointer_(mem_obj);
+
+        Py_DECREF(usm_data);
+        return bool(opaque_ptr);
+    }
+
+    const std::shared_ptr<void> &get_smart_ptr_owner() const
+    {
+        PyUSMArrayObject *raw_ar = usm_array_ptr();
+
+        auto const &api = ::dpctl::detail::dpctl_capi::get();
+
+        PyObject *usm_data = api.UsmNDArray_GetUSMData_(raw_ar);
+
+        if (!PyObject_TypeCheck(usm_data, api.Py_MemoryType_)) {
+            Py_DECREF(usm_data);
+            throw std::runtime_error(
+                "usm_ndarray object does not have Memory object "
+                "managing lifetime of USM allocation");
+        }
+
+        Py_MemoryObject *mem_obj =
+            reinterpret_cast<Py_MemoryObject *>(usm_data);
+        void *opaque_ptr = api.Memory_GetOpaquePointer_(mem_obj);
+        Py_DECREF(usm_data);
+
+        if (opaque_ptr) {
+            auto shptr_ptr =
+                reinterpret_cast<std::shared_ptr<void> *>(opaque_ptr);
+            return *shptr_ptr;
+        }
+        else {
+            throw std::runtime_error(
+                "Memory object underlying usm_ndarray does not have "
+                "smart pointer managing lifetime of USM allocation");
+        }
+    }
+
+private:
+    PyUSMArrayObject *usm_array_ptr() const
+    {
+        return reinterpret_cast<PyUSMArrayObject *>(m_ptr);
+    }
+};
+} // end namespace tensor
+
+namespace utils
+{
+namespace detail
+{
+struct ManagedMemory
+{
+
+    static bool is_usm_managed_by_shared_ptr(const py::object &h)
+    {
+        if (py::isinstance<dpctl::memory::usm_memory>(h)) {
+            const auto &usm_memory_inst =
+                py::cast<dpctl::memory::usm_memory>(h);
+            return usm_memory_inst.is_managed_by_smart_ptr();
+        }
+        else if (py::isinstance<dpctl::tensor::usm_ndarray>(h)) {
+            const auto &usm_array_inst =
+                py::cast<dpctl::tensor::usm_ndarray>(h);
+            return usm_array_inst.is_managed_by_smart_ptr();
+        }
+
+        return false;
+    }
+
+    static const std::shared_ptr<void> &extract_shared_ptr(const py::object &h)
+    {
+        if (py::isinstance<dpctl::memory::usm_memory>(h)) {
+            const auto &usm_memory_inst =
+                py::cast<dpctl::memory::usm_memory>(h);
+            return usm_memory_inst.get_smart_ptr_owner();
+        }
+        else if (py::isinstance<dpctl::tensor::usm_ndarray>(h)) {
+            const auto &usm_array_inst =
+                py::cast<dpctl::tensor::usm_ndarray>(h);
+            return usm_array_inst.get_smart_ptr_owner();
+        }
+
+        throw std::runtime_error(
+            "Attempted extraction of shared_ptr on an unrecognized type");
+    }
+};
+} // end of namespace detail
+
+template <std::size_t num>
+sycl::event keep_args_alive(sycl::queue &q,
+                            const py::object (&py_objs)[num],
+                            const std::vector<sycl::event> &depends = {})
+{
+    std::size_t n_objects_held = 0;
+    std::array<std::shared_ptr<py::handle>, num> shp_arr{};
+
+    std::size_t n_usm_owners_held = 0;
+    std::array<std::shared_ptr<void>, num> shp_usm{};
+
+    for (std::size_t i = 0; i < num; ++i) {
+        const auto &py_obj_i = py_objs[i];
+        if (detail::ManagedMemory::is_usm_managed_by_shared_ptr(py_obj_i)) {
+            const auto &shp =
+                detail::ManagedMemory::extract_shared_ptr(py_obj_i);
+            shp_usm[n_usm_owners_held] = shp;
+            ++n_usm_owners_held;
+        }
+        else {
+            shp_arr[n_objects_held] = std::make_shared<py::handle>(py_obj_i);
+            shp_arr[n_objects_held]->inc_ref();
+            ++n_objects_held;
+        }
+    }
+
+    bool use_depends = true;
+    sycl::event host_task_ev;
+
+    if (n_usm_owners_held > 0) {
+        host_task_ev = q.submit([&](sycl::handler &cgh) {
+            if (use_depends) {
+                cgh.depends_on(depends);
+                use_depends = false;
+            }
+            else {
+                cgh.depends_on(host_task_ev);
+            }
+            cgh.host_task([shp_usm = std::move(shp_usm)]() {
+                // no body, but shared pointers are captured in
+                // the lambda, ensuring that USM allocation is
+                // kept alive
+            });
+        });
+    }
+
+    if (n_objects_held > 0) {
+        host_task_ev = q.submit([&](sycl::handler &cgh) {
+            if (use_depends) {
+                cgh.depends_on(depends);
+                use_depends = false;
+            }
+            else {
+                cgh.depends_on(host_task_ev);
+            }
+            cgh.host_task([n_objects_held, shp_arr = std::move(shp_arr)]() {
+                py::gil_scoped_acquire acquire;
+
+                for (std::size_t i = 0; i < n_objects_held; ++i) {
+                    shp_arr[i]->dec_ref();
+                }
+            });
+        });
+    }
+
+    return host_task_ev;
+}
+
+/*! @brief Check if all allocation queues are the same as the
+    execution queue */
+template <std::size_t num>
+bool queues_are_compatible(const sycl::queue &exec_q,
+                           const sycl::queue (&alloc_qs)[num])
+{
+    for (std::size_t i = 0; i < num; ++i) {
+
+        if (exec_q != alloc_qs[i]) {
+            return false;
+        }
+    }
+    return true;
+}
+
+/*! @brief Check if all allocation queues of usm_ndarays are the same as
+    the execution queue */
+template <std::size_t num>
+bool queues_are_compatible(const sycl::queue &exec_q,
+                           const ::dpctl::tensor::usm_ndarray (&arrs)[num])
+{
+    for (std::size_t i = 0; i < num; ++i) {
+
+        if (exec_q != arrs[i].get_queue()) {
+            return false;
+        }
+    }
+    return true;
+}
+} // end namespace utils
+} // end namespace dpctl

From 1eef13fd5b556d010696360dce39df05d341199d Mon Sep 17 00:00:00 2001
From: Anton Volkov <antonwolfy@gmail.com>
Date: Fri, 23 Jan 2026 04:27:03 -0800
Subject: [PATCH 04/18] Update windows extension to work with dpnp4pybind11.hpp

---
 .../libtensor/include/utils/memory_overlap.hpp    |  4 ++++
 .../libtensor/include/utils/output_validation.hpp |  4 ++++
 .../libtensor/include/utils/type_dispatch.hpp     |  4 ++++
 dpnp/backend/extensions/window/CMakeLists.txt     | 15 +++++----------
 dpnp/backend/extensions/window/common.hpp         |  4 ++--
 dpnp/backend/extensions/window/kaiser.hpp         |  4 ++--
 6 files changed, 21 insertions(+), 14 deletions(-)

diff --git a/dpctl/tensor/libtensor/include/utils/memory_overlap.hpp b/dpctl/tensor/libtensor/include/utils/memory_overlap.hpp
index 3b1bc772b514..db9dfc30eb46 100644
--- a/dpctl/tensor/libtensor/include/utils/memory_overlap.hpp
+++ b/dpctl/tensor/libtensor/include/utils/memory_overlap.hpp
@@ -38,7 +38,11 @@
 
 #include <pybind11/pybind11.h>
 
+#if __has_include(<dpnp4pybind11.hpp>)
+#include "dpnp4pybind11.hpp"
+#else
 #include "dpctl4pybind11.hpp"
+#endif
 
 /* @brief check for overlap of memory regions behind arrays.
 
diff --git a/dpctl/tensor/libtensor/include/utils/output_validation.hpp b/dpctl/tensor/libtensor/include/utils/output_validation.hpp
index 1397efdee230..7a70f395dfe1 100644
--- a/dpctl/tensor/libtensor/include/utils/output_validation.hpp
+++ b/dpctl/tensor/libtensor/include/utils/output_validation.hpp
@@ -37,7 +37,11 @@
 
 #include <pybind11/pybind11.h>
 
+#if __has_include(<dpnp4pybind11.hpp>)
+#include "dpnp4pybind11.hpp"
+#else
 #include "dpctl4pybind11.hpp"
+#endif
 
 namespace dpctl::tensor::validation
 {
diff --git a/dpctl/tensor/libtensor/include/utils/type_dispatch.hpp b/dpctl/tensor/libtensor/include/utils/type_dispatch.hpp
index 5ec84783c901..38b5b43ca696 100644
--- a/dpctl/tensor/libtensor/include/utils/type_dispatch.hpp
+++ b/dpctl/tensor/libtensor/include/utils/type_dispatch.hpp
@@ -36,7 +36,11 @@
 #include <stdexcept>
 #include <string>
 
+#if __has_include(<dpnp4pybind11.hpp>)
+#include "dpnp4pybind11.hpp"
+#else
 #include "dpctl4pybind11.hpp"
+#endif
 
 #include "type_dispatch_building.hpp"
 
diff --git a/dpnp/backend/extensions/window/CMakeLists.txt b/dpnp/backend/extensions/window/CMakeLists.txt
index 186668bb1662..6fe04e334f42 100644
--- a/dpnp/backend/extensions/window/CMakeLists.txt
+++ b/dpnp/backend/extensions/window/CMakeLists.txt
@@ -62,17 +62,12 @@ set_target_properties(
 
 target_include_directories(
     ${python_module_name}
-    PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../../
+    PRIVATE
+        ${CMAKE_CURRENT_SOURCE_DIR}/../../
+        ${CMAKE_CURRENT_SOURCE_DIR}/../common
+        ${CMAKE_SOURCE_DIR}/dpnp/backend/include
+        ${CMAKE_SOURCE_DIR}/dpctl/tensor/libtensor/include
 )
-target_include_directories(
-    ${python_module_name}
-    PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../common
-)
-target_include_directories(
-    ${python_module_name}
-    PRIVATE ${CMAKE_SOURCE_DIR}/dpctl/tensor/libtensor/include
-)
-
 target_include_directories(${python_module_name} PUBLIC ${Dpctl_INCLUDE_DIR})
 
 if(WIN32)
diff --git a/dpnp/backend/extensions/window/common.hpp b/dpnp/backend/extensions/window/common.hpp
index cb084e972d78..b95aea6259e3 100644
--- a/dpnp/backend/extensions/window/common.hpp
+++ b/dpnp/backend/extensions/window/common.hpp
@@ -30,9 +30,10 @@
 
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
+
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include "dpnp4pybind11.hpp"
 
 // dpctl tensor headers
 #include "utils/output_validation.hpp"
@@ -41,7 +42,6 @@
 
 namespace dpnp::extensions::window
 {
-
 namespace dpctl_td_ns = dpctl::tensor::type_dispatch;
 
 namespace py = pybind11;
diff --git a/dpnp/backend/extensions/window/kaiser.hpp b/dpnp/backend/extensions/window/kaiser.hpp
index 0a4712cc594e..46227a60669f 100644
--- a/dpnp/backend/extensions/window/kaiser.hpp
+++ b/dpnp/backend/extensions/window/kaiser.hpp
@@ -28,9 +28,10 @@
 
 #pragma once
 
-#include <dpctl4pybind11.hpp>
 #include <sycl/sycl.hpp>
 
+#include "dpnp4pybind11.hpp"
+
 namespace dpnp::extensions::window
 {
 extern std::pair<sycl::event, sycl::event>
@@ -40,5 +41,4 @@ extern std::pair<sycl::event, sycl::event>
               const std::vector<sycl::event> &depends);
 
 extern void init_kaiser_dispatch_vectors(void);
-
 } // namespace dpnp::extensions::window

From 4dd62e8c655d16314df719f8d71267883fb41624 Mon Sep 17 00:00:00 2001
From: Anton Volkov <antonwolfy@gmail.com>
Date: Fri, 23 Jan 2026 04:31:51 -0800
Subject: [PATCH 05/18] Update blas extension to work with dpnp4pybind11.hpp

---
 dpnp/backend/extensions/blas/CMakeLists.txt | 16 +++++-----------
 dpnp/backend/extensions/blas/gemm.hpp       |  2 +-
 dpnp/backend/extensions/blas/gemv.hpp       |  2 +-
 dpnp/backend/extensions/blas/syrk.hpp       |  2 +-
 4 files changed, 8 insertions(+), 14 deletions(-)

diff --git a/dpnp/backend/extensions/blas/CMakeLists.txt b/dpnp/backend/extensions/blas/CMakeLists.txt
index 2da35cc695ac..0015eda84843 100644
--- a/dpnp/backend/extensions/blas/CMakeLists.txt
+++ b/dpnp/backend/extensions/blas/CMakeLists.txt
@@ -65,18 +65,12 @@ set_target_properties(
 
 target_include_directories(
     ${python_module_name}
-    PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../../
+    PRIVATE
+        ${CMAKE_CURRENT_SOURCE_DIR}/../common
+        ${CMAKE_SOURCE_DIR}/dpnp/backend/include
+        ${CMAKE_SOURCE_DIR}/dpctl/tensor/libtensor/include
 )
-target_include_directories(
-    ${python_module_name}
-    PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../common
-)
-target_include_directories(
-    ${python_module_name}
-    PRIVATE ${CMAKE_SOURCE_DIR}/dpctl/tensor/libtensor/include
-)
-
-target_include_directories(${python_module_name} PUBLIC ${Dpctl_INCLUDE_DIRS})
+target_include_directories(${python_module_name} PUBLIC ${Dpctl_INCLUDE_DIR})
 
 if(WIN32)
     target_compile_options(
diff --git a/dpnp/backend/extensions/blas/gemm.hpp b/dpnp/backend/extensions/blas/gemm.hpp
index 997d515f98a0..59a3d911d885 100644
--- a/dpnp/backend/extensions/blas/gemm.hpp
+++ b/dpnp/backend/extensions/blas/gemm.hpp
@@ -31,7 +31,7 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include <dpctl4pybind11.hpp>
+#include "dpnp4pybind11.hpp"
 
 namespace dpnp::extensions::blas
 {
diff --git a/dpnp/backend/extensions/blas/gemv.hpp b/dpnp/backend/extensions/blas/gemv.hpp
index afe0c6387aa9..6da71ed0964f 100644
--- a/dpnp/backend/extensions/blas/gemv.hpp
+++ b/dpnp/backend/extensions/blas/gemv.hpp
@@ -31,7 +31,7 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include <dpctl4pybind11.hpp>
+#include "dpnp4pybind11.hpp"
 
 namespace dpnp::extensions::blas
 {
diff --git a/dpnp/backend/extensions/blas/syrk.hpp b/dpnp/backend/extensions/blas/syrk.hpp
index 580239b28008..f6cec189489a 100644
--- a/dpnp/backend/extensions/blas/syrk.hpp
+++ b/dpnp/backend/extensions/blas/syrk.hpp
@@ -31,7 +31,7 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include <dpctl4pybind11.hpp>
+#include "dpnp4pybind11.hpp"
 
 namespace dpnp::extensions::blas
 {

From 6157a52ecc836702016c27201f667e3fa31a58da Mon Sep 17 00:00:00 2001
From: Anton Volkov <antonwolfy@gmail.com>
Date: Fri, 23 Jan 2026 04:46:22 -0800
Subject: [PATCH 06/18] Update fft extension to work with dpnp4pybind11.hpp

---
 dpnp/backend/extensions/fft/CMakeLists.txt   |  7 ++++---
 dpnp/backend/extensions/fft/in_place.hpp     |  5 ++++-
 dpnp/backend/extensions/fft/in_place.tpp     | 10 +++++++++-
 dpnp/backend/extensions/fft/out_of_place.hpp |  5 ++++-
 dpnp/backend/extensions/fft/out_of_place.tpp | 12 +++++++++++-
 5 files changed, 32 insertions(+), 7 deletions(-)

diff --git a/dpnp/backend/extensions/fft/CMakeLists.txt b/dpnp/backend/extensions/fft/CMakeLists.txt
index 0c2c446fe8a0..0569ecc8bca4 100644
--- a/dpnp/backend/extensions/fft/CMakeLists.txt
+++ b/dpnp/backend/extensions/fft/CMakeLists.txt
@@ -59,10 +59,11 @@ set_target_properties(
 
 target_include_directories(
     ${python_module_name}
-    PRIVATE ${CMAKE_SOURCE_DIR}/dpctl/tensor/libtensor/include
+    PRIVATE
+        ${CMAKE_SOURCE_DIR}/dpnp/backend/include
+        ${CMAKE_SOURCE_DIR}/dpctl/tensor/libtensor/include
 )
-
-target_include_directories(${python_module_name} PUBLIC ${Dpctl_INCLUDE_DIRS})
+target_include_directories(${python_module_name} PUBLIC ${Dpctl_INCLUDE_DIR})
 
 if(WIN32)
     target_compile_options(
diff --git a/dpnp/backend/extensions/fft/in_place.hpp b/dpnp/backend/extensions/fft/in_place.hpp
index 7eed11565b9e..bc35201b9b6e 100644
--- a/dpnp/backend/extensions/fft/in_place.hpp
+++ b/dpnp/backend/extensions/fft/in_place.hpp
@@ -28,10 +28,13 @@
 
 #pragma once
 
+#include <utility>
+#include <vector>
+
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include <dpctl4pybind11.hpp>
+#include "dpnp4pybind11.hpp"
 
 namespace dpnp::extensions::fft
 {
diff --git a/dpnp/backend/extensions/fft/in_place.tpp b/dpnp/backend/extensions/fft/in_place.tpp
index 4bc166b0e7ae..ace535284de6 100644
--- a/dpnp/backend/extensions/fft/in_place.tpp
+++ b/dpnp/backend/extensions/fft/in_place.tpp
@@ -27,15 +27,23 @@
 //*****************************************************************************
 
 #pragma once
+
 #include <stdexcept>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include <pybind11/pybind11.h>
 
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include <dpctl4pybind11.hpp>
+#include "dpnp4pybind11.hpp"
 
 #include "common.hpp"
 #include "fft_utils.hpp"
+#include "in_place.hpp"
+
 // dpctl tensor headers
 #include "utils/output_validation.hpp"
 
diff --git a/dpnp/backend/extensions/fft/out_of_place.hpp b/dpnp/backend/extensions/fft/out_of_place.hpp
index 811a2bd6d1c4..55ca9383baaf 100644
--- a/dpnp/backend/extensions/fft/out_of_place.hpp
+++ b/dpnp/backend/extensions/fft/out_of_place.hpp
@@ -28,10 +28,13 @@
 
 #pragma once
 
+#include <utility>
+#include <vector>
+
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include <dpctl4pybind11.hpp>
+#include "dpnp4pybind11.hpp"
 
 namespace dpnp::extensions::fft
 {
diff --git a/dpnp/backend/extensions/fft/out_of_place.tpp b/dpnp/backend/extensions/fft/out_of_place.tpp
index 290408dc60bc..e468246ea7af 100644
--- a/dpnp/backend/extensions/fft/out_of_place.tpp
+++ b/dpnp/backend/extensions/fft/out_of_place.tpp
@@ -27,15 +27,25 @@
 //*****************************************************************************
 
 #pragma once
+
+#include <cstddef>
+#include <cstdint>
 #include <stdexcept>
+#include <string>
+#include <utility>
+#include <vector>
 
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include <dpctl4pybind11.hpp>
+#include <pybind11/pybind11.h>
+
+#include "dpnp4pybind11.hpp"
 
 #include "common.hpp"
 #include "fft_utils.hpp"
+#include "out_of_place.hpp"
+
 // dpctl tensor headers
 #include "utils/memory_overlap.hpp"
 #include "utils/output_validation.hpp"

From 156d3b78a37f06cf58ff9f5486a9bf3637a0cf41 Mon Sep 17 00:00:00 2001
From: Anton Volkov <antonwolfy@gmail.com>
Date: Fri, 23 Jan 2026 04:52:26 -0800
Subject: [PATCH 07/18] Update indexing extension to work with
 dpnp4pybind11.hpp

---
 dpnp/backend/extensions/indexing/CMakeLists.txt | 14 ++++----------
 dpnp/backend/extensions/indexing/choose.cpp     | 12 ++++++------
 2 files changed, 10 insertions(+), 16 deletions(-)

diff --git a/dpnp/backend/extensions/indexing/CMakeLists.txt b/dpnp/backend/extensions/indexing/CMakeLists.txt
index 28d38bc28f21..c0de75ae3146 100644
--- a/dpnp/backend/extensions/indexing/CMakeLists.txt
+++ b/dpnp/backend/extensions/indexing/CMakeLists.txt
@@ -62,17 +62,11 @@ set_target_properties(
 
 target_include_directories(
     ${python_module_name}
-    PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../../
+    PRIVATE
+        ${CMAKE_CURRENT_SOURCE_DIR}/../common
+        ${CMAKE_SOURCE_DIR}/dpnp/backend/include
+        ${CMAKE_SOURCE_DIR}/dpctl/tensor/libtensor/include
 )
-target_include_directories(
-    ${python_module_name}
-    PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../common
-)
-target_include_directories(
-    ${python_module_name}
-    PRIVATE ${CMAKE_SOURCE_DIR}/dpctl/tensor/libtensor/include
-)
-
 target_include_directories(${python_module_name} PUBLIC ${Dpctl_INCLUDE_DIR})
 
 if(WIN32)
diff --git a/dpnp/backend/extensions/indexing/choose.cpp b/dpnp/backend/extensions/indexing/choose.cpp
index 99d91744366f..05d1bfe15385 100644
--- a/dpnp/backend/extensions/indexing/choose.cpp
+++ b/dpnp/backend/extensions/indexing/choose.cpp
@@ -30,15 +30,17 @@
 #include <cstddef>
 #include <cstdint>
 #include <memory>
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
-#include <sycl/sycl.hpp>
 #include <type_traits>
 #include <utility>
 #include <vector>
 
+#include <sycl/sycl.hpp>
+
+#include <pybind11/pybind11.h>
+
+#include "dpnp4pybind11.hpp"
+
 #include "choose_kernel.hpp"
-#include "dpctl4pybind11.hpp"
 
 // utils extension header
 #include "ext/common.hpp"
@@ -52,7 +54,6 @@
 
 namespace dpnp::extensions::indexing
 {
-
 namespace td_ns = dpctl::tensor::type_dispatch;
 
 static kernels::choose_fn_ptr_t choose_clip_dispatch_table[td_ns::num_types]
@@ -459,5 +460,4 @@ void init_choose(py::module_ m)
 
     return;
 }
-
 } // namespace dpnp::extensions::indexing

From d9d912d8a257222e759e90196a2e5d3867de50da Mon Sep 17 00:00:00 2001
From: Anton Volkov <antonwolfy@gmail.com>
Date: Fri, 23 Jan 2026 04:57:00 -0800
Subject: [PATCH 08/18] Update lapack extension to work with dpnp4pybind11.hpp

---
 dpnp/backend/extensions/lapack/CMakeLists.txt | 15 +++++----------
 dpnp/backend/extensions/lapack/geqrf.hpp      |  2 +-
 dpnp/backend/extensions/lapack/gesv.hpp       |  2 +-
 dpnp/backend/extensions/lapack/gesvd.hpp      |  2 +-
 dpnp/backend/extensions/lapack/getrf.hpp      |  2 +-
 dpnp/backend/extensions/lapack/getri.hpp      |  2 +-
 dpnp/backend/extensions/lapack/getrs.hpp      |  2 +-
 dpnp/backend/extensions/lapack/orgqr.hpp      |  2 +-
 dpnp/backend/extensions/lapack/potrf.hpp      |  2 +-
 dpnp/backend/extensions/lapack/ungqr.hpp      |  2 +-
 10 files changed, 14 insertions(+), 19 deletions(-)

diff --git a/dpnp/backend/extensions/lapack/CMakeLists.txt b/dpnp/backend/extensions/lapack/CMakeLists.txt
index aa0f6b718972..76b25c3a6d10 100644
--- a/dpnp/backend/extensions/lapack/CMakeLists.txt
+++ b/dpnp/backend/extensions/lapack/CMakeLists.txt
@@ -82,17 +82,12 @@ set_target_properties(
 
 target_include_directories(
     ${python_module_name}
-    PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../../
+    PRIVATE
+        ${CMAKE_CURRENT_SOURCE_DIR}/../../
+        ${CMAKE_CURRENT_SOURCE_DIR}/../common
+        ${CMAKE_SOURCE_DIR}/dpnp/backend/include
+        ${CMAKE_SOURCE_DIR}/dpctl/tensor/libtensor/include
 )
-target_include_directories(
-    ${python_module_name}
-    PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../common
-)
-target_include_directories(
-    ${python_module_name}
-    PRIVATE ${CMAKE_SOURCE_DIR}/dpctl/tensor/libtensor/include
-)
-
 target_include_directories(${python_module_name} PUBLIC ${Dpctl_INCLUDE_DIR})
 
 if(WIN32)
diff --git a/dpnp/backend/extensions/lapack/geqrf.hpp b/dpnp/backend/extensions/lapack/geqrf.hpp
index 522006ace8ab..7be1fee971cf 100644
--- a/dpnp/backend/extensions/lapack/geqrf.hpp
+++ b/dpnp/backend/extensions/lapack/geqrf.hpp
@@ -31,7 +31,7 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include <dpctl4pybind11.hpp>
+#include "dpnp4pybind11.hpp"
 
 namespace dpnp::extensions::lapack
 {
diff --git a/dpnp/backend/extensions/lapack/gesv.hpp b/dpnp/backend/extensions/lapack/gesv.hpp
index d4198efae62e..a86039c9b72e 100644
--- a/dpnp/backend/extensions/lapack/gesv.hpp
+++ b/dpnp/backend/extensions/lapack/gesv.hpp
@@ -31,7 +31,7 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include <dpctl4pybind11.hpp>
+#include "dpnp4pybind11.hpp"
 
 namespace dpnp::extensions::lapack
 {
diff --git a/dpnp/backend/extensions/lapack/gesvd.hpp b/dpnp/backend/extensions/lapack/gesvd.hpp
index 116348e01d9f..b2fea5e47299 100644
--- a/dpnp/backend/extensions/lapack/gesvd.hpp
+++ b/dpnp/backend/extensions/lapack/gesvd.hpp
@@ -31,7 +31,7 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include <dpctl4pybind11.hpp>
+#include "dpnp4pybind11.hpp"
 
 namespace dpnp::extensions::lapack
 {
diff --git a/dpnp/backend/extensions/lapack/getrf.hpp b/dpnp/backend/extensions/lapack/getrf.hpp
index 24ec473f4dc7..ce6dc3e788b5 100644
--- a/dpnp/backend/extensions/lapack/getrf.hpp
+++ b/dpnp/backend/extensions/lapack/getrf.hpp
@@ -31,7 +31,7 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include <dpctl4pybind11.hpp>
+#include "dpnp4pybind11.hpp"
 
 namespace dpnp::extensions::lapack
 {
diff --git a/dpnp/backend/extensions/lapack/getri.hpp b/dpnp/backend/extensions/lapack/getri.hpp
index d8c8e58f3fcb..728af4a77e01 100644
--- a/dpnp/backend/extensions/lapack/getri.hpp
+++ b/dpnp/backend/extensions/lapack/getri.hpp
@@ -31,7 +31,7 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include <dpctl4pybind11.hpp>
+#include "dpnp4pybind11.hpp"
 
 namespace dpnp::extensions::lapack
 {
diff --git a/dpnp/backend/extensions/lapack/getrs.hpp b/dpnp/backend/extensions/lapack/getrs.hpp
index f5a47c69c9ec..2728b0c4e04a 100644
--- a/dpnp/backend/extensions/lapack/getrs.hpp
+++ b/dpnp/backend/extensions/lapack/getrs.hpp
@@ -31,7 +31,7 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include <dpctl4pybind11.hpp>
+#include "dpnp4pybind11.hpp"
 
 namespace dpnp::extensions::lapack
 {
diff --git a/dpnp/backend/extensions/lapack/orgqr.hpp b/dpnp/backend/extensions/lapack/orgqr.hpp
index 962edc7b668f..2502fe567a1f 100644
--- a/dpnp/backend/extensions/lapack/orgqr.hpp
+++ b/dpnp/backend/extensions/lapack/orgqr.hpp
@@ -31,7 +31,7 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include <dpctl4pybind11.hpp>
+#include "dpnp4pybind11.hpp"
 
 namespace dpnp::extensions::lapack
 {
diff --git a/dpnp/backend/extensions/lapack/potrf.hpp b/dpnp/backend/extensions/lapack/potrf.hpp
index d5df48a9ddf4..02faf2c04fde 100644
--- a/dpnp/backend/extensions/lapack/potrf.hpp
+++ b/dpnp/backend/extensions/lapack/potrf.hpp
@@ -31,7 +31,7 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include <dpctl4pybind11.hpp>
+#include "dpnp4pybind11.hpp"
 
 namespace dpnp::extensions::lapack
 {
diff --git a/dpnp/backend/extensions/lapack/ungqr.hpp b/dpnp/backend/extensions/lapack/ungqr.hpp
index a149af1e24e1..8c9a36b3f4a6 100644
--- a/dpnp/backend/extensions/lapack/ungqr.hpp
+++ b/dpnp/backend/extensions/lapack/ungqr.hpp
@@ -31,7 +31,7 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include <dpctl4pybind11.hpp>
+#include "dpnp4pybind11.hpp"
 
 namespace dpnp::extensions::lapack
 {

From 35681bbab6cb098a83b71cfed826c4316877fcf9 Mon Sep 17 00:00:00 2001
From: Anton Volkov <antonwolfy@gmail.com>
Date: Fri, 23 Jan 2026 05:06:00 -0800
Subject: [PATCH 09/18] Update statistics extension to work with
 dpnp4pybind11.hpp

---
 .../extensions/common/ext/validation_utils.hpp        |  7 +++++++
 dpnp/backend/extensions/statistics/CMakeLists.txt     | 10 ++++------
 dpnp/backend/extensions/statistics/bincount.hpp       |  3 ++-
 dpnp/backend/extensions/statistics/histogram.cpp      |  5 ++---
 dpnp/backend/extensions/statistics/histogram.hpp      |  4 +++-
 .../extensions/statistics/histogram_common.cpp        | 10 ++++++----
 dpnp/backend/extensions/statistics/histogramdd.hpp    |  4 +++-
 .../extensions/statistics/sliding_dot_product1d.cpp   |  8 +++++---
 .../extensions/statistics/sliding_window1d.cpp        | 11 +++++++----
 9 files changed, 39 insertions(+), 23 deletions(-)

diff --git a/dpnp/backend/extensions/common/ext/validation_utils.hpp b/dpnp/backend/extensions/common/ext/validation_utils.hpp
index d41db8d5ca5a..0bb32c9f876a 100644
--- a/dpnp/backend/extensions/common/ext/validation_utils.hpp
+++ b/dpnp/backend/extensions/common/ext/validation_utils.hpp
@@ -32,7 +32,14 @@
 #include <unordered_map>
 #include <vector>
 
+#if __has_include(<dpnp4pybind11.hpp>)
+#include "dpnp4pybind11.hpp"
+#else
 #include "dpctl4pybind11.hpp"
+#endif
+
+// dpctl tensor headers
+#include "utils/type_dispatch.hpp"
 
 namespace ext::validation
 {
diff --git a/dpnp/backend/extensions/statistics/CMakeLists.txt b/dpnp/backend/extensions/statistics/CMakeLists.txt
index 710a35346d63..e04279b75e49 100644
--- a/dpnp/backend/extensions/statistics/CMakeLists.txt
+++ b/dpnp/backend/extensions/statistics/CMakeLists.txt
@@ -67,13 +67,11 @@ set_target_properties(
 
 target_include_directories(
     ${python_module_name}
-    PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../common
+    PRIVATE
+        ${CMAKE_CURRENT_SOURCE_DIR}/../common
+        ${CMAKE_SOURCE_DIR}/dpnp/backend/include
+        ${CMAKE_SOURCE_DIR}/dpctl/tensor/libtensor/include
 )
-target_include_directories(
-    ${python_module_name}
-    PRIVATE ${CMAKE_SOURCE_DIR}/dpctl/tensor/libtensor/include
-)
-
 target_include_directories(${python_module_name} PUBLIC ${Dpctl_INCLUDE_DIR})
 
 if(WIN32)
diff --git a/dpnp/backend/extensions/statistics/bincount.hpp b/dpnp/backend/extensions/statistics/bincount.hpp
index 5e42952349b0..2fc477e71edc 100644
--- a/dpnp/backend/extensions/statistics/bincount.hpp
+++ b/dpnp/backend/extensions/statistics/bincount.hpp
@@ -31,7 +31,8 @@
 #include <pybind11/pybind11.h>
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include "dpnp4pybind11.hpp"
+
 #include "ext/dispatch_table.hpp"
 
 namespace dpctl_td_ns = dpctl::tensor::type_dispatch;
diff --git a/dpnp/backend/extensions/statistics/histogram.cpp b/dpnp/backend/extensions/statistics/histogram.cpp
index 6d7da6836f60..621aa36cfbd1 100644
--- a/dpnp/backend/extensions/statistics/histogram.cpp
+++ b/dpnp/backend/extensions/statistics/histogram.cpp
@@ -33,10 +33,10 @@
 #include <vector>
 
 #include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
+
+#include "dpnp4pybind11.hpp"
 
 // dpctl tensor headers
-#include "dpctl4pybind11.hpp"
 #include "utils/type_dispatch.hpp"
 
 #include "histogram.hpp"
@@ -50,7 +50,6 @@ using namespace ext::common;
 
 namespace
 {
-
 template <typename T, typename DataStorage>
 struct HistogramEdges
 {
diff --git a/dpnp/backend/extensions/statistics/histogram.hpp b/dpnp/backend/extensions/statistics/histogram.hpp
index c6a79ec24ee3..d04d8edbf02b 100644
--- a/dpnp/backend/extensions/statistics/histogram.hpp
+++ b/dpnp/backend/extensions/statistics/histogram.hpp
@@ -31,7 +31,9 @@
 #include <pybind11/pybind11.h>
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include "dpnp4pybind11.hpp"
+
+// utils extension header
 #include "ext/dispatch_table.hpp"
 
 namespace statistics::histogram
diff --git a/dpnp/backend/extensions/statistics/histogram_common.cpp b/dpnp/backend/extensions/statistics/histogram_common.cpp
index 82afa2bd965d..252e1cd7c7cc 100644
--- a/dpnp/backend/extensions/statistics/histogram_common.cpp
+++ b/dpnp/backend/extensions/statistics/histogram_common.cpp
@@ -31,15 +31,18 @@
 #include <string>
 #include <vector>
 
-#include "dpctl4pybind11.hpp"
-#include "utils/type_dispatch.hpp"
-
 #include <pybind11/pybind11.h>
 
+#include "dpnp4pybind11.hpp"
+
 #include "histogram_common.hpp"
 
+// utils extension header
 #include "ext/validation_utils.hpp"
 
+// dpctl tensor headers
+#include "utils/type_dispatch.hpp"
+
 namespace dpctl_td_ns = dpctl::tensor::type_dispatch;
 using dpctl::tensor::usm_ndarray;
 using dpctl_td_ns::typenum_t;
@@ -57,7 +60,6 @@ using ext::validation::name_of;
 
 namespace statistics::histogram
 {
-
 void validate(const usm_ndarray &sample,
               const std::optional<const dpctl::tensor::usm_ndarray> &bins,
               const std::optional<const dpctl::tensor::usm_ndarray> &weights,
diff --git a/dpnp/backend/extensions/statistics/histogramdd.hpp b/dpnp/backend/extensions/statistics/histogramdd.hpp
index 327e9941dbc6..d7c46ae34b7d 100644
--- a/dpnp/backend/extensions/statistics/histogramdd.hpp
+++ b/dpnp/backend/extensions/statistics/histogramdd.hpp
@@ -31,7 +31,9 @@
 #include <pybind11/pybind11.h>
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include "dpnp4pybind11.hpp"
+
+// utils extension header
 #include "ext/dispatch_table.hpp"
 
 namespace statistics::histogram
diff --git a/dpnp/backend/extensions/statistics/sliding_dot_product1d.cpp b/dpnp/backend/extensions/statistics/sliding_dot_product1d.cpp
index b8f679f1030e..6c0e39a11a19 100644
--- a/dpnp/backend/extensions/statistics/sliding_dot_product1d.cpp
+++ b/dpnp/backend/extensions/statistics/sliding_dot_product1d.cpp
@@ -33,11 +33,14 @@
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 
+#include "dpnp4pybind11.hpp"
+
+// utils extension header
+#include "ext/common.hpp"
+
 // dpctl tensor headers
-#include "dpctl4pybind11.hpp"
 #include "utils/type_dispatch.hpp"
 
-#include "ext/common.hpp"
 #include "sliding_dot_product1d.hpp"
 #include "sliding_window1d.hpp"
 
@@ -51,7 +54,6 @@ using namespace ext::common;
 
 namespace
 {
-
 template <typename T>
 struct SlidingDotProductF
 {
diff --git a/dpnp/backend/extensions/statistics/sliding_window1d.cpp b/dpnp/backend/extensions/statistics/sliding_window1d.cpp
index 3ae66daa332b..81f8ae40104e 100644
--- a/dpnp/backend/extensions/statistics/sliding_window1d.cpp
+++ b/dpnp/backend/extensions/statistics/sliding_window1d.cpp
@@ -29,11 +29,16 @@
 #include <string>
 #include <vector>
 
-#include "dpctl4pybind11.hpp"
-#include "utils/type_dispatch.hpp"
 #include <pybind11/pybind11.h>
 
+#include "dpnp4pybind11.hpp"
+
+// utils extension header
 #include "ext/validation_utils.hpp"
+
+// dpctl tensor headers
+#include "utils/type_dispatch.hpp"
+
 #include "sliding_window1d.hpp"
 
 namespace dpctl_td_ns = dpctl::tensor::type_dispatch;
@@ -48,7 +53,6 @@ using ext::validation::name_of;
 
 namespace statistics::sliding_window1d
 {
-
 void validate(const usm_ndarray &a,
               const usm_ndarray &v,
               const usm_ndarray &out,
@@ -89,5 +93,4 @@ void validate(const usm_ndarray &a,
             std::to_string(expected_output_size) + ")");
     }
 }
-
 } // namespace statistics::sliding_window1d

From c35428e3aee3b14abadc550f8ef85e71860bc6e9 Mon Sep 17 00:00:00 2001
From: Anton Volkov <antonwolfy@gmail.com>
Date: Fri, 23 Jan 2026 05:16:45 -0800
Subject: [PATCH 10/18] Update ufunc extension to work with dpnp4pybind11.hpp

---
 .../elementwise_functions.hpp                     |  8 +++++---
 .../elementwise_functions_type_utils.cpp          |  9 ++++++---
 .../elementwise_functions_type_utils.hpp          |  8 ++++++--
 dpnp/backend/extensions/ufunc/CMakeLists.txt      | 15 +++++----------
 .../ufunc/elementwise_functions/bitwise_count.cpp |  2 +-
 .../ufunc/elementwise_functions/degrees.cpp       |  2 +-
 .../ufunc/elementwise_functions/divmod.cpp        |  2 +-
 .../ufunc/elementwise_functions/erf_funcs.cpp     |  2 +-
 .../ufunc/elementwise_functions/fabs.cpp          |  2 +-
 .../ufunc/elementwise_functions/float_power.cpp   |  2 +-
 .../ufunc/elementwise_functions/fmax.cpp          |  2 +-
 .../ufunc/elementwise_functions/fmin.cpp          |  2 +-
 .../ufunc/elementwise_functions/fmod.cpp          |  2 +-
 .../ufunc/elementwise_functions/frexp.cpp         |  2 +-
 .../ufunc/elementwise_functions/gcd.cpp           |  2 +-
 .../ufunc/elementwise_functions/heaviside.cpp     |  2 +-
 .../extensions/ufunc/elementwise_functions/i0.cpp |  2 +-
 .../ufunc/elementwise_functions/interpolate.cpp   |  2 +-
 .../ufunc/elementwise_functions/isclose.cpp       |  2 +-
 .../ufunc/elementwise_functions/lcm.cpp           |  2 +-
 .../ufunc/elementwise_functions/ldexp.cpp         |  2 +-
 .../ufunc/elementwise_functions/logaddexp2.cpp    |  2 +-
 .../ufunc/elementwise_functions/modf.cpp          |  2 +-
 .../ufunc/elementwise_functions/nan_to_num.cpp    |  2 +-
 .../ufunc/elementwise_functions/radians.cpp       |  2 +-
 .../ufunc/elementwise_functions/sinc.cpp          |  2 +-
 .../ufunc/elementwise_functions/spacing.cpp       |  2 +-
 27 files changed, 45 insertions(+), 41 deletions(-)

diff --git a/dpnp/backend/extensions/elementwise_functions/elementwise_functions.hpp b/dpnp/backend/extensions/elementwise_functions/elementwise_functions.hpp
index c996ac07df02..9e8d98d875a3 100644
--- a/dpnp/backend/extensions/elementwise_functions/elementwise_functions.hpp
+++ b/dpnp/backend/extensions/elementwise_functions/elementwise_functions.hpp
@@ -34,12 +34,14 @@
 #include <utility>
 #include <vector>
 
+#include <pybind11/pybind11.h>
 #include <sycl/sycl.hpp>
 
+#if __has_include(<dpnp4pybind11.hpp>)
+#include "dpnp4pybind11.hpp"
+#else
 #include "dpctl4pybind11.hpp"
-#include <pybind11/numpy.h>
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
+#endif
 
 #include "elementwise_functions_type_utils.hpp"
 #include "simplify_iteration_space.hpp"
diff --git a/dpnp/backend/extensions/elementwise_functions/elementwise_functions_type_utils.cpp b/dpnp/backend/extensions/elementwise_functions/elementwise_functions_type_utils.cpp
index 62f7584a3e0c..c126428f0558 100644
--- a/dpnp/backend/extensions/elementwise_functions/elementwise_functions_type_utils.cpp
+++ b/dpnp/backend/extensions/elementwise_functions/elementwise_functions_type_utils.cpp
@@ -26,12 +26,15 @@
 // THE POSSIBILITY OF SUCH DAMAGE.
 //*****************************************************************************
 
-#include "dpctl4pybind11.hpp"
-
-#include <pybind11/numpy.h>
 #include <pybind11/pybind11.h>
 #include <sycl/sycl.hpp>
 
+#if __has_include(<dpnp4pybind11.hpp>)
+#include "dpnp4pybind11.hpp"
+#else
+#include "dpctl4pybind11.hpp"
+#endif
+
 #include "elementwise_functions_type_utils.hpp"
 
 // dpctl tensor headers
diff --git a/dpnp/backend/extensions/elementwise_functions/elementwise_functions_type_utils.hpp b/dpnp/backend/extensions/elementwise_functions/elementwise_functions_type_utils.hpp
index 1bb6fedd7027..129a89a49dbe 100644
--- a/dpnp/backend/extensions/elementwise_functions/elementwise_functions_type_utils.hpp
+++ b/dpnp/backend/extensions/elementwise_functions/elementwise_functions_type_utils.hpp
@@ -28,10 +28,14 @@
 
 #pragma once
 
-#include "dpctl4pybind11.hpp"
 #include <pybind11/numpy.h>
 #include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
+
+#if __has_include(<dpnp4pybind11.hpp>)
+#include "dpnp4pybind11.hpp"
+#else
+#include "dpctl4pybind11.hpp"
+#endif
 
 // dpctl tensor headers
 #include "utils/type_dispatch.hpp"
diff --git a/dpnp/backend/extensions/ufunc/CMakeLists.txt b/dpnp/backend/extensions/ufunc/CMakeLists.txt
index f1378bf52d88..55a750f8423f 100644
--- a/dpnp/backend/extensions/ufunc/CMakeLists.txt
+++ b/dpnp/backend/extensions/ufunc/CMakeLists.txt
@@ -84,17 +84,12 @@ set_target_properties(
 
 target_include_directories(
     ${python_module_name}
-    PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../../
+    PRIVATE
+        ${CMAKE_CURRENT_SOURCE_DIR}/../../
+        ${CMAKE_CURRENT_SOURCE_DIR}/../common
+        ${CMAKE_SOURCE_DIR}/dpnp/backend/include
+        ${CMAKE_SOURCE_DIR}/dpctl/tensor/libtensor/include
 )
-target_include_directories(
-    ${python_module_name}
-    PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../common
-)
-target_include_directories(
-    ${python_module_name}
-    PRIVATE ${CMAKE_SOURCE_DIR}/dpctl/tensor/libtensor/include
-)
-
 target_include_directories(${python_module_name} PUBLIC ${Dpctl_INCLUDE_DIR})
 
 if(_dpnp_sycl_targets)
diff --git a/dpnp/backend/extensions/ufunc/elementwise_functions/bitwise_count.cpp b/dpnp/backend/extensions/ufunc/elementwise_functions/bitwise_count.cpp
index a0842f4ef259..14e8b7b5ed35 100644
--- a/dpnp/backend/extensions/ufunc/elementwise_functions/bitwise_count.cpp
+++ b/dpnp/backend/extensions/ufunc/elementwise_functions/bitwise_count.cpp
@@ -32,7 +32,7 @@
 
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include "dpnp4pybind11.hpp"
 
 #include "bitwise_count.hpp"
 #include "kernels/elementwise_functions/bitwise_count.hpp"
diff --git a/dpnp/backend/extensions/ufunc/elementwise_functions/degrees.cpp b/dpnp/backend/extensions/ufunc/elementwise_functions/degrees.cpp
index 77452a6b777f..511ea759ae35 100644
--- a/dpnp/backend/extensions/ufunc/elementwise_functions/degrees.cpp
+++ b/dpnp/backend/extensions/ufunc/elementwise_functions/degrees.cpp
@@ -31,7 +31,7 @@
 
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include "dpnp4pybind11.hpp"
 
 #include "degrees.hpp"
 #include "kernels/elementwise_functions/degrees.hpp"
diff --git a/dpnp/backend/extensions/ufunc/elementwise_functions/divmod.cpp b/dpnp/backend/extensions/ufunc/elementwise_functions/divmod.cpp
index af87dcc85f53..93d04ed7940e 100644
--- a/dpnp/backend/extensions/ufunc/elementwise_functions/divmod.cpp
+++ b/dpnp/backend/extensions/ufunc/elementwise_functions/divmod.cpp
@@ -32,7 +32,7 @@
 
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include "dpnp4pybind11.hpp"
 
 #include "divmod.hpp"
 #include "kernels/elementwise_functions/divmod.hpp"
diff --git a/dpnp/backend/extensions/ufunc/elementwise_functions/erf_funcs.cpp b/dpnp/backend/extensions/ufunc/elementwise_functions/erf_funcs.cpp
index 5254e50d3faf..e209f72a83b2 100644
--- a/dpnp/backend/extensions/ufunc/elementwise_functions/erf_funcs.cpp
+++ b/dpnp/backend/extensions/ufunc/elementwise_functions/erf_funcs.cpp
@@ -31,7 +31,7 @@
 
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include "dpnp4pybind11.hpp"
 
 #include "erf_funcs.hpp"
 #include "kernels/elementwise_functions/erf.hpp"
diff --git a/dpnp/backend/extensions/ufunc/elementwise_functions/fabs.cpp b/dpnp/backend/extensions/ufunc/elementwise_functions/fabs.cpp
index d2b6ae24ac4b..d673533e599b 100644
--- a/dpnp/backend/extensions/ufunc/elementwise_functions/fabs.cpp
+++ b/dpnp/backend/extensions/ufunc/elementwise_functions/fabs.cpp
@@ -31,7 +31,7 @@
 
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include "dpnp4pybind11.hpp"
 
 #include "fabs.hpp"
 #include "kernels/elementwise_functions/fabs.hpp"
diff --git a/dpnp/backend/extensions/ufunc/elementwise_functions/float_power.cpp b/dpnp/backend/extensions/ufunc/elementwise_functions/float_power.cpp
index 0994afc7c738..9d42630fd00c 100644
--- a/dpnp/backend/extensions/ufunc/elementwise_functions/float_power.cpp
+++ b/dpnp/backend/extensions/ufunc/elementwise_functions/float_power.cpp
@@ -32,7 +32,7 @@
 
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include "dpnp4pybind11.hpp"
 
 #include "float_power.hpp"
 
diff --git a/dpnp/backend/extensions/ufunc/elementwise_functions/fmax.cpp b/dpnp/backend/extensions/ufunc/elementwise_functions/fmax.cpp
index 5e1a9f33444b..70e8a434e7ac 100644
--- a/dpnp/backend/extensions/ufunc/elementwise_functions/fmax.cpp
+++ b/dpnp/backend/extensions/ufunc/elementwise_functions/fmax.cpp
@@ -30,7 +30,7 @@
 
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include "dpnp4pybind11.hpp"
 
 #include "fmax.hpp"
 #include "kernels/elementwise_functions/fmax.hpp"
diff --git a/dpnp/backend/extensions/ufunc/elementwise_functions/fmin.cpp b/dpnp/backend/extensions/ufunc/elementwise_functions/fmin.cpp
index c0e1db654317..d9c94109fdd0 100644
--- a/dpnp/backend/extensions/ufunc/elementwise_functions/fmin.cpp
+++ b/dpnp/backend/extensions/ufunc/elementwise_functions/fmin.cpp
@@ -30,7 +30,7 @@
 
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include "dpnp4pybind11.hpp"
 
 #include "fmin.hpp"
 #include "kernels/elementwise_functions/fmin.hpp"
diff --git a/dpnp/backend/extensions/ufunc/elementwise_functions/fmod.cpp b/dpnp/backend/extensions/ufunc/elementwise_functions/fmod.cpp
index 5b83595b3f7c..9db1f7873f5b 100644
--- a/dpnp/backend/extensions/ufunc/elementwise_functions/fmod.cpp
+++ b/dpnp/backend/extensions/ufunc/elementwise_functions/fmod.cpp
@@ -32,7 +32,7 @@
 
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include "dpnp4pybind11.hpp"
 
 #include "fmod.hpp"
 #include "kernels/elementwise_functions/fmod.hpp"
diff --git a/dpnp/backend/extensions/ufunc/elementwise_functions/frexp.cpp b/dpnp/backend/extensions/ufunc/elementwise_functions/frexp.cpp
index 4439f1e76993..0c4d1b1b9252 100644
--- a/dpnp/backend/extensions/ufunc/elementwise_functions/frexp.cpp
+++ b/dpnp/backend/extensions/ufunc/elementwise_functions/frexp.cpp
@@ -33,7 +33,7 @@
 
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include "dpnp4pybind11.hpp"
 
 #include "frexp.hpp"
 #include "kernels/elementwise_functions/frexp.hpp"
diff --git a/dpnp/backend/extensions/ufunc/elementwise_functions/gcd.cpp b/dpnp/backend/extensions/ufunc/elementwise_functions/gcd.cpp
index ec10504fa15e..1a570488cc1f 100644
--- a/dpnp/backend/extensions/ufunc/elementwise_functions/gcd.cpp
+++ b/dpnp/backend/extensions/ufunc/elementwise_functions/gcd.cpp
@@ -32,7 +32,7 @@
 
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include "dpnp4pybind11.hpp"
 
 #include "gcd.hpp"
 #include "kernels/elementwise_functions/gcd.hpp"
diff --git a/dpnp/backend/extensions/ufunc/elementwise_functions/heaviside.cpp b/dpnp/backend/extensions/ufunc/elementwise_functions/heaviside.cpp
index e3212de86f7f..69db72c7142b 100644
--- a/dpnp/backend/extensions/ufunc/elementwise_functions/heaviside.cpp
+++ b/dpnp/backend/extensions/ufunc/elementwise_functions/heaviside.cpp
@@ -31,7 +31,7 @@
 
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include "dpnp4pybind11.hpp"
 
 #include "heaviside.hpp"
 #include "kernels/elementwise_functions/heaviside.hpp"
diff --git a/dpnp/backend/extensions/ufunc/elementwise_functions/i0.cpp b/dpnp/backend/extensions/ufunc/elementwise_functions/i0.cpp
index 4d120a56e837..82c1c7cb27ad 100644
--- a/dpnp/backend/extensions/ufunc/elementwise_functions/i0.cpp
+++ b/dpnp/backend/extensions/ufunc/elementwise_functions/i0.cpp
@@ -31,7 +31,7 @@
 
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include "dpnp4pybind11.hpp"
 
 #include "i0.hpp"
 #include "kernels/elementwise_functions/i0.hpp"
diff --git a/dpnp/backend/extensions/ufunc/elementwise_functions/interpolate.cpp b/dpnp/backend/extensions/ufunc/elementwise_functions/interpolate.cpp
index 33c7ab19b9ab..f8ce8f007369 100644
--- a/dpnp/backend/extensions/ufunc/elementwise_functions/interpolate.cpp
+++ b/dpnp/backend/extensions/ufunc/elementwise_functions/interpolate.cpp
@@ -37,7 +37,7 @@
 
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include "dpnp4pybind11.hpp"
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 
diff --git a/dpnp/backend/extensions/ufunc/elementwise_functions/isclose.cpp b/dpnp/backend/extensions/ufunc/elementwise_functions/isclose.cpp
index b8179feb9263..34577aa7ba68 100644
--- a/dpnp/backend/extensions/ufunc/elementwise_functions/isclose.cpp
+++ b/dpnp/backend/extensions/ufunc/elementwise_functions/isclose.cpp
@@ -34,7 +34,7 @@
 
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include "dpnp4pybind11.hpp"
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 
diff --git a/dpnp/backend/extensions/ufunc/elementwise_functions/lcm.cpp b/dpnp/backend/extensions/ufunc/elementwise_functions/lcm.cpp
index 4276ceb6b246..c2d2e801fed8 100644
--- a/dpnp/backend/extensions/ufunc/elementwise_functions/lcm.cpp
+++ b/dpnp/backend/extensions/ufunc/elementwise_functions/lcm.cpp
@@ -32,7 +32,7 @@
 
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include "dpnp4pybind11.hpp"
 
 #include "kernels/elementwise_functions/lcm.hpp"
 #include "lcm.hpp"
diff --git a/dpnp/backend/extensions/ufunc/elementwise_functions/ldexp.cpp b/dpnp/backend/extensions/ufunc/elementwise_functions/ldexp.cpp
index 3e2c4f3d0149..5e413b30735d 100644
--- a/dpnp/backend/extensions/ufunc/elementwise_functions/ldexp.cpp
+++ b/dpnp/backend/extensions/ufunc/elementwise_functions/ldexp.cpp
@@ -32,7 +32,7 @@
 
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include "dpnp4pybind11.hpp"
 
 #include "kernels/elementwise_functions/ldexp.hpp"
 #include "ldexp.hpp"
diff --git a/dpnp/backend/extensions/ufunc/elementwise_functions/logaddexp2.cpp b/dpnp/backend/extensions/ufunc/elementwise_functions/logaddexp2.cpp
index 57c7c60ca9cf..4f215c8b98a1 100644
--- a/dpnp/backend/extensions/ufunc/elementwise_functions/logaddexp2.cpp
+++ b/dpnp/backend/extensions/ufunc/elementwise_functions/logaddexp2.cpp
@@ -30,7 +30,7 @@
 
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include "dpnp4pybind11.hpp"
 
 #include "kernels/elementwise_functions/logaddexp2.hpp"
 #include "logaddexp2.hpp"
diff --git a/dpnp/backend/extensions/ufunc/elementwise_functions/modf.cpp b/dpnp/backend/extensions/ufunc/elementwise_functions/modf.cpp
index f8aab23d5630..7885e26217f0 100644
--- a/dpnp/backend/extensions/ufunc/elementwise_functions/modf.cpp
+++ b/dpnp/backend/extensions/ufunc/elementwise_functions/modf.cpp
@@ -33,7 +33,7 @@
 
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include "dpnp4pybind11.hpp"
 
 #include "kernels/elementwise_functions/modf.hpp"
 #include "modf.hpp"
diff --git a/dpnp/backend/extensions/ufunc/elementwise_functions/nan_to_num.cpp b/dpnp/backend/extensions/ufunc/elementwise_functions/nan_to_num.cpp
index 2490f1921a98..b430dc51f974 100644
--- a/dpnp/backend/extensions/ufunc/elementwise_functions/nan_to_num.cpp
+++ b/dpnp/backend/extensions/ufunc/elementwise_functions/nan_to_num.cpp
@@ -38,7 +38,7 @@
 
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include "dpnp4pybind11.hpp"
 #include <pybind11/numpy.h>
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
diff --git a/dpnp/backend/extensions/ufunc/elementwise_functions/radians.cpp b/dpnp/backend/extensions/ufunc/elementwise_functions/radians.cpp
index 7fc8ae5331dd..96c1fc2f601a 100644
--- a/dpnp/backend/extensions/ufunc/elementwise_functions/radians.cpp
+++ b/dpnp/backend/extensions/ufunc/elementwise_functions/radians.cpp
@@ -31,7 +31,7 @@
 
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include "dpnp4pybind11.hpp"
 
 #include "kernels/elementwise_functions/radians.hpp"
 #include "populate.hpp"
diff --git a/dpnp/backend/extensions/ufunc/elementwise_functions/sinc.cpp b/dpnp/backend/extensions/ufunc/elementwise_functions/sinc.cpp
index abd02e1e6282..afba8db01bb2 100644
--- a/dpnp/backend/extensions/ufunc/elementwise_functions/sinc.cpp
+++ b/dpnp/backend/extensions/ufunc/elementwise_functions/sinc.cpp
@@ -32,7 +32,7 @@
 
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include "dpnp4pybind11.hpp"
 
 #include "kernels/elementwise_functions/sinc.hpp"
 #include "populate.hpp"
diff --git a/dpnp/backend/extensions/ufunc/elementwise_functions/spacing.cpp b/dpnp/backend/extensions/ufunc/elementwise_functions/spacing.cpp
index 6e401c5388dd..ca4e9b8661b2 100644
--- a/dpnp/backend/extensions/ufunc/elementwise_functions/spacing.cpp
+++ b/dpnp/backend/extensions/ufunc/elementwise_functions/spacing.cpp
@@ -31,7 +31,7 @@
 
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include "dpnp4pybind11.hpp"
 
 #include "kernels/elementwise_functions/spacing.hpp"
 #include "populate.hpp"

From db83aeed4d32575f80333400ebc9ed8c1724202c Mon Sep 17 00:00:00 2001
From: Anton Volkov <antonwolfy@gmail.com>
Date: Fri, 23 Jan 2026 05:20:37 -0800
Subject: [PATCH 11/18] Update vm extension to work with dpnp4pybind11.hpp

---
 dpnp/backend/extensions/vm/CMakeLists.txt | 14 ++++----------
 dpnp/backend/extensions/vm/abs.cpp        |  2 +-
 dpnp/backend/extensions/vm/acos.cpp       |  2 +-
 dpnp/backend/extensions/vm/acosh.cpp      |  2 +-
 dpnp/backend/extensions/vm/add.cpp        |  2 +-
 dpnp/backend/extensions/vm/arg.cpp        |  2 +-
 dpnp/backend/extensions/vm/asin.cpp       |  2 +-
 dpnp/backend/extensions/vm/asinh.cpp      |  2 +-
 dpnp/backend/extensions/vm/atan.cpp       |  2 +-
 dpnp/backend/extensions/vm/atan2.cpp      |  2 +-
 dpnp/backend/extensions/vm/atanh.cpp      |  2 +-
 dpnp/backend/extensions/vm/cbrt.cpp       |  2 +-
 dpnp/backend/extensions/vm/ceil.cpp       |  2 +-
 dpnp/backend/extensions/vm/common.hpp     |  4 ++--
 dpnp/backend/extensions/vm/conj.cpp       |  2 +-
 dpnp/backend/extensions/vm/copysign.cpp   |  2 +-
 dpnp/backend/extensions/vm/cos.cpp        |  2 +-
 dpnp/backend/extensions/vm/cosh.cpp       |  2 +-
 dpnp/backend/extensions/vm/div.cpp        |  2 +-
 dpnp/backend/extensions/vm/erf_funcs.cpp  |  2 +-
 dpnp/backend/extensions/vm/exp.cpp        |  2 +-
 dpnp/backend/extensions/vm/exp2.cpp       |  2 +-
 dpnp/backend/extensions/vm/expm1.cpp      |  2 +-
 dpnp/backend/extensions/vm/floor.cpp      |  2 +-
 dpnp/backend/extensions/vm/fmax.cpp       |  2 +-
 dpnp/backend/extensions/vm/fmin.cpp       |  2 +-
 dpnp/backend/extensions/vm/fmod.cpp       |  2 +-
 dpnp/backend/extensions/vm/hypot.cpp      |  2 +-
 dpnp/backend/extensions/vm/i0.cpp         |  2 +-
 dpnp/backend/extensions/vm/inv.cpp        |  2 +-
 dpnp/backend/extensions/vm/ln.cpp         |  2 +-
 dpnp/backend/extensions/vm/log10.cpp      |  2 +-
 dpnp/backend/extensions/vm/log1p.cpp      |  2 +-
 dpnp/backend/extensions/vm/log2.cpp       |  2 +-
 dpnp/backend/extensions/vm/modf.cpp       |  2 +-
 dpnp/backend/extensions/vm/mul.cpp        |  2 +-
 dpnp/backend/extensions/vm/nextafter.cpp  |  2 +-
 dpnp/backend/extensions/vm/pow.cpp        |  2 +-
 dpnp/backend/extensions/vm/rint.cpp       |  2 +-
 dpnp/backend/extensions/vm/sin.cpp        |  2 +-
 dpnp/backend/extensions/vm/sinh.cpp       |  2 +-
 dpnp/backend/extensions/vm/sqr.cpp        |  2 +-
 dpnp/backend/extensions/vm/sqrt.cpp       |  2 +-
 dpnp/backend/extensions/vm/sub.cpp        |  2 +-
 dpnp/backend/extensions/vm/tan.cpp        |  2 +-
 dpnp/backend/extensions/vm/tanh.cpp       |  2 +-
 dpnp/backend/extensions/vm/trunc.cpp      |  2 +-
 47 files changed, 51 insertions(+), 57 deletions(-)

diff --git a/dpnp/backend/extensions/vm/CMakeLists.txt b/dpnp/backend/extensions/vm/CMakeLists.txt
index b7181616f546..32d6a6765a00 100644
--- a/dpnp/backend/extensions/vm/CMakeLists.txt
+++ b/dpnp/backend/extensions/vm/CMakeLists.txt
@@ -107,17 +107,11 @@ set_target_properties(
 
 target_include_directories(
     ${python_module_name}
-    PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../../
+    PRIVATE
+        ${CMAKE_CURRENT_SOURCE_DIR}/../common
+        ${CMAKE_SOURCE_DIR}/dpnp/backend/include
+        ${CMAKE_SOURCE_DIR}/dpctl/tensor/libtensor/include
 )
-target_include_directories(
-    ${python_module_name}
-    PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../common
-)
-target_include_directories(
-    ${python_module_name}
-    PRIVATE ${CMAKE_SOURCE_DIR}/dpctl/tensor/libtensor/include
-)
-
 target_include_directories(${python_module_name} PUBLIC ${Dpctl_INCLUDE_DIR})
 
 if(WIN32)
diff --git a/dpnp/backend/extensions/vm/abs.cpp b/dpnp/backend/extensions/vm/abs.cpp
index 133f3077ac43..a2432f5bedc6 100644
--- a/dpnp/backend/extensions/vm/abs.cpp
+++ b/dpnp/backend/extensions/vm/abs.cpp
@@ -35,7 +35,7 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include "dpnp4pybind11.hpp"
 
 #include "abs.hpp"
 #include "common.hpp"
diff --git a/dpnp/backend/extensions/vm/acos.cpp b/dpnp/backend/extensions/vm/acos.cpp
index 0cb9bb32f4b8..01e4d5ab35f9 100644
--- a/dpnp/backend/extensions/vm/acos.cpp
+++ b/dpnp/backend/extensions/vm/acos.cpp
@@ -35,7 +35,7 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include "dpnp4pybind11.hpp"
 
 #include "acos.hpp"
 #include "common.hpp"
diff --git a/dpnp/backend/extensions/vm/acosh.cpp b/dpnp/backend/extensions/vm/acosh.cpp
index fa25ecf5cc1e..b1136163d684 100644
--- a/dpnp/backend/extensions/vm/acosh.cpp
+++ b/dpnp/backend/extensions/vm/acosh.cpp
@@ -35,7 +35,7 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include "dpnp4pybind11.hpp"
 
 #include "acosh.hpp"
 #include "common.hpp"
diff --git a/dpnp/backend/extensions/vm/add.cpp b/dpnp/backend/extensions/vm/add.cpp
index 165671c93415..572eadb83cec 100644
--- a/dpnp/backend/extensions/vm/add.cpp
+++ b/dpnp/backend/extensions/vm/add.cpp
@@ -36,7 +36,7 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include "dpnp4pybind11.hpp"
 
 #include "add.hpp"
 #include "common.hpp"
diff --git a/dpnp/backend/extensions/vm/arg.cpp b/dpnp/backend/extensions/vm/arg.cpp
index e062f1f2ee06..40a15082f0ec 100644
--- a/dpnp/backend/extensions/vm/arg.cpp
+++ b/dpnp/backend/extensions/vm/arg.cpp
@@ -35,7 +35,7 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include "dpnp4pybind11.hpp"
 
 #include "arg.hpp"
 #include "common.hpp"
diff --git a/dpnp/backend/extensions/vm/asin.cpp b/dpnp/backend/extensions/vm/asin.cpp
index 8a2e1c079ed8..8cf73f3fe572 100644
--- a/dpnp/backend/extensions/vm/asin.cpp
+++ b/dpnp/backend/extensions/vm/asin.cpp
@@ -35,7 +35,7 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include "dpnp4pybind11.hpp"
 
 #include "asin.hpp"
 #include "common.hpp"
diff --git a/dpnp/backend/extensions/vm/asinh.cpp b/dpnp/backend/extensions/vm/asinh.cpp
index 176bacdb92a8..a3404d2c5415 100644
--- a/dpnp/backend/extensions/vm/asinh.cpp
+++ b/dpnp/backend/extensions/vm/asinh.cpp
@@ -35,7 +35,7 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include "dpnp4pybind11.hpp"
 
 #include "asinh.hpp"
 #include "common.hpp"
diff --git a/dpnp/backend/extensions/vm/atan.cpp b/dpnp/backend/extensions/vm/atan.cpp
index 21c8c8f1c9d5..a89cb8f9a308 100644
--- a/dpnp/backend/extensions/vm/atan.cpp
+++ b/dpnp/backend/extensions/vm/atan.cpp
@@ -35,7 +35,7 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include "dpnp4pybind11.hpp"
 
 #include "atan.hpp"
 #include "common.hpp"
diff --git a/dpnp/backend/extensions/vm/atan2.cpp b/dpnp/backend/extensions/vm/atan2.cpp
index 1d4e5c333e68..bcdf1daae1b3 100644
--- a/dpnp/backend/extensions/vm/atan2.cpp
+++ b/dpnp/backend/extensions/vm/atan2.cpp
@@ -35,7 +35,7 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include "dpnp4pybind11.hpp"
 
 #include "atan2.hpp"
 #include "common.hpp"
diff --git a/dpnp/backend/extensions/vm/atanh.cpp b/dpnp/backend/extensions/vm/atanh.cpp
index 7097fabf602f..d4ef24663d02 100644
--- a/dpnp/backend/extensions/vm/atanh.cpp
+++ b/dpnp/backend/extensions/vm/atanh.cpp
@@ -35,7 +35,7 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include "dpnp4pybind11.hpp"
 
 #include "atanh.hpp"
 #include "common.hpp"
diff --git a/dpnp/backend/extensions/vm/cbrt.cpp b/dpnp/backend/extensions/vm/cbrt.cpp
index db3cdfcebd8d..47584c8f6811 100644
--- a/dpnp/backend/extensions/vm/cbrt.cpp
+++ b/dpnp/backend/extensions/vm/cbrt.cpp
@@ -34,7 +34,7 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include "dpnp4pybind11.hpp"
 
 #include "cbrt.hpp"
 #include "common.hpp"
diff --git a/dpnp/backend/extensions/vm/ceil.cpp b/dpnp/backend/extensions/vm/ceil.cpp
index 6f5aeba16f99..d170b66d7d2c 100644
--- a/dpnp/backend/extensions/vm/ceil.cpp
+++ b/dpnp/backend/extensions/vm/ceil.cpp
@@ -34,7 +34,7 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include "dpnp4pybind11.hpp"
 
 #include "ceil.hpp"
 #include "common.hpp"
diff --git a/dpnp/backend/extensions/vm/common.hpp b/dpnp/backend/extensions/vm/common.hpp
index 6ee73504ce96..81e113771def 100644
--- a/dpnp/backend/extensions/vm/common.hpp
+++ b/dpnp/backend/extensions/vm/common.hpp
@@ -34,10 +34,10 @@
 #include <vector>
 
 #include <oneapi/mkl.hpp>
+#include <pybind11/pybind11.h>
 #include <sycl/sycl.hpp>
 
-#include <dpctl4pybind11.hpp>
-#include <pybind11/pybind11.h>
+#include "dpnp4pybind11.hpp"
 
 // utils extension header
 #include "ext/common.hpp"
diff --git a/dpnp/backend/extensions/vm/conj.cpp b/dpnp/backend/extensions/vm/conj.cpp
index 36710104750a..ee000d5ee40d 100644
--- a/dpnp/backend/extensions/vm/conj.cpp
+++ b/dpnp/backend/extensions/vm/conj.cpp
@@ -35,7 +35,7 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include "dpnp4pybind11.hpp"
 
 #include "common.hpp"
 #include "conj.hpp"
diff --git a/dpnp/backend/extensions/vm/copysign.cpp b/dpnp/backend/extensions/vm/copysign.cpp
index cd90abf65a06..8b6714865204 100644
--- a/dpnp/backend/extensions/vm/copysign.cpp
+++ b/dpnp/backend/extensions/vm/copysign.cpp
@@ -35,7 +35,7 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include "dpnp4pybind11.hpp"
 
 #include "common.hpp"
 #include "copysign.hpp"
diff --git a/dpnp/backend/extensions/vm/cos.cpp b/dpnp/backend/extensions/vm/cos.cpp
index 76db72594763..62ecff7ea6a8 100644
--- a/dpnp/backend/extensions/vm/cos.cpp
+++ b/dpnp/backend/extensions/vm/cos.cpp
@@ -35,7 +35,7 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include "dpnp4pybind11.hpp"
 
 #include "common.hpp"
 #include "cos.hpp"
diff --git a/dpnp/backend/extensions/vm/cosh.cpp b/dpnp/backend/extensions/vm/cosh.cpp
index 464410b1accc..ec81142eb6ce 100644
--- a/dpnp/backend/extensions/vm/cosh.cpp
+++ b/dpnp/backend/extensions/vm/cosh.cpp
@@ -35,7 +35,7 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include "dpnp4pybind11.hpp"
 
 #include "common.hpp"
 #include "cosh.hpp"
diff --git a/dpnp/backend/extensions/vm/div.cpp b/dpnp/backend/extensions/vm/div.cpp
index ad96f9acf083..6b8c1f781955 100644
--- a/dpnp/backend/extensions/vm/div.cpp
+++ b/dpnp/backend/extensions/vm/div.cpp
@@ -36,7 +36,7 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include "dpnp4pybind11.hpp"
 
 #include "common.hpp"
 #include "div.hpp"
diff --git a/dpnp/backend/extensions/vm/erf_funcs.cpp b/dpnp/backend/extensions/vm/erf_funcs.cpp
index 4e84403eb061..2d4be369dc13 100644
--- a/dpnp/backend/extensions/vm/erf_funcs.cpp
+++ b/dpnp/backend/extensions/vm/erf_funcs.cpp
@@ -34,7 +34,7 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include "dpnp4pybind11.hpp"
 
 #include "common.hpp"
 #include "erf_funcs.hpp"
diff --git a/dpnp/backend/extensions/vm/exp.cpp b/dpnp/backend/extensions/vm/exp.cpp
index acd265d191f7..de5f34c404a8 100644
--- a/dpnp/backend/extensions/vm/exp.cpp
+++ b/dpnp/backend/extensions/vm/exp.cpp
@@ -35,7 +35,7 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include "dpnp4pybind11.hpp"
 
 #include "common.hpp"
 #include "exp.hpp"
diff --git a/dpnp/backend/extensions/vm/exp2.cpp b/dpnp/backend/extensions/vm/exp2.cpp
index 82c6c32fb6c5..1f1aa6ab90a8 100644
--- a/dpnp/backend/extensions/vm/exp2.cpp
+++ b/dpnp/backend/extensions/vm/exp2.cpp
@@ -34,7 +34,7 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include "dpnp4pybind11.hpp"
 
 #include "common.hpp"
 #include "exp2.hpp"
diff --git a/dpnp/backend/extensions/vm/expm1.cpp b/dpnp/backend/extensions/vm/expm1.cpp
index 93cef7b3272d..5f803622b1a3 100644
--- a/dpnp/backend/extensions/vm/expm1.cpp
+++ b/dpnp/backend/extensions/vm/expm1.cpp
@@ -34,7 +34,7 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include "dpnp4pybind11.hpp"
 
 #include "common.hpp"
 #include "expm1.hpp"
diff --git a/dpnp/backend/extensions/vm/floor.cpp b/dpnp/backend/extensions/vm/floor.cpp
index fb1a86eda7bf..a12bdb18c719 100644
--- a/dpnp/backend/extensions/vm/floor.cpp
+++ b/dpnp/backend/extensions/vm/floor.cpp
@@ -34,7 +34,7 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include "dpnp4pybind11.hpp"
 
 #include "common.hpp"
 #include "floor.hpp"
diff --git a/dpnp/backend/extensions/vm/fmax.cpp b/dpnp/backend/extensions/vm/fmax.cpp
index 32786a3e8fc2..db4ca265ec42 100644
--- a/dpnp/backend/extensions/vm/fmax.cpp
+++ b/dpnp/backend/extensions/vm/fmax.cpp
@@ -35,7 +35,7 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include "dpnp4pybind11.hpp"
 
 #include "common.hpp"
 #include "fmax.hpp"
diff --git a/dpnp/backend/extensions/vm/fmin.cpp b/dpnp/backend/extensions/vm/fmin.cpp
index d923b8c7ddfb..ca933a9f1869 100644
--- a/dpnp/backend/extensions/vm/fmin.cpp
+++ b/dpnp/backend/extensions/vm/fmin.cpp
@@ -35,7 +35,7 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include "dpnp4pybind11.hpp"
 
 #include "common.hpp"
 #include "fmin.hpp"
diff --git a/dpnp/backend/extensions/vm/fmod.cpp b/dpnp/backend/extensions/vm/fmod.cpp
index 6c8a4ac705e4..83337dc1f7fd 100644
--- a/dpnp/backend/extensions/vm/fmod.cpp
+++ b/dpnp/backend/extensions/vm/fmod.cpp
@@ -35,7 +35,7 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include "dpnp4pybind11.hpp"
 
 #include "common.hpp"
 #include "fmod.hpp"
diff --git a/dpnp/backend/extensions/vm/hypot.cpp b/dpnp/backend/extensions/vm/hypot.cpp
index 92b7c78f8ad6..bf01b8fb42b6 100644
--- a/dpnp/backend/extensions/vm/hypot.cpp
+++ b/dpnp/backend/extensions/vm/hypot.cpp
@@ -35,7 +35,7 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include "dpnp4pybind11.hpp"
 
 #include "common.hpp"
 #include "hypot.hpp"
diff --git a/dpnp/backend/extensions/vm/i0.cpp b/dpnp/backend/extensions/vm/i0.cpp
index 5db3ef9d9669..afdf34e8cabc 100644
--- a/dpnp/backend/extensions/vm/i0.cpp
+++ b/dpnp/backend/extensions/vm/i0.cpp
@@ -34,7 +34,7 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include "dpnp4pybind11.hpp"
 
 #include "common.hpp"
 #include "i0.hpp"
diff --git a/dpnp/backend/extensions/vm/inv.cpp b/dpnp/backend/extensions/vm/inv.cpp
index 1adeb1be23d0..6be886c0b0f2 100644
--- a/dpnp/backend/extensions/vm/inv.cpp
+++ b/dpnp/backend/extensions/vm/inv.cpp
@@ -34,7 +34,7 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include "dpnp4pybind11.hpp"
 
 #include "common.hpp"
 #include "inv.hpp"
diff --git a/dpnp/backend/extensions/vm/ln.cpp b/dpnp/backend/extensions/vm/ln.cpp
index e60a0545005b..c6bfb930524d 100644
--- a/dpnp/backend/extensions/vm/ln.cpp
+++ b/dpnp/backend/extensions/vm/ln.cpp
@@ -35,7 +35,7 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include "dpnp4pybind11.hpp"
 
 #include "common.hpp"
 #include "ln.hpp"
diff --git a/dpnp/backend/extensions/vm/log10.cpp b/dpnp/backend/extensions/vm/log10.cpp
index d26ec57ab9ce..7e6e611d01c8 100644
--- a/dpnp/backend/extensions/vm/log10.cpp
+++ b/dpnp/backend/extensions/vm/log10.cpp
@@ -35,7 +35,7 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include "dpnp4pybind11.hpp"
 
 #include "common.hpp"
 #include "log10.hpp"
diff --git a/dpnp/backend/extensions/vm/log1p.cpp b/dpnp/backend/extensions/vm/log1p.cpp
index 861804f8f6e0..579546f6b3f7 100644
--- a/dpnp/backend/extensions/vm/log1p.cpp
+++ b/dpnp/backend/extensions/vm/log1p.cpp
@@ -34,7 +34,7 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include "dpnp4pybind11.hpp"
 
 #include "common.hpp"
 #include "log1p.hpp"
diff --git a/dpnp/backend/extensions/vm/log2.cpp b/dpnp/backend/extensions/vm/log2.cpp
index e75e96c32fe9..7c3ecb0731d7 100644
--- a/dpnp/backend/extensions/vm/log2.cpp
+++ b/dpnp/backend/extensions/vm/log2.cpp
@@ -34,7 +34,7 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include "dpnp4pybind11.hpp"
 
 #include "common.hpp"
 #include "log2.hpp"
diff --git a/dpnp/backend/extensions/vm/modf.cpp b/dpnp/backend/extensions/vm/modf.cpp
index ef68c79d8b42..283cfadb9b78 100644
--- a/dpnp/backend/extensions/vm/modf.cpp
+++ b/dpnp/backend/extensions/vm/modf.cpp
@@ -35,7 +35,7 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include "dpnp4pybind11.hpp"
 
 #include "common.hpp"
 #include "modf.hpp"
diff --git a/dpnp/backend/extensions/vm/mul.cpp b/dpnp/backend/extensions/vm/mul.cpp
index 0c9cf7fb79cc..a689e88ae0e1 100644
--- a/dpnp/backend/extensions/vm/mul.cpp
+++ b/dpnp/backend/extensions/vm/mul.cpp
@@ -36,7 +36,7 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include "dpnp4pybind11.hpp"
 
 #include "common.hpp"
 #include "mul.hpp"
diff --git a/dpnp/backend/extensions/vm/nextafter.cpp b/dpnp/backend/extensions/vm/nextafter.cpp
index 59b205b3d62a..03b19529fc72 100644
--- a/dpnp/backend/extensions/vm/nextafter.cpp
+++ b/dpnp/backend/extensions/vm/nextafter.cpp
@@ -35,7 +35,7 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include "dpnp4pybind11.hpp"
 
 #include "common.hpp"
 #include "nextafter.hpp"
diff --git a/dpnp/backend/extensions/vm/pow.cpp b/dpnp/backend/extensions/vm/pow.cpp
index 5969a4862730..1d8e8fe8afca 100644
--- a/dpnp/backend/extensions/vm/pow.cpp
+++ b/dpnp/backend/extensions/vm/pow.cpp
@@ -36,7 +36,7 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include "dpnp4pybind11.hpp"
 
 #include "common.hpp"
 #include "pow.hpp"
diff --git a/dpnp/backend/extensions/vm/rint.cpp b/dpnp/backend/extensions/vm/rint.cpp
index 41cd20a944a0..f3d37b92a59a 100644
--- a/dpnp/backend/extensions/vm/rint.cpp
+++ b/dpnp/backend/extensions/vm/rint.cpp
@@ -34,7 +34,7 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include "dpnp4pybind11.hpp"
 
 #include "common.hpp"
 #include "rint.hpp"
diff --git a/dpnp/backend/extensions/vm/sin.cpp b/dpnp/backend/extensions/vm/sin.cpp
index 9263c3c4ffcf..39258ceb60b9 100644
--- a/dpnp/backend/extensions/vm/sin.cpp
+++ b/dpnp/backend/extensions/vm/sin.cpp
@@ -35,7 +35,7 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include "dpnp4pybind11.hpp"
 
 #include "common.hpp"
 #include "sin.hpp"
diff --git a/dpnp/backend/extensions/vm/sinh.cpp b/dpnp/backend/extensions/vm/sinh.cpp
index a1bae13a5281..5aa5a31a8f84 100644
--- a/dpnp/backend/extensions/vm/sinh.cpp
+++ b/dpnp/backend/extensions/vm/sinh.cpp
@@ -35,7 +35,7 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include "dpnp4pybind11.hpp"
 
 #include "common.hpp"
 #include "sinh.hpp"
diff --git a/dpnp/backend/extensions/vm/sqr.cpp b/dpnp/backend/extensions/vm/sqr.cpp
index 88c2e833b483..bf008a68a68f 100644
--- a/dpnp/backend/extensions/vm/sqr.cpp
+++ b/dpnp/backend/extensions/vm/sqr.cpp
@@ -34,7 +34,7 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include "dpnp4pybind11.hpp"
 
 #include "common.hpp"
 #include "sqr.hpp"
diff --git a/dpnp/backend/extensions/vm/sqrt.cpp b/dpnp/backend/extensions/vm/sqrt.cpp
index 98cf2eea9253..8bd26c0fe1a9 100644
--- a/dpnp/backend/extensions/vm/sqrt.cpp
+++ b/dpnp/backend/extensions/vm/sqrt.cpp
@@ -35,7 +35,7 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include "dpnp4pybind11.hpp"
 
 #include "common.hpp"
 #include "sqrt.hpp"
diff --git a/dpnp/backend/extensions/vm/sub.cpp b/dpnp/backend/extensions/vm/sub.cpp
index 5ee01f239c06..b0503754194f 100644
--- a/dpnp/backend/extensions/vm/sub.cpp
+++ b/dpnp/backend/extensions/vm/sub.cpp
@@ -36,7 +36,7 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include "dpnp4pybind11.hpp"
 
 #include "common.hpp"
 #include "sub.hpp"
diff --git a/dpnp/backend/extensions/vm/tan.cpp b/dpnp/backend/extensions/vm/tan.cpp
index 46555ebd0178..9fe4cb64d41c 100644
--- a/dpnp/backend/extensions/vm/tan.cpp
+++ b/dpnp/backend/extensions/vm/tan.cpp
@@ -35,7 +35,7 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include "dpnp4pybind11.hpp"
 
 #include "common.hpp"
 #include "tan.hpp"
diff --git a/dpnp/backend/extensions/vm/tanh.cpp b/dpnp/backend/extensions/vm/tanh.cpp
index 04d2febfac1d..70f4ef6142d5 100644
--- a/dpnp/backend/extensions/vm/tanh.cpp
+++ b/dpnp/backend/extensions/vm/tanh.cpp
@@ -35,7 +35,7 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include "dpnp4pybind11.hpp"
 
 #include "common.hpp"
 #include "tanh.hpp"
diff --git a/dpnp/backend/extensions/vm/trunc.cpp b/dpnp/backend/extensions/vm/trunc.cpp
index c23a9a8180fb..c6cc4f9e8265 100644
--- a/dpnp/backend/extensions/vm/trunc.cpp
+++ b/dpnp/backend/extensions/vm/trunc.cpp
@@ -34,7 +34,7 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include "dpnp4pybind11.hpp"
 
 #include "common.hpp"
 #include "trunc.hpp"

From 00f6c70003c671c7ea3aa52cfcd91723346e796b Mon Sep 17 00:00:00 2001
From: Anton Volkov <antonwolfy@gmail.com>
Date: Fri, 23 Jan 2026 05:24:13 -0800
Subject: [PATCH 12/18] remove conditional include for dpnp4pybind11.hpp

---
 dpctl/tensor/libtensor/include/utils/memory_overlap.hpp       | 4 ----
 dpctl/tensor/libtensor/include/utils/output_validation.hpp    | 4 ----
 dpctl/tensor/libtensor/include/utils/type_dispatch.hpp        | 4 ----
 dpnp/backend/extensions/common/ext/validation_utils.hpp       | 4 ----
 .../elementwise_functions/elementwise_functions.hpp           | 4 ----
 .../elementwise_functions_type_utils.cpp                      | 4 ----
 .../elementwise_functions_type_utils.hpp                      | 4 ----
 7 files changed, 28 deletions(-)

diff --git a/dpctl/tensor/libtensor/include/utils/memory_overlap.hpp b/dpctl/tensor/libtensor/include/utils/memory_overlap.hpp
index db9dfc30eb46..b534e55b3192 100644
--- a/dpctl/tensor/libtensor/include/utils/memory_overlap.hpp
+++ b/dpctl/tensor/libtensor/include/utils/memory_overlap.hpp
@@ -38,11 +38,7 @@
 
 #include <pybind11/pybind11.h>
 
-#if __has_include(<dpnp4pybind11.hpp>)
 #include "dpnp4pybind11.hpp"
-#else
-#include "dpctl4pybind11.hpp"
-#endif
 
 /* @brief check for overlap of memory regions behind arrays.
 
diff --git a/dpctl/tensor/libtensor/include/utils/output_validation.hpp b/dpctl/tensor/libtensor/include/utils/output_validation.hpp
index 7a70f395dfe1..26f1b29bd3d8 100644
--- a/dpctl/tensor/libtensor/include/utils/output_validation.hpp
+++ b/dpctl/tensor/libtensor/include/utils/output_validation.hpp
@@ -37,11 +37,7 @@
 
 #include <pybind11/pybind11.h>
 
-#if __has_include(<dpnp4pybind11.hpp>)
 #include "dpnp4pybind11.hpp"
-#else
-#include "dpctl4pybind11.hpp"
-#endif
 
 namespace dpctl::tensor::validation
 {
diff --git a/dpctl/tensor/libtensor/include/utils/type_dispatch.hpp b/dpctl/tensor/libtensor/include/utils/type_dispatch.hpp
index 38b5b43ca696..242c2cf8724a 100644
--- a/dpctl/tensor/libtensor/include/utils/type_dispatch.hpp
+++ b/dpctl/tensor/libtensor/include/utils/type_dispatch.hpp
@@ -36,11 +36,7 @@
 #include <stdexcept>
 #include <string>
 
-#if __has_include(<dpnp4pybind11.hpp>)
 #include "dpnp4pybind11.hpp"
-#else
-#include "dpctl4pybind11.hpp"
-#endif
 
 #include "type_dispatch_building.hpp"
 
diff --git a/dpnp/backend/extensions/common/ext/validation_utils.hpp b/dpnp/backend/extensions/common/ext/validation_utils.hpp
index 0bb32c9f876a..03e0718d4450 100644
--- a/dpnp/backend/extensions/common/ext/validation_utils.hpp
+++ b/dpnp/backend/extensions/common/ext/validation_utils.hpp
@@ -32,11 +32,7 @@
 #include <unordered_map>
 #include <vector>
 
-#if __has_include(<dpnp4pybind11.hpp>)
 #include "dpnp4pybind11.hpp"
-#else
-#include "dpctl4pybind11.hpp"
-#endif
 
 // dpctl tensor headers
 #include "utils/type_dispatch.hpp"
diff --git a/dpnp/backend/extensions/elementwise_functions/elementwise_functions.hpp b/dpnp/backend/extensions/elementwise_functions/elementwise_functions.hpp
index 9e8d98d875a3..b1634eafef5a 100644
--- a/dpnp/backend/extensions/elementwise_functions/elementwise_functions.hpp
+++ b/dpnp/backend/extensions/elementwise_functions/elementwise_functions.hpp
@@ -37,11 +37,7 @@
 #include <pybind11/pybind11.h>
 #include <sycl/sycl.hpp>
 
-#if __has_include(<dpnp4pybind11.hpp>)
 #include "dpnp4pybind11.hpp"
-#else
-#include "dpctl4pybind11.hpp"
-#endif
 
 #include "elementwise_functions_type_utils.hpp"
 #include "simplify_iteration_space.hpp"
diff --git a/dpnp/backend/extensions/elementwise_functions/elementwise_functions_type_utils.cpp b/dpnp/backend/extensions/elementwise_functions/elementwise_functions_type_utils.cpp
index c126428f0558..6798cb4f3154 100644
--- a/dpnp/backend/extensions/elementwise_functions/elementwise_functions_type_utils.cpp
+++ b/dpnp/backend/extensions/elementwise_functions/elementwise_functions_type_utils.cpp
@@ -29,11 +29,7 @@
 #include <pybind11/pybind11.h>
 #include <sycl/sycl.hpp>
 
-#if __has_include(<dpnp4pybind11.hpp>)
 #include "dpnp4pybind11.hpp"
-#else
-#include "dpctl4pybind11.hpp"
-#endif
 
 #include "elementwise_functions_type_utils.hpp"
 
diff --git a/dpnp/backend/extensions/elementwise_functions/elementwise_functions_type_utils.hpp b/dpnp/backend/extensions/elementwise_functions/elementwise_functions_type_utils.hpp
index 129a89a49dbe..58fe43c01589 100644
--- a/dpnp/backend/extensions/elementwise_functions/elementwise_functions_type_utils.hpp
+++ b/dpnp/backend/extensions/elementwise_functions/elementwise_functions_type_utils.hpp
@@ -31,11 +31,7 @@
 #include <pybind11/numpy.h>
 #include <pybind11/pybind11.h>
 
-#if __has_include(<dpnp4pybind11.hpp>)
 #include "dpnp4pybind11.hpp"
-#else
-#include "dpctl4pybind11.hpp"
-#endif
 
 // dpctl tensor headers
 #include "utils/type_dispatch.hpp"

From 6944e0ace8fc5f18c5b4740f76d9b6d567a56f60 Mon Sep 17 00:00:00 2001
From: Anton Volkov <antonwolfy@gmail.com>
Date: Fri, 23 Jan 2026 08:22:15 -0800
Subject: [PATCH 13/18] Add missing pybind11 include

---
 dpnp/backend/extensions/indexing/choose.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/dpnp/backend/extensions/indexing/choose.cpp b/dpnp/backend/extensions/indexing/choose.cpp
index 05d1bfe15385..7b5284418b00 100644
--- a/dpnp/backend/extensions/indexing/choose.cpp
+++ b/dpnp/backend/extensions/indexing/choose.cpp
@@ -37,6 +37,7 @@
 #include <sycl/sycl.hpp>
 
 #include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
 
 #include "dpnp4pybind11.hpp"
 

From 135590e76c3afea27d7e985cb4d7982f933d0ace Mon Sep 17 00:00:00 2001
From: Anton Volkov <antonwolfy@gmail.com>
Date: Sat, 24 Jan 2026 04:17:34 -0800
Subject: [PATCH 14/18] py::dtype requies to include <pybind11/numpy.h>

---
 .../extensions/common/ext/details/common_internal.hpp        | 4 +++-
 .../elementwise_functions/elementwise_functions.hpp          | 2 ++
 .../elementwise_functions_type_utils.cpp                     | 2 ++
 .../extensions/ufunc/elementwise_functions/bitwise_count.cpp | 3 +++
 .../extensions/ufunc/elementwise_functions/degrees.cpp       | 3 +++
 .../extensions/ufunc/elementwise_functions/divmod.cpp        | 3 +++
 .../extensions/ufunc/elementwise_functions/erf_funcs.cpp     | 3 +++
 dpnp/backend/extensions/ufunc/elementwise_functions/fabs.cpp | 3 +++
 .../extensions/ufunc/elementwise_functions/float_power.cpp   | 3 +++
 dpnp/backend/extensions/ufunc/elementwise_functions/fmax.cpp | 3 +++
 dpnp/backend/extensions/ufunc/elementwise_functions/fmin.cpp | 3 +++
 dpnp/backend/extensions/ufunc/elementwise_functions/fmod.cpp | 3 +++
 .../backend/extensions/ufunc/elementwise_functions/frexp.cpp | 3 +++
 dpnp/backend/extensions/ufunc/elementwise_functions/gcd.cpp  | 3 +++
 .../extensions/ufunc/elementwise_functions/heaviside.cpp     | 3 +++
 dpnp/backend/extensions/ufunc/elementwise_functions/i0.cpp   | 3 +++
 .../extensions/ufunc/elementwise_functions/interpolate.cpp   | 5 +++--
 .../extensions/ufunc/elementwise_functions/isclose.cpp       | 5 +++--
 dpnp/backend/extensions/ufunc/elementwise_functions/lcm.cpp  | 3 +++
 .../backend/extensions/ufunc/elementwise_functions/ldexp.cpp | 3 +++
 .../extensions/ufunc/elementwise_functions/logaddexp2.cpp    | 3 +++
 dpnp/backend/extensions/ufunc/elementwise_functions/modf.cpp | 3 +++
 .../extensions/ufunc/elementwise_functions/radians.cpp       | 3 +++
 dpnp/backend/extensions/ufunc/elementwise_functions/sinc.cpp | 3 +++
 .../extensions/ufunc/elementwise_functions/spacing.cpp       | 3 +++
 25 files changed, 73 insertions(+), 5 deletions(-)

diff --git a/dpnp/backend/extensions/common/ext/details/common_internal.hpp b/dpnp/backend/extensions/common/ext/details/common_internal.hpp
index 31d9671a0a43..8db72ce32318 100644
--- a/dpnp/backend/extensions/common/ext/details/common_internal.hpp
+++ b/dpnp/backend/extensions/common/ext/details/common_internal.hpp
@@ -30,9 +30,11 @@
 
 #include <algorithm>
 
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+
 #include "ext/common.hpp"
 #include "utils/type_dispatch.hpp"
-#include <pybind11/pybind11.h>
 
 namespace dpctl_td_ns = dpctl::tensor::type_dispatch;
 
diff --git a/dpnp/backend/extensions/elementwise_functions/elementwise_functions.hpp b/dpnp/backend/extensions/elementwise_functions/elementwise_functions.hpp
index b1634eafef5a..e23f74a678dc 100644
--- a/dpnp/backend/extensions/elementwise_functions/elementwise_functions.hpp
+++ b/dpnp/backend/extensions/elementwise_functions/elementwise_functions.hpp
@@ -34,7 +34,9 @@
 #include <utility>
 #include <vector>
 
+#include <pybind11/numpy.h>
 #include <pybind11/pybind11.h>
+
 #include <sycl/sycl.hpp>
 
 #include "dpnp4pybind11.hpp"
diff --git a/dpnp/backend/extensions/elementwise_functions/elementwise_functions_type_utils.cpp b/dpnp/backend/extensions/elementwise_functions/elementwise_functions_type_utils.cpp
index 6798cb4f3154..7300f938eabb 100644
--- a/dpnp/backend/extensions/elementwise_functions/elementwise_functions_type_utils.cpp
+++ b/dpnp/backend/extensions/elementwise_functions/elementwise_functions_type_utils.cpp
@@ -26,7 +26,9 @@
 // THE POSSIBILITY OF SUCH DAMAGE.
 //*****************************************************************************
 
+#include <pybind11/numpy.h>
 #include <pybind11/pybind11.h>
+
 #include <sycl/sycl.hpp>
 
 #include "dpnp4pybind11.hpp"
diff --git a/dpnp/backend/extensions/ufunc/elementwise_functions/bitwise_count.cpp b/dpnp/backend/extensions/ufunc/elementwise_functions/bitwise_count.cpp
index 14e8b7b5ed35..9fe5d5d43c7c 100644
--- a/dpnp/backend/extensions/ufunc/elementwise_functions/bitwise_count.cpp
+++ b/dpnp/backend/extensions/ufunc/elementwise_functions/bitwise_count.cpp
@@ -30,6 +30,9 @@
 #include <type_traits>
 #include <vector>
 
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+
 #include <sycl/sycl.hpp>
 
 #include "dpnp4pybind11.hpp"
diff --git a/dpnp/backend/extensions/ufunc/elementwise_functions/degrees.cpp b/dpnp/backend/extensions/ufunc/elementwise_functions/degrees.cpp
index 511ea759ae35..9b15f8b29895 100644
--- a/dpnp/backend/extensions/ufunc/elementwise_functions/degrees.cpp
+++ b/dpnp/backend/extensions/ufunc/elementwise_functions/degrees.cpp
@@ -29,6 +29,9 @@
 #include <type_traits>
 #include <vector>
 
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+
 #include <sycl/sycl.hpp>
 
 #include "dpnp4pybind11.hpp"
diff --git a/dpnp/backend/extensions/ufunc/elementwise_functions/divmod.cpp b/dpnp/backend/extensions/ufunc/elementwise_functions/divmod.cpp
index 93d04ed7940e..599a5eca1518 100644
--- a/dpnp/backend/extensions/ufunc/elementwise_functions/divmod.cpp
+++ b/dpnp/backend/extensions/ufunc/elementwise_functions/divmod.cpp
@@ -30,6 +30,9 @@
 #include <type_traits>
 #include <vector>
 
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+
 #include <sycl/sycl.hpp>
 
 #include "dpnp4pybind11.hpp"
diff --git a/dpnp/backend/extensions/ufunc/elementwise_functions/erf_funcs.cpp b/dpnp/backend/extensions/ufunc/elementwise_functions/erf_funcs.cpp
index e209f72a83b2..c739cd5f119d 100644
--- a/dpnp/backend/extensions/ufunc/elementwise_functions/erf_funcs.cpp
+++ b/dpnp/backend/extensions/ufunc/elementwise_functions/erf_funcs.cpp
@@ -29,6 +29,9 @@
 #include <type_traits>
 #include <vector>
 
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+
 #include <sycl/sycl.hpp>
 
 #include "dpnp4pybind11.hpp"
diff --git a/dpnp/backend/extensions/ufunc/elementwise_functions/fabs.cpp b/dpnp/backend/extensions/ufunc/elementwise_functions/fabs.cpp
index d673533e599b..640d8629edbd 100644
--- a/dpnp/backend/extensions/ufunc/elementwise_functions/fabs.cpp
+++ b/dpnp/backend/extensions/ufunc/elementwise_functions/fabs.cpp
@@ -29,6 +29,9 @@
 #include <type_traits>
 #include <vector>
 
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+
 #include <sycl/sycl.hpp>
 
 #include "dpnp4pybind11.hpp"
diff --git a/dpnp/backend/extensions/ufunc/elementwise_functions/float_power.cpp b/dpnp/backend/extensions/ufunc/elementwise_functions/float_power.cpp
index 9d42630fd00c..0f065e6dad0d 100644
--- a/dpnp/backend/extensions/ufunc/elementwise_functions/float_power.cpp
+++ b/dpnp/backend/extensions/ufunc/elementwise_functions/float_power.cpp
@@ -30,6 +30,9 @@
 #include <type_traits>
 #include <vector>
 
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+
 #include <sycl/sycl.hpp>
 
 #include "dpnp4pybind11.hpp"
diff --git a/dpnp/backend/extensions/ufunc/elementwise_functions/fmax.cpp b/dpnp/backend/extensions/ufunc/elementwise_functions/fmax.cpp
index 70e8a434e7ac..3882f24611a5 100644
--- a/dpnp/backend/extensions/ufunc/elementwise_functions/fmax.cpp
+++ b/dpnp/backend/extensions/ufunc/elementwise_functions/fmax.cpp
@@ -28,6 +28,9 @@
 
 #include <vector>
 
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+
 #include <sycl/sycl.hpp>
 
 #include "dpnp4pybind11.hpp"
diff --git a/dpnp/backend/extensions/ufunc/elementwise_functions/fmin.cpp b/dpnp/backend/extensions/ufunc/elementwise_functions/fmin.cpp
index d9c94109fdd0..1fd8798572e5 100644
--- a/dpnp/backend/extensions/ufunc/elementwise_functions/fmin.cpp
+++ b/dpnp/backend/extensions/ufunc/elementwise_functions/fmin.cpp
@@ -28,6 +28,9 @@
 
 #include <vector>
 
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+
 #include <sycl/sycl.hpp>
 
 #include "dpnp4pybind11.hpp"
diff --git a/dpnp/backend/extensions/ufunc/elementwise_functions/fmod.cpp b/dpnp/backend/extensions/ufunc/elementwise_functions/fmod.cpp
index 9db1f7873f5b..1dca65e622cf 100644
--- a/dpnp/backend/extensions/ufunc/elementwise_functions/fmod.cpp
+++ b/dpnp/backend/extensions/ufunc/elementwise_functions/fmod.cpp
@@ -30,6 +30,9 @@
 #include <type_traits>
 #include <vector>
 
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+
 #include <sycl/sycl.hpp>
 
 #include "dpnp4pybind11.hpp"
diff --git a/dpnp/backend/extensions/ufunc/elementwise_functions/frexp.cpp b/dpnp/backend/extensions/ufunc/elementwise_functions/frexp.cpp
index 0c4d1b1b9252..b1367bd82540 100644
--- a/dpnp/backend/extensions/ufunc/elementwise_functions/frexp.cpp
+++ b/dpnp/backend/extensions/ufunc/elementwise_functions/frexp.cpp
@@ -31,6 +31,9 @@
 #include <utility>
 #include <vector>
 
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+
 #include <sycl/sycl.hpp>
 
 #include "dpnp4pybind11.hpp"
diff --git a/dpnp/backend/extensions/ufunc/elementwise_functions/gcd.cpp b/dpnp/backend/extensions/ufunc/elementwise_functions/gcd.cpp
index 1a570488cc1f..d21ef703fecf 100644
--- a/dpnp/backend/extensions/ufunc/elementwise_functions/gcd.cpp
+++ b/dpnp/backend/extensions/ufunc/elementwise_functions/gcd.cpp
@@ -30,6 +30,9 @@
 #include <type_traits>
 #include <vector>
 
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+
 #include <sycl/sycl.hpp>
 
 #include "dpnp4pybind11.hpp"
diff --git a/dpnp/backend/extensions/ufunc/elementwise_functions/heaviside.cpp b/dpnp/backend/extensions/ufunc/elementwise_functions/heaviside.cpp
index 69db72c7142b..29b8a475a8a9 100644
--- a/dpnp/backend/extensions/ufunc/elementwise_functions/heaviside.cpp
+++ b/dpnp/backend/extensions/ufunc/elementwise_functions/heaviside.cpp
@@ -29,6 +29,9 @@
 #include <type_traits>
 #include <vector>
 
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+
 #include <sycl/sycl.hpp>
 
 #include "dpnp4pybind11.hpp"
diff --git a/dpnp/backend/extensions/ufunc/elementwise_functions/i0.cpp b/dpnp/backend/extensions/ufunc/elementwise_functions/i0.cpp
index 82c1c7cb27ad..9e7aaba5c90c 100644
--- a/dpnp/backend/extensions/ufunc/elementwise_functions/i0.cpp
+++ b/dpnp/backend/extensions/ufunc/elementwise_functions/i0.cpp
@@ -29,6 +29,9 @@
 #include <type_traits>
 #include <vector>
 
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+
 #include <sycl/sycl.hpp>
 
 #include "dpnp4pybind11.hpp"
diff --git a/dpnp/backend/extensions/ufunc/elementwise_functions/interpolate.cpp b/dpnp/backend/extensions/ufunc/elementwise_functions/interpolate.cpp
index f8ce8f007369..d22c51c29fde 100644
--- a/dpnp/backend/extensions/ufunc/elementwise_functions/interpolate.cpp
+++ b/dpnp/backend/extensions/ufunc/elementwise_functions/interpolate.cpp
@@ -35,11 +35,12 @@
 #include <utility>
 #include <vector>
 
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+
 #include <sycl/sycl.hpp>
 
 #include "dpnp4pybind11.hpp"
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
 
 // dpctl tensor headers
 #include "utils/type_dispatch.hpp"
diff --git a/dpnp/backend/extensions/ufunc/elementwise_functions/isclose.cpp b/dpnp/backend/extensions/ufunc/elementwise_functions/isclose.cpp
index 34577aa7ba68..37949016c905 100644
--- a/dpnp/backend/extensions/ufunc/elementwise_functions/isclose.cpp
+++ b/dpnp/backend/extensions/ufunc/elementwise_functions/isclose.cpp
@@ -32,11 +32,12 @@
 #include <type_traits>
 #include <vector>
 
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+
 #include <sycl/sycl.hpp>
 
 #include "dpnp4pybind11.hpp"
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
 
 #include "kernels/elementwise_functions/isclose.hpp"
 
diff --git a/dpnp/backend/extensions/ufunc/elementwise_functions/lcm.cpp b/dpnp/backend/extensions/ufunc/elementwise_functions/lcm.cpp
index c2d2e801fed8..3d5d34aae4ab 100644
--- a/dpnp/backend/extensions/ufunc/elementwise_functions/lcm.cpp
+++ b/dpnp/backend/extensions/ufunc/elementwise_functions/lcm.cpp
@@ -30,6 +30,9 @@
 #include <type_traits>
 #include <vector>
 
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+
 #include <sycl/sycl.hpp>
 
 #include "dpnp4pybind11.hpp"
diff --git a/dpnp/backend/extensions/ufunc/elementwise_functions/ldexp.cpp b/dpnp/backend/extensions/ufunc/elementwise_functions/ldexp.cpp
index 5e413b30735d..15ceb91fbd78 100644
--- a/dpnp/backend/extensions/ufunc/elementwise_functions/ldexp.cpp
+++ b/dpnp/backend/extensions/ufunc/elementwise_functions/ldexp.cpp
@@ -30,6 +30,9 @@
 #include <type_traits>
 #include <vector>
 
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+
 #include <sycl/sycl.hpp>
 
 #include "dpnp4pybind11.hpp"
diff --git a/dpnp/backend/extensions/ufunc/elementwise_functions/logaddexp2.cpp b/dpnp/backend/extensions/ufunc/elementwise_functions/logaddexp2.cpp
index 4f215c8b98a1..a63b3e3431f0 100644
--- a/dpnp/backend/extensions/ufunc/elementwise_functions/logaddexp2.cpp
+++ b/dpnp/backend/extensions/ufunc/elementwise_functions/logaddexp2.cpp
@@ -28,6 +28,9 @@
 
 #include <vector>
 
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+
 #include <sycl/sycl.hpp>
 
 #include "dpnp4pybind11.hpp"
diff --git a/dpnp/backend/extensions/ufunc/elementwise_functions/modf.cpp b/dpnp/backend/extensions/ufunc/elementwise_functions/modf.cpp
index 7885e26217f0..784c83b66cd5 100644
--- a/dpnp/backend/extensions/ufunc/elementwise_functions/modf.cpp
+++ b/dpnp/backend/extensions/ufunc/elementwise_functions/modf.cpp
@@ -31,6 +31,9 @@
 #include <utility>
 #include <vector>
 
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+
 #include <sycl/sycl.hpp>
 
 #include "dpnp4pybind11.hpp"
diff --git a/dpnp/backend/extensions/ufunc/elementwise_functions/radians.cpp b/dpnp/backend/extensions/ufunc/elementwise_functions/radians.cpp
index 96c1fc2f601a..ea21ad42e140 100644
--- a/dpnp/backend/extensions/ufunc/elementwise_functions/radians.cpp
+++ b/dpnp/backend/extensions/ufunc/elementwise_functions/radians.cpp
@@ -29,6 +29,9 @@
 #include <type_traits>
 #include <vector>
 
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+
 #include <sycl/sycl.hpp>
 
 #include "dpnp4pybind11.hpp"
diff --git a/dpnp/backend/extensions/ufunc/elementwise_functions/sinc.cpp b/dpnp/backend/extensions/ufunc/elementwise_functions/sinc.cpp
index afba8db01bb2..eebeb82b7124 100644
--- a/dpnp/backend/extensions/ufunc/elementwise_functions/sinc.cpp
+++ b/dpnp/backend/extensions/ufunc/elementwise_functions/sinc.cpp
@@ -30,6 +30,9 @@
 #include <type_traits>
 #include <vector>
 
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+
 #include <sycl/sycl.hpp>
 
 #include "dpnp4pybind11.hpp"
diff --git a/dpnp/backend/extensions/ufunc/elementwise_functions/spacing.cpp b/dpnp/backend/extensions/ufunc/elementwise_functions/spacing.cpp
index ca4e9b8661b2..ec81be1bbe03 100644
--- a/dpnp/backend/extensions/ufunc/elementwise_functions/spacing.cpp
+++ b/dpnp/backend/extensions/ufunc/elementwise_functions/spacing.cpp
@@ -29,6 +29,9 @@
 #include <type_traits>
 #include <vector>
 
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+
 #include <sycl/sycl.hpp>
 
 #include "dpnp4pybind11.hpp"

From 2884939ce327db01d152fb8fc92ed2a2069b6302 Mon Sep 17 00:00:00 2001
From: Anton Volkov <antonwolfy@gmail.com>
Date: Sat, 24 Jan 2026 06:15:26 -0800
Subject: [PATCH 15/18] Include <pybind11/stl.h> is required to provide
 conversion from std::vector to python list

---
 dpnp/backend/extensions/lapack/heevd.cpp                       | 1 +
 dpnp/backend/extensions/lapack/heevd_batch.cpp                 | 1 +
 dpnp/backend/extensions/lapack/syevd.cpp                       | 1 +
 dpnp/backend/extensions/lapack/syevd_batch.cpp                 | 1 +
 dpnp/backend/extensions/statistics/histogram.cpp               | 1 +
 .../extensions/ufunc/elementwise_functions/bitwise_count.cpp   | 1 +
 .../backend/extensions/ufunc/elementwise_functions/degrees.cpp | 1 +
 dpnp/backend/extensions/ufunc/elementwise_functions/divmod.cpp | 1 +
 .../extensions/ufunc/elementwise_functions/erf_funcs.cpp       | 1 +
 dpnp/backend/extensions/ufunc/elementwise_functions/fabs.cpp   | 1 +
 .../extensions/ufunc/elementwise_functions/float_power.cpp     | 1 +
 dpnp/backend/extensions/ufunc/elementwise_functions/fmax.cpp   | 1 +
 dpnp/backend/extensions/ufunc/elementwise_functions/fmin.cpp   | 1 +
 dpnp/backend/extensions/ufunc/elementwise_functions/fmod.cpp   | 1 +
 dpnp/backend/extensions/ufunc/elementwise_functions/frexp.cpp  | 1 +
 dpnp/backend/extensions/ufunc/elementwise_functions/gcd.cpp    | 1 +
 .../extensions/ufunc/elementwise_functions/heaviside.cpp       | 1 +
 dpnp/backend/extensions/ufunc/elementwise_functions/i0.cpp     | 1 +
 .../extensions/ufunc/elementwise_functions/interpolate.cpp     | 1 +
 .../backend/extensions/ufunc/elementwise_functions/isclose.cpp | 1 +
 dpnp/backend/extensions/ufunc/elementwise_functions/lcm.cpp    | 1 +
 dpnp/backend/extensions/ufunc/elementwise_functions/ldexp.cpp  | 1 +
 .../extensions/ufunc/elementwise_functions/logaddexp2.cpp      | 1 +
 dpnp/backend/extensions/ufunc/elementwise_functions/modf.cpp   | 1 +
 .../extensions/ufunc/elementwise_functions/nan_to_num.cpp      | 3 ++-
 .../backend/extensions/ufunc/elementwise_functions/radians.cpp | 1 +
 dpnp/backend/extensions/ufunc/elementwise_functions/sinc.cpp   | 1 +
 .../backend/extensions/ufunc/elementwise_functions/spacing.cpp | 1 +
 dpnp/backend/extensions/vm/abs.cpp                             | 3 +++
 dpnp/backend/extensions/vm/acos.cpp                            | 3 +++
 dpnp/backend/extensions/vm/acosh.cpp                           | 3 +++
 dpnp/backend/extensions/vm/add.cpp                             | 3 +++
 dpnp/backend/extensions/vm/arg.cpp                             | 3 +++
 dpnp/backend/extensions/vm/asin.cpp                            | 3 +++
 dpnp/backend/extensions/vm/asinh.cpp                           | 3 +++
 dpnp/backend/extensions/vm/atan.cpp                            | 3 +++
 dpnp/backend/extensions/vm/atan2.cpp                           | 3 +++
 dpnp/backend/extensions/vm/atanh.cpp                           | 3 +++
 dpnp/backend/extensions/vm/cbrt.cpp                            | 3 +++
 dpnp/backend/extensions/vm/ceil.cpp                            | 3 +++
 dpnp/backend/extensions/vm/conj.cpp                            | 3 +++
 dpnp/backend/extensions/vm/copysign.cpp                        | 3 +++
 dpnp/backend/extensions/vm/cos.cpp                             | 3 +++
 dpnp/backend/extensions/vm/cosh.cpp                            | 3 +++
 dpnp/backend/extensions/vm/div.cpp                             | 3 +++
 dpnp/backend/extensions/vm/erf_funcs.cpp                       | 3 +++
 dpnp/backend/extensions/vm/exp.cpp                             | 3 +++
 dpnp/backend/extensions/vm/exp2.cpp                            | 3 +++
 dpnp/backend/extensions/vm/expm1.cpp                           | 3 +++
 dpnp/backend/extensions/vm/floor.cpp                           | 3 +++
 dpnp/backend/extensions/vm/fmax.cpp                            | 3 +++
 dpnp/backend/extensions/vm/fmin.cpp                            | 3 +++
 dpnp/backend/extensions/vm/fmod.cpp                            | 3 +++
 dpnp/backend/extensions/vm/hypot.cpp                           | 3 +++
 dpnp/backend/extensions/vm/i0.cpp                              | 3 +++
 dpnp/backend/extensions/vm/inv.cpp                             | 3 +++
 dpnp/backend/extensions/vm/ln.cpp                              | 3 +++
 dpnp/backend/extensions/vm/log10.cpp                           | 3 +++
 dpnp/backend/extensions/vm/log1p.cpp                           | 3 +++
 dpnp/backend/extensions/vm/log2.cpp                            | 3 +++
 dpnp/backend/extensions/vm/modf.cpp                            | 3 +++
 dpnp/backend/extensions/vm/mul.cpp                             | 3 +++
 dpnp/backend/extensions/vm/nextafter.cpp                       | 3 +++
 dpnp/backend/extensions/vm/pow.cpp                             | 3 +++
 dpnp/backend/extensions/vm/rint.cpp                            | 3 +++
 dpnp/backend/extensions/vm/sin.cpp                             | 3 +++
 dpnp/backend/extensions/vm/sinh.cpp                            | 3 +++
 dpnp/backend/extensions/vm/sqr.cpp                             | 3 +++
 dpnp/backend/extensions/vm/sqrt.cpp                            | 3 +++
 dpnp/backend/extensions/vm/sub.cpp                             | 3 +++
 dpnp/backend/extensions/vm/tan.cpp                             | 3 +++
 dpnp/backend/extensions/vm/tanh.cpp                            | 3 +++
 dpnp/backend/extensions/vm/trunc.cpp                           | 3 +++
 73 files changed, 164 insertions(+), 1 deletion(-)

diff --git a/dpnp/backend/extensions/lapack/heevd.cpp b/dpnp/backend/extensions/lapack/heevd.cpp
index 5990e5344a17..923e950b1383 100644
--- a/dpnp/backend/extensions/lapack/heevd.cpp
+++ b/dpnp/backend/extensions/lapack/heevd.cpp
@@ -28,6 +28,7 @@
 
 #include <stdexcept>
 
+#include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 
 #include "evd_common.hpp"
diff --git a/dpnp/backend/extensions/lapack/heevd_batch.cpp b/dpnp/backend/extensions/lapack/heevd_batch.cpp
index e1c1a96bc320..9d7c3300dbf7 100644
--- a/dpnp/backend/extensions/lapack/heevd_batch.cpp
+++ b/dpnp/backend/extensions/lapack/heevd_batch.cpp
@@ -28,6 +28,7 @@
 
 #include <stdexcept>
 
+#include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 
 #include "common_helpers.hpp"
diff --git a/dpnp/backend/extensions/lapack/syevd.cpp b/dpnp/backend/extensions/lapack/syevd.cpp
index af69cf9e6b7e..3c09ca4f587b 100644
--- a/dpnp/backend/extensions/lapack/syevd.cpp
+++ b/dpnp/backend/extensions/lapack/syevd.cpp
@@ -28,6 +28,7 @@
 
 #include <stdexcept>
 
+#include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 
 #include "evd_common.hpp"
diff --git a/dpnp/backend/extensions/lapack/syevd_batch.cpp b/dpnp/backend/extensions/lapack/syevd_batch.cpp
index 0c326e5d79bb..36d1c820f00d 100644
--- a/dpnp/backend/extensions/lapack/syevd_batch.cpp
+++ b/dpnp/backend/extensions/lapack/syevd_batch.cpp
@@ -28,6 +28,7 @@
 
 #include <stdexcept>
 
+#include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 
 #include "common_helpers.hpp"
diff --git a/dpnp/backend/extensions/statistics/histogram.cpp b/dpnp/backend/extensions/statistics/histogram.cpp
index 621aa36cfbd1..afc5d9638f48 100644
--- a/dpnp/backend/extensions/statistics/histogram.cpp
+++ b/dpnp/backend/extensions/statistics/histogram.cpp
@@ -33,6 +33,7 @@
 #include <vector>
 
 #include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
 
 #include "dpnp4pybind11.hpp"
 
diff --git a/dpnp/backend/extensions/ufunc/elementwise_functions/bitwise_count.cpp b/dpnp/backend/extensions/ufunc/elementwise_functions/bitwise_count.cpp
index 9fe5d5d43c7c..761bd330a326 100644
--- a/dpnp/backend/extensions/ufunc/elementwise_functions/bitwise_count.cpp
+++ b/dpnp/backend/extensions/ufunc/elementwise_functions/bitwise_count.cpp
@@ -32,6 +32,7 @@
 
 #include <pybind11/numpy.h>
 #include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
 
 #include <sycl/sycl.hpp>
 
diff --git a/dpnp/backend/extensions/ufunc/elementwise_functions/degrees.cpp b/dpnp/backend/extensions/ufunc/elementwise_functions/degrees.cpp
index 9b15f8b29895..729fcb576c77 100644
--- a/dpnp/backend/extensions/ufunc/elementwise_functions/degrees.cpp
+++ b/dpnp/backend/extensions/ufunc/elementwise_functions/degrees.cpp
@@ -31,6 +31,7 @@
 
 #include <pybind11/numpy.h>
 #include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
 
 #include <sycl/sycl.hpp>
 
diff --git a/dpnp/backend/extensions/ufunc/elementwise_functions/divmod.cpp b/dpnp/backend/extensions/ufunc/elementwise_functions/divmod.cpp
index 599a5eca1518..1bb3859a39f4 100644
--- a/dpnp/backend/extensions/ufunc/elementwise_functions/divmod.cpp
+++ b/dpnp/backend/extensions/ufunc/elementwise_functions/divmod.cpp
@@ -32,6 +32,7 @@
 
 #include <pybind11/numpy.h>
 #include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
 
 #include <sycl/sycl.hpp>
 
diff --git a/dpnp/backend/extensions/ufunc/elementwise_functions/erf_funcs.cpp b/dpnp/backend/extensions/ufunc/elementwise_functions/erf_funcs.cpp
index c739cd5f119d..fff0118d06aa 100644
--- a/dpnp/backend/extensions/ufunc/elementwise_functions/erf_funcs.cpp
+++ b/dpnp/backend/extensions/ufunc/elementwise_functions/erf_funcs.cpp
@@ -31,6 +31,7 @@
 
 #include <pybind11/numpy.h>
 #include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
 
 #include <sycl/sycl.hpp>
 
diff --git a/dpnp/backend/extensions/ufunc/elementwise_functions/fabs.cpp b/dpnp/backend/extensions/ufunc/elementwise_functions/fabs.cpp
index 640d8629edbd..f7c2183633af 100644
--- a/dpnp/backend/extensions/ufunc/elementwise_functions/fabs.cpp
+++ b/dpnp/backend/extensions/ufunc/elementwise_functions/fabs.cpp
@@ -31,6 +31,7 @@
 
 #include <pybind11/numpy.h>
 #include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
 
 #include <sycl/sycl.hpp>
 
diff --git a/dpnp/backend/extensions/ufunc/elementwise_functions/float_power.cpp b/dpnp/backend/extensions/ufunc/elementwise_functions/float_power.cpp
index 0f065e6dad0d..43927eb93806 100644
--- a/dpnp/backend/extensions/ufunc/elementwise_functions/float_power.cpp
+++ b/dpnp/backend/extensions/ufunc/elementwise_functions/float_power.cpp
@@ -32,6 +32,7 @@
 
 #include <pybind11/numpy.h>
 #include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
 
 #include <sycl/sycl.hpp>
 
diff --git a/dpnp/backend/extensions/ufunc/elementwise_functions/fmax.cpp b/dpnp/backend/extensions/ufunc/elementwise_functions/fmax.cpp
index 3882f24611a5..9471feaf2166 100644
--- a/dpnp/backend/extensions/ufunc/elementwise_functions/fmax.cpp
+++ b/dpnp/backend/extensions/ufunc/elementwise_functions/fmax.cpp
@@ -30,6 +30,7 @@
 
 #include <pybind11/numpy.h>
 #include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
 
 #include <sycl/sycl.hpp>
 
diff --git a/dpnp/backend/extensions/ufunc/elementwise_functions/fmin.cpp b/dpnp/backend/extensions/ufunc/elementwise_functions/fmin.cpp
index 1fd8798572e5..8e279897f414 100644
--- a/dpnp/backend/extensions/ufunc/elementwise_functions/fmin.cpp
+++ b/dpnp/backend/extensions/ufunc/elementwise_functions/fmin.cpp
@@ -30,6 +30,7 @@
 
 #include <pybind11/numpy.h>
 #include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
 
 #include <sycl/sycl.hpp>
 
diff --git a/dpnp/backend/extensions/ufunc/elementwise_functions/fmod.cpp b/dpnp/backend/extensions/ufunc/elementwise_functions/fmod.cpp
index 1dca65e622cf..83fb750b6907 100644
--- a/dpnp/backend/extensions/ufunc/elementwise_functions/fmod.cpp
+++ b/dpnp/backend/extensions/ufunc/elementwise_functions/fmod.cpp
@@ -32,6 +32,7 @@
 
 #include <pybind11/numpy.h>
 #include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
 
 #include <sycl/sycl.hpp>
 
diff --git a/dpnp/backend/extensions/ufunc/elementwise_functions/frexp.cpp b/dpnp/backend/extensions/ufunc/elementwise_functions/frexp.cpp
index b1367bd82540..17e09f3ee816 100644
--- a/dpnp/backend/extensions/ufunc/elementwise_functions/frexp.cpp
+++ b/dpnp/backend/extensions/ufunc/elementwise_functions/frexp.cpp
@@ -33,6 +33,7 @@
 
 #include <pybind11/numpy.h>
 #include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
 
 #include <sycl/sycl.hpp>
 
diff --git a/dpnp/backend/extensions/ufunc/elementwise_functions/gcd.cpp b/dpnp/backend/extensions/ufunc/elementwise_functions/gcd.cpp
index d21ef703fecf..0481365356ca 100644
--- a/dpnp/backend/extensions/ufunc/elementwise_functions/gcd.cpp
+++ b/dpnp/backend/extensions/ufunc/elementwise_functions/gcd.cpp
@@ -32,6 +32,7 @@
 
 #include <pybind11/numpy.h>
 #include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
 
 #include <sycl/sycl.hpp>
 
diff --git a/dpnp/backend/extensions/ufunc/elementwise_functions/heaviside.cpp b/dpnp/backend/extensions/ufunc/elementwise_functions/heaviside.cpp
index 29b8a475a8a9..62affd206420 100644
--- a/dpnp/backend/extensions/ufunc/elementwise_functions/heaviside.cpp
+++ b/dpnp/backend/extensions/ufunc/elementwise_functions/heaviside.cpp
@@ -31,6 +31,7 @@
 
 #include <pybind11/numpy.h>
 #include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
 
 #include <sycl/sycl.hpp>
 
diff --git a/dpnp/backend/extensions/ufunc/elementwise_functions/i0.cpp b/dpnp/backend/extensions/ufunc/elementwise_functions/i0.cpp
index 9e7aaba5c90c..53ded341b58b 100644
--- a/dpnp/backend/extensions/ufunc/elementwise_functions/i0.cpp
+++ b/dpnp/backend/extensions/ufunc/elementwise_functions/i0.cpp
@@ -31,6 +31,7 @@
 
 #include <pybind11/numpy.h>
 #include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
 
 #include <sycl/sycl.hpp>
 
diff --git a/dpnp/backend/extensions/ufunc/elementwise_functions/interpolate.cpp b/dpnp/backend/extensions/ufunc/elementwise_functions/interpolate.cpp
index d22c51c29fde..82e96ab732de 100644
--- a/dpnp/backend/extensions/ufunc/elementwise_functions/interpolate.cpp
+++ b/dpnp/backend/extensions/ufunc/elementwise_functions/interpolate.cpp
@@ -37,6 +37,7 @@
 
 #include <pybind11/numpy.h>
 #include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
 
 #include <sycl/sycl.hpp>
 
diff --git a/dpnp/backend/extensions/ufunc/elementwise_functions/isclose.cpp b/dpnp/backend/extensions/ufunc/elementwise_functions/isclose.cpp
index 37949016c905..3025cbf16586 100644
--- a/dpnp/backend/extensions/ufunc/elementwise_functions/isclose.cpp
+++ b/dpnp/backend/extensions/ufunc/elementwise_functions/isclose.cpp
@@ -34,6 +34,7 @@
 
 #include <pybind11/numpy.h>
 #include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
 
 #include <sycl/sycl.hpp>
 
diff --git a/dpnp/backend/extensions/ufunc/elementwise_functions/lcm.cpp b/dpnp/backend/extensions/ufunc/elementwise_functions/lcm.cpp
index 3d5d34aae4ab..35138e903eac 100644
--- a/dpnp/backend/extensions/ufunc/elementwise_functions/lcm.cpp
+++ b/dpnp/backend/extensions/ufunc/elementwise_functions/lcm.cpp
@@ -32,6 +32,7 @@
 
 #include <pybind11/numpy.h>
 #include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
 
 #include <sycl/sycl.hpp>
 
diff --git a/dpnp/backend/extensions/ufunc/elementwise_functions/ldexp.cpp b/dpnp/backend/extensions/ufunc/elementwise_functions/ldexp.cpp
index 15ceb91fbd78..44ef51726a6a 100644
--- a/dpnp/backend/extensions/ufunc/elementwise_functions/ldexp.cpp
+++ b/dpnp/backend/extensions/ufunc/elementwise_functions/ldexp.cpp
@@ -32,6 +32,7 @@
 
 #include <pybind11/numpy.h>
 #include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
 
 #include <sycl/sycl.hpp>
 
diff --git a/dpnp/backend/extensions/ufunc/elementwise_functions/logaddexp2.cpp b/dpnp/backend/extensions/ufunc/elementwise_functions/logaddexp2.cpp
index a63b3e3431f0..e37f13b119d6 100644
--- a/dpnp/backend/extensions/ufunc/elementwise_functions/logaddexp2.cpp
+++ b/dpnp/backend/extensions/ufunc/elementwise_functions/logaddexp2.cpp
@@ -30,6 +30,7 @@
 
 #include <pybind11/numpy.h>
 #include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
 
 #include <sycl/sycl.hpp>
 
diff --git a/dpnp/backend/extensions/ufunc/elementwise_functions/modf.cpp b/dpnp/backend/extensions/ufunc/elementwise_functions/modf.cpp
index 784c83b66cd5..266103248521 100644
--- a/dpnp/backend/extensions/ufunc/elementwise_functions/modf.cpp
+++ b/dpnp/backend/extensions/ufunc/elementwise_functions/modf.cpp
@@ -33,6 +33,7 @@
 
 #include <pybind11/numpy.h>
 #include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
 
 #include <sycl/sycl.hpp>
 
diff --git a/dpnp/backend/extensions/ufunc/elementwise_functions/nan_to_num.cpp b/dpnp/backend/extensions/ufunc/elementwise_functions/nan_to_num.cpp
index b430dc51f974..c30d388f8afd 100644
--- a/dpnp/backend/extensions/ufunc/elementwise_functions/nan_to_num.cpp
+++ b/dpnp/backend/extensions/ufunc/elementwise_functions/nan_to_num.cpp
@@ -38,11 +38,12 @@
 
 #include <sycl/sycl.hpp>
 
-#include "dpnp4pybind11.hpp"
 #include <pybind11/numpy.h>
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 
+#include "dpnp4pybind11.hpp"
+
 #include "kernels/elementwise_functions/nan_to_num.hpp"
 
 #include "../../elementwise_functions/simplify_iteration_space.hpp"
diff --git a/dpnp/backend/extensions/ufunc/elementwise_functions/radians.cpp b/dpnp/backend/extensions/ufunc/elementwise_functions/radians.cpp
index ea21ad42e140..0a481fd33d11 100644
--- a/dpnp/backend/extensions/ufunc/elementwise_functions/radians.cpp
+++ b/dpnp/backend/extensions/ufunc/elementwise_functions/radians.cpp
@@ -31,6 +31,7 @@
 
 #include <pybind11/numpy.h>
 #include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
 
 #include <sycl/sycl.hpp>
 
diff --git a/dpnp/backend/extensions/ufunc/elementwise_functions/sinc.cpp b/dpnp/backend/extensions/ufunc/elementwise_functions/sinc.cpp
index eebeb82b7124..87a911472db2 100644
--- a/dpnp/backend/extensions/ufunc/elementwise_functions/sinc.cpp
+++ b/dpnp/backend/extensions/ufunc/elementwise_functions/sinc.cpp
@@ -32,6 +32,7 @@
 
 #include <pybind11/numpy.h>
 #include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
 
 #include <sycl/sycl.hpp>
 
diff --git a/dpnp/backend/extensions/ufunc/elementwise_functions/spacing.cpp b/dpnp/backend/extensions/ufunc/elementwise_functions/spacing.cpp
index ec81be1bbe03..4c14582f30ae 100644
--- a/dpnp/backend/extensions/ufunc/elementwise_functions/spacing.cpp
+++ b/dpnp/backend/extensions/ufunc/elementwise_functions/spacing.cpp
@@ -31,6 +31,7 @@
 
 #include <pybind11/numpy.h>
 #include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
 
 #include <sycl/sycl.hpp>
 
diff --git a/dpnp/backend/extensions/vm/abs.cpp b/dpnp/backend/extensions/vm/abs.cpp
index a2432f5bedc6..1dc8143dd5ff 100644
--- a/dpnp/backend/extensions/vm/abs.cpp
+++ b/dpnp/backend/extensions/vm/abs.cpp
@@ -35,6 +35,9 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
 #include "dpnp4pybind11.hpp"
 
 #include "abs.hpp"
diff --git a/dpnp/backend/extensions/vm/acos.cpp b/dpnp/backend/extensions/vm/acos.cpp
index 01e4d5ab35f9..15b4ce80cc3c 100644
--- a/dpnp/backend/extensions/vm/acos.cpp
+++ b/dpnp/backend/extensions/vm/acos.cpp
@@ -35,6 +35,9 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
 #include "dpnp4pybind11.hpp"
 
 #include "acos.hpp"
diff --git a/dpnp/backend/extensions/vm/acosh.cpp b/dpnp/backend/extensions/vm/acosh.cpp
index b1136163d684..eed835b78e10 100644
--- a/dpnp/backend/extensions/vm/acosh.cpp
+++ b/dpnp/backend/extensions/vm/acosh.cpp
@@ -35,6 +35,9 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
 #include "dpnp4pybind11.hpp"
 
 #include "acosh.hpp"
diff --git a/dpnp/backend/extensions/vm/add.cpp b/dpnp/backend/extensions/vm/add.cpp
index 572eadb83cec..a58aac727cd1 100644
--- a/dpnp/backend/extensions/vm/add.cpp
+++ b/dpnp/backend/extensions/vm/add.cpp
@@ -36,6 +36,9 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
 #include "dpnp4pybind11.hpp"
 
 #include "add.hpp"
diff --git a/dpnp/backend/extensions/vm/arg.cpp b/dpnp/backend/extensions/vm/arg.cpp
index 40a15082f0ec..c50c4a33dee1 100644
--- a/dpnp/backend/extensions/vm/arg.cpp
+++ b/dpnp/backend/extensions/vm/arg.cpp
@@ -35,6 +35,9 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
 #include "dpnp4pybind11.hpp"
 
 #include "arg.hpp"
diff --git a/dpnp/backend/extensions/vm/asin.cpp b/dpnp/backend/extensions/vm/asin.cpp
index 8cf73f3fe572..5af7033fed21 100644
--- a/dpnp/backend/extensions/vm/asin.cpp
+++ b/dpnp/backend/extensions/vm/asin.cpp
@@ -35,6 +35,9 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
 #include "dpnp4pybind11.hpp"
 
 #include "asin.hpp"
diff --git a/dpnp/backend/extensions/vm/asinh.cpp b/dpnp/backend/extensions/vm/asinh.cpp
index a3404d2c5415..5b0f8ed13106 100644
--- a/dpnp/backend/extensions/vm/asinh.cpp
+++ b/dpnp/backend/extensions/vm/asinh.cpp
@@ -35,6 +35,9 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
 #include "dpnp4pybind11.hpp"
 
 #include "asinh.hpp"
diff --git a/dpnp/backend/extensions/vm/atan.cpp b/dpnp/backend/extensions/vm/atan.cpp
index a89cb8f9a308..2255000c1c4b 100644
--- a/dpnp/backend/extensions/vm/atan.cpp
+++ b/dpnp/backend/extensions/vm/atan.cpp
@@ -35,6 +35,9 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
 #include "dpnp4pybind11.hpp"
 
 #include "atan.hpp"
diff --git a/dpnp/backend/extensions/vm/atan2.cpp b/dpnp/backend/extensions/vm/atan2.cpp
index bcdf1daae1b3..bf29e2921a1d 100644
--- a/dpnp/backend/extensions/vm/atan2.cpp
+++ b/dpnp/backend/extensions/vm/atan2.cpp
@@ -35,6 +35,9 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
 #include "dpnp4pybind11.hpp"
 
 #include "atan2.hpp"
diff --git a/dpnp/backend/extensions/vm/atanh.cpp b/dpnp/backend/extensions/vm/atanh.cpp
index d4ef24663d02..9daab09980e6 100644
--- a/dpnp/backend/extensions/vm/atanh.cpp
+++ b/dpnp/backend/extensions/vm/atanh.cpp
@@ -35,6 +35,9 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
 #include "dpnp4pybind11.hpp"
 
 #include "atanh.hpp"
diff --git a/dpnp/backend/extensions/vm/cbrt.cpp b/dpnp/backend/extensions/vm/cbrt.cpp
index 47584c8f6811..34ff8dd913ac 100644
--- a/dpnp/backend/extensions/vm/cbrt.cpp
+++ b/dpnp/backend/extensions/vm/cbrt.cpp
@@ -34,6 +34,9 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
 #include "dpnp4pybind11.hpp"
 
 #include "cbrt.hpp"
diff --git a/dpnp/backend/extensions/vm/ceil.cpp b/dpnp/backend/extensions/vm/ceil.cpp
index d170b66d7d2c..e76a30d28317 100644
--- a/dpnp/backend/extensions/vm/ceil.cpp
+++ b/dpnp/backend/extensions/vm/ceil.cpp
@@ -34,6 +34,9 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
 #include "dpnp4pybind11.hpp"
 
 #include "ceil.hpp"
diff --git a/dpnp/backend/extensions/vm/conj.cpp b/dpnp/backend/extensions/vm/conj.cpp
index ee000d5ee40d..f77020cf1d55 100644
--- a/dpnp/backend/extensions/vm/conj.cpp
+++ b/dpnp/backend/extensions/vm/conj.cpp
@@ -35,6 +35,9 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
 #include "dpnp4pybind11.hpp"
 
 #include "common.hpp"
diff --git a/dpnp/backend/extensions/vm/copysign.cpp b/dpnp/backend/extensions/vm/copysign.cpp
index 8b6714865204..15c0fceec413 100644
--- a/dpnp/backend/extensions/vm/copysign.cpp
+++ b/dpnp/backend/extensions/vm/copysign.cpp
@@ -35,6 +35,9 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
 #include "dpnp4pybind11.hpp"
 
 #include "common.hpp"
diff --git a/dpnp/backend/extensions/vm/cos.cpp b/dpnp/backend/extensions/vm/cos.cpp
index 62ecff7ea6a8..7c9b0c35d6ca 100644
--- a/dpnp/backend/extensions/vm/cos.cpp
+++ b/dpnp/backend/extensions/vm/cos.cpp
@@ -35,6 +35,9 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
 #include "dpnp4pybind11.hpp"
 
 #include "common.hpp"
diff --git a/dpnp/backend/extensions/vm/cosh.cpp b/dpnp/backend/extensions/vm/cosh.cpp
index ec81142eb6ce..a95c7075ba61 100644
--- a/dpnp/backend/extensions/vm/cosh.cpp
+++ b/dpnp/backend/extensions/vm/cosh.cpp
@@ -35,6 +35,9 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
 #include "dpnp4pybind11.hpp"
 
 #include "common.hpp"
diff --git a/dpnp/backend/extensions/vm/div.cpp b/dpnp/backend/extensions/vm/div.cpp
index 6b8c1f781955..6e0cb4d0439f 100644
--- a/dpnp/backend/extensions/vm/div.cpp
+++ b/dpnp/backend/extensions/vm/div.cpp
@@ -36,6 +36,9 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
 #include "dpnp4pybind11.hpp"
 
 #include "common.hpp"
diff --git a/dpnp/backend/extensions/vm/erf_funcs.cpp b/dpnp/backend/extensions/vm/erf_funcs.cpp
index 2d4be369dc13..7be7f691edcf 100644
--- a/dpnp/backend/extensions/vm/erf_funcs.cpp
+++ b/dpnp/backend/extensions/vm/erf_funcs.cpp
@@ -34,6 +34,9 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
 #include "dpnp4pybind11.hpp"
 
 #include "common.hpp"
diff --git a/dpnp/backend/extensions/vm/exp.cpp b/dpnp/backend/extensions/vm/exp.cpp
index de5f34c404a8..31f50f36171d 100644
--- a/dpnp/backend/extensions/vm/exp.cpp
+++ b/dpnp/backend/extensions/vm/exp.cpp
@@ -35,6 +35,9 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
 #include "dpnp4pybind11.hpp"
 
 #include "common.hpp"
diff --git a/dpnp/backend/extensions/vm/exp2.cpp b/dpnp/backend/extensions/vm/exp2.cpp
index 1f1aa6ab90a8..41f18351fa7d 100644
--- a/dpnp/backend/extensions/vm/exp2.cpp
+++ b/dpnp/backend/extensions/vm/exp2.cpp
@@ -34,6 +34,9 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
 #include "dpnp4pybind11.hpp"
 
 #include "common.hpp"
diff --git a/dpnp/backend/extensions/vm/expm1.cpp b/dpnp/backend/extensions/vm/expm1.cpp
index 5f803622b1a3..37440cab9b0c 100644
--- a/dpnp/backend/extensions/vm/expm1.cpp
+++ b/dpnp/backend/extensions/vm/expm1.cpp
@@ -34,6 +34,9 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
 #include "dpnp4pybind11.hpp"
 
 #include "common.hpp"
diff --git a/dpnp/backend/extensions/vm/floor.cpp b/dpnp/backend/extensions/vm/floor.cpp
index a12bdb18c719..771d141e7f6a 100644
--- a/dpnp/backend/extensions/vm/floor.cpp
+++ b/dpnp/backend/extensions/vm/floor.cpp
@@ -34,6 +34,9 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
 #include "dpnp4pybind11.hpp"
 
 #include "common.hpp"
diff --git a/dpnp/backend/extensions/vm/fmax.cpp b/dpnp/backend/extensions/vm/fmax.cpp
index db4ca265ec42..d01b3ef3dc42 100644
--- a/dpnp/backend/extensions/vm/fmax.cpp
+++ b/dpnp/backend/extensions/vm/fmax.cpp
@@ -35,6 +35,9 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
 #include "dpnp4pybind11.hpp"
 
 #include "common.hpp"
diff --git a/dpnp/backend/extensions/vm/fmin.cpp b/dpnp/backend/extensions/vm/fmin.cpp
index ca933a9f1869..6fbebba556f8 100644
--- a/dpnp/backend/extensions/vm/fmin.cpp
+++ b/dpnp/backend/extensions/vm/fmin.cpp
@@ -35,6 +35,9 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
 #include "dpnp4pybind11.hpp"
 
 #include "common.hpp"
diff --git a/dpnp/backend/extensions/vm/fmod.cpp b/dpnp/backend/extensions/vm/fmod.cpp
index 83337dc1f7fd..1330453d6f84 100644
--- a/dpnp/backend/extensions/vm/fmod.cpp
+++ b/dpnp/backend/extensions/vm/fmod.cpp
@@ -35,6 +35,9 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
 #include "dpnp4pybind11.hpp"
 
 #include "common.hpp"
diff --git a/dpnp/backend/extensions/vm/hypot.cpp b/dpnp/backend/extensions/vm/hypot.cpp
index bf01b8fb42b6..a9b3d3c12288 100644
--- a/dpnp/backend/extensions/vm/hypot.cpp
+++ b/dpnp/backend/extensions/vm/hypot.cpp
@@ -35,6 +35,9 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
 #include "dpnp4pybind11.hpp"
 
 #include "common.hpp"
diff --git a/dpnp/backend/extensions/vm/i0.cpp b/dpnp/backend/extensions/vm/i0.cpp
index afdf34e8cabc..50f692ebd958 100644
--- a/dpnp/backend/extensions/vm/i0.cpp
+++ b/dpnp/backend/extensions/vm/i0.cpp
@@ -34,6 +34,9 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
 #include "dpnp4pybind11.hpp"
 
 #include "common.hpp"
diff --git a/dpnp/backend/extensions/vm/inv.cpp b/dpnp/backend/extensions/vm/inv.cpp
index 6be886c0b0f2..eda08a6d0cd5 100644
--- a/dpnp/backend/extensions/vm/inv.cpp
+++ b/dpnp/backend/extensions/vm/inv.cpp
@@ -34,6 +34,9 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
 #include "dpnp4pybind11.hpp"
 
 #include "common.hpp"
diff --git a/dpnp/backend/extensions/vm/ln.cpp b/dpnp/backend/extensions/vm/ln.cpp
index c6bfb930524d..a5365e4d5a8b 100644
--- a/dpnp/backend/extensions/vm/ln.cpp
+++ b/dpnp/backend/extensions/vm/ln.cpp
@@ -35,6 +35,9 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
 #include "dpnp4pybind11.hpp"
 
 #include "common.hpp"
diff --git a/dpnp/backend/extensions/vm/log10.cpp b/dpnp/backend/extensions/vm/log10.cpp
index 7e6e611d01c8..c04fb602f63d 100644
--- a/dpnp/backend/extensions/vm/log10.cpp
+++ b/dpnp/backend/extensions/vm/log10.cpp
@@ -35,6 +35,9 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
 #include "dpnp4pybind11.hpp"
 
 #include "common.hpp"
diff --git a/dpnp/backend/extensions/vm/log1p.cpp b/dpnp/backend/extensions/vm/log1p.cpp
index 579546f6b3f7..04416bf37185 100644
--- a/dpnp/backend/extensions/vm/log1p.cpp
+++ b/dpnp/backend/extensions/vm/log1p.cpp
@@ -34,6 +34,9 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
 #include "dpnp4pybind11.hpp"
 
 #include "common.hpp"
diff --git a/dpnp/backend/extensions/vm/log2.cpp b/dpnp/backend/extensions/vm/log2.cpp
index 7c3ecb0731d7..752caa261977 100644
--- a/dpnp/backend/extensions/vm/log2.cpp
+++ b/dpnp/backend/extensions/vm/log2.cpp
@@ -34,6 +34,9 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
 #include "dpnp4pybind11.hpp"
 
 #include "common.hpp"
diff --git a/dpnp/backend/extensions/vm/modf.cpp b/dpnp/backend/extensions/vm/modf.cpp
index 283cfadb9b78..418e4e44f7f7 100644
--- a/dpnp/backend/extensions/vm/modf.cpp
+++ b/dpnp/backend/extensions/vm/modf.cpp
@@ -35,6 +35,9 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
 #include "dpnp4pybind11.hpp"
 
 #include "common.hpp"
diff --git a/dpnp/backend/extensions/vm/mul.cpp b/dpnp/backend/extensions/vm/mul.cpp
index a689e88ae0e1..557cfb8882b3 100644
--- a/dpnp/backend/extensions/vm/mul.cpp
+++ b/dpnp/backend/extensions/vm/mul.cpp
@@ -36,6 +36,9 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
 #include "dpnp4pybind11.hpp"
 
 #include "common.hpp"
diff --git a/dpnp/backend/extensions/vm/nextafter.cpp b/dpnp/backend/extensions/vm/nextafter.cpp
index 03b19529fc72..a8ff710bda77 100644
--- a/dpnp/backend/extensions/vm/nextafter.cpp
+++ b/dpnp/backend/extensions/vm/nextafter.cpp
@@ -35,6 +35,9 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
 #include "dpnp4pybind11.hpp"
 
 #include "common.hpp"
diff --git a/dpnp/backend/extensions/vm/pow.cpp b/dpnp/backend/extensions/vm/pow.cpp
index 1d8e8fe8afca..f0db87d1ef48 100644
--- a/dpnp/backend/extensions/vm/pow.cpp
+++ b/dpnp/backend/extensions/vm/pow.cpp
@@ -36,6 +36,9 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
 #include "dpnp4pybind11.hpp"
 
 #include "common.hpp"
diff --git a/dpnp/backend/extensions/vm/rint.cpp b/dpnp/backend/extensions/vm/rint.cpp
index f3d37b92a59a..86931f259a04 100644
--- a/dpnp/backend/extensions/vm/rint.cpp
+++ b/dpnp/backend/extensions/vm/rint.cpp
@@ -34,6 +34,9 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
 #include "dpnp4pybind11.hpp"
 
 #include "common.hpp"
diff --git a/dpnp/backend/extensions/vm/sin.cpp b/dpnp/backend/extensions/vm/sin.cpp
index 39258ceb60b9..7bb6ec321d2a 100644
--- a/dpnp/backend/extensions/vm/sin.cpp
+++ b/dpnp/backend/extensions/vm/sin.cpp
@@ -35,6 +35,9 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
 #include "dpnp4pybind11.hpp"
 
 #include "common.hpp"
diff --git a/dpnp/backend/extensions/vm/sinh.cpp b/dpnp/backend/extensions/vm/sinh.cpp
index 5aa5a31a8f84..5c351afd3b82 100644
--- a/dpnp/backend/extensions/vm/sinh.cpp
+++ b/dpnp/backend/extensions/vm/sinh.cpp
@@ -35,6 +35,9 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
 #include "dpnp4pybind11.hpp"
 
 #include "common.hpp"
diff --git a/dpnp/backend/extensions/vm/sqr.cpp b/dpnp/backend/extensions/vm/sqr.cpp
index bf008a68a68f..9d5cb8af5f2c 100644
--- a/dpnp/backend/extensions/vm/sqr.cpp
+++ b/dpnp/backend/extensions/vm/sqr.cpp
@@ -34,6 +34,9 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
 #include "dpnp4pybind11.hpp"
 
 #include "common.hpp"
diff --git a/dpnp/backend/extensions/vm/sqrt.cpp b/dpnp/backend/extensions/vm/sqrt.cpp
index 8bd26c0fe1a9..5ab3489c1288 100644
--- a/dpnp/backend/extensions/vm/sqrt.cpp
+++ b/dpnp/backend/extensions/vm/sqrt.cpp
@@ -35,6 +35,9 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
 #include "dpnp4pybind11.hpp"
 
 #include "common.hpp"
diff --git a/dpnp/backend/extensions/vm/sub.cpp b/dpnp/backend/extensions/vm/sub.cpp
index b0503754194f..401588d4b65f 100644
--- a/dpnp/backend/extensions/vm/sub.cpp
+++ b/dpnp/backend/extensions/vm/sub.cpp
@@ -36,6 +36,9 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
 #include "dpnp4pybind11.hpp"
 
 #include "common.hpp"
diff --git a/dpnp/backend/extensions/vm/tan.cpp b/dpnp/backend/extensions/vm/tan.cpp
index 9fe4cb64d41c..590320034934 100644
--- a/dpnp/backend/extensions/vm/tan.cpp
+++ b/dpnp/backend/extensions/vm/tan.cpp
@@ -35,6 +35,9 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
 #include "dpnp4pybind11.hpp"
 
 #include "common.hpp"
diff --git a/dpnp/backend/extensions/vm/tanh.cpp b/dpnp/backend/extensions/vm/tanh.cpp
index 70f4ef6142d5..8febd94f2ec8 100644
--- a/dpnp/backend/extensions/vm/tanh.cpp
+++ b/dpnp/backend/extensions/vm/tanh.cpp
@@ -35,6 +35,9 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
 #include "dpnp4pybind11.hpp"
 
 #include "common.hpp"
diff --git a/dpnp/backend/extensions/vm/trunc.cpp b/dpnp/backend/extensions/vm/trunc.cpp
index c6cc4f9e8265..4ec788ccf949 100644
--- a/dpnp/backend/extensions/vm/trunc.cpp
+++ b/dpnp/backend/extensions/vm/trunc.cpp
@@ -34,6 +34,9 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
 #include "dpnp4pybind11.hpp"
 
 #include "common.hpp"

From 617f6e1b8d84e968455b48cceea3c0ed15a014be Mon Sep 17 00:00:00 2001
From: Anton Volkov <antonwolfy@gmail.com>
Date: Mon, 26 Jan 2026 05:29:21 -0800
Subject: [PATCH 16/18] Extend isort configuration to add known third party lib
 as dpctl

---
 pyproject.toml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/pyproject.toml b/pyproject.toml
index 6394cf118dcf..e88c44053dc0 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -134,8 +134,10 @@ source = [
 ensure_newline_before_comments = true
 force_grid_wrap = 0
 include_trailing_comma = true
+known_third_party = ["dpctl"]
 line_length = 80
 multi_line_output = 3
+profile = "black"
 skip = ["dpnp/__init__.py"]
 split_on_trailing_comma = true
 use_parentheses = true

From 2d3f41d566518ee9f83983167a083ff111969e15 Mon Sep 17 00:00:00 2001
From: Anton Volkov <antonwolfy@gmail.com>
Date: Mon, 26 Jan 2026 05:44:20 -0800
Subject: [PATCH 17/18] =?UTF-8?q?Disable=20pylint=E2=80=99s=20import-order?=
 =?UTF-8?q?=20checks=20and=20fully=20rely=20on=20isort?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 pyproject.toml | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/pyproject.toml b/pyproject.toml
index e88c44053dc0..2f6fcebc0e4d 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -143,6 +143,11 @@ split_on_trailing_comma = true
 use_parentheses = true
 
 [tool.pylint.basic]
+disable = [
+  "wrong-import-order",
+  "ungrouped-imports",
+  "wrong-import-position"
+]
 include-naming-hint = true
 
 [tool.pylint.classes]

From e2eb3cc6c061a6147d32f2a69599ca23c2d5628c Mon Sep 17 00:00:00 2001
From: Anton Volkov <antonwolfy@gmail.com>
Date: Mon, 26 Jan 2026 05:53:44 -0800
Subject: [PATCH 18/18] Update pylint configuration to skip checking dpctl
 toplevel package and all dpctl submodules

---
 pyproject.toml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pyproject.toml b/pyproject.toml
index 2f6fcebc0e4d..cdf592535d11 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -148,6 +148,7 @@ disable = [
   "ungrouped-imports",
   "wrong-import-position"
 ]
+ignored-modules = ["dpctl", "dpctl.*"]
 include-naming-hint = true
 
 [tool.pylint.classes]