From f65e9e4c9910e488d708b88c3e42d5ddaabce770 Mon Sep 17 00:00:00 2001 From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com> Date: Fri, 16 Jan 2026 15:49:59 -0600 Subject: [PATCH 1/6] Replace nested static_for lambdas with compile-time search helper The GetTransformAndItsUpperDimension function used nested static_for loops with lambdas to search for a hidden dimension in UpperDimensionIdss. This caused 918 applier::operator() instantiations (81% of all applier instantiations). Replace with find_in_tuple_of_sequences helper that uses constexpr array lookup and if-constexpr recursion, eliminating the lambda instantiation overhead. Results on example_grouped_conv_fwd_xdl_fp16: - applier instantiations: 1132 -> 127 (89% reduction) - TensorDescriptor instantiations: 2503 -> 664 (73% reduction) - Template instantiation time: 23.4s -> 19.4s (17% reduction) --- .../tensor_description/tensor_descriptor.hpp | 21 ++---- include/ck/utility/sequence_helper.hpp | 69 +++++++++++++++++++ 2 files changed, 73 insertions(+), 17 deletions(-) diff --git a/include/ck/tensor_description/tensor_descriptor.hpp b/include/ck/tensor_description/tensor_descriptor.hpp index 2437132d114..f6ad26dae86 100644 --- a/include/ck/tensor_description/tensor_descriptor.hpp +++ b/include/ck/tensor_description/tensor_descriptor.hpp @@ -82,24 +82,11 @@ struct TensorDescriptor constexpr index_t idim_hidden = VisibleDimensionIds::At(idim_visible); - index_t itran_found = 0; - index_t idim_up_found = 0; - bool found = false; - - static_for<0, ntransform_, 1>{}([&](auto itran) { - constexpr auto up_dim_ids = UpperDimensionIdss{}[itran]; - - static_for<0, up_dim_ids.Size(), 1>{}([&](auto idim_up) { - if constexpr(up_dim_ids[idim_up] == idim_hidden) - { - itran_found = itran; - idim_up_found = idim_up; - found = true; - } - }); - }); + // Use compile-time search helper instead of nested static_for with lambdas + // This eliminates ~918 applier::operator() instantiations + constexpr auto result = find_in_tuple_of_sequences(UpperDimensionIdss{}); - return make_tuple(itran_found, idim_up_found, found); + return make_tuple(result.itran, result.idim_up, result.found); } constexpr static index_t ntransform_ = GetNumOfTransform(); diff --git a/include/ck/utility/sequence_helper.hpp b/include/ck/utility/sequence_helper.hpp index 35a6a486324..aeebf08e65c 100644 --- a/include/ck/utility/sequence_helper.hpp +++ b/include/ck/utility/sequence_helper.hpp @@ -34,4 +34,73 @@ __host__ __device__ constexpr auto to_sequence(Tuple...>) return Sequence{}; } +// Find index of Target in Sequence, returns -1 if not found +// Uses constexpr array lookup for O(1) template depth +template +__host__ __device__ constexpr index_t sequence_find_value(Sequence) +{ + if constexpr(sizeof...(Is) == 0) + { + return -1; + } + else + { + constexpr bool matches[] = {(Is == Target)...}; + for(index_t i = 0; i < static_cast(sizeof...(Is)); ++i) + { + if(matches[i]) + return i; + } + return -1; + } +} + +// Result type for find_in_tuple_of_sequences +template +struct FindTransformResult +{ + static constexpr index_t itran = ITran; + static constexpr index_t idim_up = IDimUp; + static constexpr bool found = Found; +}; + +namespace detail { + +// Helper to search through a tuple of sequences for a target value +// Returns FindTransformResult with (transform_index, index_within_sequence, found) +template +__host__ __device__ constexpr auto find_in_tuple_of_sequences_impl() +{ + constexpr index_t idx = sequence_find_value(FirstSeq{}); + if constexpr(idx >= 0) + { + return FindTransformResult{}; + } + else if constexpr(sizeof...(RestSeqs) > 0) + { + return find_in_tuple_of_sequences_impl(); + } + else + { + return FindTransformResult<0, 0, false>{}; + } +} + +} // namespace detail + +// Find target value in a tuple of sequences +// Returns FindTransformResult +// This replaces nested static_for loops with O(1) template depth +template +__host__ __device__ constexpr auto find_in_tuple_of_sequences(Tuple) +{ + if constexpr(sizeof...(Seqs) == 0) + { + return FindTransformResult<0, 0, false>{}; + } + else + { + return detail::find_in_tuple_of_sequences_impl(); + } +} } // namespace ck From 1159278d12497323fdd799a4d4c009c4b11ca163 Mon Sep 17 00:00:00 2001 From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com> Date: Fri, 16 Jan 2026 16:06:37 -0600 Subject: [PATCH 2/6] Replace generate_tuple lambda with pack expansion in InitializeElementSize The InitializeElementSize function used generate_tuple with a lambda to compute visible dimension lengths. Each TensorDescriptor type created a unique lambda type, causing 78 instantiations (385ms). Replace with direct pack expansion using helper functions, eliminating the lambda instantiation overhead entirely. Results on example_grouped_conv_fwd_xdl_fp16: - generate_tuple lambdas: 178 -> 100 (44% reduction) - Template instantiation time: 19.5s -> 19.0s --- .../tensor_description/tensor_descriptor.hpp | 39 ++++++++++--------- 1 file changed, 20 insertions(+), 19 deletions(-) diff --git a/include/ck/tensor_description/tensor_descriptor.hpp b/include/ck/tensor_description/tensor_descriptor.hpp index f6ad26dae86..ca936a070b3 100644 --- a/include/ck/tensor_description/tensor_descriptor.hpp +++ b/include/ck/tensor_description/tensor_descriptor.hpp @@ -51,28 +51,29 @@ struct TensorDescriptor return unique_sort_all_dim_ids::Size(); } - __host__ __device__ static constexpr auto InitializeElementSize(const Transforms& transforms) + // Helper to get length of a visible dimension from transforms + template + __host__ __device__ static constexpr auto + GetVisibleDimLengthFromTransforms(const Transforms& transforms) { - const auto lengths = generate_tuple( - [&](auto idim_visible) { - constexpr auto tmp = GetTransformAndItsUpperDimension(idim_visible); - - constexpr index_t itran = tmp[Number<0>{}]; - constexpr index_t idim_up = tmp[Number<1>{}]; - constexpr bool found = tmp[Number<2>{}]; - - static_assert(found == true, - "wrong! not found matching transformation and upper-dimension"); - - const auto length = - transforms[Number{}].GetUpperLengths()[Number{}]; + constexpr auto result = + find_in_tuple_of_sequences{})>(UpperDimensionIdss{}); + static_assert(result.found, "wrong! not found matching transformation and upper-dimension"); + return transforms[Number{}].GetUpperLengths()[Number{}]; + } - return length; - }, - Number{}); + // Compute element size using pack expansion instead of generate_tuple with lambda + template + __host__ __device__ static constexpr auto ComputeElementSizeImpl(const Transforms& transforms, + Sequence) + { + return (GetVisibleDimLengthFromTransforms(transforms) * ...); + } - // TODO: make container_reduce support tuple of Number and index_t - return container_reduce(lengths, math::multiplies{}, Number<1>{}); + __host__ __device__ static constexpr auto InitializeElementSize(const Transforms& transforms) + { + return ComputeElementSizeImpl( + transforms, typename arithmetic_sequence_gen<0, ndim_visible_, 1>::type{}); } template From bc802ffe3a9905a0bc8b5ab404edc0f36c98aa82 Mon Sep 17 00:00:00 2001 From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com> Date: Fri, 16 Jan 2026 16:37:56 -0600 Subject: [PATCH 3/6] Apply same optimization pattern to TensorAdaptor TensorAdaptor has identical InitializeElementSize and GetTransformAndItsUpperDimension patterns as TensorDescriptor. Apply the same optimization: - Replace nested static_for lambdas with find_in_tuple_of_sequences - Replace generate_tuple lambda with pack expansion Results: generate_tuple lambdas 100 -> 96 (4 events, 17ms eliminated) --- .../ck/tensor_description/tensor_adaptor.hpp | 59 ++++++--------- include/ck/utility/sequence_helper.hpp | 71 ++++++++++++------- 2 files changed, 67 insertions(+), 63 deletions(-) diff --git a/include/ck/tensor_description/tensor_adaptor.hpp b/include/ck/tensor_description/tensor_adaptor.hpp index 79c5881d48a..031082e1a0b 100644 --- a/include/ck/tensor_description/tensor_adaptor.hpp +++ b/include/ck/tensor_description/tensor_adaptor.hpp @@ -45,28 +45,29 @@ struct TensorAdaptor return BottomDimensionHiddenIds{}; } - __host__ __device__ static constexpr auto InitializeElementSize(const Transforms& transforms) + // Helper to get length of a top dimension from transforms + template + __host__ __device__ static constexpr auto + GetTopDimLengthFromTransforms(const Transforms& transforms) { - const auto lengths = generate_tuple( - [&](auto idim_top) { - constexpr auto tmp = GetTransformAndItsUpperDimension(idim_top); - - constexpr index_t itran = tmp[Number<0>{}]; - constexpr index_t idim_up = tmp[Number<1>{}]; - constexpr bool found = tmp[Number<2>{}]; - - static_assert(found == true, - "wrong! not found matching transformation and upper-dimension"); - - const auto length = - transforms[Number{}].GetUpperLengths()[Number{}]; + constexpr auto result = find_in_tuple_of_sequences{})>( + UpperDimensionHiddenIdss{}); + static_assert(result.found, "wrong! not found matching transformation and upper-dimension"); + return transforms[Number{}].GetUpperLengths()[Number{}]; + } - return length; - }, - Number{}); + // Compute element size using pack expansion instead of generate_tuple with lambda + template + __host__ __device__ static constexpr auto ComputeElementSizeImpl(const Transforms& transforms, + Sequence) + { + return (GetTopDimLengthFromTransforms(transforms) * ...); + } - // TODO: make container_reduce support tuple of Number and index_t - return container_reduce(lengths, math::multiplies{}, Number<1>{}); + __host__ __device__ static constexpr auto InitializeElementSize(const Transforms& transforms) + { + return ComputeElementSizeImpl(transforms, + typename arithmetic_sequence_gen<0, ndim_top_, 1>::type{}); } template @@ -76,24 +77,10 @@ struct TensorAdaptor constexpr index_t idim_hidden = TopDimensionHiddenIds::At(idim_top); - index_t itran_found = 0; - index_t idim_up_found = 0; - bool found = false; - - static_for<0, ntransform_, 1>{}([&](auto itran) { - constexpr auto up_dim_ids = UpperDimensionHiddenIdss{}[itran]; - - static_for<0, up_dim_ids.Size(), 1>{}([&](auto idim_up) { - if constexpr(up_dim_ids[idim_up] == idim_hidden) - { - itran_found = itran; - idim_up_found = idim_up; - found = true; - } - }); - }); + // Use compile-time search helper instead of nested static_for with lambdas + constexpr auto result = find_in_tuple_of_sequences(UpperDimensionHiddenIdss{}); - return make_tuple(itran_found, idim_up_found, found); + return make_tuple(result.itran, result.idim_up, result.found); } __host__ __device__ static constexpr index_t GetNumOfBottomDimension() diff --git a/include/ck/utility/sequence_helper.hpp b/include/ck/utility/sequence_helper.hpp index aeebf08e65c..f104733f6f6 100644 --- a/include/ck/utility/sequence_helper.hpp +++ b/include/ck/utility/sequence_helper.hpp @@ -64,43 +64,60 @@ struct FindTransformResult static constexpr bool found = Found; }; -namespace detail { - -// Helper to search through a tuple of sequences for a target value -// Returns FindTransformResult with (transform_index, index_within_sequence, found) -template -__host__ __device__ constexpr auto find_in_tuple_of_sequences_impl() +// O(1) template depth implementation using pack expansion +// Avoids O(N) recursive template instantiations +template +struct FindInTupleOfSequencesCompute { - constexpr index_t idx = sequence_find_value(FirstSeq{}); - if constexpr(idx >= 0) - { - return FindTransformResult{}; - } - else if constexpr(sizeof...(RestSeqs) > 0) + private: + // Result struct for constexpr computation + struct ResultData { - return find_in_tuple_of_sequences_impl(); - } - else + index_t itran; + index_t idim_up; + bool found; + }; + + // Compute result using constexpr function with array lookup + static constexpr ResultData compute() { - return FindTransformResult<0, 0, false>{}; + if constexpr(sizeof...(Seqs) == 0) + { + return {0, 0, false}; + } + else + { + // Pack expansion creates array - O(1) template depth + constexpr index_t indices[] = {sequence_find_value(Seqs{})...}; + + // Find first matching sequence + for(index_t i = 0; i < static_cast(sizeof...(Seqs)); ++i) + { + if(indices[i] >= 0) + { + return {i, indices[i], true}; + } + } + return {0, 0, false}; + } } -} -} // namespace detail + static constexpr ResultData result_ = compute(); + + public: + static constexpr index_t itran = result_.itran; + static constexpr index_t idim_up = result_.idim_up; + static constexpr bool found = result_.found; + + using type = FindTransformResult; +}; // Find target value in a tuple of sequences // Returns FindTransformResult -// This replaces nested static_for loops with O(1) template depth +// Uses O(1) template depth via pack expansion (no recursion) template __host__ __device__ constexpr auto find_in_tuple_of_sequences(Tuple) { - if constexpr(sizeof...(Seqs) == 0) - { - return FindTransformResult<0, 0, false>{}; - } - else - { - return detail::find_in_tuple_of_sequences_impl(); - } + return typename FindInTupleOfSequencesCompute::type{}; } } // namespace ck From 83a76d74deb6be7038356e217ea2025660d64b2e Mon Sep 17 00:00:00 2001 From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com> Date: Wed, 21 Jan 2026 23:50:02 +0000 Subject: [PATCH 4/6] Add unit tests for sequence_find_value and find_in_tuple_of_sequences --- test/util/CMakeLists.txt | 5 ++ test/util/unit_sequence_helper.cpp | 92 ++++++++++++++++++++++++++++++ 2 files changed, 97 insertions(+) create mode 100644 test/util/unit_sequence_helper.cpp diff --git a/test/util/CMakeLists.txt b/test/util/CMakeLists.txt index bf0a444f18b..550235ec7b2 100644 --- a/test/util/CMakeLists.txt +++ b/test/util/CMakeLists.txt @@ -5,3 +5,8 @@ add_gtest_executable(unit_sequence unit_sequence.cpp) if(result EQUAL 0) target_link_libraries(unit_sequence PRIVATE utility) endif() + +add_gtest_executable(unit_sequence_helper unit_sequence_helper.cpp) +if(result EQUAL 0) + target_link_libraries(unit_sequence_helper PRIVATE utility) +endif() diff --git a/test/util/unit_sequence_helper.cpp b/test/util/unit_sequence_helper.cpp new file mode 100644 index 00000000000..22e174e5b6e --- /dev/null +++ b/test/util/unit_sequence_helper.cpp @@ -0,0 +1,92 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. +// SPDX-License-Identifier: MIT + +#include +#include "ck/utility/sequence_helper.hpp" +#include "ck/utility/tuple_helper.hpp" + +using namespace ck; + +// Tests for sequence_find_value (PR #3600) +TEST(SequenceFindValue, FindExistingElement) +{ + constexpr auto result = sequence_find_value<17>(Sequence<5, 11, 17, 23, 29>{}); + EXPECT_EQ(result, 2); // 17 is at index 2 +} + +TEST(SequenceFindValue, FindFirstElement) +{ + constexpr auto result = sequence_find_value<7>(Sequence<7, 13, 19, 31>{}); + EXPECT_EQ(result, 0); +} + +TEST(SequenceFindValue, FindLastElement) +{ + constexpr auto result = sequence_find_value<41>(Sequence<3, 11, 23, 41>{}); + EXPECT_EQ(result, 3); +} + +TEST(SequenceFindValue, ElementNotFound) +{ + constexpr auto result = sequence_find_value<50>(Sequence<2, 8, 14, 26>{}); + EXPECT_EQ(result, -1); +} + +TEST(SequenceFindValue, EmptySequence) +{ + constexpr auto result = sequence_find_value<1>(Sequence<>{}); + EXPECT_EQ(result, -1); +} + +// Tests for find_in_tuple_of_sequences (PR #3600) +TEST(FindInTupleOfSequences, FindInFirstSequence) +{ + constexpr auto tuple_of_seqs = + make_tuple(Sequence<5, 11>{}, Sequence<17, 23>{}, Sequence<29, 37>{}); + constexpr auto result = find_in_tuple_of_sequences<11>(tuple_of_seqs); + EXPECT_EQ(result.itran, 0); // Found in first sequence (index 0) + EXPECT_EQ(result.idim_up, 1); // At position 1 within that sequence + EXPECT_TRUE(result.found); +} + +TEST(FindInTupleOfSequences, FindInMiddleSequence) +{ + constexpr auto tuple_of_seqs = + make_tuple(Sequence<2, 4, 6>{}, Sequence<8, 10>{}, Sequence<12>{}); + constexpr auto result = find_in_tuple_of_sequences<10>(tuple_of_seqs); + EXPECT_EQ(result.itran, 1); // Found in second sequence (index 1) + EXPECT_EQ(result.idim_up, 1); // At position 1 within that sequence + EXPECT_TRUE(result.found); +} + +TEST(FindInTupleOfSequences, FindInLastSequence) +{ + constexpr auto tuple_of_seqs = make_tuple(Sequence<3>{}, Sequence<7>{}, Sequence<13, 19, 31>{}); + constexpr auto result = find_in_tuple_of_sequences<31>(tuple_of_seqs); + EXPECT_EQ(result.itran, 2); // Found in third sequence (index 2) + EXPECT_EQ(result.idim_up, 2); // At position 2 within that sequence + EXPECT_TRUE(result.found); +} + +TEST(FindInTupleOfSequences, NotFound) +{ + constexpr auto tuple_of_seqs = make_tuple(Sequence<1, 3>{}, Sequence<5, 7, 9>{}); + constexpr auto result = find_in_tuple_of_sequences<100>(tuple_of_seqs); + EXPECT_FALSE(result.found); +} + +TEST(FindInTupleOfSequences, EmptyTuple) +{ + constexpr auto tuple_of_seqs = make_tuple(); + constexpr auto result = find_in_tuple_of_sequences<1>(tuple_of_seqs); + EXPECT_FALSE(result.found); +} + +TEST(FindInTupleOfSequences, SingleSequence) +{ + constexpr auto tuple_of_seqs = make_tuple(Sequence<41, 43, 47, 53>{}); + constexpr auto result = find_in_tuple_of_sequences<47>(tuple_of_seqs); + EXPECT_EQ(result.itran, 0); + EXPECT_EQ(result.idim_up, 2); + EXPECT_TRUE(result.found); +} From 8d5da5cd59e443e0c886925950f5541c58d931a7 Mon Sep 17 00:00:00 2001 From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com> Date: Thu, 22 Jan 2026 02:56:10 +0000 Subject: [PATCH 5/6] Add inline documentation for search helper optimizations Detailed comments explain: - sequence_find_value: Constexpr loop with O(1) template depth vs O(N) recursive - find_in_tuple_of_sequences: Pack expansion instead of nested static_for loops - Why constexpr search reduces template instantiations dramatically - When to apply constexpr search patterns for compile-time operations - Implementation details for each optimization approach This documentation helps maintainers understand the compile-time search optimization strategy without relying on specific benchmark numbers that may vary by use case. --- .../tensor_description/tensor_descriptor.hpp | 6 ++- include/ck/utility/sequence_helper.hpp | 52 +++++++++++++++++-- 2 files changed, 52 insertions(+), 6 deletions(-) diff --git a/include/ck/tensor_description/tensor_descriptor.hpp b/include/ck/tensor_description/tensor_descriptor.hpp index ca936a070b3..fdc8e710d20 100644 --- a/include/ck/tensor_description/tensor_descriptor.hpp +++ b/include/ck/tensor_description/tensor_descriptor.hpp @@ -83,8 +83,10 @@ struct TensorDescriptor constexpr index_t idim_hidden = VisibleDimensionIds::At(idim_visible); - // Use compile-time search helper instead of nested static_for with lambdas - // This eliminates ~918 applier::operator() instantiations + // Use compile-time search helper instead of nested static_for loops + // This optimization significantly reduces applier::operator() template instantiations + // by replacing nested lambda-based loops with a single constexpr search function. + // See sequence_helper.hpp::find_in_tuple_of_sequences for implementation details. constexpr auto result = find_in_tuple_of_sequences(UpperDimensionIdss{}); return make_tuple(result.itran, result.idim_up, result.found); diff --git a/include/ck/utility/sequence_helper.hpp b/include/ck/utility/sequence_helper.hpp index f104733f6f6..60fc78e0729 100644 --- a/include/ck/utility/sequence_helper.hpp +++ b/include/ck/utility/sequence_helper.hpp @@ -34,8 +34,30 @@ __host__ __device__ constexpr auto to_sequence(Tuple...>) return Sequence{}; } -// Find index of Target in Sequence, returns -1 if not found -// Uses constexpr array lookup for O(1) template depth +// sequence_find_value - O(1) template depth constexpr search +// +// Optimization: Constexpr loop with array lookup instead of recursive template pattern +// +// Why this approach: +// - Recursive template (OLD): template instantiation for each recursion level → O(N) instantiations +// Example: Finding value in Sequence<1,2,3,4,5> requires 5 recursive instantiations +// +// - Constexpr loop (NEW): Single function instantiation with runtime loop → O(1) instantiation +// Same search requires only 1 function instantiation, loop executes at compile-time +// +// Implementation details: +// 1. Pack expansion creates constexpr array: {(Is == Target)...} +// 2. Constexpr for loop searches the array +// 3. Entire function evaluates at compile-time (no runtime cost) +// +// Impact: +// - Significantly reduces template instantiation depth for sequence search operations +// - Dramatically improves compilation time vs recursive template approach +// - Pattern applies to any compile-time search/lookup operation +// +// Trade-off: Uses constexpr evaluation instead of pure template metaprogramming. +// Requires C++14 constexpr but results in dramatically better compile times. +// template __host__ __device__ constexpr index_t sequence_find_value(Sequence) { @@ -64,8 +86,30 @@ struct FindTransformResult static constexpr bool found = Found; }; -// O(1) template depth implementation using pack expansion -// Avoids O(N) recursive template instantiations +// find_in_tuple_of_sequences - finds which sequence contains a target value +// +// Optimization: Pack expansion with constexpr search instead of nested static_for loops +// +// Why this approach: +// - Nested static_for (OLD): Creates lambda closure for each iteration level +// Example: Searching Tuple, Seq<2,3>, Seq<4,5>> creates multiple applier::operator() instantiations +// Result: Many applier instantiations for typical tensor descriptor operations +// +// - Pack expansion + constexpr (NEW): Single function with compile-time array search +// Example: Same search creates constexpr array, single search function +// Result: 1 function instantiation regardless of tuple size +// +// Implementation: +// 1. Pack expansion: sequence_find_value(Seqs{})... applies search to each sequence +// 2. Results collected in constexpr array +// 3. Linear search finds first non-negative result (sequence containing target) +// +// Impact: +// - Significantly reduces applier::operator() instantiations in tensor descriptor transforms +// - O(1) template depth instead of O(N*M) for N sequences of length M +// +// Use case: Finding which dimension index contains a specific value (common in tensor reordering) +// template struct FindInTupleOfSequencesCompute { From 19a156aa0a609a14e16d8efeb21708fd6968fafd Mon Sep 17 00:00:00 2001 From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com> Date: Thu, 22 Jan 2026 03:17:44 +0000 Subject: [PATCH 6/6] Apply clang-format with -style=file --- include/ck/utility/sequence_helper.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/ck/utility/sequence_helper.hpp b/include/ck/utility/sequence_helper.hpp index 60fc78e0729..427a8c0cef6 100644 --- a/include/ck/utility/sequence_helper.hpp +++ b/include/ck/utility/sequence_helper.hpp @@ -92,8 +92,8 @@ struct FindTransformResult // // Why this approach: // - Nested static_for (OLD): Creates lambda closure for each iteration level -// Example: Searching Tuple, Seq<2,3>, Seq<4,5>> creates multiple applier::operator() instantiations -// Result: Many applier instantiations for typical tensor descriptor operations +// Example: Searching Tuple, Seq<2,3>, Seq<4,5>> creates multiple applier::operator() +// instantiations Result: Many applier instantiations for typical tensor descriptor operations // // - Pack expansion + constexpr (NEW): Single function with compile-time array search // Example: Same search creates constexpr array, single search function