diff --git a/stl/src/vector_algorithms.cpp b/stl/src/vector_algorithms.cpp index 6f891cf62b..e476e3a18c 100644 --- a/stl/src/vector_algorithms.cpp +++ b/stl/src/vector_algorithms.cpp @@ -632,63 +632,50 @@ namespace { template __declspec(noalias) void __cdecl _Reverse_impl(void* _First, void* _Last) noexcept { - if (const size_t _Length = _Byte_length(_First, _Last); _Length >= 64) { - const void* _Stop_at = _First; - constexpr size_t _Mask_32 = ~((static_cast(1) << 5) - 1); - _Advance_bytes(_Stop_at, (_Length >> 1) & _Mask_32); - do { - _Advance_bytes(_Last, -32); - - const uint8x16_t _Left1 = vld1q_u8(static_cast(_First) + 0); - const uint8x16_t _Left2 = vld1q_u8(static_cast(_First) + 16); - const uint8x16_t _Right1 = vld1q_u8(static_cast(_Last) + 0); - const uint8x16_t _Right2 = vld1q_u8(static_cast(_Last) + 16); - - const uint8x16_t _Left1_reversed = _Traits::_Rev(_Left1); - const uint8x16_t _Left2_reversed = _Traits::_Rev(_Left2); - const uint8x16_t _Right1_reversed = _Traits::_Rev(_Right1); - const uint8x16_t _Right2_reversed = _Traits::_Rev(_Right2); - vst1q_u8(static_cast(_First) + 0, _Right2_reversed); - vst1q_u8(static_cast(_First) + 16, _Right1_reversed); - vst1q_u8(static_cast(_Last) + 0, _Left2_reversed); - vst1q_u8(static_cast(_Last) + 16, _Left1_reversed); + // TEMP: Test overhead of feature checking. + if (_Use_FEAT_SVE()) { + if (const size_t _Length = _Byte_length(_First, _Last); _Length >= 64) { + const void* _Stop_at = _First; + constexpr size_t _Mask_32 = ~((static_cast(1) << 5) - 1); + _Advance_bytes(_Stop_at, (_Length >> 1) & _Mask_32); + do { + _Advance_bytes(_Last, -32); - _Advance_bytes(_First, 32); - } while (_First != _Stop_at); - } + const uint8x16_t _Left1 = vld1q_u8(static_cast(_First) + 0); + const uint8x16_t _Left2 = vld1q_u8(static_cast(_First) + 16); + const uint8x16_t _Right1 = vld1q_u8(static_cast(_Last) + 0); + const uint8x16_t _Right2 = vld1q_u8(static_cast(_Last) + 16); - if (const size_t _Length = _Byte_length(_First, _Last); _Length >= 32) { - _Advance_bytes(_Last, -16); - const uint8x16_t _Left = vld1q_u8(static_cast(_First)); - const uint8x16_t _Right = vld1q_u8(static_cast(_Last)); + const uint8x16_t _Left1_reversed = _Traits::_Rev(_Left1); + const uint8x16_t _Left2_reversed = _Traits::_Rev(_Left2); + const uint8x16_t _Right1_reversed = _Traits::_Rev(_Right1); + const uint8x16_t _Right2_reversed = _Traits::_Rev(_Right2); - const uint8x16_t _Left_reversed = _Traits::_Rev(_Left); - const uint8x16_t _Right_reversed = _Traits::_Rev(_Right); + vst1q_u8(static_cast(_First) + 0, _Right2_reversed); + vst1q_u8(static_cast(_First) + 16, _Right1_reversed); + vst1q_u8(static_cast(_Last) + 0, _Left2_reversed); + vst1q_u8(static_cast(_Last) + 16, _Left1_reversed); - vst1q_u8(static_cast(_First), _Right_reversed); - vst1q_u8(static_cast(_Last), _Left_reversed); - _Advance_bytes(_First, 16); - } + _Advance_bytes(_First, 32); + } while (_First != _Stop_at); + } - if (const size_t _Length = _Byte_length(_First, _Last); _Length >= 16) { - _Advance_bytes(_Last, -8); - const uint8x8_t _Left = vld1_u8(static_cast(_First)); - const uint8x8_t _Right = vld1_u8(static_cast(_Last)); + if (const size_t _Length = _Byte_length(_First, _Last); _Length >= 32) { + _Advance_bytes(_Last, -16); + const uint8x16_t _Left = vld1q_u8(static_cast(_First)); + const uint8x16_t _Right = vld1q_u8(static_cast(_Last)); - const uint8x8_t _Left_reversed = _Traits::_Rev(_Left); - const uint8x8_t _Right_reversed = _Traits::_Rev(_Right); + const uint8x16_t _Left_reversed = _Traits::_Rev(_Left); + const uint8x16_t _Right_reversed = _Traits::_Rev(_Right); - vst1_u8(static_cast(_First), _Right_reversed); - vst1_u8(static_cast(_Last), _Left_reversed); - _Advance_bytes(_First, 8); - } + vst1q_u8(static_cast(_First), _Right_reversed); + vst1q_u8(static_cast(_Last), _Left_reversed); + _Advance_bytes(_First, 16); + } - if constexpr (sizeof(_Ty) < 8) { - if (const size_t _Length = _Byte_length(_First, _Last); _Length >= 8) { + if (const size_t _Length = _Byte_length(_First, _Last); _Length >= 16) { _Advance_bytes(_Last, -8); - - // Intentional overlapped loads/stores: read both sides first, then write. const uint8x8_t _Left = vld1_u8(static_cast(_First)); const uint8x8_t _Right = vld1_u8(static_cast(_Last)); @@ -697,13 +684,32 @@ namespace { vst1_u8(static_cast(_First), _Right_reversed); vst1_u8(static_cast(_Last), _Left_reversed); + _Advance_bytes(_First, 8); + } - // Overlapped stores cover any 8-15B remainder, so do not fall through to scalar tail. - return; + if constexpr (sizeof(_Ty) < 8) { + if (const size_t _Length = _Byte_length(_First, _Last); _Length >= 8) { + _Advance_bytes(_Last, -8); + + // Intentional overlapped loads/stores: read both sides first, then write. + const uint8x8_t _Left = vld1_u8(static_cast(_First)); + const uint8x8_t _Right = vld1_u8(static_cast(_Last)); + + const uint8x8_t _Left_reversed = _Traits::_Rev(_Left); + const uint8x8_t _Right_reversed = _Traits::_Rev(_Right); + + vst1_u8(static_cast(_First), _Right_reversed); + vst1_u8(static_cast(_Last), _Left_reversed); + + // Overlapped stores cover any 8-15B remainder, so do not fall through to scalar tail. + return; + } } - } - if constexpr (sizeof(_Ty) < 4) { + if constexpr (sizeof(_Ty) < 4) { + _Reverse_tail(static_cast<_Ty*>(_First), static_cast<_Ty*>(_Last)); + } + } else { _Reverse_tail(static_cast<_Ty*>(_First), static_cast<_Ty*>(_Last)); } } @@ -711,60 +717,66 @@ namespace { template __declspec(noalias) void __cdecl _Reverse_copy_impl( const void* _First, const void* _Last, void* _Dest) noexcept { - if (const size_t _Length = _Byte_length(_First, _Last); _Length >= 64) { - const void* _Stop_at = _Dest; - constexpr size_t _Mask_64 = ~((static_cast(1) << 6) - 1); - _Advance_bytes(_Stop_at, _Length & _Mask_64); - do { - _Advance_bytes(_Last, -64); + // TEMP: Test overhead of feature checking. + if (_Use_FEAT_SVE()) { + if (const size_t _Length = _Byte_length(_First, _Last); _Length >= 64) { + const void* _Stop_at = _Dest; + constexpr size_t _Mask_64 = ~((static_cast(1) << 6) - 1); + _Advance_bytes(_Stop_at, _Length & _Mask_64); + do { + _Advance_bytes(_Last, -64); + const uint8x16_t _Block1 = vld1q_u8(static_cast(_Last) + 0); + const uint8x16_t _Block2 = vld1q_u8(static_cast(_Last) + 16); + const uint8x16_t _Block3 = vld1q_u8(static_cast(_Last) + 32); + const uint8x16_t _Block4 = vld1q_u8(static_cast(_Last) + 48); + + const uint8x16_t _Block1_reversed = _Traits::_Rev(_Block1); + const uint8x16_t _Block2_reversed = _Traits::_Rev(_Block2); + const uint8x16_t _Block3_reversed = _Traits::_Rev(_Block3); + const uint8x16_t _Block4_reversed = _Traits::_Rev(_Block4); + + vst1q_u8(static_cast(_Dest) + 0, _Block4_reversed); + vst1q_u8(static_cast(_Dest) + 16, _Block3_reversed); + vst1q_u8(static_cast(_Dest) + 32, _Block2_reversed); + vst1q_u8(static_cast(_Dest) + 48, _Block1_reversed); + _Advance_bytes(_Dest, 64); + } while (_Dest != _Stop_at); + } + + if (const size_t _Length = _Byte_length(_First, _Last); _Length >= 32) { + _Advance_bytes(_Last, -32); const uint8x16_t _Block1 = vld1q_u8(static_cast(_Last) + 0); const uint8x16_t _Block2 = vld1q_u8(static_cast(_Last) + 16); - const uint8x16_t _Block3 = vld1q_u8(static_cast(_Last) + 32); - const uint8x16_t _Block4 = vld1q_u8(static_cast(_Last) + 48); const uint8x16_t _Block1_reversed = _Traits::_Rev(_Block1); const uint8x16_t _Block2_reversed = _Traits::_Rev(_Block2); - const uint8x16_t _Block3_reversed = _Traits::_Rev(_Block3); - const uint8x16_t _Block4_reversed = _Traits::_Rev(_Block4); - - vst1q_u8(static_cast(_Dest) + 0, _Block4_reversed); - vst1q_u8(static_cast(_Dest) + 16, _Block3_reversed); - vst1q_u8(static_cast(_Dest) + 32, _Block2_reversed); - vst1q_u8(static_cast(_Dest) + 48, _Block1_reversed); - _Advance_bytes(_Dest, 64); - } while (_Dest != _Stop_at); - } - - if (const size_t _Length = _Byte_length(_First, _Last); _Length >= 32) { - _Advance_bytes(_Last, -32); - const uint8x16_t _Block1 = vld1q_u8(static_cast(_Last) + 0); - const uint8x16_t _Block2 = vld1q_u8(static_cast(_Last) + 16); - - const uint8x16_t _Block1_reversed = _Traits::_Rev(_Block1); - const uint8x16_t _Block2_reversed = _Traits::_Rev(_Block2); - vst1q_u8(static_cast(_Dest) + 0, _Block2_reversed); - vst1q_u8(static_cast(_Dest) + 16, _Block1_reversed); - _Advance_bytes(_Dest, 32); - } + vst1q_u8(static_cast(_Dest) + 0, _Block2_reversed); + vst1q_u8(static_cast(_Dest) + 16, _Block1_reversed); + _Advance_bytes(_Dest, 32); + } - if (const size_t _Length = _Byte_length(_First, _Last); _Length >= 16) { - _Advance_bytes(_Last, -16); - const uint8x16_t _Block = vld1q_u8(static_cast(_Last)); - const uint8x16_t _Block_reversed = _Traits::_Rev(_Block); - vst1q_u8(static_cast(_Dest), _Block_reversed); - _Advance_bytes(_Dest, 16); - } + if (const size_t _Length = _Byte_length(_First, _Last); _Length >= 16) { + _Advance_bytes(_Last, -16); + const uint8x16_t _Block = vld1q_u8(static_cast(_Last)); + const uint8x16_t _Block_reversed = _Traits::_Rev(_Block); + vst1q_u8(static_cast(_Dest), _Block_reversed); + _Advance_bytes(_Dest, 16); + } - if (const size_t _Length = _Byte_length(_First, _Last); _Length >= 8) { - _Advance_bytes(_Last, -8); - const uint8x8_t _Block = vld1_u8(static_cast(_Last)); - const uint8x8_t _Block_reversed = _Traits::_Rev(_Block); - vst1_u8(static_cast(_Dest), _Block_reversed); - _Advance_bytes(_Dest, 8); - } + if (const size_t _Length = _Byte_length(_First, _Last); _Length >= 8) { + _Advance_bytes(_Last, -8); + const uint8x8_t _Block = vld1_u8(static_cast(_Last)); + const uint8x8_t _Block_reversed = _Traits::_Rev(_Block); + vst1_u8(static_cast(_Dest), _Block_reversed); + _Advance_bytes(_Dest, 8); + } - if constexpr (sizeof(_Ty) < 8) { + if constexpr (sizeof(_Ty) < 8) { + _Reverse_copy_tail( + static_cast(_First), static_cast(_Last), static_cast<_Ty*>(_Dest)); + } + } else { _Reverse_copy_tail( static_cast(_First), static_cast(_Last), static_cast<_Ty*>(_Dest)); }