Skip to content
Draft
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
204 changes: 108 additions & 96 deletions stl/src/vector_algorithms.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -632,63 +632,50 @@ namespace {

template <class _Traits, class _Ty>
__declspec(noalias) void __cdecl _Reverse_impl(void* _First, void* _Last) noexcept {
if (const size_t _Length = _Byte_length(_First, _Last); _Length >= 64) {
const void* _Stop_at = _First;
constexpr size_t _Mask_32 = ~((static_cast<size_t>(1) << 5) - 1);
_Advance_bytes(_Stop_at, (_Length >> 1) & _Mask_32);
do {
_Advance_bytes(_Last, -32);

const uint8x16_t _Left1 = vld1q_u8(static_cast<uint8_t*>(_First) + 0);
const uint8x16_t _Left2 = vld1q_u8(static_cast<uint8_t*>(_First) + 16);
const uint8x16_t _Right1 = vld1q_u8(static_cast<uint8_t*>(_Last) + 0);
const uint8x16_t _Right2 = vld1q_u8(static_cast<uint8_t*>(_Last) + 16);

const uint8x16_t _Left1_reversed = _Traits::_Rev(_Left1);
const uint8x16_t _Left2_reversed = _Traits::_Rev(_Left2);
const uint8x16_t _Right1_reversed = _Traits::_Rev(_Right1);
const uint8x16_t _Right2_reversed = _Traits::_Rev(_Right2);

vst1q_u8(static_cast<uint8_t*>(_First) + 0, _Right2_reversed);
vst1q_u8(static_cast<uint8_t*>(_First) + 16, _Right1_reversed);
vst1q_u8(static_cast<uint8_t*>(_Last) + 0, _Left2_reversed);
vst1q_u8(static_cast<uint8_t*>(_Last) + 16, _Left1_reversed);
// TEMP: Test overhead of feature checking.
if (_Use_FEAT_SVE()) {
if (const size_t _Length = _Byte_length(_First, _Last); _Length >= 64) {
const void* _Stop_at = _First;
constexpr size_t _Mask_32 = ~((static_cast<size_t>(1) << 5) - 1);
_Advance_bytes(_Stop_at, (_Length >> 1) & _Mask_32);
do {
_Advance_bytes(_Last, -32);

_Advance_bytes(_First, 32);
} while (_First != _Stop_at);
}
const uint8x16_t _Left1 = vld1q_u8(static_cast<uint8_t*>(_First) + 0);
const uint8x16_t _Left2 = vld1q_u8(static_cast<uint8_t*>(_First) + 16);
const uint8x16_t _Right1 = vld1q_u8(static_cast<uint8_t*>(_Last) + 0);
const uint8x16_t _Right2 = vld1q_u8(static_cast<uint8_t*>(_Last) + 16);

if (const size_t _Length = _Byte_length(_First, _Last); _Length >= 32) {
_Advance_bytes(_Last, -16);
const uint8x16_t _Left = vld1q_u8(static_cast<uint8_t*>(_First));
const uint8x16_t _Right = vld1q_u8(static_cast<uint8_t*>(_Last));
const uint8x16_t _Left1_reversed = _Traits::_Rev(_Left1);
const uint8x16_t _Left2_reversed = _Traits::_Rev(_Left2);
const uint8x16_t _Right1_reversed = _Traits::_Rev(_Right1);
const uint8x16_t _Right2_reversed = _Traits::_Rev(_Right2);

const uint8x16_t _Left_reversed = _Traits::_Rev(_Left);
const uint8x16_t _Right_reversed = _Traits::_Rev(_Right);
vst1q_u8(static_cast<uint8_t*>(_First) + 0, _Right2_reversed);
vst1q_u8(static_cast<uint8_t*>(_First) + 16, _Right1_reversed);
vst1q_u8(static_cast<uint8_t*>(_Last) + 0, _Left2_reversed);
vst1q_u8(static_cast<uint8_t*>(_Last) + 16, _Left1_reversed);

vst1q_u8(static_cast<uint8_t*>(_First), _Right_reversed);
vst1q_u8(static_cast<uint8_t*>(_Last), _Left_reversed);
_Advance_bytes(_First, 16);
}
_Advance_bytes(_First, 32);
} while (_First != _Stop_at);
}

if (const size_t _Length = _Byte_length(_First, _Last); _Length >= 16) {
_Advance_bytes(_Last, -8);
const uint8x8_t _Left = vld1_u8(static_cast<uint8_t*>(_First));
const uint8x8_t _Right = vld1_u8(static_cast<uint8_t*>(_Last));
if (const size_t _Length = _Byte_length(_First, _Last); _Length >= 32) {
_Advance_bytes(_Last, -16);
const uint8x16_t _Left = vld1q_u8(static_cast<uint8_t*>(_First));
const uint8x16_t _Right = vld1q_u8(static_cast<uint8_t*>(_Last));

const uint8x8_t _Left_reversed = _Traits::_Rev(_Left);
const uint8x8_t _Right_reversed = _Traits::_Rev(_Right);
const uint8x16_t _Left_reversed = _Traits::_Rev(_Left);
const uint8x16_t _Right_reversed = _Traits::_Rev(_Right);

vst1_u8(static_cast<uint8_t*>(_First), _Right_reversed);
vst1_u8(static_cast<uint8_t*>(_Last), _Left_reversed);
_Advance_bytes(_First, 8);
}
vst1q_u8(static_cast<uint8_t*>(_First), _Right_reversed);
vst1q_u8(static_cast<uint8_t*>(_Last), _Left_reversed);
_Advance_bytes(_First, 16);
}

if constexpr (sizeof(_Ty) < 8) {
if (const size_t _Length = _Byte_length(_First, _Last); _Length >= 8) {
if (const size_t _Length = _Byte_length(_First, _Last); _Length >= 16) {
_Advance_bytes(_Last, -8);

// Intentional overlapped loads/stores: read both sides first, then write.
const uint8x8_t _Left = vld1_u8(static_cast<uint8_t*>(_First));
const uint8x8_t _Right = vld1_u8(static_cast<uint8_t*>(_Last));

Expand All @@ -697,74 +684,99 @@ namespace {

vst1_u8(static_cast<uint8_t*>(_First), _Right_reversed);
vst1_u8(static_cast<uint8_t*>(_Last), _Left_reversed);
_Advance_bytes(_First, 8);
}

// Overlapped stores cover any 8-15B remainder, so do not fall through to scalar tail.
return;
if constexpr (sizeof(_Ty) < 8) {
if (const size_t _Length = _Byte_length(_First, _Last); _Length >= 8) {
_Advance_bytes(_Last, -8);

// Intentional overlapped loads/stores: read both sides first, then write.
const uint8x8_t _Left = vld1_u8(static_cast<uint8_t*>(_First));
const uint8x8_t _Right = vld1_u8(static_cast<uint8_t*>(_Last));

const uint8x8_t _Left_reversed = _Traits::_Rev(_Left);
const uint8x8_t _Right_reversed = _Traits::_Rev(_Right);

vst1_u8(static_cast<uint8_t*>(_First), _Right_reversed);
vst1_u8(static_cast<uint8_t*>(_Last), _Left_reversed);

// Overlapped stores cover any 8-15B remainder, so do not fall through to scalar tail.
return;
}
}
}

if constexpr (sizeof(_Ty) < 4) {
if constexpr (sizeof(_Ty) < 4) {
_Reverse_tail(static_cast<_Ty*>(_First), static_cast<_Ty*>(_Last));
}
} else {
_Reverse_tail(static_cast<_Ty*>(_First), static_cast<_Ty*>(_Last));
}
}

template <class _Traits, class _Ty>
__declspec(noalias) void __cdecl _Reverse_copy_impl(
const void* _First, const void* _Last, void* _Dest) noexcept {
if (const size_t _Length = _Byte_length(_First, _Last); _Length >= 64) {
const void* _Stop_at = _Dest;
constexpr size_t _Mask_64 = ~((static_cast<size_t>(1) << 6) - 1);
_Advance_bytes(_Stop_at, _Length & _Mask_64);
do {
_Advance_bytes(_Last, -64);
// TEMP: Test overhead of feature checking.
if (_Use_FEAT_SVE()) {
if (const size_t _Length = _Byte_length(_First, _Last); _Length >= 64) {
const void* _Stop_at = _Dest;
constexpr size_t _Mask_64 = ~((static_cast<size_t>(1) << 6) - 1);
_Advance_bytes(_Stop_at, _Length & _Mask_64);
do {
_Advance_bytes(_Last, -64);
const uint8x16_t _Block1 = vld1q_u8(static_cast<const uint8_t*>(_Last) + 0);
const uint8x16_t _Block2 = vld1q_u8(static_cast<const uint8_t*>(_Last) + 16);
const uint8x16_t _Block3 = vld1q_u8(static_cast<const uint8_t*>(_Last) + 32);
const uint8x16_t _Block4 = vld1q_u8(static_cast<const uint8_t*>(_Last) + 48);

const uint8x16_t _Block1_reversed = _Traits::_Rev(_Block1);
const uint8x16_t _Block2_reversed = _Traits::_Rev(_Block2);
const uint8x16_t _Block3_reversed = _Traits::_Rev(_Block3);
const uint8x16_t _Block4_reversed = _Traits::_Rev(_Block4);

vst1q_u8(static_cast<uint8_t*>(_Dest) + 0, _Block4_reversed);
vst1q_u8(static_cast<uint8_t*>(_Dest) + 16, _Block3_reversed);
vst1q_u8(static_cast<uint8_t*>(_Dest) + 32, _Block2_reversed);
vst1q_u8(static_cast<uint8_t*>(_Dest) + 48, _Block1_reversed);
_Advance_bytes(_Dest, 64);
} while (_Dest != _Stop_at);
}

if (const size_t _Length = _Byte_length(_First, _Last); _Length >= 32) {
_Advance_bytes(_Last, -32);
const uint8x16_t _Block1 = vld1q_u8(static_cast<const uint8_t*>(_Last) + 0);
const uint8x16_t _Block2 = vld1q_u8(static_cast<const uint8_t*>(_Last) + 16);
const uint8x16_t _Block3 = vld1q_u8(static_cast<const uint8_t*>(_Last) + 32);
const uint8x16_t _Block4 = vld1q_u8(static_cast<const uint8_t*>(_Last) + 48);

const uint8x16_t _Block1_reversed = _Traits::_Rev(_Block1);
const uint8x16_t _Block2_reversed = _Traits::_Rev(_Block2);
const uint8x16_t _Block3_reversed = _Traits::_Rev(_Block3);
const uint8x16_t _Block4_reversed = _Traits::_Rev(_Block4);

vst1q_u8(static_cast<uint8_t*>(_Dest) + 0, _Block4_reversed);
vst1q_u8(static_cast<uint8_t*>(_Dest) + 16, _Block3_reversed);
vst1q_u8(static_cast<uint8_t*>(_Dest) + 32, _Block2_reversed);
vst1q_u8(static_cast<uint8_t*>(_Dest) + 48, _Block1_reversed);
_Advance_bytes(_Dest, 64);
} while (_Dest != _Stop_at);
}

if (const size_t _Length = _Byte_length(_First, _Last); _Length >= 32) {
_Advance_bytes(_Last, -32);
const uint8x16_t _Block1 = vld1q_u8(static_cast<const uint8_t*>(_Last) + 0);
const uint8x16_t _Block2 = vld1q_u8(static_cast<const uint8_t*>(_Last) + 16);

const uint8x16_t _Block1_reversed = _Traits::_Rev(_Block1);
const uint8x16_t _Block2_reversed = _Traits::_Rev(_Block2);

vst1q_u8(static_cast<uint8_t*>(_Dest) + 0, _Block2_reversed);
vst1q_u8(static_cast<uint8_t*>(_Dest) + 16, _Block1_reversed);
_Advance_bytes(_Dest, 32);
}
vst1q_u8(static_cast<uint8_t*>(_Dest) + 0, _Block2_reversed);
vst1q_u8(static_cast<uint8_t*>(_Dest) + 16, _Block1_reversed);
_Advance_bytes(_Dest, 32);
}

if (const size_t _Length = _Byte_length(_First, _Last); _Length >= 16) {
_Advance_bytes(_Last, -16);
const uint8x16_t _Block = vld1q_u8(static_cast<const uint8_t*>(_Last));
const uint8x16_t _Block_reversed = _Traits::_Rev(_Block);
vst1q_u8(static_cast<uint8_t*>(_Dest), _Block_reversed);
_Advance_bytes(_Dest, 16);
}
if (const size_t _Length = _Byte_length(_First, _Last); _Length >= 16) {
_Advance_bytes(_Last, -16);
const uint8x16_t _Block = vld1q_u8(static_cast<const uint8_t*>(_Last));
const uint8x16_t _Block_reversed = _Traits::_Rev(_Block);
vst1q_u8(static_cast<uint8_t*>(_Dest), _Block_reversed);
_Advance_bytes(_Dest, 16);
}

if (const size_t _Length = _Byte_length(_First, _Last); _Length >= 8) {
_Advance_bytes(_Last, -8);
const uint8x8_t _Block = vld1_u8(static_cast<const uint8_t*>(_Last));
const uint8x8_t _Block_reversed = _Traits::_Rev(_Block);
vst1_u8(static_cast<uint8_t*>(_Dest), _Block_reversed);
_Advance_bytes(_Dest, 8);
}
if (const size_t _Length = _Byte_length(_First, _Last); _Length >= 8) {
_Advance_bytes(_Last, -8);
const uint8x8_t _Block = vld1_u8(static_cast<const uint8_t*>(_Last));
const uint8x8_t _Block_reversed = _Traits::_Rev(_Block);
vst1_u8(static_cast<uint8_t*>(_Dest), _Block_reversed);
_Advance_bytes(_Dest, 8);
}

if constexpr (sizeof(_Ty) < 8) {
if constexpr (sizeof(_Ty) < 8) {
_Reverse_copy_tail(
static_cast<const _Ty*>(_First), static_cast<const _Ty*>(_Last), static_cast<_Ty*>(_Dest));
}
} else {
_Reverse_copy_tail(
static_cast<const _Ty*>(_First), static_cast<const _Ty*>(_Last), static_cast<_Ty*>(_Dest));
}
Expand Down