28#if HWY_COMPILER_GCC_ACTUAL
35#if HWY_COMPILER_CLANGCL
43#include <avx2intrin.h>
44#include <f16cintrin.h>
47#include <avx512fintrin.h>
48#include <avx512vlintrin.h>
49#include <avx512bwintrin.h>
50#include <avx512dqintrin.h>
51#include <avx512vlbwintrin.h>
52#include <avx512vldqintrin.h>
53#include <avx512bitalgintrin.h>
54#include <avx512vlbitalgintrin.h>
55#include <avx512vpopcntdqintrin.h>
56#include <avx512vpopcntdqvlintrin.h>
64#include <sanitizer/msan_interface.h>
122 return *
this = (*
this * other);
125 return *
this = (*
this / other);
128 return *
this = (*
this + other);
131 return *
this = (*
this - other);
134 return *
this = (*
this & other);
137 return *
this = (*
this | other);
140 return *
this = (*
this ^ other);
163 return _mm512_castpd_si512(
v);
192template <
typename T,
typename FromT>
202 return Vec512<T>{_mm512_setzero_si512()};
223 _mm512_set1_epi64(
static_cast<long long>(t))};
236 _mm512_set1_epi64(
static_cast<long long>(t))};
253 return Vec512<T>{_mm512_undefined_epi32()};
273 Vec512<TU>{_mm512_ternarylogic_epi32(vu, vu, vu, 0x55)});
339 using VU =
VFromD<
decltype(du)>;
340 const __m512i ret = _mm512_ternarylogic_epi64(
350 using VU =
VFromD<
decltype(du)>;
351 const __m512i ret = _mm512_ternarylogic_epi64(
361 using VU =
VFromD<
decltype(du)>;
362 const __m512i ret = _mm512_ternarylogic_epi64(
372 using VU =
VFromD<
decltype(du)>;
398#if HWY_TARGET == HWY_AVX3_DL
400#ifdef HWY_NATIVE_POPCNT
401#undef HWY_NATIVE_POPCNT
403#define HWY_NATIVE_POPCNT
440 static_assert(IsFloat<T>(),
"Only makes sense for floating-point");
456 const __m512i out = _mm512_ternarylogic_epi32(
482template <
typename T, HWY_IF_NOT_LANE_SIZE(T, 1)>
485 const uint32_t all = ~uint32_t{0};
487 m.raw =
static_cast<decltype(m.raw)
>((n > 255) ? all : _bzhi_u32(all, n));
491template <
typename T, HWY_IF_LANE_SIZE(T, 1)>
493 const uint64_t bits = n < 64 ? ((1ULL << n) - 1) : ~uint64_t{0};
494 return Mask512<T>{
static_cast<__mmask64
>(bits)};
504 const uint64_t all = ~uint64_t{0};
506 m.
raw =
static_cast<decltype(m.raw)
>((n > 255) ? all : _bzhi_u64(all, n));
509 return detail::FirstN<T>(n);
646 static_assert(IsSigned<T>(),
"Only works for signed/float");
651template <
typename T, HWY_IF_FLOAT(T)>
881template <
int kBits,
typename T, HWY_IF_LANE_SIZE(T, 1)>
888 : (shifted &
Set(d8,
static_cast<T
>((0xFF << kBits) & 0xFF)));
913 return shifted &
Set(d8, 0xFF >> kBits);
936 const auto shifted_sign =
BitCast(di,
Set(du, 0x80 >> kBits));
937 return (shifted ^ shifted_sign) - shifted_sign;
944 static_assert(0 <= kBits && kBits < 32,
"Invalid shift count");
950 static_assert(0 <= kBits && kBits < 64,
"Invalid shift count");
981template <
typename T, HWY_IF_LANE_SIZE(T, 1)>
986 return shifted &
Set(d8,
static_cast<T
>((0xFF << bits) & 0xFF));
1008 return shifted &
Set(d8,
static_cast<uint8_t
>(0xFF >> bits));
1029 const auto shifted_sign =
1030 BitCast(di,
Set(du,
static_cast<uint8_t
>(0x80 >> bits)));
1031 return (shifted ^ shifted_sign) - shifted_sign;
1052template <
typename T, HWY_IF_SIGNED(T)>
1193#ifdef HWY_NATIVE_I64MULLO
1194#undef HWY_NATIVE_I64MULLO
1196#define HWY_NATIVE_I64MULLO
1238template <
typename T, HWY_IF_FLOAT(T)>
1243template <
typename T, HWY_IF_NOT_FLOAT(T)>
1245 return Zero(Full512<T>()) -
v;
1344 v.raw, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)};
1348 v.raw, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)};
1354 _mm512_roundscale_ps(
v.raw, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)};
1358 _mm512_roundscale_pd(
v.raw, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)};
1364 _mm512_roundscale_ps(
v.raw, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)};
1368 _mm512_roundscale_pd(
v.raw, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)};
1374 _mm512_roundscale_ps(
v.raw, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)};
1378 _mm512_roundscale_pd(
v.raw, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)};
1387template <
typename TFrom,
typename TTo>
1389 static_assert(
sizeof(TFrom) ==
sizeof(TTo),
"Must have same size");
1395template <
typename T>
1400template <
typename T>
1405template <
typename T>
1410template <
typename T>
1418template <
typename T>
1420 static_assert(!hwy::IsFloat<T>(),
"Only integer vectors supported");
1426template <
typename T, HWY_IF_LANE_SIZE(T, 1)>
1430template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
1432 return Mask512<T>{_mm512_cmpeq_epi16_mask(a.raw, b.raw)};
1434template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
1436 return Mask512<T>{_mm512_cmpeq_epi32_mask(a.raw, b.raw)};
1438template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
1440 return Mask512<T>{_mm512_cmpeq_epi64_mask(a.raw, b.raw)};
1453template <
typename T, HWY_IF_LANE_SIZE(T, 1)>
1457template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
1459 return Mask512<T>{_mm512_cmpneq_epi16_mask(a.raw, b.raw)};
1461template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
1463 return Mask512<T>{_mm512_cmpneq_epi32_mask(a.raw, b.raw)};
1465template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
1467 return Mask512<T>{_mm512_cmpneq_epi64_mask(a.raw, b.raw)};
1524template <
typename T>
1529template <
typename T>
1538template <
typename T>
1542template <
typename T>
1546template <
typename T>
1550template <
typename T>
1557template <
typename T>
1590 return Vec512<float>{_mm512_castsi512_ps(_mm512_movm_epi32(
v.raw))};
1603template <
typename T>
1612template <
typename T>
1614#if HWY_COMPILER_HAS_MASK_INTRINSICS
1620template <
typename T>
1622#if HWY_COMPILER_HAS_MASK_INTRINSICS
1628template <
typename T>
1630#if HWY_COMPILER_HAS_MASK_INTRINSICS
1633 return Mask512<T>{
static_cast<uint16_t
>(~m.raw & 0xFFFF)};
1636template <
typename T>
1638#if HWY_COMPILER_HAS_MASK_INTRINSICS
1641 return Mask512<T>{
static_cast<uint8_t
>(~m.raw & 0xFF)};
1645template <
typename T>
1648#if HWY_COMPILER_HAS_MASK_INTRINSICS
1654template <
typename T>
1657#if HWY_COMPILER_HAS_MASK_INTRINSICS
1663template <
typename T>
1666#if HWY_COMPILER_HAS_MASK_INTRINSICS
1672template <
typename T>
1675#if HWY_COMPILER_HAS_MASK_INTRINSICS
1682template <
typename T>
1685#if HWY_COMPILER_HAS_MASK_INTRINSICS
1691template <
typename T>
1694#if HWY_COMPILER_HAS_MASK_INTRINSICS
1700template <
typename T>
1703#if HWY_COMPILER_HAS_MASK_INTRINSICS
1709template <
typename T>
1712#if HWY_COMPILER_HAS_MASK_INTRINSICS
1719template <
typename T>
1722#if HWY_COMPILER_HAS_MASK_INTRINSICS
1728template <
typename T>
1731#if HWY_COMPILER_HAS_MASK_INTRINSICS
1737template <
typename T>
1740#if HWY_COMPILER_HAS_MASK_INTRINSICS
1746template <
typename T>
1749#if HWY_COMPILER_HAS_MASK_INTRINSICS
1756template <
typename T>
1759#if HWY_COMPILER_HAS_MASK_INTRINSICS
1765template <
typename T>
1768#if HWY_COMPILER_HAS_MASK_INTRINSICS
1774template <
typename T>
1777#if HWY_COMPILER_HAS_MASK_INTRINSICS
1783template <
typename T>
1786#if HWY_COMPILER_HAS_MASK_INTRINSICS
1793template <
typename T>
1796#if HWY_COMPILER_HAS_MASK_INTRINSICS
1802template <
typename T>
1805#if HWY_COMPILER_HAS_MASK_INTRINSICS
1811template <
typename T>
1814#if HWY_COMPILER_HAS_MASK_INTRINSICS
1820template <
typename T>
1823#if HWY_COMPILER_HAS_MASK_INTRINSICS
1832template <
typename T>
1837template <
typename T>
1842template <
typename T>
1847template <
typename T>
1852template <
typename T>
1857template <
typename T>
1869 return ShiftRight<15>(
v);
1873 return ShiftRight<31>(
v);
1909template <
typename T>
1911 return Vec512<T>{_mm512_load_si512(aligned)};
1922template <
typename T>
1924 return Vec512<T>{_mm512_loadu_si512(p)};
1937template <
typename T, HWY_IF_LANE_SIZE(T, 1)>
1943template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
1946 return Vec512<T>{_mm512_maskz_loadu_epi16(m.raw, p)};
1949template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
1952 return Vec512<T>{_mm512_maskz_loadu_epi32(m.raw, p)};
1955template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
1958 return Vec512<T>{_mm512_maskz_loadu_epi64(m.raw, p)};
1975template <
typename T>
1979 return Vec512<T>{_mm512_broadcast_i32x4(x4.raw)};
1983 const __m128 x4 = _mm_loadu_ps(p);
1989 const __m128d x2 = _mm_loadu_pd(p);
1995template <
typename T>
1998 _mm512_store_si512(
reinterpret_cast<__m512i*
>(aligned),
v.raw);
2002 _mm512_store_ps(aligned,
v.raw);
2006 _mm512_store_pd(aligned,
v.raw);
2009template <
typename T>
2012 _mm512_storeu_si512(
reinterpret_cast<__m512i*
>(p),
v.raw);
2016 _mm512_storeu_ps(p,
v.raw);
2020 _mm512_storeu_pd(p,
v.raw);
2025template <
typename T, HWY_IF_LANE_SIZE(T, 1)>
2028 _mm512_mask_storeu_epi8(p, m.
raw,
v.raw);
2031template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
2034 _mm512_mask_storeu_epi16(p, m.raw,
v.raw);
2037template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
2040 _mm512_mask_storeu_epi32(p, m.raw,
v.raw);
2043template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
2046 _mm512_mask_storeu_epi64(p, m.raw,
v.raw);
2051 _mm512_mask_storeu_ps(p, m.
raw,
v.raw);
2056 _mm512_mask_storeu_pd(p, m.
raw,
v.raw);
2061template <
typename T>
2064 _mm512_stream_si512(
reinterpret_cast<__m512i*
>(aligned),
v.raw);
2068 _mm512_stream_ps(aligned,
v.raw);
2072 _mm512_stream_pd(aligned,
v.raw);
2083template <
typename T>
2087 _mm512_i32scatter_epi32(base, offset.
raw,
v.raw, 1);
2089template <
typename T>
2093 _mm512_i32scatter_epi32(base, index.
raw,
v.raw, 4);
2096template <
typename T>
2100 _mm512_i64scatter_epi64(base, offset.
raw,
v.raw, 1);
2102template <
typename T>
2106 _mm512_i64scatter_epi64(base, index.
raw,
v.raw, 8);
2111template <
typename T,
typename Offset>
2114 static_assert(
sizeof(T) ==
sizeof(Offset),
"Must match for portability");
2117template <
typename T,
typename Index>
2120 static_assert(
sizeof(T) ==
sizeof(Index),
"Must match for portability");
2127 _mm512_i32scatter_ps(base, offset.
raw,
v.raw, 1);
2132 _mm512_i32scatter_ps(base, index.
raw,
v.raw, 4);
2138 _mm512_i64scatter_pd(base, offset.
raw,
v.raw, 1);
2143 _mm512_i64scatter_pd(base, index.
raw,
v.raw, 8);
2150template <
typename T>
2155 return Vec512<T>{_mm512_i32gather_epi32(offset.
raw, base, 1)};
2157template <
typename T>
2162 return Vec512<T>{_mm512_i32gather_epi32(index.
raw, base, 4)};
2165template <
typename T>
2170 return Vec512<T>{_mm512_i64gather_epi64(offset.
raw, base, 1)};
2172template <
typename T>
2177 return Vec512<T>{_mm512_i64gather_epi64(index.
raw, base, 8)};
2182template <
typename T,
typename Offset>
2185 static_assert(
sizeof(T) ==
sizeof(Offset),
"Must match for portability");
2188template <
typename T,
typename Index>
2191 static_assert(
sizeof(T) ==
sizeof(Index),
"Must match for portability");
2223template <
typename T>
2225 return Vec256<T>{_mm512_castsi512_si256(
v.raw)};
2234template <
typename T>
2241template <
typename T>
2243 return Vec256<T>{_mm512_extracti32x8_epi32(
v.raw, 1)};
2253template <
typename T>
2257 alignas(64) T lanes[64 /
sizeof(T)];
2263template <
typename T>
2267 alignas(64) T lanes[64 /
sizeof(T)];
2270 return Load(
d, lanes);
2274template <
typename T>
2281template <
typename T>
2286 return Vec512<T>{_mm512_inserti32x8(_mm512_setzero_si512(), lo.
raw, 0)};
2308template <
typename T>
2311 return Vec512<T>{_mm512_inserti32x8(lo512.raw, hi.
raw, 1)};
2326template <
int kBytes,
typename T>
2328 static_assert(0 <= kBytes && kBytes <= 16,
"Invalid kBytes");
2329 return Vec512<T>{_mm512_bslli_epi128(
v.raw, kBytes)};
2332template <
int kBytes,
typename T>
2339template <
int kLanes,
typename T>
2345template <
int kLanes,
typename T>
2351template <
int kBytes,
typename T>
2353 static_assert(0 <= kBytes && kBytes <= 16,
"Invalid kBytes");
2354 return Vec512<T>{_mm512_bsrli_epi128(
v.raw, kBytes)};
2358template <
int kLanes,
typename T>
2366template <
int kBytes,
typename T,
class V = Vec512<T>>
2378 static_assert(0 <= kLane && kLane < 8,
"Invalid lane");
2380 const __m512i lo = _mm512_shufflelo_epi16(
v.raw, (0x55 * kLane) & 0xFF);
2384 _mm512_shufflehi_epi16(
v.raw, (0x55 * (kLane - 4)) & 0xFF);
2390 static_assert(0 <= kLane && kLane < 4,
"Invalid lane");
2391 constexpr _MM_PERM_ENUM perm =
static_cast<_MM_PERM_ENUM
>(0x55 * kLane);
2396 static_assert(0 <= kLane && kLane < 2,
"Invalid lane");
2397 constexpr _MM_PERM_ENUM perm = kLane ? _MM_PERM_DCDC : _MM_PERM_BABA;
2404 static_assert(0 <= kLane && kLane < 8,
"Invalid lane");
2406 const __m512i lo = _mm512_shufflelo_epi16(
v.raw, (0x55 * kLane) & 0xFF);
2410 _mm512_shufflehi_epi16(
v.raw, (0x55 * (kLane - 4)) & 0xFF);
2416 static_assert(0 <= kLane && kLane < 4,
"Invalid lane");
2417 constexpr _MM_PERM_ENUM perm =
static_cast<_MM_PERM_ENUM
>(0x55 * kLane);
2422 static_assert(0 <= kLane && kLane < 2,
"Invalid lane");
2423 constexpr _MM_PERM_ENUM perm = kLane ? _MM_PERM_DCDC : _MM_PERM_BABA;
2430 static_assert(0 <= kLane && kLane < 4,
"Invalid lane");
2431 constexpr _MM_PERM_ENUM perm =
static_cast<_MM_PERM_ENUM
>(0x55 * kLane);
2436 static_assert(0 <= kLane && kLane < 2,
"Invalid lane");
2437 constexpr _MM_PERM_ENUM perm =
static_cast<_MM_PERM_ENUM
>(0xFF * kLane);
2450template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
2452 return Vec512<T>{_mm512_shuffle_epi32(
v.raw, _MM_PERM_CDAB)};
2460template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
2468template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
2476template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
2544template <
typename T>
2549template <
typename T,
typename TI>
2551 static_assert(
sizeof(T) ==
sizeof(TI),
"Index size must match lane");
2552#if HWY_IS_DEBUG_BUILD
2555 AllTrue(di, Lt(vec,
Set(di,
static_cast<TI
>(64 /
sizeof(T))))));
2560template <
typename T,
typename TI>
2562 const Rebind<TI,
decltype(
d)> di;
2566template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
2571template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
2573 return Vec512<T>{_mm512_permutexvar_epi64(idx.raw,
v.raw)};
2587template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
2590 alignas(64)
constexpr int16_t kReverse[32] = {
2591 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16,
2592 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0};
2595 _mm512_permutexvar_epi16(idx.
raw,
BitCast(di,
v).raw)});
2598template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
2600 alignas(64)
constexpr int32_t kReverse[16] = {15, 14, 13, 12, 11, 10, 9, 8,
2601 7, 6, 5, 4, 3, 2, 1, 0};
2605template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
2607 alignas(64)
constexpr int64_t kReverse[8] = {7, 6, 5, 4, 3, 2, 1, 0};
2613template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
2619template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
2624template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
2631template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
2634 alignas(64)
constexpr int16_t kReverse4[32] = {
2635 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12,
2636 19, 18, 17, 16, 23, 22, 21, 20, 27, 26, 25, 24, 31, 30, 29, 28};
2639 _mm512_permutexvar_epi16(idx.
raw,
BitCast(di,
v).raw)});
2642template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
2647template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
2649 return Vec512<T>{_mm512_permutex_epi64(
v.raw, _MM_SHUFFLE(0, 1, 2, 3))};
2652 return Vec512<double>{_mm512_permutex_pd(
v.raw, _MM_SHUFFLE(0, 1, 2, 3))};
2657template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
2660 alignas(64)
constexpr int16_t kReverse8[32] = {
2661 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8,
2662 23, 22, 21, 20, 19, 18, 17, 16, 31, 30, 29, 28, 27, 26, 25, 24};
2665 _mm512_permutexvar_epi16(idx.
raw,
BitCast(di,
v).raw)});
2668template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
2671 alignas(64)
constexpr int32_t kReverse8[16] = {7, 6, 5, 4, 3, 2, 1, 0,
2672 15, 14, 13, 12, 11, 10, 9, 8};
2673 const Vec512<int32_t> idx =
Load(di, kReverse8);
2675 _mm512_permutexvar_epi32(idx.raw,
BitCast(di,
v).raw)});
2678template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
2782template <
typename T,
class V = Vec512<T>>
2791template <
typename T,
typename TW = MakeW
ide<T>>
2795template <
typename T,
typename TW = MakeW
ide<T>>
2800template <
typename T,
typename TW = MakeW
ide<T>>
2808template <
typename T>
2811 return Vec512<T>{_mm512_shuffle_i32x4(lo.
raw, hi.
raw, _MM_PERM_BABA)};
2825template <
typename T>
2828 return Vec512<T>{_mm512_shuffle_i32x4(lo.
raw, hi.
raw, _MM_PERM_DCDC)};
2842template <
typename T>
2845 return Vec512<T>{_mm512_shuffle_i32x4(lo.
raw, hi.
raw, _MM_PERM_BADC)};
2859template <
typename T>
2864 const __mmask32 mask = (0x0000FFFF);
2870 const __mmask16 mask = (0x00FF);
2876 const __mmask8 mask = (0x0F);
2882template <
typename T, HWY_IF_LANE_SIZE(T, 1)>
2885#if HWY_TARGET == HWY_AVX3_DL
2886 alignas(64)
constexpr uint8_t kIdx[64] = {
2887 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25,
2888 27, 29, 31, 33, 35, 37, 39, 41, 43, 45, 47, 49, 51,
2889 53, 55, 57, 59, 61, 63, 65, 67, 69, 71, 73, 75, 77,
2890 79, 81, 83, 85, 87, 89, 91, 93, 95, 97, 99, 101, 103,
2891 105, 107, 109, 111, 113, 115, 117, 119, 121, 123, 125, 127};
2895 __mmask64{0xFFFFFFFFFFFFFFFFull},
BitCast(du, hi).raw)});
2904 alignas(64)
constexpr uint64_t kIdx[8] = {0, 2, 4, 6, 1, 3, 5, 7};
2909template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
2912 alignas(64)
constexpr uint16_t kIdx[32] = {
2913 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31,
2914 33, 35, 37, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63};
2915 return BitCast(
d, Vec512<uint16_t>{_mm512_mask2_permutex2var_epi16(
2917 __mmask32{0xFFFFFFFFu},
BitCast(du, hi).raw)});
2920template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
2923 alignas(64)
constexpr uint32_t kIdx[16] = {1, 3, 5, 7, 9, 11, 13, 15,
2924 17, 19, 21, 23, 25, 27, 29, 31};
2925 return BitCast(
d, Vec512<uint32_t>{_mm512_mask2_permutex2var_epi32(
2927 __mmask16{0xFFFF},
BitCast(du, hi).raw)});
2933 alignas(64)
constexpr uint32_t kIdx[16] = {1, 3, 5, 7, 9, 11, 13, 15,
2934 17, 19, 21, 23, 25, 27, 29, 31};
2936 __mmask16{0xFFFF}, hi.
raw)};
2939template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
2942 alignas(64)
constexpr uint64_t kIdx[8] = {1, 3, 5, 7, 9, 11, 13, 15};
2943 return BitCast(
d, Vec512<uint64_t>{_mm512_mask2_permutex2var_epi64(
2944 BitCast(du, lo).raw,
Load(du, kIdx).raw, __mmask8{0xFF},
2951 alignas(64)
constexpr uint64_t kIdx[8] = {1, 3, 5, 7, 9, 11, 13, 15};
2953 __mmask8{0xFF}, hi.
raw)};
2958template <
typename T, HWY_IF_LANE_SIZE(T, 1)>
2961#if HWY_TARGET == HWY_AVX3_DL
2962 alignas(64)
constexpr uint8_t kIdx[64] = {
2963 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24,
2964 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46, 48, 50,
2965 52, 54, 56, 58, 60, 62, 64, 66, 68, 70, 72, 74, 76,
2966 78, 80, 82, 84, 86, 88, 90, 92, 94, 96, 98, 100, 102,
2967 104, 106, 108, 110, 112, 114, 116, 118, 120, 122, 124, 126};
2971 __mmask64{0xFFFFFFFFFFFFFFFFull},
BitCast(du, hi).raw)});
2981 alignas(64)
constexpr uint64_t kIdx[8] = {0, 2, 4, 6, 1, 3, 5, 7};
2986template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
2989 alignas(64)
constexpr uint16_t kIdx[32] = {
2990 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30,
2991 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62};
2992 return BitCast(
d, Vec512<uint32_t>{_mm512_mask2_permutex2var_epi16(
2994 __mmask32{0xFFFFFFFFu},
BitCast(du, hi).raw)});
2997template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
3000 alignas(64)
constexpr uint32_t kIdx[16] = {0, 2, 4, 6, 8, 10, 12, 14,
3001 16, 18, 20, 22, 24, 26, 28, 30};
3002 return BitCast(
d, Vec512<uint32_t>{_mm512_mask2_permutex2var_epi32(
3004 __mmask16{0xFFFF},
BitCast(du, hi).raw)});
3010 alignas(64)
constexpr uint32_t kIdx[16] = {0, 2, 4, 6, 8, 10, 12, 14,
3011 16, 18, 20, 22, 24, 26, 28, 30};
3013 __mmask16{0xFFFF}, hi.
raw)};
3016template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
3019 alignas(64)
constexpr uint64_t kIdx[8] = {0, 2, 4, 6, 8, 10, 12, 14};
3020 return BitCast(
d, Vec512<uint64_t>{_mm512_mask2_permutex2var_epi64(
3021 BitCast(du, lo).raw,
Load(du, kIdx).raw, __mmask8{0xFF},
3028 alignas(64)
constexpr uint64_t kIdx[8] = {0, 2, 4, 6, 8, 10, 12, 14};
3030 __mmask8{0xFF}, hi.
raw)};
3035template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
3037 return Vec512<T>{_mm512_shuffle_epi32(
v.raw, _MM_PERM_CCAA)};
3043template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
3050template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
3052 return Vec512<T>{_mm512_shuffle_epi32(
v.raw, _MM_PERM_DDBB)};
3058template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
3065template <
typename T>
3067 constexpr size_t s =
sizeof(T);
3068 constexpr int shift = s == 1 ? 0 : s == 2 ? 32 : s == 4 ? 48 : 56;
3074template <
typename T>
3076 return Vec512<T>{_mm512_mask_blend_epi64(__mmask8{0x33u}, odd.
raw, even.
raw)};
3081 _mm512_mask_blend_ps(__mmask16{0x0F0Fu}, odd.
raw, even.
raw)};
3086 _mm512_mask_blend_pd(__mmask8{0x33u}, odd.
raw, even.
raw)};
3091template <
typename T>
3093 return Vec512<T>{_mm512_shuffle_i32x4(
v.raw,
v.raw, _MM_PERM_CDAB)};
3097 return Vec512<float>{_mm512_shuffle_f32x4(
v.raw,
v.raw, _MM_PERM_CDAB)};
3106template <
typename T>
3108 return Vec512<T>{_mm512_shuffle_i32x4(
v.raw,
v.raw, _MM_PERM_ABCD)};
3111 return Vec512<float>{_mm512_shuffle_f32x4(
v.raw,
v.raw, _MM_PERM_ABCD)};
3121template <
typename T,
typename TI>
3127template <
typename T,
typename TI,
size_t NI>
3130 const Half<
decltype(d512)> d256;
3131 const Half<
decltype(d256)> d128;
3134 const auto from_512 =
3140template <
typename T,
typename TI>
3147template <
typename T,
size_t N,
typename TI>
3150 const Half<
decltype(d512)> d256;
3151 const Half<
decltype(d256)> d128;
3154 const auto bytes_512 =
3158template <
typename T,
typename TI>
3231 const Rebind<uint16_t,
decltype(df32)> du16;
3251 alignas(64)
static constexpr uint64_t kLanes[8] = {0, 2, 4, 6, 0, 2, 4, 6};
3262 alignas(64)
static constexpr uint64_t kLanes[8] = {0, 2, 4, 6, 0, 2, 4, 6};
3264 const Vec512<int16_t> even{_mm512_permutexvar_epi64(idx64.raw, i16.raw)};
3274 _mm512_and_si512(u16.raw, _mm512_set1_epi16(0x7FFF))};
3277 alignas(16)
static constexpr uint32_t kLanes[4] = {0, 4, 8, 12};
3279 const Vec512<uint8_t> fixed{_mm512_permutexvar_epi32(idx32.raw, u8.raw)};
3288 alignas(64)
static constexpr uint64_t kLanes[8] = {0, 2, 4, 6, 0, 2, 4, 6};
3290 const Vec512<uint8_t> even{_mm512_permutexvar_epi64(idx64.raw, u8.raw)};
3299 alignas(16)
static constexpr uint32_t kLanes[16] = {0, 4, 8, 12, 0, 4, 8, 12,
3300 0, 4, 8, 12, 0, 4, 8, 12};
3302 const Vec512<int8_t> fixed{_mm512_permutexvar_epi32(idx32.raw, i8.raw)};
3311 alignas(64)
static constexpr uint64_t kLanes[8] = {0, 2, 4, 6, 0, 2, 4, 6};
3313 const Vec512<int8_t> even{_mm512_permutexvar_epi64(idx64.raw, u8.raw)};
3329 const Rebind<int32_t,
decltype(dbf16)> di32;
3330 const Rebind<uint32_t,
decltype(dbf16)> du32;
3331 const Rebind<uint16_t,
decltype(dbf16)> du16;
3332 const auto bits_in_32 =
BitCast(di32, ShiftRight<16>(
BitCast(du32,
v)));
3340 const Repartition<uint32_t,
decltype(dbf16)> du32;
3366 alignas(16)
static constexpr uint32_t k8From32[4] = {0x0C080400u, ~0u, ~0u,
3370 alignas(16)
static constexpr uint32_t kIndex32[4] = {0, 4, 8, 12};
3372 _mm512_permutexvar_epi32(
LoadDup128(d32, kIndex32).raw, quads.raw)};
3380#if HWY_TARGET == HWY_AVX3_DL
3383 alignas(16)
static constexpr uint8_t k8From64[16] = {
3384 0, 8, 16, 24, 32, 40, 48, 56, 0, 8, 16, 24, 32, 40, 48, 56};
3386 _mm512_permutexvar_epi8(
LoadDup128(d8, k8From64).raw,
v.raw)};
3390 alignas(64)
constexpr uint32_t kEven[16] = {0, 2, 4, 6, 8, 10, 12, 14,
3391 0, 2, 4, 6, 8, 10, 12, 14};
3393 _mm512_permutexvar_epi32(
Load(d32, kEven).raw,
v.raw)};
3401 alignas(16)
static constexpr uint16_t k16From64[8] = {
3402 0, 4, 8, 12, 16, 20, 24, 28};
3404 _mm512_permutexvar_epi16(
LoadDup128(d16, k16From64).raw,
v.raw)};
3411 alignas(64)
constexpr uint32_t kEven[16] = {0, 2, 4, 6, 8, 10, 12, 14,
3412 0, 2, 4, 6, 8, 10, 12, 14};
3414 _mm512_permutexvar_epi32(
Load(d32, kEven).raw,
v.raw)};
3420#if HWY_TARGET == HWY_AVX3_DL
3422 alignas(16)
static constexpr uint8_t k8From32[16] = {
3423 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60};
3425 _mm512_permutexvar_epi32(
LoadDup128(d8, k8From32).raw,
v.raw)};
3430 alignas(16)
static constexpr uint32_t k8From32[4] = {0x0C080400u, ~0u, ~0u,
3434 alignas(16)
static constexpr uint32_t kIndex32[4] = {0, 4, 8, 12};
3436 _mm512_permutexvar_epi32(
LoadDup128(d32, kIndex32).raw, quads.raw)};
3444 alignas(64)
static constexpr uint16_t k16From32[32] = {
3445 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30,
3446 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30};
3448 _mm512_permutexvar_epi16(
Load(d16, k16From32).raw,
v.raw)};
3454#if HWY_TARGET == HWY_AVX3_DL
3456 alignas(64)
static constexpr uint8_t k8From16[64] = {
3457 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30,
3458 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62,
3459 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30,
3460 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62};
3462 _mm512_permutexvar_epi8(
Load(d8, k8From16).raw,
v.raw)};
3465 alignas(16)
static constexpr uint32_t k16From32[4] = {
3466 0x06040200u, 0x0E0C0A08u, 0x06040200u, 0x0E0C0A08u};
3468 alignas(64)
static constexpr uint32_t kIndex32[16] = {
3469 0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 4, 5, 8, 9, 12, 13};
3471 _mm512_permutexvar_epi32(
Load(d32, kIndex32).raw, quads.raw)};
3513#if !defined(HWY_DISABLE_PCLMUL_AES)
3516#ifdef HWY_NATIVE_AES
3517#undef HWY_NATIVE_AES
3519#define HWY_NATIVE_AES
3524#if HWY_TARGET == HWY_AVX3_DL
3528 const Half<
decltype(
d)> d2;
3536#if HWY_TARGET == HWY_AVX3_DL
3540 const Half<
decltype(
d)> d2;
3548#if HWY_TARGET == HWY_AVX3_DL
3551 alignas(64) uint64_t a[8];
3552 alignas(64) uint64_t b[8];
3557 for (
size_t i = 0; i < 8; i += 2) {
3559 Store(mul, d128, a + i);
3566#if HWY_TARGET == HWY_AVX3_DL
3569 alignas(64) uint64_t a[8];
3570 alignas(64) uint64_t b[8];
3575 for (
size_t i = 0; i < 8; i += 2) {
3577 Store(mul, d128, a + i);
3588template <
typename T,
typename T2>
3591 for (
size_t i = 0; i < 64 /
sizeof(T); ++i) {
3595 return Load(
d, lanes);
3604template <
typename T>
3606#if HWY_COMPILER_HAS_MASK_INTRINSICS
3607 return _kortestz_mask64_u8(mask.
raw, mask.
raw);
3609 return mask.
raw == 0;
3612template <
typename T>
3614#if HWY_COMPILER_HAS_MASK_INTRINSICS
3615 return _kortestz_mask32_u8(mask.
raw, mask.
raw);
3617 return mask.
raw == 0;
3620template <
typename T>
3622#if HWY_COMPILER_HAS_MASK_INTRINSICS
3623 return _kortestz_mask16_u8(mask.
raw, mask.
raw);
3625 return mask.
raw == 0;
3628template <
typename T>
3630#if HWY_COMPILER_HAS_MASK_INTRINSICS
3631 return _kortestz_mask8_u8(mask.
raw, mask.
raw);
3633 return mask.
raw == 0;
3639template <
typename T>
3646template <
typename T>
3648#if HWY_COMPILER_HAS_MASK_INTRINSICS
3649 return _kortestc_mask64_u8(mask.
raw, mask.
raw);
3651 return mask.
raw == 0xFFFFFFFFFFFFFFFFull;
3654template <
typename T>
3656#if HWY_COMPILER_HAS_MASK_INTRINSICS
3657 return _kortestc_mask32_u8(mask.
raw, mask.
raw);
3659 return mask.
raw == 0xFFFFFFFFull;
3662template <
typename T>
3664#if HWY_COMPILER_HAS_MASK_INTRINSICS
3665 return _kortestc_mask16_u8(mask.
raw, mask.
raw);
3667 return mask.
raw == 0xFFFFull;
3670template <
typename T>
3672#if HWY_COMPILER_HAS_MASK_INTRINSICS
3673 return _kortestc_mask8_u8(mask.
raw, mask.
raw);
3675 return mask.
raw == 0xFFull;
3681template <
typename T>
3687template <
typename T>
3697template <
typename T>
3700 const size_t kNumBytes = 8 /
sizeof(T);
3701 CopyBytes<kNumBytes>(&mask.
raw, bits);
3706template <
typename T>
3711template <
typename T, HWY_IF_NOT_LANE_SIZE(T, 1)>
3717template <
typename T, HWY_IF_LANE_SIZE(T, 1)>
3719 const Mask512<T> mask) {
3723template <
typename T>
3733#ifdef HWY_NATIVE_COMPRESS8
3734#undef HWY_NATIVE_COMPRESS8
3736#define HWY_NATIVE_COMPRESS8
3741#if HWY_TARGET == HWY_AVX3_DL
3775 _mm_mask_compressstoreu_epi8(unaligned, mask.
raw,
v.raw);
3780 _mm256_mask_compressstoreu_epi8(unaligned, mask.
raw,
v.raw);
3785 _mm512_mask_compressstoreu_epi8(unaligned, mask.
raw,
v.raw);
3793 _mm_mask_compressstoreu_epi16(unaligned, mask.
raw,
v.raw);
3798 _mm256_mask_compressstoreu_epi16(unaligned, mask.
raw,
v.raw);
3803 _mm512_mask_compressstoreu_epi16(unaligned, mask.
raw,
v.raw);
3828 _mm_mask_compressstoreu_epi32(unaligned, mask.
raw,
v.raw);
3833 _mm256_mask_compressstoreu_epi32(unaligned, mask.
raw,
v.raw);
3838 _mm512_mask_compressstoreu_epi32(unaligned, mask.
raw,
v.raw);
3846 _mm_mask_compressstoreu_epi64(unaligned, mask.
raw,
v.raw);
3851 _mm256_mask_compressstoreu_epi64(unaligned, mask.
raw,
v.raw);
3856 _mm512_mask_compressstoreu_epi64(unaligned, mask.
raw,
v.raw);
3866 const Rebind<uint32_t,
decltype(
d)> d32;
3869 const uint64_t mask_bits{mask.
raw};
3871 using M32 =
MFromD<
decltype(d32)>;
3872 const M32 m0{
static_cast<typename M32::Raw
>(mask_bits)};
3880 const Rebind<int32_t,
decltype(
d)> di32;
3882 const MFromD<
decltype(du32)> mask32{
static_cast<__mmask8
>(mask.
raw)};
3892 const Rebind<int32_t,
decltype(
d)> di32;
3900template <
typename T,
size_t N>
3918 const uint64_t mask_bits{mask.
raw};
3919 const Half<
decltype(
d)> dh;
3920 const Rebind<uint32_t,
decltype(dh)> d32;
3935 const uint64_t mask_bits{mask.
raw};
3937 const Rebind<uint32_t,
decltype(dq)> d32;
3946 static_cast<uint16_t
>((mask_bits >> 16) & 0xFFFFu)};
3948 static_cast<uint16_t
>((mask_bits >> 32) & 0xFFFFu)};
3969 const Half<
decltype(
d)> dh;
3975 const uint64_t mask_bits{mask.
raw};
3976 const uint64_t maskL = mask_bits & 0xFFFF;
3977 const uint64_t maskH = mask_bits >> 16;
3987 StoreU(demoted0, dh, unaligned);
3992template <
typename T>
3997 return Load(
d, buf);
4003 HWY_ALIGN uint8_t buf[2 * 32 /
sizeof(uint8_t)];
4005 return Load(
d, buf);
4010template <
class V,
class M, HWY_IF_LANE_SIZE_ONE_OF_V(V, 0x6)>
4015#if HWY_TARGET == HWY_AVX3_DL
4022template <
class V,
class M, HWY_IF_LANE_SIZE_V(V, 4)>
4030template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
4033 alignas(16)
constexpr uint64_t packed_array[256] = {
4037 0x76543210, 0x76543210, 0x76543201, 0x76543210, 0x76543102, 0x76543120,
4038 0x76543021, 0x76543210, 0x76542103, 0x76542130, 0x76542031, 0x76542310,
4039 0x76541032, 0x76541320, 0x76540321, 0x76543210, 0x76532104, 0x76532140,
4040 0x76532041, 0x76532410, 0x76531042, 0x76531420, 0x76530421, 0x76534210,
4041 0x76521043, 0x76521430, 0x76520431, 0x76524310, 0x76510432, 0x76514320,
4042 0x76504321, 0x76543210, 0x76432105, 0x76432150, 0x76432051, 0x76432510,
4043 0x76431052, 0x76431520, 0x76430521, 0x76435210, 0x76421053, 0x76421530,
4044 0x76420531, 0x76425310, 0x76410532, 0x76415320, 0x76405321, 0x76453210,
4045 0x76321054, 0x76321540, 0x76320541, 0x76325410, 0x76310542, 0x76315420,
4046 0x76305421, 0x76354210, 0x76210543, 0x76215430, 0x76205431, 0x76254310,
4047 0x76105432, 0x76154320, 0x76054321, 0x76543210, 0x75432106, 0x75432160,
4048 0x75432061, 0x75432610, 0x75431062, 0x75431620, 0x75430621, 0x75436210,
4049 0x75421063, 0x75421630, 0x75420631, 0x75426310, 0x75410632, 0x75416320,
4050 0x75406321, 0x75463210, 0x75321064, 0x75321640, 0x75320641, 0x75326410,
4051 0x75310642, 0x75316420, 0x75306421, 0x75364210, 0x75210643, 0x75216430,
4052 0x75206431, 0x75264310, 0x75106432, 0x75164320, 0x75064321, 0x75643210,
4053 0x74321065, 0x74321650, 0x74320651, 0x74326510, 0x74310652, 0x74316520,
4054 0x74306521, 0x74365210, 0x74210653, 0x74216530, 0x74206531, 0x74265310,
4055 0x74106532, 0x74165320, 0x74065321, 0x74653210, 0x73210654, 0x73216540,
4056 0x73206541, 0x73265410, 0x73106542, 0x73165420, 0x73065421, 0x73654210,
4057 0x72106543, 0x72165430, 0x72065431, 0x72654310, 0x71065432, 0x71654320,
4058 0x70654321, 0x76543210, 0x65432107, 0x65432170, 0x65432071, 0x65432710,
4059 0x65431072, 0x65431720, 0x65430721, 0x65437210, 0x65421073, 0x65421730,
4060 0x65420731, 0x65427310, 0x65410732, 0x65417320, 0x65407321, 0x65473210,
4061 0x65321074, 0x65321740, 0x65320741, 0x65327410, 0x65310742, 0x65317420,
4062 0x65307421, 0x65374210, 0x65210743, 0x65217430, 0x65207431, 0x65274310,
4063 0x65107432, 0x65174320, 0x65074321, 0x65743210, 0x64321075, 0x64321750,
4064 0x64320751, 0x64327510, 0x64310752, 0x64317520, 0x64307521, 0x64375210,
4065 0x64210753, 0x64217530, 0x64207531, 0x64275310, 0x64107532, 0x64175320,
4066 0x64075321, 0x64753210, 0x63210754, 0x63217540, 0x63207541, 0x63275410,
4067 0x63107542, 0x63175420, 0x63075421, 0x63754210, 0x62107543, 0x62175430,
4068 0x62075431, 0x62754310, 0x61075432, 0x61754320, 0x60754321, 0x67543210,
4069 0x54321076, 0x54321760, 0x54320761, 0x54327610, 0x54310762, 0x54317620,
4070 0x54307621, 0x54376210, 0x54210763, 0x54217630, 0x54207631, 0x54276310,
4071 0x54107632, 0x54176320, 0x54076321, 0x54763210, 0x53210764, 0x53217640,
4072 0x53207641, 0x53276410, 0x53107642, 0x53176420, 0x53076421, 0x53764210,
4073 0x52107643, 0x52176430, 0x52076431, 0x52764310, 0x51076432, 0x51764320,
4074 0x50764321, 0x57643210, 0x43210765, 0x43217650, 0x43207651, 0x43276510,
4075 0x43107652, 0x43176520, 0x43076521, 0x43765210, 0x42107653, 0x42176530,
4076 0x42076531, 0x42765310, 0x41076532, 0x41765320, 0x40765321, 0x47653210,
4077 0x32107654, 0x32176540, 0x32076541, 0x32765410, 0x31076542, 0x31765420,
4078 0x30765421, 0x37654210, 0x21076543, 0x21765430, 0x20765431, 0x27654310,
4079 0x10765432, 0x17654320, 0x07654321, 0x76543210};
4085 const auto packed =
Set(du64, packed_array[mask.
raw]);
4086 alignas(64)
constexpr uint64_t shifts[8] = {0, 4, 8, 12, 16, 20, 24, 28};
4093template <
class V,
class M, HWY_IF_NOT_LANE_SIZE_V(V, 8)>
4098template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
4101 alignas(16)
constexpr uint64_t packed_array[256] = {
4105 0x76543210, 0x07654321, 0x17654320, 0x10765432, 0x27654310, 0x20765431,
4106 0x21765430, 0x21076543, 0x37654210, 0x30765421, 0x31765420, 0x31076542,
4107 0x32765410, 0x32076541, 0x32176540, 0x32107654, 0x47653210, 0x40765321,
4108 0x41765320, 0x41076532, 0x42765310, 0x42076531, 0x42176530, 0x42107653,
4109 0x43765210, 0x43076521, 0x43176520, 0x43107652, 0x43276510, 0x43207651,
4110 0x43217650, 0x43210765, 0x57643210, 0x50764321, 0x51764320, 0x51076432,
4111 0x52764310, 0x52076431, 0x52176430, 0x52107643, 0x53764210, 0x53076421,
4112 0x53176420, 0x53107642, 0x53276410, 0x53207641, 0x53217640, 0x53210764,
4113 0x54763210, 0x54076321, 0x54176320, 0x54107632, 0x54276310, 0x54207631,
4114 0x54217630, 0x54210763, 0x54376210, 0x54307621, 0x54317620, 0x54310762,
4115 0x54327610, 0x54320761, 0x54321760, 0x54321076, 0x67543210, 0x60754321,
4116 0x61754320, 0x61075432, 0x62754310, 0x62075431, 0x62175430, 0x62107543,
4117 0x63754210, 0x63075421, 0x63175420, 0x63107542, 0x63275410, 0x63207541,
4118 0x63217540, 0x63210754, 0x64753210, 0x64075321, 0x64175320, 0x64107532,
4119 0x64275310, 0x64207531, 0x64217530, 0x64210753, 0x64375210, 0x64307521,
4120 0x64317520, 0x64310752, 0x64327510, 0x64320751, 0x64321750, 0x64321075,
4121 0x65743210, 0x65074321, 0x65174320, 0x65107432, 0x65274310, 0x65207431,
4122 0x65217430, 0x65210743, 0x65374210, 0x65307421, 0x65317420, 0x65310742,
4123 0x65327410, 0x65320741, 0x65321740, 0x65321074, 0x65473210, 0x65407321,
4124 0x65417320, 0x65410732, 0x65427310, 0x65420731, 0x65421730, 0x65421073,
4125 0x65437210, 0x65430721, 0x65431720, 0x65431072, 0x65432710, 0x65432071,
4126 0x65432170, 0x65432107, 0x76543210, 0x70654321, 0x71654320, 0x71065432,
4127 0x72654310, 0x72065431, 0x72165430, 0x72106543, 0x73654210, 0x73065421,
4128 0x73165420, 0x73106542, 0x73265410, 0x73206541, 0x73216540, 0x73210654,
4129 0x74653210, 0x74065321, 0x74165320, 0x74106532, 0x74265310, 0x74206531,
4130 0x74216530, 0x74210653, 0x74365210, 0x74306521, 0x74316520, 0x74310652,
4131 0x74326510, 0x74320651, 0x74321650, 0x74321065, 0x75643210, 0x75064321,
4132 0x75164320, 0x75106432, 0x75264310, 0x75206431, 0x75216430, 0x75210643,
4133 0x75364210, 0x75306421, 0x75316420, 0x75310642, 0x75326410, 0x75320641,
4134 0x75321640, 0x75321064, 0x75463210, 0x75406321, 0x75416320, 0x75410632,
4135 0x75426310, 0x75420631, 0x75421630, 0x75421063, 0x75436210, 0x75430621,
4136 0x75431620, 0x75431062, 0x75432610, 0x75432061, 0x75432160, 0x75432106,
4137 0x76543210, 0x76054321, 0x76154320, 0x76105432, 0x76254310, 0x76205431,
4138 0x76215430, 0x76210543, 0x76354210, 0x76305421, 0x76315420, 0x76310542,
4139 0x76325410, 0x76320541, 0x76321540, 0x76321054, 0x76453210, 0x76405321,
4140 0x76415320, 0x76410532, 0x76425310, 0x76420531, 0x76421530, 0x76421053,
4141 0x76435210, 0x76430521, 0x76431520, 0x76431052, 0x76432510, 0x76432051,
4142 0x76432150, 0x76432105, 0x76543210, 0x76504321, 0x76514320, 0x76510432,
4143 0x76524310, 0x76520431, 0x76521430, 0x76521043, 0x76534210, 0x76530421,
4144 0x76531420, 0x76531042, 0x76532410, 0x76532041, 0x76532140, 0x76532104,
4145 0x76543210, 0x76540321, 0x76541320, 0x76541032, 0x76542310, 0x76542031,
4146 0x76542130, 0x76542103, 0x76543210, 0x76543021, 0x76543120, 0x76543102,
4147 0x76543210, 0x76543201, 0x76543210, 0x76543210};
4153 const auto packed =
Set(du64, packed_array[mask.
raw]);
4154 alignas(64)
constexpr uint64_t shifts[8] = {0, 4, 8, 12, 16, 20, 24, 28};
4161template <
class V,
class M, hwy::EnableIf<(sizeof(V) > 16)>* =
nullptr>
4174template <
class V,
class D, HWY_IF_LANE_SIZE_ONE_OF_V(V, 0x6)>
4180#if HWY_TARGET == HWY_AVX3_DL
4190template <
class V,
class D, HWY_IF_LANE_SIZE_ONE_OF_V(V, 0x110)>
4195 using TU =
TFromD<
decltype(du)>;
4196 TU*
HWY_RESTRICT pu =
reinterpret_cast<TU*
>(unaligned);
4207 _mm512_mask_compressstoreu_ps(unaligned, mask.
raw,
v.raw);
4208 const size_t count =
PopCount(uint64_t{mask.
raw});
4216 _mm512_mask_compressstoreu_pd(unaligned, mask.
raw,
v.raw);
4217 const size_t count =
PopCount(uint64_t{mask.
raw});
4223template <
class D,
typename T = TFromD<D>>
4251template <_MM_PERM_ENUM kPerm,
typename T>
4255template <_MM_PERM_ENUM kPerm>
4259template <_MM_PERM_ENUM kPerm>
4272template <
typename T>
4276 constexpr size_t N = 64 /
sizeof(T);
4281 const Vec512<T> v5421 = detail::Shuffle128<_MM_PERM_BACB>(v3210, v7654);
4282 const Vec512<T> va976 = detail::Shuffle128<_MM_PERM_CBDC>(v7654, vba98);
4284 A = detail::Shuffle128<_MM_PERM_CADA>(v3210, va976);
4285 B = detail::Shuffle128<_MM_PERM_DBCA>(v5421, va976);
4286 C = detail::Shuffle128<_MM_PERM_DADB>(v5421, vba98);
4299template <
typename T>
4304 constexpr size_t N = 64 /
sizeof(T);
4310 const Vec512<T> v5410 = detail::Shuffle128<_MM_PERM_BABA>(v3210, v7654);
4311 const Vec512<T> vdc98 = detail::Shuffle128<_MM_PERM_BABA>(vba98, vfedc);
4312 const Vec512<T> v7632 = detail::Shuffle128<_MM_PERM_DCDC>(v3210, v7654);
4313 const Vec512<T> vfeba = detail::Shuffle128<_MM_PERM_DCDC>(vba98, vfedc);
4314 A = detail::Shuffle128<_MM_PERM_CACA>(v5410, vdc98);
4315 B = detail::Shuffle128<_MM_PERM_DBDB>(v5410, vdc98);
4316 C = detail::Shuffle128<_MM_PERM_CACA>(v7632, vfeba);
4317 D = detail::Shuffle128<_MM_PERM_DBDB>(v7632, vfeba);
4334template <
typename T>
4338 constexpr size_t N = 64 /
sizeof(T);
4339 const auto j1_j0_i1_i0 = detail::Shuffle128<_MM_PERM_BABA>(i, j);
4340 const auto j3_j2_i3_i2 = detail::Shuffle128<_MM_PERM_DCDC>(i, j);
4341 const auto j1_i1_j0_i0 =
4342 detail::Shuffle128<_MM_PERM_DBCA>(j1_j0_i1_i0, j1_j0_i1_i0);
4343 const auto j3_i3_j2_i2 =
4344 detail::Shuffle128<_MM_PERM_DBCA>(j3_j2_i3_i2, j3_j2_i3_i2);
4345 StoreU(j1_i1_j0_i0,
d, unaligned + 0 *
N);
4346 StoreU(j3_i3_j2_i2,
d, unaligned + 1 *
N);
4357template <
typename T>
4361 constexpr size_t N = 64 /
sizeof(T);
4362 const Vec512<T> j2_j0_i2_i0 = detail::Shuffle128<_MM_PERM_CACA>(i, j);
4363 const Vec512<T> i3_i1_k2_k0 = detail::Shuffle128<_MM_PERM_DBCA>(k, i);
4364 const Vec512<T> j3_j1_k3_k1 = detail::Shuffle128<_MM_PERM_DBDB>(k, j);
4367 detail::Shuffle128<_MM_PERM_CACA>(j2_j0_i2_i0, i3_i1_k2_k0);
4369 detail::Shuffle128<_MM_PERM_DBAC>(j3_j1_k3_k1, j2_j0_i2_i0);
4371 detail::Shuffle128<_MM_PERM_BDDB>(i3_i1_k2_k0, j3_j1_k3_k1);
4373 StoreU(out0,
d, unaligned + 0 *
N);
4374 StoreU(out1,
d, unaligned + 1 *
N);
4375 StoreU(out2,
d, unaligned + 2 *
N);
4388template <
typename T>
4392 constexpr size_t N = 64 /
sizeof(T);
4393 const Vec512<T> j1_j0_i1_i0 = detail::Shuffle128<_MM_PERM_BABA>(i, j);
4394 const Vec512<T> l1_l0_k1_k0 = detail::Shuffle128<_MM_PERM_BABA>(k, l);
4395 const Vec512<T> j3_j2_i3_i2 = detail::Shuffle128<_MM_PERM_DCDC>(i, j);
4396 const Vec512<T> l3_l2_k3_k2 = detail::Shuffle128<_MM_PERM_DCDC>(k, l);
4398 detail::Shuffle128<_MM_PERM_CACA>(j1_j0_i1_i0, l1_l0_k1_k0);
4400 detail::Shuffle128<_MM_PERM_DBDB>(j1_j0_i1_i0, l1_l0_k1_k0);
4402 detail::Shuffle128<_MM_PERM_CACA>(j3_j2_i3_i2, l3_l2_k3_k2);
4404 detail::Shuffle128<_MM_PERM_DBDB>(j3_j2_i3_i2, l3_l2_k3_k2);
4405 StoreU(out0,
d, unaligned + 0 *
N);
4406 StoreU(out1,
d, unaligned + 1 *
N);
4407 StoreU(out2,
d, unaligned + 2 *
N);
4408 StoreU(out3,
d, unaligned + 3 *
N);
4419 const auto maskL =
Set(du64, 0xFFFFFFFFULL);
4420 const auto a32 =
BitCast(du32, a);
4421 const auto b32 =
BitCast(du32, b);
4429 const auto aLbL =
MulEven(a32, b32);
4430 const auto w3 = aLbL & maskL;
4432 const auto t2 =
MulEven(aH, b32) + ShiftRight<32>(aLbL);
4433 const auto w2 = t2 & maskL;
4434 const auto w1 = ShiftRight<32>(t2);
4436 const auto t =
MulEven(a32, bH) + w2;
4437 const auto k = ShiftRight<32>(t);
4439 const auto mulH =
MulEven(aH, bH) + w1 + k;
4440 const auto mulL = ShiftLeft<32>(t) + w3;
4448 const auto maskL =
Set(du64, 0xFFFFFFFFULL);
4449 const auto a32 =
BitCast(du32, a);
4450 const auto b32 =
BitCast(du32, b);
4456 const auto aLbL =
MulEven(a32, b32);
4457 const auto w3 = aLbL & maskL;
4459 const auto t2 =
MulEven(aH, b32) + ShiftRight<32>(aLbL);
4460 const auto w2 = t2 & maskL;
4461 const auto w1 = ShiftRight<32>(t2);
4463 const auto t =
MulEven(a32, bH) + w2;
4464 const auto k = ShiftRight<32>(t);
4466 const auto mulH =
MulEven(aH, bH) + w1 + k;
4467 const auto mulL = ShiftLeft<32>(t) + w3;
4489 return Set(
d, _mm512_reduce_add_epi32(
v.raw));
4492 return Set(
d, _mm512_reduce_add_epi64(
v.raw));
4495 return Set(
d,
static_cast<uint32_t
>(_mm512_reduce_add_epi32(
v.raw)));
4498 return Set(
d,
static_cast<uint64_t
>(_mm512_reduce_add_epi64(
v.raw)));
4501 return Set(
d, _mm512_reduce_add_ps(
v.raw));
4504 return Set(
d, _mm512_reduce_add_pd(
v.raw));
4509 const auto odd = ShiftRight<16>(
BitCast(d32,
v));
4510 const auto sum =
SumOfLanes(d32, even + odd);
4517 const auto even = ShiftRight<16>(ShiftLeft<16>(
BitCast(d32,
v)));
4518 const auto odd = ShiftRight<16>(
BitCast(d32,
v));
4519 const auto sum =
SumOfLanes(d32, even + odd);
4526 return Set(
d, _mm512_reduce_min_epi32(
v.raw));
4529 return Set(
d, _mm512_reduce_min_epi64(
v.raw));
4532 return Set(
d, _mm512_reduce_min_epu32(
v.raw));
4535 return Set(
d, _mm512_reduce_min_epu64(
v.raw));
4538 return Set(
d, _mm512_reduce_min_ps(
v.raw));
4541 return Set(
d, _mm512_reduce_min_pd(
v.raw));
4546 const auto odd = ShiftRight<16>(
BitCast(d32,
v));
4554 const auto even = ShiftRight<16>(ShiftLeft<16>(
BitCast(d32,
v)));
4555 const auto odd = ShiftRight<16>(
BitCast(d32,
v));
4563 return Set(
d, _mm512_reduce_max_epi32(
v.raw));
4566 return Set(
d, _mm512_reduce_max_epi64(
v.raw));
4569 return Set(
d, _mm512_reduce_max_epu32(
v.raw));
4572 return Set(
d, _mm512_reduce_max_epu64(
v.raw));
4575 return Set(
d, _mm512_reduce_max_ps(
v.raw));
4578 return Set(
d, _mm512_reduce_max_pd(
v.raw));
4583 const auto odd = ShiftRight<16>(
BitCast(d32,
v));
4591 const auto even = ShiftRight<16>(ShiftLeft<16>(
BitCast(d32,
v)));
4592 const auto odd = ShiftRight<16>(
BitCast(d32,
v));
#define HWY_RESTRICT
Definition: base.h:64
#define HWY_DIAGNOSTICS(tokens)
Definition: base.h:78
#define HWY_API
Definition: base.h:129
#define HWY_INLINE
Definition: base.h:70
#define HWY_DIAGNOSTICS_OFF(msc, gcc)
Definition: base.h:79
#define HWY_DASSERT(condition)
Definition: base.h:238
Definition: arm_neon-inl.h:825
Raw raw
Definition: arm_neon-inl.h:835
Definition: arm_neon-inl.h:778
Raw raw
Definition: arm_neon-inl.h:814
Definition: wasm_256-inl.h:27
Raw raw
Definition: x86_256-inl.h:113
Definition: x86_512-inl.h:112
HWY_INLINE Vec512 & operator/=(const Vec512 other)
Definition: x86_512-inl.h:124
typename detail::Raw512< T >::type Raw
Definition: x86_512-inl.h:113
Raw raw
Definition: x86_512-inl.h:143
HWY_INLINE Vec512 & operator|=(const Vec512 other)
Definition: x86_512-inl.h:136
T PrivateT
Definition: x86_512-inl.h:116
HWY_INLINE Vec512 & operator+=(const Vec512 other)
Definition: x86_512-inl.h:127
HWY_INLINE Vec512 & operator^=(const Vec512 other)
Definition: x86_512-inl.h:139
HWY_INLINE Vec512 & operator-=(const Vec512 other)
Definition: x86_512-inl.h:130
HWY_INLINE Vec512 & operator*=(const Vec512 other)
Definition: x86_512-inl.h:121
static constexpr size_t kPrivateN
Definition: x86_512-inl.h:117
HWY_INLINE Vec512 & operator&=(const Vec512 other)
Definition: x86_512-inl.h:133
#define HWY_AVX3_DL
Definition: detect_targets.h:65
#define HWY_TARGET
Definition: detect_targets.h:380
const double shift
Definition: RateControl.cpp:165
HWY_API Vec128< T, N > Shuffle2301(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: wasm_128-inl.h:2413
HWY_INLINE void MaybeUnpoison(T *HWY_RESTRICT unaligned, size_t count)
Definition: x86_128-inl.h:648
HWY_INLINE Vec128< T, N > IfThenElseZero(hwy::SizeTag< 1 >, Mask128< T, N > mask, Vec128< T, N > yes)
Definition: x86_128-inl.h:718
HWY_INLINE Vec128< uint8_t, N > EmuCompress(Vec128< uint8_t, N > v, Mask128< uint8_t, N > mask)
Definition: x86_512-inl.h:3863
HWY_INLINE void NativeCompressStore(Vec128< uint8_t, N > v, Mask128< uint8_t, N > mask, Simd< uint8_t, N, 0 >, uint8_t *HWY_RESTRICT unaligned)
Definition: x86_512-inl.h:3771
HWY_INLINE auto FixConversionOverflow(DI di, VFromD< DF > original, decltype(Zero(di).raw) converted_raw) -> VFromD< DI >
Definition: x86_128-inl.h:5571
HWY_API void LoadTransposedBlocks3(Simd< T, N, 0 > d, const T *HWY_RESTRICT unaligned, V &A, V &B, V &C)
Definition: generic_ops-inl.h:159
HWY_API Vec128< T, N > Shuffle3012(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: wasm_128-inl.h:2451
HWY_INLINE auto ClampF64ToI32Max(Simd< double, N, 0 > d, decltype(Zero(d)) v) -> decltype(Zero(d))
Definition: x86_128-inl.h:5560
HWY_INLINE Mask128< T, N > MaskFromVec(hwy::SizeTag< 1 >, const Vec128< T, N > v)
Definition: x86_128-inl.h:1570
HWY_API void StoreTransposedBlocks2(const V A, const V B, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: generic_ops-inl.h:470
HWY_INLINE Mask128< T, N > ExclusiveNeither(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition: x86_128-inl.h:963
HWY_INLINE bool AllTrue(hwy::SizeTag< 1 >, const Mask128< T > m)
Definition: wasm_128-inl.h:3661
HWY_INLINE Mask128< T, N > And(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition: x86_128-inl.h:815
HWY_INLINE Vec256< T > GatherIndex(hwy::SizeTag< 4 >, Full256< T >, const T *HWY_RESTRICT base, const Vec256< int32_t > index)
Definition: x86_256-inl.h:2612
HWY_INLINE void ScatterIndex(hwy::SizeTag< 4 >, Vec128< T, N > v, Simd< T, N, 0 >, T *HWY_RESTRICT base, const Vec128< int32_t, N > index)
Definition: x86_128-inl.h:3286
HWY_INLINE Vec128< T, N > Compress(Vec128< T, N > v, const uint64_t mask_bits)
Definition: arm_neon-inl.h:6153
HWY_INLINE void ScatterOffset(hwy::SizeTag< 4 >, Vec128< T, N > v, Simd< T, N, 0 >, T *HWY_RESTRICT base, const Vec128< int32_t, N > offset)
Definition: x86_128-inl.h:3275
HWY_API void StoreTransposedBlocks4(const V A, const V B, const V C, const V D, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: generic_ops-inl.h:862
Vec512< T > Shuffle128(const Vec512< T > lo, const Vec512< T > hi)
Definition: x86_512-inl.h:4252
HWY_INLINE __v128_u BitCastToInteger(__v128_u v)
Definition: wasm_128-inl.h:130
HWY_INLINE Vec128< uint8_t, N > BitCastFromByte(Simd< uint8_t, N, 0 >, Vec128< uint8_t, N > v)
Definition: arm_neon-inl.h:888
HWY_INLINE bool AllFalse(hwy::SizeTag< 1 >, const Mask256< T > mask)
Definition: x86_256-inl.h:4543
HWY_INLINE Mask128< T, N > Or(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition: x86_128-inl.h:889
HWY_INLINE Mask128< T, N > AndNot(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition: x86_128-inl.h:852
HWY_INLINE size_t CountTrue(hwy::SizeTag< 1 >, const Mask128< T > mask)
Definition: arm_neon-inl.h:5609
HWY_INLINE Vec128< uint8_t, N > BitCastToByte(Vec128< uint8_t, N > v)
Definition: arm_neon-inl.h:861
HWY_API Vec128< T, N > Shuffle1230(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: wasm_128-inl.h:2432
HWY_INLINE Vec128< T > PopulationCount(hwy::SizeTag< 1 >, Vec128< T > v)
Definition: arm_neon-inl.h:2080
HWY_INLINE Vec128< T, N > IfThenElse(hwy::SizeTag< 1 >, Mask128< T, N > mask, Vec128< T, N > yes, Vec128< T, N > no)
Definition: x86_128-inl.h:670
HWY_INLINE Vec128< T, N > IfThenZeroElse(hwy::SizeTag< 1 >, Mask128< T, N > mask, Vec128< T, N > no)
Definition: x86_128-inl.h:760
HWY_INLINE Vec128< uint8_t, N > NativeCompress(const Vec128< uint8_t, N > v, const Mask128< uint8_t, N > mask)
Definition: x86_512-inl.h:3743
HWY_API void StoreTransposedBlocks3(const V A, const V B, const V C, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: generic_ops-inl.h:505
HWY_API void LoadTransposedBlocks4(Simd< T, N, 0 > d, const T *HWY_RESTRICT unaligned, V &A, V &B, V &C, V &D)
Definition: generic_ops-inl.h:340
HWY_API Vec128< uint64_t > InterleaveUpper(const Vec128< uint64_t > a, const Vec128< uint64_t > b)
Definition: arm_neon-inl.h:4235
HWY_INLINE Mask512< T > Not(hwy::SizeTag< 1 >, const Mask512< T > m)
Definition: x86_512-inl.h:1613
HWY_INLINE void EmuCompressStore(Vec128< T, N > v, Mask128< T, N > mask, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: x86_512-inl.h:3901
HWY_INLINE Vec256< T > GatherOffset(hwy::SizeTag< 4 >, Full256< T >, const T *HWY_RESTRICT base, const Vec256< int32_t > offset)
Definition: x86_256-inl.h:2604
HWY_INLINE Mask128< T, N > Xor(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition: x86_128-inl.h:926
HWY_INLINE Mask128< T, N > TestBit(hwy::SizeTag< 1 >, const Vec128< T, N > v, const Vec128< T, N > bit)
Definition: x86_128-inl.h:1406
d
Definition: rvv-inl.h:1998
HWY_API Vec128< T, N > ShiftLeftSame(const Vec128< T, N > v, int bits)
Definition: arm_neon-inl.h:1631
HWY_API Vec128< T, N > AverageRound(Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:619
HWY_API Vec128< T, N > CopySign(const Vec128< T, N > magn, const Vec128< T, N > sign)
Definition: arm_neon-inl.h:2190
HWY_API Vec128< T, N > OddEvenBlocks(Vec128< T, N >, Vec128< T, N > even)
Definition: arm_neon-inl.h:4697
decltype(FirstN(D(), 0)) MFromD
Definition: arm_sve-inl.h:276
HWY_API Mask128< T, N > operator>(Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:2445
HWY_API Vec128< T, N > operator-(Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:576
HWY_API Mask128< TTo, N > RebindMask(Simd< TTo, N, 0 > dto, Mask128< TFrom, N > m)
Definition: arm_neon-inl.h:2230
HWY_API Vec128< T, N > DupOdd(Vec128< T, N > v)
Definition: arm_neon-inl.h:4662
HWY_API Mask128< T, N > operator==(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:1139
HWY_API VFromD< DW > ZipLower(V a, V b)
Definition: arm_neon-inl.h:4272
HWY_API bool AllTrue(const Full128< T > d, const Mask128< T > m)
Definition: arm_neon-inl.h:5716
HWY_API Vec128< T > Shuffle1032(const Vec128< T > v)
Definition: arm_neon-inl.h:4131
HWY_API Vec128< int16_t > MulHigh(const Vec128< int16_t > a, const Vec128< int16_t > b)
Definition: arm_neon-inl.h:1684
HWY_API Vec128< T > Shuffle2103(const Vec128< T > v)
Definition: arm_neon-inl.h:4147
HWY_API Vec128< float, N > Round(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3436
HWY_API Vec128< T, N > ZeroExtendVector(Simd< T, N, 0 > d, Vec128< T, N/2 > lo)
Definition: arm_neon-inl.h:4448
HWY_API Mask128< T, N > IsNaN(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3506
HWY_API intptr_t FindFirstTrue(const Simd< T, N, 0 > d, const Mask128< T, N > mask)
Definition: arm_neon-inl.h:5691
HWY_API V128 CombineShiftRightBytes(Full128< T > d, V128 hi, V128 lo)
Definition: arm_neon-inl.h:3592
HWY_API Vec128< T, N > ShiftLeftLanes(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3695
HWY_API Mask128< T, N > FirstN(const Simd< T, N, 0 > d, size_t num)
Definition: arm_neon-inl.h:2456
HWY_API size_t StoreMaskBits(Simd< T, N, 0 >, const Mask128< T, N > mask, uint8_t *bits)
Definition: arm_neon-inl.h:5701
HWY_API Vec128< float, N > MulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition: arm_neon-inl.h:1799
HWY_API void Stream(const Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2955
HWY_API Vec128< T, N > Xor3(Vec128< T, N > x1, Vec128< T, N > x2, Vec128< T, N > x3)
Definition: arm_neon-inl.h:2025
Repartition< MakeWide< TFromD< D > >, D > RepartitionToWide
Definition: ops/shared-inl.h:221
HWY_API Vec128< T, N > And(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1949
HWY_API Vec128< T, N > SumOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:5334
HWY_API Vec128< T, N > BroadcastSignBit(const Vec128< T, N > v)
Definition: arm_neon-inl.h:2207
HWY_API Vec128< To, 1 > TruncateTo(Simd< To, 1, 0 >, const Vec128< From, 1 > v)
Definition: arm_neon-inl.h:4806
HWY_API Vec128< uint64_t, N > Min(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition: arm_neon-inl.h:2517
HWY_API Vec256< uint64_t > CLMulUpper(Vec256< uint64_t > a, Vec256< uint64_t > b)
Definition: x86_256-inl.h:4453
HWY_API Vec128< T, N > PopulationCount(Vec128< T, N > v)
Definition: arm_neon-inl.h:2137
HWY_API Vec128< uint64_t, N > Max(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition: arm_neon-inl.h:2555
HWY_API Mask128< T, N > MaskFromVec(const Vec128< T, N > v)
Definition: arm_neon-inl.h:2217
HWY_API Vec128< T, N > ConcatUpperUpper(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4517
Rebind< MakeUnsigned< TFromD< D > >, D > RebindToUnsigned
Definition: ops/shared-inl.h:212
HWY_API Vec128< T, N > SaturatedAdd(Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:597
HWY_API Vec128< T, N > GatherIndex(const Simd< T, N, 0 > d, const T *HWY_RESTRICT base, const Vec128< Index, N > index)
Definition: arm_neon-inl.h:5037
HWY_INLINE Vec128< uint64_t > MulOdd(Vec128< uint64_t > a, Vec128< uint64_t > b)
Definition: arm_neon-inl.h:4912
N ConcatEven(Simd< T, N, 0 >, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4617
HWY_API Vec128< T > Shuffle0321(const Vec128< T > v)
Definition: arm_neon-inl.h:4141
HWY_API Vec128< T > Not(const Vec128< T > v)
Definition: arm_neon-inl.h:1931
HWY_API Mask128< T, N > IsInf(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3511
HWY_API Vec128< T, N > ConcatLowerUpper(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4544
HWY_API Vec128< T, N/2 > LowerHalf(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3540
HWY_API Vec128< T, N > operator&(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:2055
HWY_API Vec128< T, N > operator|(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:2060
HWY_API Vec128< uint64_t > InterleaveLower(const Vec128< uint64_t > a, const Vec128< uint64_t > b)
Definition: arm_neon-inl.h:4181
HWY_API Vec128< int64_t > MulEven(Vec128< int32_t > a, Vec128< int32_t > b)
Definition: arm_neon-inl.h:4872
HWY_API Vec128< bfloat16_t, 2 *N > ReorderDemote2To(Simd< bfloat16_t, 2 *N, 0 > dbf16, Vec128< float, N > a, Vec128< float, N > b)
Definition: arm_neon-inl.h:4719
Rebind< MakeFloat< TFromD< D > >, D > RebindToFloat
Definition: ops/shared-inl.h:214
HWY_API Vec128< T, 1 > CompressNot(Vec128< T, 1 > v, Mask128< T, 1 >)
Definition: arm_neon-inl.h:6198
HWY_API Vec128< T, N > MaskedLoad(Mask128< T, N > m, Simd< T, N, 0 > d, const T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2758
Rebind< MakeSigned< TFromD< D > >, D > RebindToSigned
Definition: ops/shared-inl.h:210
HWY_API Mask128< T, N > operator<(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:1163
HWY_API Vec128< uint64_t > CompressBlocksNot(Vec128< uint64_t > v, Mask128< uint64_t >)
Definition: arm_neon-inl.h:6226
HWY_API Vec128< float, N > ReorderWidenMulAccumulate(Simd< float, N, 0 > df32, Vec128< bfloat16_t, 2 *N > a, Vec128< bfloat16_t, 2 *N > b, const Vec128< float, N > sum0, Vec128< float, N > &sum1)
Definition: arm_neon-inl.h:4288
HWY_API Vec128< T, N > IfVecThenElse(Vec128< T, N > mask, Vec128< T, N > yes, Vec128< T, N > no)
Definition: arm_neon-inl.h:2047
HWY_API Vec128< T, N > operator^(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:2065
HWY_API void BlendedStore(Vec128< T, N > v, Mask128< T, N > m, Simd< T, N, 0 > d, T *HWY_RESTRICT p)
Definition: arm_neon-inl.h:2941
HWY_API size_t CountTrue(Full128< T >, const Mask128< T > mask)
Definition: arm_neon-inl.h:5671
HWY_API Vec128< T, N > VecFromMask(Simd< T, N, 0 > d, const Mask128< T, N > v)
Definition: arm_neon-inl.h:2223
HWY_API Vec128< T, N > DupEven(Vec128< T, N > v)
Definition: arm_neon-inl.h:4646
HWY_API Vec128< T, N > IfThenElseZero(const Mask128< T, N > mask, const Vec128< T, N > yes)
Definition: arm_neon-inl.h:2253
HWY_API Mask128< uint64_t, N > TestBit(Vec128< uint64_t, N > v, Vec128< uint64_t, N > bit)
Definition: arm_neon-inl.h:2477
HWY_API constexpr size_t Lanes(Simd< T, N, kPow2 >)
Definition: arm_sve-inl.h:243
HWY_API Vec128< T, N > Load(Simd< T, N, 0 > d, const T *HWY_RESTRICT p)
Definition: arm_neon-inl.h:2753
HWY_API Vec128< int64_t > Neg(const Vec128< int64_t > v)
Definition: arm_neon-inl.h:1413
HWY_API Vec128< TI > TableLookupBytes(const Vec128< T > bytes, const Vec128< TI > from)
Definition: arm_neon-inl.h:4922
HWY_API Vec256< uint8_t > AESRound(Vec256< uint8_t > state, Vec256< uint8_t > round_key)
Definition: x86_256-inl.h:4417
HWY_API Vec128< T, N > IfThenElse(const Mask128< T, N > mask, const Vec128< T, N > yes, const Vec128< T, N > no)
Definition: emu128-inl.h:303
HWY_API Vec128< T, N > TableLookupLanes(Vec128< T, N > v, Indices128< T, N > idx)
Definition: arm_neon-inl.h:4019
HWY_API Vec128< T, N > Xor(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1998
HWY_API Vec128< float, N > Floor(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3467
Simd< typename V::PrivateT, V::kPrivateN, 0 > DFromV
Definition: arm_neon-inl.h:842
HWY_API Vec128< float, N > MulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition: arm_neon-inl.h:1853
HWY_API Vec128< T, N > CopySignToAbs(const Vec128< T, N > abs, const Vec128< T, N > sign)
Definition: arm_neon-inl.h:2198
HWY_API void StoreU(const Vec128< uint8_t > v, Full128< uint8_t >, uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:2772
N ConcatOdd(Simd< T, N, 0 >, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4586
HWY_API Vec128< float, N > Ceil(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3453
Repartition< MakeNarrow< TFromD< D > >, D > RepartitionToNarrow
Definition: ops/shared-inl.h:223
HWY_API Indices128< T, N > IndicesFromVec(Simd< T, N, 0 > d, Vec128< TI, N > vec)
Definition: arm_neon-inl.h:3973
HWY_API Vec128< T, N > SwapAdjacentBlocks(Vec128< T, N > v)
Definition: arm_neon-inl.h:4704
HWY_API Vec128< T, N > ShiftLeftBytes(Simd< T, N, 0 >, Vec128< T, N > v)
Definition: arm_neon-inl.h:3684
HWY_API Vec128< T, N > Reverse2(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4061
HWY_API Vec64< uint32_t > Shuffle2301(const Vec64< uint32_t > v)
Definition: arm_neon-inl.h:2326
svuint16_t Set(Simd< bfloat16_t, N, kPow2 > d, bfloat16_t arg)
Definition: arm_sve-inl.h:322
HWY_API Vec128< uint8_t > Combine(Full128< uint8_t >, Vec64< uint8_t > hi, Vec64< uint8_t > lo)
Definition: arm_neon-inl.h:4352
HWY_API Vec128< T, N > Reverse8(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4113
HWY_API Vec< D > SignBit(D d)
Definition: generic_ops-inl.h:69
HWY_API Vec128< T, N > MaxOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:5342
Vec128< T, N > Iota(const Simd< T, N, 0 > d, const T2 first)
Definition: arm_neon-inl.h:1049
HWY_API Mask128< T, N > ExclusiveNeither(const Mask128< T, N > a, Mask128< T, N > b)
Definition: arm_neon-inl.h:2314
HWY_INLINE Vec128< T, N > CompressBits(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits)
Definition: arm_neon-inl.h:6234
HWY_API Mask128< T, N > LoadMaskBits(Simd< T, N, 0 > d, const uint8_t *HWY_RESTRICT bits)
Definition: arm_neon-inl.h:5407
HWY_API Vec128< T, N > ZeroIfNegative(Vec128< T, N > v)
Definition: arm_neon-inl.h:2277
HWY_API Vec128< T > Shuffle01(const Vec128< T > v)
Definition: arm_neon-inl.h:4135
HWY_API Vec128< float, N > operator/(const Vec128< float, N > a, const Vec128< float, N > b)
Definition: arm_neon-inl.h:1761
HWY_API Vec64< uint16_t > DemoteTo(Full64< uint16_t >, const Vec128< int32_t > v)
Definition: arm_neon-inl.h:3145
HWY_API Vec128< uint8_t > LoadU(Full128< uint8_t >, const uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:2591
HWY_API Vec128< T, N > OrAnd(Vec128< T, N > o, Vec128< T, N > a1, Vec128< T, N > a2)
Definition: arm_neon-inl.h:2040
HWY_API Vec128< T, N > IfNegativeThenElse(Vec128< T, N > v, Vec128< T, N > yes, Vec128< T, N > no)
Definition: arm_neon-inl.h:2266
HWY_API Vec128< T, N > ConcatUpperLower(Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4570
HWY_API Vec128< uint8_t > operator<<(const Vec128< uint8_t > v, const Vec128< uint8_t > bits)
Definition: arm_neon-inl.h:1462
HWY_API Vec128< uint16_t > operator*(const Vec128< uint16_t > a, const Vec128< uint16_t > b)
Definition: arm_neon-inl.h:1642
HWY_API Vec128< T, N > BitCast(Simd< T, N, 0 > d, Vec128< FromT, N *sizeof(T)/sizeof(FromT)> v)
Definition: arm_neon-inl.h:997
HWY_API bool AllFalse(const Simd< T, N, 0 > d, const Mask128< T, N > m)
Definition: arm_neon-inl.h:5710
HWY_API Vec64< uint8_t > UpperHalf(Full64< uint8_t >, const Vec128< uint8_t > v)
Definition: arm_neon-inl.h:3739
HWY_API T ExtractLane(const Vec128< T, 1 > v, size_t i)
Definition: arm_neon-inl.h:1085
HWY_API void ScatterOffset(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT base, const Vec128< Offset, N > offset)
Definition: arm_neon-inl.h:4984
HWY_API Vec128< T, N > Undefined(Simd< T, N, 0 >)
Definition: arm_neon-inl.h:1040
HWY_API VFromD< DW > ZipUpper(DW dw, V a, V b)
Definition: arm_neon-inl.h:4281
HWY_API Vec128< T, N > ShiftRight(Vec128< T, N > v)
Definition: emu128-inl.h:386
HWY_API Vec128< T, N > ConcatLowerLower(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4456
typename D::template Rebind< T > Rebind
Definition: ops/shared-inl.h:207
HWY_API Vec128< float, N > RearrangeToOddPlusEven(const Vec128< float, N > sum0, const Vec128< float, N > sum1)
Definition: arm_neon-inl.h:4412
HWY_API Vec256< uint64_t > CLMulLower(Vec256< uint64_t > a, Vec256< uint64_t > b)
Definition: x86_256-inl.h:4442
HWY_API Vec128< T, N > Zero(Simd< T, N, 0 > d)
Definition: arm_neon-inl.h:1020
HWY_API Mask128< T, N > operator>=(Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:2449
HWY_API Vec128< T, N > ShiftRightSame(const Vec128< T, N > v, int bits)
Definition: arm_neon-inl.h:1635
HWY_API V InterleaveUpper(Simd< T, N, 0 >, V a, V b)
Definition: arm_neon-inl.h:4256
HWY_API Vec128< T, N > GatherOffset(const Simd< T, N, 0 > d, const T *HWY_RESTRICT base, const Vec128< Offset, N > offset)
Definition: arm_neon-inl.h:5020
HWY_API Vec128< T, N > IfThenZeroElse(const Mask128< T, N > mask, const Vec128< T, N > no)
Definition: arm_neon-inl.h:2260
HWY_API Mask128< T, N > operator!=(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:1148
HWY_API Vec128< T, N > Or(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1986
HWY_API Vec128< int32_t, N > NearestInt(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3497
HWY_API Vec128< float > ApproximateReciprocal(const Vec128< float > v)
Definition: arm_neon-inl.h:1734
HWY_API Vec32< uint8_t > U8FromU32(const Vec128< uint32_t > v)
Definition: arm_neon-inl.h:3287
HWY_API Indices128< T, N > SetTableIndices(Simd< T, N, 0 > d, const TI *idx)
Definition: arm_neon-inl.h:4013
HWY_API TFromV< V > GetLane(const V v)
Definition: arm_neon-inl.h:1076
HWY_API void ScatterIndex(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT base, const Vec128< Index, N > index)
Definition: arm_neon-inl.h:5002
HWY_API Vec128< float, N > NegMulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition: arm_neon-inl.h:1832
HWY_API Vec128< uint16_t > PromoteTo(Full128< uint16_t >, const Vec64< uint8_t > v)
Definition: arm_neon-inl.h:2965
HWY_API Mask128< T, N > operator<=(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:1180
HWY_API Vec128< T, N > Or3(Vec128< T, N > o1, Vec128< T, N > o2, Vec128< T, N > o3)
Definition: arm_neon-inl.h:2033
decltype(Zero(D())) VFromD
Definition: arm_neon-inl.h:1030
HWY_API Vec128< T, N > LoadDup128(Simd< T, N, 0 > d, const T *const HWY_RESTRICT p)
Definition: arm_neon-inl.h:2765
HWY_API Vec128< T, N > OddEven(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:4678
HWY_API Vec128< int16_t > MulFixedPoint15(Vec128< int16_t > a, Vec128< int16_t > b)
Definition: arm_neon-inl.h:1720
HWY_API Vec128< T > Shuffle0123(const Vec128< T > v)
Definition: arm_neon-inl.h:4153
HWY_API Vec128< float, N > Trunc(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3425
typename D::Half Half
Definition: ops/shared-inl.h:227
HWY_API Vec128< T, N > MinOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:5338
HWY_API Vec128< T, N > ShiftRightBytes(Simd< T, N, 0 >, Vec128< T, N > v)
Definition: arm_neon-inl.h:3707
HWY_API size_t CompressStore(Vec128< T, N > v, const Mask128< T, N > mask, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:6248
typename D::template Repartition< T > Repartition
Definition: ops/shared-inl.h:218
HWY_API Vec128< int8_t > Abs(const Vec128< int8_t > v)
Definition: arm_neon-inl.h:2146
HWY_API Vec128< float > ConvertTo(Full128< float >, const Vec128< int32_t > v)
Definition: arm_neon-inl.h:3327
N
Definition: rvv-inl.h:1998
HWY_API Vec128< float, N > Sqrt(const Vec128< float, N > v)
Definition: arm_neon-inl.h:1913
HWY_API size_t CompressBitsStore(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:6273
HWY_API Vec128< uint32_t, N > RotateRight(const Vec128< uint32_t, N > v)
Definition: arm_neon-inl.h:1444
HWY_API Mask128< T, N > IsFinite(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3521
HWY_API Vec128< T, N > AndNot(const Vec128< T, N > not_mask, const Vec128< T, N > mask)
Definition: arm_neon-inl.h:1964
HWY_API Vec128< uint64_t > SumsOf8(const Vec128< uint8_t > v)
Definition: arm_neon-inl.h:1361
HWY_API Vec128< float > ApproximateReciprocalSqrt(const Vec128< float > v)
Definition: arm_neon-inl.h:1885
HWY_API Vec128< T > ReverseBlocks(Full128< T >, const Vec128< T > v)
Definition: arm_neon-inl.h:4712
HWY_API size_t CompressBlendedStore(Vec128< T, N > v, Mask128< T, N > m, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:6257
HWY_API Vec128< T, N > Reverse4(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4090
HWY_API size_t FindKnownFirstTrue(const Simd< T, N, 0 > d, const Mask128< T, N > mask)
Definition: arm_neon-inl.h:5683
HWY_API Vec128< T, N > operator+(Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:580
HWY_API Vec128< T, 1 > Reverse(Simd< T, 1, 0 >, const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:4030
HWY_API Vec128< uint8_t > operator>>(const Vec128< uint8_t > v, const Vec128< uint8_t > bits)
Definition: arm_neon-inl.h:1542
HWY_API void Store(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2934
HWY_API Vec128< T, 1 > InsertLane(const Vec128< T, 1 > v, size_t i, T t)
Definition: arm_neon-inl.h:1225
HWY_API Vec128< T, N > SaturatedSub(Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:608
HWY_API Vec128< T, N > ShiftLeft(Vec128< T, N > v)
Definition: emu128-inl.h:376
HWY_API Vec128< uint16_t > Broadcast(const Vec128< uint16_t > v)
Definition: arm_neon-inl.h:3885
const vfloat64m1_t v
Definition: rvv-inl.h:1998
HWY_API Vec256< uint8_t > AESLastRound(Vec256< uint8_t > state, Vec256< uint8_t > round_key)
Definition: x86_256-inl.h:4429
HWY_API Vec128< float > AbsDiff(const Vec128< float > a, const Vec128< float > b)
Definition: arm_neon-inl.h:1773
HWY_API Vec128< T, N > ShiftRightLanes(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3713
typename D::T TFromD
Definition: ops/shared-inl.h:203
HWY_API Vec128< T, 1 > Compress(Vec128< T, 1 > v, Mask128< T, 1 >)
Definition: arm_neon-inl.h:6174
HWY_API Vec128< float, N > NegMulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition: arm_neon-inl.h:1861
Definition: aligned_allocator.h:27
HWY_API void CopyBytes(const From *from, To *to)
Definition: base.h:950
HWY_INLINE constexpr T AddWithWraparound(hwy::FloatTag, T t, size_t n)
Definition: base.h:906
HWY_API size_t Num0BitsBelowLS1Bit_Nonzero64(const uint64_t x)
Definition: base.h:806
HWY_API size_t PopCount(uint64_t x)
Definition: base.h:865
HWY_API size_t Num0BitsBelowLS1Bit_Nonzero32(const uint32_t x)
Definition: base.h:796
typename detail::Relations< T >::Unsigned MakeUnsigned
Definition: base.h:593
#define HWY_ALIGN
Definition: set_macros-inl.h:83
#define HWY_NAMESPACE
Definition: set_macros-inl.h:82
Definition: x86_512-inl.h:2545
__m512i raw
Definition: x86_512-inl.h:2546
Definition: wasm_256-inl.h:61
Raw raw
Definition: x86_256-inl.h:150
Definition: x86_512-inl.h:148
typename detail::RawMask512< sizeof(T)>::type Raw
Definition: x86_512-inl.h:149
Raw raw
Definition: x86_512-inl.h:150
Definition: ops/shared-inl.h:52
HWY_INLINE __m512d operator()(__m512i v)
Definition: x86_512-inl.h:182
HWY_INLINE __m512 operator()(__m512i v)
Definition: x86_512-inl.h:178
Definition: x86_512-inl.h:173
HWY_INLINE __m512i operator()(__m512i v)
Definition: x86_512-inl.h:174
__m512d type
Definition: x86_512-inl.h:86
__m512 type
Definition: x86_512-inl.h:82
Definition: x86_512-inl.h:77
__m512i type
Definition: x86_512-inl.h:78
__mmask64 type
Definition: x86_512-inl.h:94
__mmask32 type
Definition: x86_512-inl.h:98
__mmask16 type
Definition: x86_512-inl.h:102
__mmask8 type
Definition: x86_512-inl.h:106
Definition: x86_512-inl.h:91