35 return *
this = (*
this * other);
38 return *
this = (*
this / other);
41 return *
this = (*
this + other);
44 return *
this = (*
this - other);
47 return *
this = (*
this & other);
50 return *
this = (*
this | other);
53 return *
this = (*
this ^ other);
68template <
typename T,
typename FromT>
70 const Half<
decltype(
d)> dh;
81 const Half<
decltype(
d)> dh;
83 ret.
v0 = ret.v1 =
Zero(dh);
93template <
typename T,
typename T2>
95 const Half<
decltype(
d)> dh;
97 ret.
v0 = ret.v1 =
Set(dh,
static_cast<T
>(t));
103 const Half<
decltype(
d)> dh;
109template <
typename T,
typename T2>
111 const Half<
decltype(
d)> dh;
116 static_cast<T
>(first),
Lanes(dh)));
174template <
int kBits,
typename T>
176 v.v0 = ShiftLeft<kBits>(
v.v0);
177 v.v1 = ShiftLeft<kBits>(
v.v1);
181template <
int kBits,
typename T>
183 v.v0 = ShiftRight<kBits>(
v.v0);
184 v.v1 = ShiftRight<kBits>(
v.v1);
189template <
int kBits,
typename T>
191 constexpr size_t kSizeInBits =
sizeof(T) * 8;
192 static_assert(0 <= kBits && kBits < kSizeInBits,
"Invalid shift count");
193 if (kBits == 0)
return v;
194 return Or(ShiftRight<kBits>(
v), ShiftLeft<kSizeInBits - kBits>(
v));
312 return mul * x + add;
319 return add - mul * x;
327 return mul * x - sub;
334 return Neg(mul) * x - sub;
350 return one /
Sqrt(
v);
390template <
typename T, HWY_IF_FLOAT(T)>
400template <
typename T, HWY_IF_FLOAT(T)>
409 const VFromD<
decltype(di)> exp =
411 return RebindMask(
d, Lt(exp,
Set(di, hwy::MaxExponentField<T>())));
418template <
typename TFrom,
typename TTo>
420 static_assert(
sizeof(TFrom) ==
sizeof(TTo),
"Must have same size");
426 static_assert(!hwy::IsFloat<T>(),
"Only integer vectors supported");
427 return (
v & bit) == bit;
525 return Xor(x1,
Xor(x2, x3));
530 return Or(o1,
Or(o2, o3));
535 return Or(o,
And(a1, a2));
564 static_assert(IsFloat<T>(),
"Only makes sense for floating-point");
571 static_assert(IsFloat<T>(),
"Only makes sense for floating-point");
588 const Half<
decltype(
d)> dh;
622template <
typename T, HWY_IF_FLOAT(T)>
682template <
typename T, HWY_IF_NOT_LANE_SIZE(T, 1)>
697 const Half<
decltype(
d)> dh;
699 ret.
v0 =
Load(dh, aligned);
718 const Half<
decltype(
d)> dh;
720 ret.
v0 = ret.v1 =
Load(dh, p);
728 const Half<
decltype(
d)> dh;
754template <
typename T,
typename Offset>
757 constexpr size_t N = 32 /
sizeof(T);
758 static_assert(
sizeof(T) ==
sizeof(Offset),
"Must match for portability");
760 alignas(32) T lanes[
N];
763 alignas(32) Offset offset_lanes[
N];
766 uint8_t* base_bytes =
reinterpret_cast<uint8_t*
>(base);
767 for (
size_t i = 0; i <
N; ++i) {
768 CopyBytes<sizeof(T)>(&lanes[i], base_bytes + offset_lanes[i]);
772template <
typename T,
typename Index>
775 constexpr size_t N = 32 /
sizeof(T);
776 static_assert(
sizeof(T) ==
sizeof(Index),
"Must match for portability");
778 alignas(32) T lanes[
N];
781 alignas(32) Index index_lanes[
N];
784 for (
size_t i = 0; i <
N; ++i) {
785 base[index_lanes[i]] = lanes[i];
791template <
typename T,
typename Offset>
794 constexpr size_t N = 32 /
sizeof(T);
795 static_assert(
sizeof(T) ==
sizeof(Offset),
"Must match for portability");
797 alignas(32) Offset offset_lanes[
N];
800 alignas(32) T lanes[
N];
801 const uint8_t* base_bytes =
reinterpret_cast<const uint8_t*
>(base);
802 for (
size_t i = 0; i <
N; ++i) {
803 CopyBytes<sizeof(T)>(base_bytes + offset_lanes[i], &lanes[i]);
805 return Load(
d, lanes);
808template <
typename T,
typename Index>
811 constexpr size_t N = 32 /
sizeof(T);
812 static_assert(
sizeof(T) ==
sizeof(Index),
"Must match for portability");
814 alignas(32) Index index_lanes[
N];
817 alignas(32) T lanes[
N];
818 for (
size_t i = 0; i <
N; ++i) {
819 lanes[i] = base[index_lanes[i]];
821 return Load(
d, lanes);
829 alignas(32) T lanes[32 /
sizeof(T)];
838 alignas(32) T lanes[32 /
sizeof(T)];
841 return Load(
d, lanes);
864template <
int kBytes,
typename T>
866 const Half<
decltype(
d)> dh;
867 v.v0 = ShiftLeftBytes<kBytes>(dh,
v.v0);
868 v.v1 = ShiftLeftBytes<kBytes>(dh,
v.v1);
872template <
int kBytes,
typename T>
879template <
int kLanes,
typename T>
885template <
int kLanes,
typename T>
891template <
int kBytes,
typename T>
893 const Half<
decltype(
d)> dh;
894 v.v0 = ShiftRightBytes<kBytes>(dh,
v.v0);
895 v.v1 = ShiftRightBytes<kBytes>(dh,
v.v1);
900template <
int kLanes,
typename T>
915template <
int kBytes,
typename T,
class V = Vec256<T>>
917 const Half<
decltype(
d)> dh;
918 hi.v0 = CombineShiftRightBytes<kBytes>(dh, hi.v0, lo.v0);
919 hi.v1 = CombineShiftRightBytes<kBytes>(dh, hi.v1, lo.v1);
925template <
int kLane,
typename T>
928 ret.
v0 = Broadcast<kLane>(
v.v0);
929 ret.
v1 = Broadcast<kLane>(
v.v1);
936template <
typename T,
typename TI>
944template <
typename T,
typename TI,
size_t NI>
955template <
typename T,
size_t N,
typename TI>
965template <
class V,
class VI>
1001template <
typename T>
1008template <
typename T>
1018template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
1024template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
1030template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
1042template <
typename T>
1048template <
typename T,
typename TI>
1050 static_assert(
sizeof(T) ==
sizeof(TI),
"Index size must match lane");
1052 ret.
i0 = vec.
v0.raw;
1053 ret.
i1 = vec.
v1.raw;
1057template <
typename T,
typename TI>
1059 const Rebind<TI,
decltype(
d)> di;
1063template <
typename T>
1068 constexpr size_t kLanesPerHalf = 16 /
sizeof(TU);
1072 const Vec128<TU> mask =
Set(duh,
static_cast<TU
>(kLanesPerHalf - 1));
1089template <
typename T>
1096template <
typename T>
1098 const Half<
decltype(
d)> dh;
1106template <
typename T>
1108 const Half<
decltype(
d)> dh;
1117template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
1119 const Half<
decltype(
d)> dh;
1126template <
typename T, HWY_IF_NOT_LANE_SIZE(T, 8)>
1128 const Half<
decltype(
d)> dh;
1136template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
1142template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
1144 const Half<
decltype(
d)> dh;
1151template <
typename T, HWY_IF_LANE_SIZE_ONE_OF(T, 0x6)>
1153 const Half<
decltype(
d)> dh;
1161template <
typename T>
1172template <
typename T,
class V = Vec256<T>>
1174 const Half<
decltype(
d)> dh;
1184template <
typename T,
class DW = RepartitionToW
ide<Full256<T>>>
1188template <
typename T,
class D = Full256<T>,
class DW = RepartitionToW
ide<D>>
1193template <
typename T,
class D = Full256<T>,
class DW = RepartitionToW
ide<D>>
1201template <
typename T>
1210template <
typename T>
1212 const Half<
decltype(
d)> dh;
1217template <
typename T>
1227template <
typename T>
1237template <
typename T>
1247template <
typename T>
1257template <
typename T>
1260 const Half<
decltype(
d)> dh;
1268template <
typename T>
1271 const Half<
decltype(
d)> dh;
1279template <
typename T>
1287template <
typename T>
1295template <
typename T>
1303template <
typename T>
1310template <
typename T>
1319template <
typename T>
1338 wasm_u32x4_extend_high_u16x8(wasm_u16x8_extend_high_u8x16(
v.raw))};
1347 wasm_u32x4_extend_high_u16x8(wasm_u16x8_extend_high_u8x16(
v.raw))};
1370 wasm_i32x4_extend_high_i16x8(wasm_i16x8_extend_high_i8x16(
v.raw))};
1394 const auto sign = ShiftRight<15>(bits16);
1395 const auto biased_exp = ShiftRight<10>(bits16) &
Set(du32, 0x1F);
1396 const auto mantissa = bits16 &
Set(du32, 0x3FF);
1397 const auto subnormal =
1399 Set(df32, 1.0f / 16384 / 1024));
1401 const auto biased_exp32 = biased_exp +
Set(du32, 127 - 15);
1402 const auto mantissa32 =
ShiftLeft<23 - 10>(mantissa);
1403 const auto normal = ShiftLeft<23>(biased_exp32) | mantissa32;
1404 const auto bits32 =
IfThenElse(biased_exp ==
Zero(du32), subnormal, normal);
1405 return BitCast(df32, ShiftLeft<31>(sign) | bits32);
1417template <
typename T,
typename TN>
1419 const Half<
decltype(
d)> dh;
1427template <
typename TW,
typename TN>
1429 const Half<
decltype(
d)> dh;
1452 const auto intermediate = wasm_i16x8_narrow_i32x4(
v.v0.raw,
v.v1.raw);
1453 return Vec64<uint8_t>{wasm_u8x16_narrow_i16x8(intermediate, intermediate)};
1463 const auto intermediate = wasm_i16x8_narrow_i32x4(
v.v0.raw,
v.v1.raw);
1464 return Vec64<int8_t>{wasm_i8x16_narrow_i16x8(intermediate, intermediate)};
1480 const Half<
decltype(d16)> d16h;
1488 const Half<
decltype(dbf16)> dbf16h;
1491 return Combine(dbf16, hi, lo);
1505 return Vec32<uint8_t>{wasm_i8x16_shuffle(
v.v0.raw,
v.v1.raw, 0, 8, 16, 24, 0,
1506 8, 16, 24, 0, 8, 16, 24, 0, 8, 16,
1513 17, 24, 25, 0, 1, 8, 9, 16, 17, 24,
1520 9, 10, 11, 16, 17, 18, 19, 24, 25,
1526 return Vec64<uint8_t>{wasm_i8x16_shuffle(
v.v0.raw,
v.v1.raw, 0, 4, 8, 12, 16,
1527 20, 24, 28, 0, 4, 8, 12, 16, 20, 24,
1534 9, 12, 13, 16, 17, 20, 21, 24, 25,
1541 10, 12, 14, 16, 18, 20, 22, 24, 26,
1554 const Half<
decltype(d16)> d16h;
1563template <
typename TTo,
typename TFrom>
1565 const Half<
decltype(
d)> dh;
1581template <
typename T, HWY_IF_LANE_SIZE_ONE_OF(T, 0x110)>
1584 const Half<
decltype(
d)> dh;
1589 constexpr size_t kBitsPerHalf = 16 /
sizeof(T);
1590 const uint8_t bits_upper[8] = {
static_cast<uint8_t
>(bits[0] >> kBitsPerHalf)};
1595template <
typename T, HWY_IF_LANE_SIZE_ONE_OF(T, 0x6)>
1598 const Half<
decltype(
d)> dh;
1601 constexpr size_t kLanesPerHalf = 16 /
sizeof(T);
1602 constexpr size_t kBytesPerHalf = kLanesPerHalf / 8;
1603 static_assert(kBytesPerHalf != 0,
"Lane size <= 16 bits => at least 8 lanes");
1611template <
typename T, HWY_IF_LANE_SIZE_ONE_OF(T, 0x110)>
1614 const Half<
decltype(
d)> dh;
1616 const uint8_t lo = bits[0];
1620 constexpr size_t kBitsPerHalf = 16 /
sizeof(T);
1621 bits[0] =
static_cast<uint8_t
>(lo | (bits[0] << kBitsPerHalf));
1622 return (kBitsPerHalf * 2 + 7) / 8;
1625template <
typename T, HWY_IF_LANE_SIZE_ONE_OF(T, 0x6)>
1628 const Half<
decltype(
d)> dh;
1629 constexpr size_t kLanesPerHalf = 16 /
sizeof(T);
1630 constexpr size_t kBytesPerHalf = kLanesPerHalf / 8;
1631 static_assert(kBytesPerHalf != 0,
"Lane size <= 16 bits => at least 8 lanes");
1634 return kBytesPerHalf * 2;
1637template <
typename T>
1639 const Half<
decltype(
d)> dh;
1643template <
typename T>
1645 const Half<
decltype(
d)> dh;
1649template <
typename T>
1651 const Half<
decltype(
d)> dh;
1655template <
typename T>
1657 const Half<
decltype(
d)> dh;
1659 constexpr size_t kLanesPerHalf = 16 /
sizeof(T);
1660 return lo >= 0 ?
static_cast<size_t>(lo)
1664template <
typename T>
1666 const Half<
decltype(
d)> dh;
1669 if (lo < 0 && hi < 0)
return lo;
1670 constexpr int kLanesPerHalf = 16 /
sizeof(T);
1671 return lo >= 0 ? lo : hi + kLanesPerHalf;
1675template <
typename T>
1678 const Half<
decltype(
d)> dh;
1681 return count + count2;
1685template <
typename T>
1688 const Half<
decltype(
d)> dh;
1691 return count + count2;
1696template <
typename T>
1706template <
typename T>
1709 alignas(32) T lanes[32 /
sizeof(T)] = {};
1711 return Load(
d, lanes);
1715template <
typename T>
1731template <
typename T>
1751template <
typename T>
1755 constexpr size_t N = 32 /
sizeof(T);
1775template <
typename T>
1780 constexpr size_t N = 32 /
sizeof(T);
1806template <
typename T>
1810 constexpr size_t N = 32 /
sizeof(T);
1813 StoreU(out0,
d, unaligned + 0 *
N);
1814 StoreU(out1,
d, unaligned + 1 *
N);
1825template <
typename T>
1829 constexpr size_t N = 32 /
sizeof(T);
1833 StoreU(out0,
d, unaligned + 0 *
N);
1834 StoreU(out1,
d, unaligned + 1 *
N);
1835 StoreU(out2,
d, unaligned + 2 *
N);
1848template <
typename T>
1852 constexpr size_t N = 32 /
sizeof(T);
1856 StoreU(out0,
d, unaligned + 0 *
N);
1857 StoreU(out1,
d, unaligned + 1 *
N);
1860 StoreU(out2,
d, unaligned + 2 *
N);
1861 StoreU(out3,
d, unaligned + 3 *
N);
1867template <
typename TN,
typename TW>
1871 const Half<
decltype(
d)> dh;
1878template <
typename TW>
1887template <
typename T>
1889 const Half<
decltype(
d)> dh;
1894template <
typename T>
1896 const Half<
decltype(
d)> dh;
1901template <
typename T>
1903 const Half<
decltype(
d)> dh;
1910template <
typename T>
1912 const Half<
decltype(
d)> dh;
1919template <
typename T>
1921 const Half<
decltype(
d)> dh;
1928template <
typename T>
1930 const Half<
decltype(
d)> dh;
1937template <
typename T>
1939 const Half<
decltype(
d)> dh;
1946template <
typename T>
1948 const Half<
decltype(
d)> dh;
1955template <
typename T>
1957 const Half<
decltype(
d)> dh;
1964template <
typename T>
1966 const Half<
decltype(
d)> dh;
1973template <
typename T>
1975 const Half<
decltype(
d)> dh;
1982template <
typename T>
1984 const Half<
decltype(
d)> dh;
1991template <
typename T>
1993 const Half<
decltype(
d)> dh;
#define HWY_RESTRICT
Definition: base.h:64
#define HWY_API
Definition: base.h:129
#define HWY_INLINE
Definition: base.h:70
#define HWY_ASSERT(condition)
Definition: base.h:192
Definition: arm_neon-inl.h:825
Definition: arm_neon-inl.h:778
Raw raw
Definition: arm_neon-inl.h:814
Definition: wasm_256-inl.h:27
HWY_INLINE Vec256 & operator^=(const Vec256 other)
Definition: wasm_256-inl.h:52
HWY_INLINE Vec256 & operator&=(const Vec256 other)
Definition: wasm_256-inl.h:46
HWY_INLINE Vec256 & operator-=(const Vec256 other)
Definition: wasm_256-inl.h:43
HWY_INLINE Vec256 & operator+=(const Vec256 other)
Definition: wasm_256-inl.h:40
Vec128< T > v1
Definition: wasm_256-inl.h:57
HWY_INLINE Vec256 & operator|=(const Vec256 other)
Definition: wasm_256-inl.h:49
HWY_INLINE Vec256 & operator/=(const Vec256 other)
Definition: wasm_256-inl.h:37
static constexpr size_t kPrivateN
Definition: wasm_256-inl.h:30
Vec128< T > v0
Definition: wasm_256-inl.h:56
T PrivateT
Definition: wasm_256-inl.h:29
HWY_INLINE Vec256 & operator*=(const Vec256 other)
Definition: wasm_256-inl.h:34
HWY_API Vec128< T, N > Shuffle2301(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: wasm_128-inl.h:2413
HWY_API void LoadTransposedBlocks3(Simd< T, N, 0 > d, const T *HWY_RESTRICT unaligned, V &A, V &B, V &C)
Definition: generic_ops-inl.h:159
HWY_API Vec128< T, N > Shuffle3012(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: wasm_128-inl.h:2451
HWY_API void StoreTransposedBlocks2(const V A, const V B, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: generic_ops-inl.h:470
HWY_INLINE Vec128< T, N > Add(hwy::NonFloatTag, Vec128< T, N > a, Vec128< T, N > b)
Definition: emu128-inl.h:535
HWY_API void StoreTransposedBlocks4(const V A, const V B, const V C, const V D, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: generic_ops-inl.h:862
HWY_API svfloat32_t PromoteUpperTo(Simd< float, N, kPow2 > df, svfloat16_t v)
Definition: arm_sve-inl.h:1299
HWY_API Vec128< T, N > Shuffle1230(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: wasm_128-inl.h:2432
HWY_INLINE Vec128< T, N > IfThenElse(hwy::SizeTag< 1 >, Mask128< T, N > mask, Vec128< T, N > yes, Vec128< T, N > no)
Definition: x86_128-inl.h:670
HWY_API void StoreTransposedBlocks3(const V A, const V B, const V C, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: generic_ops-inl.h:505
HWY_API Vec128< ToT, N > ConvertTo(hwy::FloatTag, Simd< ToT, N, 0 >, Vec128< FromT, N > from)
Definition: emu128-inl.h:1685
HWY_API void LoadTransposedBlocks4(Simd< T, N, 0 > d, const T *HWY_RESTRICT unaligned, V &A, V &B, V &C, V &D)
Definition: generic_ops-inl.h:340
d
Definition: rvv-inl.h:1998
HWY_API Vec128< T, N > ShiftLeftSame(const Vec128< T, N > v, int bits)
Definition: arm_neon-inl.h:1631
HWY_API Vec128< T, N > AverageRound(Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:619
HWY_API Vec128< T, N > CopySign(const Vec128< T, N > magn, const Vec128< T, N > sign)
Definition: arm_neon-inl.h:2190
HWY_API Vec128< T, N > OddEvenBlocks(Vec128< T, N >, Vec128< T, N > even)
Definition: arm_neon-inl.h:4697
HWY_API Mask128< T, N > operator>(Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:2445
HWY_API Vec128< T, N > operator-(Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:576
HWY_API Mask128< TTo, N > RebindMask(Simd< TTo, N, 0 > dto, Mask128< TFrom, N > m)
Definition: arm_neon-inl.h:2230
HWY_API Vec128< T, N > DupOdd(Vec128< T, N > v)
Definition: arm_neon-inl.h:4662
HWY_API Mask128< T, N > operator==(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:1139
HWY_API VFromD< DW > ZipLower(V a, V b)
Definition: arm_neon-inl.h:4272
HWY_INLINE Mask128< T, N > Ne128Upper(Simd< T, N, 0 > d, Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:6685
HWY_API bool AllTrue(const Full128< T > d, const Mask128< T > m)
Definition: arm_neon-inl.h:5716
HWY_API Vec128< T > Shuffle1032(const Vec128< T > v)
Definition: arm_neon-inl.h:4131
HWY_API Vec128< int16_t > MulHigh(const Vec128< int16_t > a, const Vec128< int16_t > b)
Definition: arm_neon-inl.h:1684
HWY_API Vec128< T > Shuffle2103(const Vec128< T > v)
Definition: arm_neon-inl.h:4147
HWY_API Vec128< float, N > Round(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3436
HWY_API Vec128< T, N > ZeroExtendVector(Simd< T, N, 0 > d, Vec128< T, N/2 > lo)
Definition: arm_neon-inl.h:4448
HWY_API Vec256< T > TableLookupLanesOr0(Vec256< T > v, Indices256< T > idx)
Definition: wasm_256-inl.h:1090
HWY_API Mask128< T, N > IsNaN(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3506
HWY_API intptr_t FindFirstTrue(const Simd< T, N, 0 > d, const Mask128< T, N > mask)
Definition: arm_neon-inl.h:5691
HWY_API V128 CombineShiftRightBytes(Full128< T > d, V128 hi, V128 lo)
Definition: arm_neon-inl.h:3592
HWY_API Vec128< T, N > ShiftLeftLanes(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3695
HWY_API Mask128< T, N > FirstN(const Simd< T, N, 0 > d, size_t num)
Definition: arm_neon-inl.h:2456
HWY_API size_t StoreMaskBits(Simd< T, N, 0 >, const Mask128< T, N > mask, uint8_t *bits)
Definition: arm_neon-inl.h:5701
HWY_API Vec128< float, N > MulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition: arm_neon-inl.h:1799
HWY_API void Stream(const Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2955
HWY_API Vec128< T, N > Xor3(Vec128< T, N > x1, Vec128< T, N > x2, Vec128< T, N > x3)
Definition: arm_neon-inl.h:2025
HWY_INLINE Mask128< T, N > Eq128Upper(Simd< T, N, 0 > d, Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:6668
HWY_API Vec128< T, N > And(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1949
HWY_API Vec128< T, N > SumOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:5334
HWY_API Vec128< T, N > BroadcastSignBit(const Vec128< T, N > v)
Definition: arm_neon-inl.h:2207
HWY_API Vec128< To, 1 > TruncateTo(Simd< To, 1, 0 >, const Vec128< From, 1 > v)
Definition: arm_neon-inl.h:4806
HWY_API Vec128< uint64_t, N > Min(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition: arm_neon-inl.h:2517
HWY_API Vec128< uint64_t, N > Max(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition: arm_neon-inl.h:2555
HWY_API Mask128< T, N > MaskFromVec(const Vec128< T, N > v)
Definition: arm_neon-inl.h:2217
HWY_API Vec128< T, N > ConcatUpperUpper(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4517
HWY_INLINE Mask128< T, N > Ne128(Simd< T, N, 0 > d, Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:6677
Rebind< MakeUnsigned< TFromD< D > >, D > RebindToUnsigned
Definition: ops/shared-inl.h:212
HWY_API Vec128< T, N > SaturatedAdd(Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:597
HWY_API Vec128< T, N > GatherIndex(const Simd< T, N, 0 > d, const T *HWY_RESTRICT base, const Vec128< Index, N > index)
Definition: arm_neon-inl.h:5037
HWY_INLINE Vec128< uint64_t > MulOdd(Vec128< uint64_t > a, Vec128< uint64_t > b)
Definition: arm_neon-inl.h:4912
HWY_INLINE Mask128< T, N > Eq128(Simd< T, N, 0 > d, Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:6660
N ConcatEven(Simd< T, N, 0 >, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4617
HWY_API Vec128< T > Shuffle0321(const Vec128< T > v)
Definition: arm_neon-inl.h:4141
HWY_API Vec128< T > Not(const Vec128< T > v)
Definition: arm_neon-inl.h:1931
HWY_API Mask128< T, N > IsInf(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3511
HWY_API Vec128< T, N > ConcatLowerUpper(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4544
HWY_API Vec128< T, N/2 > LowerHalf(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3540
HWY_API Vec128< T, N > operator&(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:2055
HWY_API Vec128< T, N > operator|(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:2060
HWY_API Vec128< uint64_t > InterleaveLower(const Vec128< uint64_t > a, const Vec128< uint64_t > b)
Definition: arm_neon-inl.h:4181
HWY_API Vec128< int64_t > MulEven(Vec128< int32_t > a, Vec128< int32_t > b)
Definition: arm_neon-inl.h:4872
HWY_API Vec128< bfloat16_t, 2 *N > ReorderDemote2To(Simd< bfloat16_t, 2 *N, 0 > dbf16, Vec128< float, N > a, Vec128< float, N > b)
Definition: arm_neon-inl.h:4719
HWY_API Vec128< T, 1 > CompressNot(Vec128< T, 1 > v, Mask128< T, 1 >)
Definition: arm_neon-inl.h:6198
HWY_API Vec128< T, N > MaskedLoad(Mask128< T, N > m, Simd< T, N, 0 > d, const T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2758
Rebind< MakeSigned< TFromD< D > >, D > RebindToSigned
Definition: ops/shared-inl.h:210
HWY_API Mask128< T, N > operator<(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:1163
HWY_API Vec128< uint64_t > CompressBlocksNot(Vec128< uint64_t > v, Mask128< uint64_t >)
Definition: arm_neon-inl.h:6226
HWY_API Vec128< float, N > ReorderWidenMulAccumulate(Simd< float, N, 0 > df32, Vec128< bfloat16_t, 2 *N > a, Vec128< bfloat16_t, 2 *N > b, const Vec128< float, N > sum0, Vec128< float, N > &sum1)
Definition: arm_neon-inl.h:4288
HWY_API Vec128< T, N > IfVecThenElse(Vec128< T, N > mask, Vec128< T, N > yes, Vec128< T, N > no)
Definition: arm_neon-inl.h:2047
HWY_API Vec128< T, N > operator^(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:2065
HWY_API void BlendedStore(Vec128< T, N > v, Mask128< T, N > m, Simd< T, N, 0 > d, T *HWY_RESTRICT p)
Definition: arm_neon-inl.h:2941
HWY_API size_t CountTrue(Full128< T >, const Mask128< T > mask)
Definition: arm_neon-inl.h:5671
HWY_API Vec128< T, N > VecFromMask(Simd< T, N, 0 > d, const Mask128< T, N > v)
Definition: arm_neon-inl.h:2223
HWY_API Vec128< T, N > DupEven(Vec128< T, N > v)
Definition: arm_neon-inl.h:4646
HWY_API Vec128< T, N > IfThenElseZero(const Mask128< T, N > mask, const Vec128< T, N > yes)
Definition: arm_neon-inl.h:2253
HWY_API Mask128< uint64_t, N > TestBit(Vec128< uint64_t, N > v, Vec128< uint64_t, N > bit)
Definition: arm_neon-inl.h:2477
HWY_API constexpr size_t Lanes(Simd< T, N, kPow2 >)
Definition: arm_sve-inl.h:243
HWY_API Vec128< T, N > Load(Simd< T, N, 0 > d, const T *HWY_RESTRICT p)
Definition: arm_neon-inl.h:2753
HWY_API Vec128< int64_t > Neg(const Vec128< int64_t > v)
Definition: arm_neon-inl.h:1413
HWY_API Vec128< TI > TableLookupBytes(const Vec128< T > bytes, const Vec128< TI > from)
Definition: arm_neon-inl.h:4922
HWY_API Vec128< T, N > IfThenElse(const Mask128< T, N > mask, const Vec128< T, N > yes, const Vec128< T, N > no)
Definition: emu128-inl.h:303
HWY_API Vec128< T, N > TableLookupLanes(Vec128< T, N > v, Indices128< T, N > idx)
Definition: arm_neon-inl.h:4019
HWY_API Vec128< T, N > Xor(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1998
HWY_API Vec128< float, N > Floor(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3467
HWY_API Vec128< float, N > MulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition: arm_neon-inl.h:1853
HWY_API Vec128< T, N > CopySignToAbs(const Vec128< T, N > abs, const Vec128< T, N > sign)
Definition: arm_neon-inl.h:2198
HWY_API void StoreU(const Vec128< uint8_t > v, Full128< uint8_t >, uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:2772
HWY_INLINE VFromD< D > Min128Upper(D d, const VFromD< D > a, const VFromD< D > b)
Definition: arm_neon-inl.h:6705
N ConcatOdd(Simd< T, N, 0 >, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4586
HWY_API Vec128< float, N > Ceil(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3453
HWY_API Indices128< T, N > IndicesFromVec(Simd< T, N, 0 > d, Vec128< TI, N > vec)
Definition: arm_neon-inl.h:3973
HWY_API Vec128< T, N > SwapAdjacentBlocks(Vec128< T, N > v)
Definition: arm_neon-inl.h:4704
HWY_API Vec128< T, N > ShiftLeftBytes(Simd< T, N, 0 >, Vec128< T, N > v)
Definition: arm_neon-inl.h:3684
HWY_INLINE VFromD< D > Min128(D d, const VFromD< D > a, const VFromD< D > b)
Definition: arm_neon-inl.h:6695
HWY_API Vec128< T, N > Reverse2(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4061
HWY_API Vec64< uint32_t > Shuffle2301(const Vec64< uint32_t > v)
Definition: arm_neon-inl.h:2326
svuint16_t Set(Simd< bfloat16_t, N, kPow2 > d, bfloat16_t arg)
Definition: arm_sve-inl.h:322
HWY_API Vec128< uint8_t > Combine(Full128< uint8_t >, Vec64< uint8_t > hi, Vec64< uint8_t > lo)
Definition: arm_neon-inl.h:4352
HWY_API Vec128< T, N > Reverse8(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4113
HWY_API Vec< D > SignBit(D d)
Definition: generic_ops-inl.h:69
HWY_API Vec128< T, N > MaxOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:5342
Vec128< T, N > Iota(const Simd< T, N, 0 > d, const T2 first)
Definition: arm_neon-inl.h:1049
HWY_API Mask128< T, N > ExclusiveNeither(const Mask128< T, N > a, Mask128< T, N > b)
Definition: arm_neon-inl.h:2314
HWY_INLINE Vec128< T, N > CompressBits(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits)
Definition: arm_neon-inl.h:6234
HWY_API Mask128< T, N > LoadMaskBits(Simd< T, N, 0 > d, const uint8_t *HWY_RESTRICT bits)
Definition: arm_neon-inl.h:5407
HWY_API Vec128< T, N > ZeroIfNegative(Vec128< T, N > v)
Definition: arm_neon-inl.h:2277
HWY_API Vec128< T > Shuffle01(const Vec128< T > v)
Definition: arm_neon-inl.h:4135
HWY_INLINE VFromD< D > Max128Upper(D d, const VFromD< D > a, const VFromD< D > b)
Definition: arm_neon-inl.h:6710
HWY_INLINE Mask128< T, N > Lt128(Simd< T, N, 0 > d, Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:6623
HWY_API Vec128< float, N > operator/(const Vec128< float, N > a, const Vec128< float, N > b)
Definition: arm_neon-inl.h:1761
HWY_API Vec64< uint16_t > DemoteTo(Full64< uint16_t >, const Vec128< int32_t > v)
Definition: arm_neon-inl.h:3145
HWY_API Vec128< uint8_t > LoadU(Full128< uint8_t >, const uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:2591
HWY_API Vec128< T, N > OrAnd(Vec128< T, N > o, Vec128< T, N > a1, Vec128< T, N > a2)
Definition: arm_neon-inl.h:2040
HWY_API Vec128< T, N > IfNegativeThenElse(Vec128< T, N > v, Vec128< T, N > yes, Vec128< T, N > no)
Definition: arm_neon-inl.h:2266
HWY_API Vec128< T, N > ConcatUpperLower(Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4570
HWY_API Vec128< uint8_t > operator<<(const Vec128< uint8_t > v, const Vec128< uint8_t > bits)
Definition: arm_neon-inl.h:1462
HWY_API Vec128< uint16_t > operator*(const Vec128< uint16_t > a, const Vec128< uint16_t > b)
Definition: arm_neon-inl.h:1642
HWY_API Vec128< T, N > BitCast(Simd< T, N, 0 > d, Vec128< FromT, N *sizeof(T)/sizeof(FromT)> v)
Definition: arm_neon-inl.h:997
HWY_API bool AllFalse(const Simd< T, N, 0 > d, const Mask128< T, N > m)
Definition: arm_neon-inl.h:5710
HWY_API Vec64< uint8_t > UpperHalf(Full64< uint8_t >, const Vec128< uint8_t > v)
Definition: arm_neon-inl.h:3739
HWY_API T ExtractLane(const Vec128< T, 1 > v, size_t i)
Definition: arm_neon-inl.h:1085
HWY_API void ScatterOffset(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT base, const Vec128< Offset, N > offset)
Definition: arm_neon-inl.h:4984
HWY_API Vec128< T, N > Undefined(Simd< T, N, 0 >)
Definition: arm_neon-inl.h:1040
HWY_API VFromD< DW > ZipUpper(DW dw, V a, V b)
Definition: arm_neon-inl.h:4281
HWY_API Vec128< T, N > ShiftRight(Vec128< T, N > v)
Definition: emu128-inl.h:386
HWY_API Vec128< T, N > ConcatLowerLower(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4456
typename D::template Rebind< T > Rebind
Definition: ops/shared-inl.h:207
HWY_API Vec128< float, N > RearrangeToOddPlusEven(const Vec128< float, N > sum0, const Vec128< float, N > sum1)
Definition: arm_neon-inl.h:4412
HWY_API Vec128< T, N > Zero(Simd< T, N, 0 > d)
Definition: arm_neon-inl.h:1020
HWY_API Mask128< T, N > operator>=(Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:2449
HWY_API Vec128< T, N > ShiftRightSame(const Vec128< T, N > v, int bits)
Definition: arm_neon-inl.h:1635
HWY_API V InterleaveUpper(Simd< T, N, 0 >, V a, V b)
Definition: arm_neon-inl.h:4256
HWY_API Vec128< T, N > GatherOffset(const Simd< T, N, 0 > d, const T *HWY_RESTRICT base, const Vec128< Offset, N > offset)
Definition: arm_neon-inl.h:5020
HWY_API Vec128< T, N > IfThenZeroElse(const Mask128< T, N > mask, const Vec128< T, N > no)
Definition: arm_neon-inl.h:2260
HWY_API Mask128< T, N > operator!=(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:1148
HWY_API Vec128< T, N > Or(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1986
HWY_INLINE VFromD< D > Max128(D d, const VFromD< D > a, const VFromD< D > b)
Definition: arm_neon-inl.h:6700
HWY_API Vec128< int32_t, N > NearestInt(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3497
HWY_API Vec128< float > ApproximateReciprocal(const Vec128< float > v)
Definition: arm_neon-inl.h:1734
HWY_API Vec32< uint8_t > U8FromU32(const Vec128< uint32_t > v)
Definition: arm_neon-inl.h:3287
HWY_API Indices128< T, N > SetTableIndices(Simd< T, N, 0 > d, const TI *idx)
Definition: arm_neon-inl.h:4013
HWY_API TFromV< V > GetLane(const V v)
Definition: arm_neon-inl.h:1076
HWY_API void ScatterIndex(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT base, const Vec128< Index, N > index)
Definition: arm_neon-inl.h:5002
HWY_API Vec128< float, N > NegMulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition: arm_neon-inl.h:1832
HWY_API Vec128< uint16_t > PromoteTo(Full128< uint16_t >, const Vec64< uint8_t > v)
Definition: arm_neon-inl.h:2965
HWY_API Mask128< T, N > operator<=(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:1180
HWY_API Vec128< T, N > Or3(Vec128< T, N > o1, Vec128< T, N > o2, Vec128< T, N > o3)
Definition: arm_neon-inl.h:2033
decltype(Zero(D())) VFromD
Definition: arm_neon-inl.h:1030
HWY_API Vec128< T, N > LoadDup128(Simd< T, N, 0 > d, const T *const HWY_RESTRICT p)
Definition: arm_neon-inl.h:2765
HWY_API Vec128< T, N > OddEven(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:4678
HWY_API Vec128< int16_t > MulFixedPoint15(Vec128< int16_t > a, Vec128< int16_t > b)
Definition: arm_neon-inl.h:1720
HWY_API Vec128< T > Shuffle0123(const Vec128< T > v)
Definition: arm_neon-inl.h:4153
HWY_API Vec128< float, N > Trunc(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3425
typename D::Half Half
Definition: ops/shared-inl.h:227
HWY_API Vec128< T, N > MinOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:5338
HWY_API Vec128< T, N > ShiftRightBytes(Simd< T, N, 0 >, Vec128< T, N > v)
Definition: arm_neon-inl.h:3707
HWY_API size_t CompressStore(Vec128< T, N > v, const Mask128< T, N > mask, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:6248
typename D::template Repartition< T > Repartition
Definition: ops/shared-inl.h:218
HWY_API Vec128< int8_t > Abs(const Vec128< int8_t > v)
Definition: arm_neon-inl.h:2146
HWY_API Vec128< float > ConvertTo(Full128< float >, const Vec128< int32_t > v)
Definition: arm_neon-inl.h:3327
N
Definition: rvv-inl.h:1998
HWY_API Vec128< float, N > Sqrt(const Vec128< float, N > v)
Definition: arm_neon-inl.h:1913
HWY_API size_t CompressBitsStore(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:6273
HWY_API Vec128< uint32_t, N > RotateRight(const Vec128< uint32_t, N > v)
Definition: arm_neon-inl.h:1444
HWY_API Mask128< T, N > IsFinite(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3521
HWY_API Vec128< T, N > AndNot(const Vec128< T, N > not_mask, const Vec128< T, N > mask)
Definition: arm_neon-inl.h:1964
HWY_API Vec128< uint64_t > SumsOf8(const Vec128< uint8_t > v)
Definition: arm_neon-inl.h:1361
HWY_API Vec128< float > ApproximateReciprocalSqrt(const Vec128< float > v)
Definition: arm_neon-inl.h:1885
HWY_API Vec128< T > ReverseBlocks(Full128< T >, const Vec128< T > v)
Definition: arm_neon-inl.h:4712
HWY_API size_t CompressBlendedStore(Vec128< T, N > v, Mask128< T, N > m, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:6257
HWY_API Vec128< T, N > Reverse4(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4090
HWY_API size_t FindKnownFirstTrue(const Simd< T, N, 0 > d, const Mask128< T, N > mask)
Definition: arm_neon-inl.h:5683
HWY_API Vec128< T, N > operator+(Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:580
HWY_API Vec128< T, 1 > Reverse(Simd< T, 1, 0 >, const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:4030
HWY_API Vec128< uint8_t > operator>>(const Vec128< uint8_t > v, const Vec128< uint8_t > bits)
Definition: arm_neon-inl.h:1542
HWY_API void Store(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2934
HWY_API Vec128< T, 1 > InsertLane(const Vec128< T, 1 > v, size_t i, T t)
Definition: arm_neon-inl.h:1225
HWY_INLINE Mask128< T, N > Lt128Upper(Simd< T, N, 0 > d, Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:6651
HWY_API Vec128< T, N > SaturatedSub(Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:608
HWY_API Vec128< T, N > ShiftLeft(Vec128< T, N > v)
Definition: emu128-inl.h:376
HWY_API Vec128< uint16_t > Broadcast(const Vec128< uint16_t > v)
Definition: arm_neon-inl.h:3885
const vfloat64m1_t v
Definition: rvv-inl.h:1998
HWY_API Vec128< float > AbsDiff(const Vec128< float > a, const Vec128< float > b)
Definition: arm_neon-inl.h:1773
HWY_API Vec128< T, N > ShiftRightLanes(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3713
HWY_API VI TableLookupBytesOr0(const V bytes, const VI from)
Definition: arm_neon-inl.h:4977
HWY_API Vec128< T, 1 > Compress(Vec128< T, 1 > v, Mask128< T, 1 >)
Definition: arm_neon-inl.h:6174
HWY_API Vec128< float, N > NegMulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition: arm_neon-inl.h:1861
Definition: aligned_allocator.h:27
HWY_INLINE constexpr T AddWithWraparound(hwy::FloatTag, T t, size_t n)
Definition: base.h:906
typename detail::Relations< T >::Unsigned MakeUnsigned
Definition: base.h:593
typename detail::Relations< T >::Signed MakeSigned
Definition: base.h:595
#define HWY_NAMESPACE
Definition: set_macros-inl.h:82
Definition: arm_neon-inl.h:3968
Definition: wasm_256-inl.h:1043
__v128_u i0
Definition: wasm_256-inl.h:1044
__v128_u i1
Definition: wasm_256-inl.h:1045
Definition: wasm_256-inl.h:61
Mask128< T > m1
Definition: wasm_256-inl.h:63
Mask128< T > m0
Definition: wasm_256-inl.h:62
Definition: ops/shared-inl.h:52