core/stdarch/crates/core_arch/src/x86/
avx512bf16.rs

1//! [AVX512BF16 intrinsics].
2//!
3//! [AVX512BF16 intrinsics]: https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769&avx512techs=AVX512_BF16
4
5use crate::arch::asm;
6use crate::core_arch::{simd::*, x86::*};
7use crate::intrinsics::simd::*;
8
9#[cfg(test)]
10use stdarch_test::assert_instr;
11
12#[allow(improper_ctypes)]
13extern "C" {
14    #[link_name = "llvm.x86.avx512bf16.cvtne2ps2bf16.128"]
15    fn cvtne2ps2bf16(a: f32x4, b: f32x4) -> i16x8;
16    #[link_name = "llvm.x86.avx512bf16.cvtne2ps2bf16.256"]
17    fn cvtne2ps2bf16_256(a: f32x8, b: f32x8) -> i16x16;
18    #[link_name = "llvm.x86.avx512bf16.cvtne2ps2bf16.512"]
19    fn cvtne2ps2bf16_512(a: f32x16, b: f32x16) -> i16x32;
20    #[link_name = "llvm.x86.avx512bf16.cvtneps2bf16.256"]
21    fn cvtneps2bf16_256(a: f32x8) -> i16x8;
22    #[link_name = "llvm.x86.avx512bf16.cvtneps2bf16.512"]
23    fn cvtneps2bf16_512(a: f32x16) -> i16x16;
24    #[link_name = "llvm.x86.avx512bf16.dpbf16ps.128"]
25    fn dpbf16ps(a: f32x4, b: i32x4, c: i32x4) -> f32x4;
26    #[link_name = "llvm.x86.avx512bf16.dpbf16ps.256"]
27    fn dpbf16ps_256(a: f32x8, b: i32x8, c: i32x8) -> f32x8;
28    #[link_name = "llvm.x86.avx512bf16.dpbf16ps.512"]
29    fn dpbf16ps_512(a: f32x16, b: i32x16, c: i32x16) -> f32x16;
30}
31
32/// Convert packed single-precision (32-bit) floating-point elements in two 128-bit vectors
33/// a and b to packed BF16 (16-bit) floating-point elements, and store the results in a
34/// 128-bit wide vector.
35/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651&avx512techs=AVX512_BF16&text=_mm_cvtne2ps_pbh)
36#[inline]
37#[target_feature(enable = "avx512bf16,avx512vl")]
38#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
39#[cfg_attr(test, assert_instr("vcvtne2ps2bf16"))]
40pub unsafe fn _mm_cvtne2ps_pbh(a: __m128, b: __m128) -> __m128bh {
41    transmute(cvtne2ps2bf16(a.as_f32x4(), b.as_f32x4()))
42}
43
44/// Convert packed single-precision (32-bit) floating-point elements in two vectors
45/// a and b to packed BF16 (16-bit) floating-point elements, and store the results
46/// in single vector dst using writemask k (elements are copied from src when the
47/// corresponding mask bit is not set).
48/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651&avx512techs=AVX512_BF16&text=_mm_mask_cvtne2ps_pbh)
49#[inline]
50#[target_feature(enable = "avx512bf16,avx512vl")]
51#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
52#[cfg_attr(test, assert_instr("vcvtne2ps2bf16"))]
53pub unsafe fn _mm_mask_cvtne2ps_pbh(src: __m128bh, k: __mmask8, a: __m128, b: __m128) -> __m128bh {
54    let cvt = _mm_cvtne2ps_pbh(a, b).as_u16x8();
55    transmute(simd_select_bitmask(k, cvt, src.as_u16x8()))
56}
57
58/// Convert packed single-precision (32-bit) floating-point elements in two vectors
59/// a and b to packed BF16 (16-bit) floating-point elements, and store the results
60/// in single vector dst using zeromask k (elements are zeroed out when the corresponding
61/// mask bit is not set).
62/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651&avx512techs=AVX512_BF16&text=_mm_maskz_cvtne2ps_pbh)
63#[inline]
64#[target_feature(enable = "avx512bf16,avx512vl")]
65#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
66#[cfg_attr(test, assert_instr("vcvtne2ps2bf16"))]
67pub unsafe fn _mm_maskz_cvtne2ps_pbh(k: __mmask8, a: __m128, b: __m128) -> __m128bh {
68    let cvt = _mm_cvtne2ps_pbh(a, b).as_u16x8();
69    transmute(simd_select_bitmask(k, cvt, u16x8::ZERO))
70}
71
72/// Convert packed single-precision (32-bit) floating-point elements in two 256-bit vectors
73/// a and b to packed BF16 (16-bit) floating-point elements, and store the results in a
74/// 256-bit wide vector.
75/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654&avx512techs=AVX512_BF16&text=_mm256_cvtne2ps_pbh)
76#[inline]
77#[target_feature(enable = "avx512bf16,avx512vl")]
78#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
79#[cfg_attr(test, assert_instr("vcvtne2ps2bf16"))]
80pub unsafe fn _mm256_cvtne2ps_pbh(a: __m256, b: __m256) -> __m256bh {
81    transmute(cvtne2ps2bf16_256(a.as_f32x8(), b.as_f32x8()))
82}
83
84/// Convert packed single-precision (32-bit) floating-point elements in two vectors a and b
85/// to packed BF16 (16-bit) floating-point elements and store the results in single vector
86/// dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
87/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654&avx512techs=AVX512_BF16&text=_mm256_mask_cvtne2ps_pbh)
88#[inline]
89#[target_feature(enable = "avx512bf16,avx512vl")]
90#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
91#[cfg_attr(test, assert_instr("vcvtne2ps2bf16"))]
92pub unsafe fn _mm256_mask_cvtne2ps_pbh(
93    src: __m256bh,
94    k: __mmask16,
95    a: __m256,
96    b: __m256,
97) -> __m256bh {
98    let cvt = _mm256_cvtne2ps_pbh(a, b).as_u16x16();
99    transmute(simd_select_bitmask(k, cvt, src.as_u16x16()))
100}
101
102/// Convert packed single-precision (32-bit) floating-point elements in two vectors a and b
103/// to packed BF16 (16-bit) floating-point elements, and store the results in single vector
104/// dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
105/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654&avx512techs=AVX512_BF16&text=_mm256_maskz_cvtne2ps_pbh)
106#[inline]
107#[target_feature(enable = "avx512bf16,avx512vl")]
108#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
109#[cfg_attr(test, assert_instr("vcvtne2ps2bf16"))]
110pub unsafe fn _mm256_maskz_cvtne2ps_pbh(k: __mmask16, a: __m256, b: __m256) -> __m256bh {
111    let cvt = _mm256_cvtne2ps_pbh(a, b).as_u16x16();
112    transmute(simd_select_bitmask(k, cvt, u16x16::ZERO))
113}
114
115/// Convert packed single-precision (32-bit) floating-point elements in two 512-bit vectors
116/// a and b to packed BF16 (16-bit) floating-point elements, and store the results in a
117/// 512-bit wide vector.
118/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657&avx512techs=AVX512_BF16&text=_mm512_cvtne2ps_pbh)
119#[inline]
120#[target_feature(enable = "avx512bf16,avx512f")]
121#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
122#[cfg_attr(test, assert_instr("vcvtne2ps2bf16"))]
123pub unsafe fn _mm512_cvtne2ps_pbh(a: __m512, b: __m512) -> __m512bh {
124    transmute(cvtne2ps2bf16_512(a.as_f32x16(), b.as_f32x16()))
125}
126
127/// Convert packed single-precision (32-bit) floating-point elements in two vectors
128/// a and b to packed BF16 (16-bit) floating-point elements, and store the results
129/// in single vector dst using writemask k (elements are copied from src when the
130/// corresponding mask bit is not set).
131/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657&avx512techs=AVX512_BF16&text=_mm512_mask_cvtne2ps_pbh)
132#[inline]
133#[target_feature(enable = "avx512bf16,avx512f")]
134#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
135#[cfg_attr(test, assert_instr("vcvtne2ps2bf16"))]
136pub unsafe fn _mm512_mask_cvtne2ps_pbh(
137    src: __m512bh,
138    k: __mmask32,
139    a: __m512,
140    b: __m512,
141) -> __m512bh {
142    let cvt = _mm512_cvtne2ps_pbh(a, b).as_u16x32();
143    transmute(simd_select_bitmask(k, cvt, src.as_u16x32()))
144}
145
146/// Convert packed single-precision (32-bit) floating-point elements in two vectors
147/// a and b to packed BF16 (16-bit) floating-point elements, and store the results
148/// in single vector dst using zeromask k (elements are zeroed out when the corresponding
149/// mask bit is not set).
150/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657&avx512techs=AVX512_BF16&text=_mm512_maskz_cvtne2ps_pbh)
151#[inline]
152#[target_feature(enable = "avx512bf16,avx512f")]
153#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
154#[cfg_attr(test, assert_instr("vcvtne2ps2bf16"))]
155pub unsafe fn _mm512_maskz_cvtne2ps_pbh(k: __mmask32, a: __m512, b: __m512) -> __m512bh {
156    let cvt = _mm512_cvtne2ps_pbh(a, b).as_u16x32();
157    transmute(simd_select_bitmask(k, cvt, u16x32::ZERO))
158}
159
160/// Convert packed single-precision (32-bit) floating-point elements in a to packed BF16 (16-bit)
161/// floating-point elements, and store the results in dst.
162/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm256_cvtneps_pbh)
163#[inline]
164#[target_feature(enable = "avx512bf16,avx512vl")]
165#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
166#[cfg_attr(test, assert_instr("vcvtneps2bf16"))]
167pub unsafe fn _mm256_cvtneps_pbh(a: __m256) -> __m128bh {
168    transmute(cvtneps2bf16_256(a.as_f32x8()))
169}
170
171/// Convert packed single-precision (32-bit) floating-point elements in a to packed BF16 (16-bit)
172/// floating-point elements, and store the results in dst using writemask k
173/// (elements are copied from src when the corresponding mask bit is not set).
174/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm256_mask_cvtneps_pbh)
175#[inline]
176#[target_feature(enable = "avx512bf16,avx512vl")]
177#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
178#[cfg_attr(test, assert_instr("vcvtneps2bf16"))]
179pub unsafe fn _mm256_mask_cvtneps_pbh(src: __m128bh, k: __mmask8, a: __m256) -> __m128bh {
180    let cvt = _mm256_cvtneps_pbh(a).as_u16x8();
181    transmute(simd_select_bitmask(k, cvt, src.as_u16x8()))
182}
183
184/// Convert packed single-precision (32-bit) floating-point elements in a to packed BF16 (16-bit)
185/// floating-point elements, and store the results in dst using zeromask k
186/// (elements are zeroed out when the corresponding mask bit is not set).
187/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm256_maskz_cvtneps_pbh)
188#[inline]
189#[target_feature(enable = "avx512bf16,avx512vl")]
190#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
191#[cfg_attr(test, assert_instr("vcvtneps2bf16"))]
192pub unsafe fn _mm256_maskz_cvtneps_pbh(k: __mmask8, a: __m256) -> __m128bh {
193    let cvt = _mm256_cvtneps_pbh(a).as_u16x8();
194    transmute(simd_select_bitmask(k, cvt, u16x8::ZERO))
195}
196
197/// Convert packed single-precision (32-bit) floating-point elements in a to packed BF16 (16-bit)
198/// floating-point elements, and store the results in dst.
199/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm512_cvtneps_pbh)
200#[inline]
201#[target_feature(enable = "avx512bf16,avx512f")]
202#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
203#[cfg_attr(test, assert_instr("vcvtneps2bf16"))]
204pub unsafe fn _mm512_cvtneps_pbh(a: __m512) -> __m256bh {
205    transmute(cvtneps2bf16_512(a.as_f32x16()))
206}
207
208/// Convert packed single-precision (32-bit) floating-point elements in a to packed BF16 (16-bit)
209/// floating-point elements, and store the results in dst using writemask k
210/// (elements are copied from src when the corresponding mask bit is not set).
211/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm512_mask_cvtneps_pbh)
212#[inline]
213#[target_feature(enable = "avx512bf16,avx512f")]
214#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
215#[cfg_attr(test, assert_instr("vcvtneps2bf16"))]
216pub unsafe fn _mm512_mask_cvtneps_pbh(src: __m256bh, k: __mmask16, a: __m512) -> __m256bh {
217    let cvt = _mm512_cvtneps_pbh(a).as_u16x16();
218    transmute(simd_select_bitmask(k, cvt, src.as_u16x16()))
219}
220
221/// Convert packed single-precision (32-bit) floating-point elements in a to packed BF16 (16-bit)
222/// floating-point elements, and store the results in dst using zeromask k
223/// (elements are zeroed out when the corresponding mask bit is not set).
224/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm512_maskz_cvtneps_pbh)
225#[inline]
226#[target_feature(enable = "avx512bf16,avx512f")]
227#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
228#[cfg_attr(test, assert_instr("vcvtneps2bf16"))]
229pub unsafe fn _mm512_maskz_cvtneps_pbh(k: __mmask16, a: __m512) -> __m256bh {
230    let cvt = _mm512_cvtneps_pbh(a).as_u16x16();
231    transmute(simd_select_bitmask(k, cvt, u16x16::ZERO))
232}
233
234/// Compute dot-product of BF16 (16-bit) floating-point pairs in a and b,
235/// accumulating the intermediate single-precision (32-bit) floating-point elements
236/// with elements in src, and store the results in dst.
237/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm_dpbf16_ps)
238#[inline]
239#[target_feature(enable = "avx512bf16,avx512vl")]
240#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
241#[cfg_attr(test, assert_instr("vdpbf16ps"))]
242pub unsafe fn _mm_dpbf16_ps(src: __m128, a: __m128bh, b: __m128bh) -> __m128 {
243    transmute(dpbf16ps(src.as_f32x4(), a.as_i32x4(), b.as_i32x4()))
244}
245
246/// Compute dot-product of BF16 (16-bit) floating-point pairs in a and b,
247/// accumulating the intermediate single-precision (32-bit) floating-point elements
248/// with elements in src, and store the results in dst using writemask k
249/// (elements are copied from src when the corresponding mask bit is not set).
250/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm_mask_dpbf16_ps)
251#[inline]
252#[target_feature(enable = "avx512bf16,avx512vl")]
253#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
254#[cfg_attr(test, assert_instr("vdpbf16ps"))]
255pub unsafe fn _mm_mask_dpbf16_ps(src: __m128, k: __mmask8, a: __m128bh, b: __m128bh) -> __m128 {
256    let rst = _mm_dpbf16_ps(src, a, b).as_f32x4();
257    transmute(simd_select_bitmask(k, rst, src.as_f32x4()))
258}
259
260/// Compute dot-product of BF16 (16-bit) floating-point pairs in a and b,
261/// accumulating the intermediate single-precision (32-bit) floating-point elements
262/// with elements in src, and store the results in dst using zeromask k
263/// (elements are zeroed out when the corresponding mask bit is not set).
264/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm_maskz_dpbf16_ps)
265#[inline]
266#[target_feature(enable = "avx512bf16,avx512vl")]
267#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
268#[cfg_attr(test, assert_instr("vdpbf16ps"))]
269pub unsafe fn _mm_maskz_dpbf16_ps(k: __mmask8, src: __m128, a: __m128bh, b: __m128bh) -> __m128 {
270    let rst = _mm_dpbf16_ps(src, a, b).as_f32x4();
271    let zero = _mm_set1_ps(0.0_f32).as_f32x4();
272    transmute(simd_select_bitmask(k, rst, zero))
273}
274
275/// Compute dot-product of BF16 (16-bit) floating-point pairs in a and b,
276/// accumulating the intermediate single-precision (32-bit) floating-point elements
277/// with elements in src, and store the results in dst.
278/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm256_dpbf16_ps)
279#[inline]
280#[target_feature(enable = "avx512bf16,avx512vl")]
281#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
282#[cfg_attr(test, assert_instr("vdpbf16ps"))]
283pub unsafe fn _mm256_dpbf16_ps(src: __m256, a: __m256bh, b: __m256bh) -> __m256 {
284    transmute(dpbf16ps_256(src.as_f32x8(), a.as_i32x8(), b.as_i32x8()))
285}
286
287/// Compute dot-product of BF16 (16-bit) floating-point pairs in a and b,
288/// accumulating the intermediate single-precision (32-bit) floating-point elements
289/// with elements in src, and store the results in dst using writemask k
290/// (elements are copied from src when the corresponding mask bit is not set).
291/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm256_mask_dpbf16_ps)
292#[inline]
293#[target_feature(enable = "avx512bf16,avx512vl")]
294#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
295#[cfg_attr(test, assert_instr("vdpbf16ps"))]
296pub unsafe fn _mm256_mask_dpbf16_ps(src: __m256, k: __mmask8, a: __m256bh, b: __m256bh) -> __m256 {
297    let rst = _mm256_dpbf16_ps(src, a, b).as_f32x8();
298    transmute(simd_select_bitmask(k, rst, src.as_f32x8()))
299}
300
301/// Compute dot-product of BF16 (16-bit) floating-point pairs in a and b,
302/// accumulating the intermediate single-precision (32-bit) floating-point elements
303/// with elements in src, and store the results in dst using zeromask k
304/// (elements are zeroed out when the corresponding mask bit is not set).
305/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm256_maskz_dpbf16_ps)
306#[inline]
307#[target_feature(enable = "avx512bf16,avx512vl")]
308#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
309#[cfg_attr(test, assert_instr("vdpbf16ps"))]
310pub unsafe fn _mm256_maskz_dpbf16_ps(k: __mmask8, src: __m256, a: __m256bh, b: __m256bh) -> __m256 {
311    let rst = _mm256_dpbf16_ps(src, a, b).as_f32x8();
312    transmute(simd_select_bitmask(k, rst, f32x8::ZERO))
313}
314
315/// Compute dot-product of BF16 (16-bit) floating-point pairs in a and b,
316/// accumulating the intermediate single-precision (32-bit) floating-point elements
317/// with elements in src, and store the results in dst.Compute dot-product of BF16 (16-bit)
318/// floating-point pairs in a and b, accumulating the intermediate single-precision (32-bit)
319/// floating-point elements with elements in src, and store the results in dst.
320/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm512_dpbf16_ps)
321#[inline]
322#[target_feature(enable = "avx512bf16,avx512f")]
323#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
324#[cfg_attr(test, assert_instr("vdpbf16ps"))]
325pub unsafe fn _mm512_dpbf16_ps(src: __m512, a: __m512bh, b: __m512bh) -> __m512 {
326    transmute(dpbf16ps_512(src.as_f32x16(), a.as_i32x16(), b.as_i32x16()))
327}
328
329/// Compute dot-product of BF16 (16-bit) floating-point pairs in a and b,
330/// accumulating the intermediate single-precision (32-bit) floating-point elements
331/// with elements in src, and store the results in dst using writemask k
332/// (elements are copied from src when the corresponding mask bit is not set).
333/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm512_mask_dpbf16_ps)
334#[inline]
335#[target_feature(enable = "avx512bf16,avx512f")]
336#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
337#[cfg_attr(test, assert_instr("vdpbf16ps"))]
338pub unsafe fn _mm512_mask_dpbf16_ps(src: __m512, k: __mmask16, a: __m512bh, b: __m512bh) -> __m512 {
339    let rst = _mm512_dpbf16_ps(src, a, b).as_f32x16();
340    transmute(simd_select_bitmask(k, rst, src.as_f32x16()))
341}
342
343/// Compute dot-product of BF16 (16-bit) floating-point pairs in a and b,
344/// accumulating the intermediate single-precision (32-bit) floating-point elements
345/// with elements in src, and store the results in dst using zeromask k
346/// (elements are zeroed out when the corresponding mask bit is not set).
347/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm512_maskz_dpbf16_ps)
348#[inline]
349#[target_feature(enable = "avx512bf16,avx512f")]
350#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
351#[cfg_attr(test, assert_instr("vdpbf16ps"))]
352pub unsafe fn _mm512_maskz_dpbf16_ps(
353    k: __mmask16,
354    src: __m512,
355    a: __m512bh,
356    b: __m512bh,
357) -> __m512 {
358    let rst = _mm512_dpbf16_ps(src, a, b).as_f32x16();
359    transmute(simd_select_bitmask(k, rst, f32x16::ZERO))
360}
361
362/// Converts packed BF16 (16-bit) floating-point elements in a to packed single-precision (32-bit)
363/// floating-point elements, and store the results in dst.
364///
365/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtpbh_ps)
366#[inline]
367#[target_feature(enable = "avx512bf16,avx512f")]
368#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
369pub unsafe fn _mm512_cvtpbh_ps(a: __m256bh) -> __m512 {
370    _mm512_castsi512_ps(_mm512_slli_epi32::<16>(_mm512_cvtepi16_epi32(transmute(a))))
371}
372
373/// Converts packed BF16 (16-bit) floating-point elements in a to packed single-precision (32-bit)
374/// floating-point elements, and store the results in dst using writemask k (elements are copied
375/// from src when the corresponding mask bit is not set).
376///
377/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtpbh_ps)
378#[inline]
379#[target_feature(enable = "avx512bf16,avx512f")]
380#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
381pub unsafe fn _mm512_mask_cvtpbh_ps(src: __m512, k: __mmask16, a: __m256bh) -> __m512 {
382    let cvt = _mm512_cvtpbh_ps(a);
383    transmute(simd_select_bitmask(k, cvt.as_f32x16(), src.as_f32x16()))
384}
385
386/// Converts packed BF16 (16-bit) floating-point elements in a to packed single-precision (32-bit)
387/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out
388/// when the corresponding mask bit is not set).
389///
390/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtpbh_ps)
391#[inline]
392#[target_feature(enable = "avx512bf16,avx512f")]
393#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
394pub unsafe fn _mm512_maskz_cvtpbh_ps(k: __mmask16, a: __m256bh) -> __m512 {
395    let cvt = _mm512_cvtpbh_ps(a);
396    transmute(simd_select_bitmask(k, cvt.as_f32x16(), f32x16::ZERO))
397}
398
399/// Converts packed BF16 (16-bit) floating-point elements in a to packed single-precision (32-bit)
400/// floating-point elements, and store the results in dst.
401///
402/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtpbh_ps)
403#[inline]
404#[target_feature(enable = "avx512bf16,avx512vl")]
405#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
406pub unsafe fn _mm256_cvtpbh_ps(a: __m128bh) -> __m256 {
407    _mm256_castsi256_ps(_mm256_slli_epi32::<16>(_mm256_cvtepi16_epi32(transmute(a))))
408}
409
410/// Converts packed BF16 (16-bit) floating-point elements in a to packed single-precision (32-bit)
411/// floating-point elements, and store the results in dst using writemask k (elements are copied
412/// from src when the corresponding mask bit is not set).
413///
414/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtpbh_ps)
415#[inline]
416#[target_feature(enable = "avx512bf16,avx512vl")]
417#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
418pub unsafe fn _mm256_mask_cvtpbh_ps(src: __m256, k: __mmask8, a: __m128bh) -> __m256 {
419    let cvt = _mm256_cvtpbh_ps(a);
420    transmute(simd_select_bitmask(k, cvt.as_f32x8(), src.as_f32x8()))
421}
422
423/// Converts packed BF16 (16-bit) floating-point elements in a to packed single-precision (32-bit)
424/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out
425/// when the corresponding mask bit is not set).
426///
427/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtpbh_ps)
428#[inline]
429#[target_feature(enable = "avx512bf16,avx512vl")]
430#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
431pub unsafe fn _mm256_maskz_cvtpbh_ps(k: __mmask8, a: __m128bh) -> __m256 {
432    let cvt = _mm256_cvtpbh_ps(a);
433    transmute(simd_select_bitmask(k, cvt.as_f32x8(), f32x8::ZERO))
434}
435
436/// Converts packed BF16 (16-bit) floating-point elements in a to single-precision (32-bit) floating-point
437/// elements, and store the results in dst.
438///
439/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpbh_ps)
440#[inline]
441#[target_feature(enable = "avx512bf16,avx512vl")]
442#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
443pub unsafe fn _mm_cvtpbh_ps(a: __m128bh) -> __m128 {
444    _mm_castsi128_ps(_mm_slli_epi32::<16>(_mm_cvtepi16_epi32(transmute(a))))
445}
446
447/// Converts packed BF16 (16-bit) floating-point elements in a to single-precision (32-bit) floating-point
448/// elements, and store the results in dst using writemask k (elements are copied from src when the corresponding
449/// mask bit is not set).
450///
451/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtpbh_ps)
452#[inline]
453#[target_feature(enable = "avx512bf16,avx512vl")]
454#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
455pub unsafe fn _mm_mask_cvtpbh_ps(src: __m128, k: __mmask8, a: __m128bh) -> __m128 {
456    let cvt = _mm_cvtpbh_ps(a);
457    transmute(simd_select_bitmask(k, cvt.as_f32x4(), src.as_f32x4()))
458}
459
460/// Converts packed BF16 (16-bit) floating-point elements in a to single-precision (32-bit) floating-point
461/// elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding
462/// mask bit is not set).
463///
464/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtpbh_ps)
465#[inline]
466#[target_feature(enable = "avx512bf16,avx512vl")]
467#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
468pub unsafe fn _mm_maskz_cvtpbh_ps(k: __mmask8, a: __m128bh) -> __m128 {
469    let cvt = _mm_cvtpbh_ps(a);
470    transmute(simd_select_bitmask(k, cvt.as_f32x4(), f32x4::ZERO))
471}
472
473/// Converts a single BF16 (16-bit) floating-point element in a to a single-precision (32-bit) floating-point
474/// element, and store the result in dst.
475///
476/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsbh_ss)
477#[inline]
478#[target_feature(enable = "avx512bf16,avx512f")]
479#[unstable(feature = "stdarch_x86_avx512_bf16", issue = "127356")]
480pub unsafe fn _mm_cvtsbh_ss(a: bf16) -> f32 {
481    f32::from_bits((a.to_bits() as u32) << 16)
482}
483
484/// Converts packed single-precision (32-bit) floating-point elements in a to packed BF16 (16-bit)
485/// floating-point elements, and store the results in dst.
486///
487/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtneps_pbh)
488#[inline]
489#[target_feature(enable = "avx512bf16,avx512vl")]
490#[cfg_attr(test, assert_instr("vcvtneps2bf16"))]
491#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
492pub unsafe fn _mm_cvtneps_pbh(a: __m128) -> __m128bh {
493    let mut dst: __m128bh;
494    asm!(
495        "vcvtneps2bf16 {dst}, {src}",
496        dst = lateout(xmm_reg) dst,
497        src = in(xmm_reg) a,
498        options(pure, nomem, nostack, preserves_flags)
499    );
500    dst
501}
502
503/// Converts packed single-precision (32-bit) floating-point elements in a to packed BF16 (16-bit)
504/// floating-point elements, and store the results in dst using writemask k (elements are copied
505/// from src when the corresponding mask bit is not set).
506///
507/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtneps_pbh)
508#[inline]
509#[target_feature(enable = "avx512bf16,avx512vl")]
510#[cfg_attr(test, assert_instr("vcvtneps2bf16"))]
511#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
512pub unsafe fn _mm_mask_cvtneps_pbh(src: __m128bh, k: __mmask8, a: __m128) -> __m128bh {
513    let mut dst = src;
514    asm!(
515        "vcvtneps2bf16 {dst}{{{k}}},{src}",
516        dst = inlateout(xmm_reg) dst,
517        src = in(xmm_reg) a,
518        k = in(kreg) k,
519        options(pure, nomem, nostack, preserves_flags)
520    );
521    dst
522}
523
524/// Converts packed single-precision (32-bit) floating-point elements in a to packed BF16 (16-bit)
525/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out
526/// when the corresponding mask bit is not set).
527///
528/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtneps_pbh)
529#[inline]
530#[target_feature(enable = "avx512bf16,avx512vl")]
531#[cfg_attr(test, assert_instr("vcvtneps2bf16"))]
532#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
533pub unsafe fn _mm_maskz_cvtneps_pbh(k: __mmask8, a: __m128) -> __m128bh {
534    let mut dst: __m128bh;
535    asm!(
536        "vcvtneps2bf16 {dst}{{{k}}}{{z}},{src}",
537        dst = lateout(xmm_reg) dst,
538        src = in(xmm_reg) a,
539        k = in(kreg) k,
540        options(pure, nomem, nostack, preserves_flags)
541    );
542    dst
543}
544
545/// Converts a single-precision (32-bit) floating-point element in a to a BF16 (16-bit) floating-point
546/// element, and store the result in dst.
547///
548/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtness_sbh)
549#[inline]
550#[target_feature(enable = "avx512bf16,avx512vl")]
551#[unstable(feature = "stdarch_x86_avx512_bf16", issue = "127356")]
552pub unsafe fn _mm_cvtness_sbh(a: f32) -> bf16 {
553    let value: u16 = simd_extract!(_mm_cvtneps_pbh(_mm_set_ss(a)), 0);
554    bf16::from_bits(value)
555}
556
557#[cfg(test)]
558mod tests {
559    use crate::core_arch::simd::u16x4;
560    use crate::{
561        core_arch::x86::*,
562        mem::{transmute, transmute_copy},
563    };
564    use stdarch_test::simd_test;
565
566    #[simd_test(enable = "avx512bf16,avx512vl")]
567    unsafe fn test_mm_cvtne2ps_pbh() {
568        let a_array = [178.125_f32, 10.5_f32, 3.75_f32, 50.25_f32];
569        let b_array = [-178.125_f32, -10.5_f32, -3.75_f32, -50.25_f32];
570        let a: __m128 = transmute(a_array);
571        let b: __m128 = transmute(b_array);
572        let c: __m128bh = _mm_cvtne2ps_pbh(a, b);
573        let result: [u16; 8] = transmute(c.as_u16x8());
574        #[rustfmt::skip]
575        let expected_result: [u16; 8] = [
576            0b1_10000110_0110010,
577            0b1_10000010_0101000,
578            0b1_10000000_1110000,
579            0b1_10000100_1001001,
580            0b0_10000110_0110010,
581            0b0_10000010_0101000,
582            0b0_10000000_1110000,
583            0b0_10000100_1001001,
584        ];
585        assert_eq!(result, expected_result);
586    }
587
588    #[simd_test(enable = "avx512bf16,avx512vl")]
589    unsafe fn test_mm_mask_cvtne2ps_pbh() {
590        let a_array = [178.125_f32, 10.5_f32, 3.75_f32, 50.25_f32];
591        let b_array = [-178.125_f32, -10.5_f32, -3.75_f32, -50.25_f32];
592        #[rustfmt::skip]
593        let src_array: [u16; 8] = [
594            0b0_10000110_0110010,
595            0b0_10000010_0101000,
596            0b0_10000000_1110000,
597            0b0_10000100_1001001,
598            0b0_10000110_0110010,
599            0b0_10000010_0101000,
600            0b0_10000000_1110000,
601            0b0_10000100_1001001,
602        ];
603        let src: __m128bh = transmute(src_array);
604        let a: __m128 = transmute(a_array);
605        let b: __m128 = transmute(b_array);
606        let k: __mmask8 = 0b1111_1111;
607        let c: __m128bh = _mm_mask_cvtne2ps_pbh(src, k, a, b);
608        let result: [u16; 8] = transmute(c.as_u16x8());
609        #[rustfmt::skip]
610        let expected_result: [u16; 8] = [
611            0b1_10000110_0110010,
612            0b1_10000010_0101000,
613            0b1_10000000_1110000,
614            0b1_10000100_1001001,
615            0b0_10000110_0110010,
616            0b0_10000010_0101000,
617            0b0_10000000_1110000,
618            0b0_10000100_1001001,
619        ];
620        assert_eq!(result, expected_result);
621        let k = 0b0000_0000;
622        let c = _mm_mask_cvtne2ps_pbh(src, k, a, b);
623        let result: [u16; 8] = transmute(c.as_u16x8());
624        let expected_result = src_array;
625        assert_eq!(result, expected_result);
626    }
627
628    #[simd_test(enable = "avx512bf16,avx512vl")]
629    unsafe fn test_mm_maskz_cvtne2ps_pbh() {
630        let a_array = [178.125_f32, 10.5_f32, 3.75_f32, 50.25_f32];
631        let b_array = [-178.125_f32, -10.5_f32, -3.75_f32, -50.25_f32];
632        let a: __m128 = transmute(a_array);
633        let b: __m128 = transmute(b_array);
634        let k: __mmask8 = 0b1111_1111;
635        let c: __m128bh = _mm_maskz_cvtne2ps_pbh(k, a, b);
636        let result: [u16; 8] = transmute(c.as_u16x8());
637        #[rustfmt::skip]
638        let expected_result: [u16; 8] = [
639            0b1_10000110_0110010,
640            0b1_10000010_0101000,
641            0b1_10000000_1110000,
642            0b1_10000100_1001001,
643            0b0_10000110_0110010,
644            0b0_10000010_0101000,
645            0b0_10000000_1110000,
646            0b0_10000100_1001001,
647        ];
648        assert_eq!(result, expected_result);
649        let k = 0b0011_1100;
650        let c = _mm_maskz_cvtne2ps_pbh(k, a, b);
651        let result: [u16; 8] = transmute(c.as_u16x8());
652        #[rustfmt::skip]
653        let expected_result: [u16; 8] = [
654            0,
655            0,
656            0b1_10000000_1110000,
657            0b1_10000100_1001001,
658            0b0_10000110_0110010,
659            0b0_10000010_0101000,
660            0,
661            0,
662        ];
663        assert_eq!(result, expected_result);
664    }
665
666    #[simd_test(enable = "avx512bf16,avx512vl")]
667    unsafe fn test_mm256_cvtne2ps_pbh() {
668        #[rustfmt::skip]
669        let a_array = [
670            178.125_f32,
671            10.5_f32,
672            3.75_f32,
673            50.25_f32,
674            16.5_f32,
675            255.11_f32,
676            1000.158_f32,
677            575.575_f32,
678        ];
679        let b_array = [
680            -178.125_f32,
681            -10.5_f32,
682            -3.75_f32,
683            -50.25_f32,
684            -16.5_f32,
685            -255.11_f32,
686            -1000.158_f32,
687            -575.575_f32,
688        ];
689        let a: __m256 = transmute(a_array);
690        let b: __m256 = transmute(b_array);
691        let c: __m256bh = _mm256_cvtne2ps_pbh(a, b);
692        let result: [u16; 16] = transmute(c.as_u16x16());
693        #[rustfmt::skip]
694        let expected_result: [u16; 16] = [
695            0b1_10000110_0110010,
696            0b1_10000010_0101000,
697            0b1_10000000_1110000,
698            0b1_10000100_1001001,
699            0b1_10000011_0000100,
700            0b1_10000110_1111111,
701            0b1_10001000_1111010,
702            0b1_10001000_0010000,
703            0b0_10000110_0110010,
704            0b0_10000010_0101000,
705            0b0_10000000_1110000,
706            0b0_10000100_1001001,
707            0b0_10000011_0000100,
708            0b0_10000110_1111111,
709            0b0_10001000_1111010,
710            0b0_10001000_0010000,
711        ];
712        assert_eq!(result, expected_result);
713    }
714
715    #[simd_test(enable = "avx512bf16,avx512vl")]
716    unsafe fn test_mm256_mask_cvtne2ps_pbh() {
717        #[rustfmt::skip]
718        let a_array = [
719            178.125_f32,
720            10.5_f32,
721            3.75_f32,
722            50.25_f32,
723            16.5_f32,
724            255.11_f32,
725            1000.158_f32,
726            575.575_f32,
727        ];
728        let b_array = [
729            -178.125_f32,
730            -10.5_f32,
731            -3.75_f32,
732            -50.25_f32,
733            -16.5_f32,
734            -255.11_f32,
735            -1000.158_f32,
736            -575.575_f32,
737        ];
738        let src_array: [u16; 16] = [
739            0b0_10000110_0110010,
740            0b0_10000010_0101000,
741            0b0_10000000_1110000,
742            0b0_10000100_1001001,
743            0b0_10000110_0110010,
744            0b0_10000010_0101000,
745            0b0_10000000_1110000,
746            0b0_10000100_1001001,
747            0b0_10000110_0110010,
748            0b0_10000010_0101000,
749            0b0_10000000_1110000,
750            0b0_10000100_1001001,
751            0b0_10000110_0110010,
752            0b0_10000010_0101000,
753            0b0_10000000_1110000,
754            0b0_10000100_1001001,
755        ];
756        let src: __m256bh = transmute(src_array);
757        let a: __m256 = transmute(a_array);
758        let b: __m256 = transmute(b_array);
759        let k: __mmask16 = 0xffff;
760        let c: __m256bh = _mm256_mask_cvtne2ps_pbh(src, k, a, b);
761        let result: [u16; 16] = transmute(c.as_u16x16());
762        #[rustfmt::skip]
763        let expected_result: [u16; 16] = [
764            0b1_10000110_0110010,
765            0b1_10000010_0101000,
766            0b1_10000000_1110000,
767            0b1_10000100_1001001,
768            0b1_10000011_0000100,
769            0b1_10000110_1111111,
770            0b1_10001000_1111010,
771            0b1_10001000_0010000,
772            0b0_10000110_0110010,
773            0b0_10000010_0101000,
774            0b0_10000000_1110000,
775            0b0_10000100_1001001,
776            0b0_10000011_0000100,
777            0b0_10000110_1111111,
778            0b0_10001000_1111010,
779            0b0_10001000_0010000,
780        ];
781        assert_eq!(result, expected_result);
782        let k: __mmask16 = 0;
783        let c: __m256bh = _mm256_mask_cvtne2ps_pbh(src, k, a, b);
784        let result: [u16; 16] = transmute(c.as_u16x16());
785        let expected_result = src_array;
786        assert_eq!(result, expected_result);
787    }
788
789    #[simd_test(enable = "avx512bf16,avx512vl")]
790    unsafe fn test_mm256_maskz_cvtne2ps_pbh() {
791        #[rustfmt::skip]
792        let a_array = [
793            178.125_f32,
794            10.5_f32,
795            3.75_f32,
796            50.25_f32,
797            16.5_f32,
798            255.11_f32,
799            1000.158_f32,
800            575.575_f32,
801        ];
802        let b_array = [
803            -178.125_f32,
804            -10.5_f32,
805            -3.75_f32,
806            -50.25_f32,
807            -16.5_f32,
808            -255.11_f32,
809            -1000.158_f32,
810            -575.575_f32,
811        ];
812        let a: __m256 = transmute(a_array);
813        let b: __m256 = transmute(b_array);
814        let k: __mmask16 = 0xffff;
815        let c: __m256bh = _mm256_maskz_cvtne2ps_pbh(k, a, b);
816        let result: [u16; 16] = transmute(c.as_u16x16());
817        #[rustfmt::skip]
818        let expected_result: [u16; 16] = [
819            0b1_10000110_0110010,
820            0b1_10000010_0101000,
821            0b1_10000000_1110000,
822            0b1_10000100_1001001,
823            0b1_10000011_0000100,
824            0b1_10000110_1111111,
825            0b1_10001000_1111010,
826            0b1_10001000_0010000,
827            0b0_10000110_0110010,
828            0b0_10000010_0101000,
829            0b0_10000000_1110000,
830            0b0_10000100_1001001,
831            0b0_10000011_0000100,
832            0b0_10000110_1111111,
833            0b0_10001000_1111010,
834            0b0_10001000_0010000,
835        ];
836        assert_eq!(result, expected_result);
837        let k: __mmask16 = 0b0110_1100_0011_0110;
838        let c: __m256bh = _mm256_maskz_cvtne2ps_pbh(k, a, b);
839        let result: [u16; 16] = transmute(c.as_u16x16());
840        #[rustfmt::skip]
841        let expected_result: [u16; 16] = [
842            0,
843            0b1_10000010_0101000,
844            0b1_10000000_1110000,
845            0,
846            0b1_10000011_0000100,
847            0b1_10000110_1111111,
848            0,
849            0,
850            0,
851            0,
852            0b0_10000000_1110000,
853            0b0_10000100_1001001,
854            0,
855            0b0_10000110_1111111,
856            0b0_10001000_1111010,
857            0,
858        ];
859        assert_eq!(result, expected_result);
860    }
861
862    #[simd_test(enable = "avx512bf16,avx512f")]
863    unsafe fn test_mm512_cvtne2ps_pbh() {
864        #[rustfmt::skip]
865        let a_array = [
866            178.125_f32,
867            10.5_f32,
868            3.75_f32,
869            50.25_f32,
870            16.5_f32,
871            255.11_f32,
872            1000.158_f32,
873            575.575_f32,
874            178.125_f32,
875            10.5_f32,
876            3.75_f32,
877            50.25_f32,
878            16.5_f32,
879            255.11_f32,
880            1000.158_f32,
881            575.575_f32,
882        ];
883        let b_array = [
884            -178.125_f32,
885            -10.5_f32,
886            -3.75_f32,
887            -50.25_f32,
888            -16.5_f32,
889            -255.11_f32,
890            -1000.158_f32,
891            -575.575_f32,
892            -178.125_f32,
893            -10.5_f32,
894            -3.75_f32,
895            -50.25_f32,
896            -16.5_f32,
897            -255.11_f32,
898            -1000.158_f32,
899            -575.575_f32,
900        ];
901        let a: __m512 = transmute(a_array);
902        let b: __m512 = transmute(b_array);
903        let c: __m512bh = _mm512_cvtne2ps_pbh(a, b);
904        let result: [u16; 32] = transmute(c.as_u16x32());
905        #[rustfmt::skip]
906        let expected_result: [u16; 32] = [
907            0b1_10000110_0110010,
908            0b1_10000010_0101000,
909            0b1_10000000_1110000,
910            0b1_10000100_1001001,
911            0b1_10000011_0000100,
912            0b1_10000110_1111111,
913            0b1_10001000_1111010,
914            0b1_10001000_0010000,
915            0b1_10000110_0110010,
916            0b1_10000010_0101000,
917            0b1_10000000_1110000,
918            0b1_10000100_1001001,
919            0b1_10000011_0000100,
920            0b1_10000110_1111111,
921            0b1_10001000_1111010,
922            0b1_10001000_0010000,
923            0b0_10000110_0110010,
924            0b0_10000010_0101000,
925            0b0_10000000_1110000,
926            0b0_10000100_1001001,
927            0b0_10000011_0000100,
928            0b0_10000110_1111111,
929            0b0_10001000_1111010,
930            0b0_10001000_0010000,
931            0b0_10000110_0110010,
932            0b0_10000010_0101000,
933            0b0_10000000_1110000,
934            0b0_10000100_1001001,
935            0b0_10000011_0000100,
936            0b0_10000110_1111111,
937            0b0_10001000_1111010,
938            0b0_10001000_0010000,
939        ];
940        assert_eq!(result, expected_result);
941    }
942
943    #[simd_test(enable = "avx512bf16,avx512f")]
944    unsafe fn test_mm512_mask_cvtne2ps_pbh() {
945        #[rustfmt::skip]
946        let a_array = [
947            178.125_f32,
948            10.5_f32,
949            3.75_f32,
950            50.25_f32,
951            16.5_f32,
952            255.11_f32,
953            1000.158_f32,
954            575.575_f32,
955            178.125_f32,
956            10.5_f32,
957            3.75_f32,
958            50.25_f32,
959            16.5_f32,
960            255.11_f32,
961            1000.158_f32,
962            575.575_f32,
963        ];
964        let b_array = [
965            -178.125_f32,
966            -10.5_f32,
967            -3.75_f32,
968            -50.25_f32,
969            -16.5_f32,
970            -255.11_f32,
971            -1000.158_f32,
972            -575.575_f32,
973            -178.125_f32,
974            -10.5_f32,
975            -3.75_f32,
976            -50.25_f32,
977            -16.5_f32,
978            -255.11_f32,
979            -1000.158_f32,
980            -575.575_f32,
981        ];
982        let src_array: [u16; 32] = [
983            0b0_10000110_0110010,
984            0b0_10000010_0101000,
985            0b0_10000000_1110000,
986            0b0_10000100_1001001,
987            0b0_10000110_0110010,
988            0b0_10000010_0101000,
989            0b0_10000000_1110000,
990            0b0_10000100_1001001,
991            0b0_10000110_0110010,
992            0b0_10000010_0101000,
993            0b0_10000000_1110000,
994            0b0_10000100_1001001,
995            0b0_10000110_0110010,
996            0b0_10000010_0101000,
997            0b0_10000000_1110000,
998            0b0_10000100_1001001,
999            0b0_10000110_0110010,
1000            0b0_10000010_0101000,
1001            0b0_10000000_1110000,
1002            0b0_10000100_1001001,
1003            0b0_10000110_0110010,
1004            0b0_10000010_0101000,
1005            0b0_10000000_1110000,
1006            0b0_10000100_1001001,
1007            0b0_10000110_0110010,
1008            0b0_10000010_0101000,
1009            0b0_10000000_1110000,
1010            0b0_10000100_1001001,
1011            0b0_10000110_0110010,
1012            0b0_10000010_0101000,
1013            0b0_10000000_1110000,
1014            0b0_10000100_1001001,
1015        ];
1016        let src: __m512bh = transmute(src_array);
1017        let a: __m512 = transmute(a_array);
1018        let b: __m512 = transmute(b_array);
1019        let k: __mmask32 = 0xffffffff;
1020        let c: __m512bh = _mm512_mask_cvtne2ps_pbh(src, k, a, b);
1021        let result: [u16; 32] = transmute(c.as_u16x32());
1022        #[rustfmt::skip]
1023        let expected_result: [u16; 32] = [
1024            0b1_10000110_0110010,
1025            0b1_10000010_0101000,
1026            0b1_10000000_1110000,
1027            0b1_10000100_1001001,
1028            0b1_10000011_0000100,
1029            0b1_10000110_1111111,
1030            0b1_10001000_1111010,
1031            0b1_10001000_0010000,
1032            0b1_10000110_0110010,
1033            0b1_10000010_0101000,
1034            0b1_10000000_1110000,
1035            0b1_10000100_1001001,
1036            0b1_10000011_0000100,
1037            0b1_10000110_1111111,
1038            0b1_10001000_1111010,
1039            0b1_10001000_0010000,
1040            0b0_10000110_0110010,
1041            0b0_10000010_0101000,
1042            0b0_10000000_1110000,
1043            0b0_10000100_1001001,
1044            0b0_10000011_0000100,
1045            0b0_10000110_1111111,
1046            0b0_10001000_1111010,
1047            0b0_10001000_0010000,
1048            0b0_10000110_0110010,
1049            0b0_10000010_0101000,
1050            0b0_10000000_1110000,
1051            0b0_10000100_1001001,
1052            0b0_10000011_0000100,
1053            0b0_10000110_1111111,
1054            0b0_10001000_1111010,
1055            0b0_10001000_0010000,
1056        ];
1057        assert_eq!(result, expected_result);
1058        let k: __mmask32 = 0;
1059        let c: __m512bh = _mm512_mask_cvtne2ps_pbh(src, k, a, b);
1060        let result: [u16; 32] = transmute(c.as_u16x32());
1061        let expected_result = src_array;
1062        assert_eq!(result, expected_result);
1063    }
1064
1065    #[simd_test(enable = "avx512bf16,avx512f")]
1066    unsafe fn test_mm512_maskz_cvtne2ps_pbh() {
1067        #[rustfmt::skip]
1068        let a_array = [
1069            178.125_f32,
1070            10.5_f32,
1071            3.75_f32,
1072            50.25_f32,
1073            16.5_f32,
1074            255.11_f32,
1075            1000.158_f32,
1076            575.575_f32,
1077            178.125_f32,
1078            10.5_f32,
1079            3.75_f32,
1080            50.25_f32,
1081            16.5_f32,
1082            255.11_f32,
1083            1000.158_f32,
1084            575.575_f32,
1085        ];
1086        let b_array = [
1087            -178.125_f32,
1088            -10.5_f32,
1089            -3.75_f32,
1090            -50.25_f32,
1091            -16.5_f32,
1092            -255.11_f32,
1093            -1000.158_f32,
1094            -575.575_f32,
1095            -178.125_f32,
1096            -10.5_f32,
1097            -3.75_f32,
1098            -50.25_f32,
1099            -16.5_f32,
1100            -255.11_f32,
1101            -1000.158_f32,
1102            -575.575_f32,
1103        ];
1104        let a: __m512 = transmute(a_array);
1105        let b: __m512 = transmute(b_array);
1106        let k: __mmask32 = 0xffffffff;
1107        let c: __m512bh = _mm512_maskz_cvtne2ps_pbh(k, a, b);
1108        let result: [u16; 32] = transmute(c.as_u16x32());
1109        #[rustfmt::skip]
1110        let expected_result: [u16; 32] = [
1111            0b1_10000110_0110010,
1112            0b1_10000010_0101000,
1113            0b1_10000000_1110000,
1114            0b1_10000100_1001001,
1115            0b1_10000011_0000100,
1116            0b1_10000110_1111111,
1117            0b1_10001000_1111010,
1118            0b1_10001000_0010000,
1119            0b1_10000110_0110010,
1120            0b1_10000010_0101000,
1121            0b1_10000000_1110000,
1122            0b1_10000100_1001001,
1123            0b1_10000011_0000100,
1124            0b1_10000110_1111111,
1125            0b1_10001000_1111010,
1126            0b1_10001000_0010000,
1127            0b0_10000110_0110010,
1128            0b0_10000010_0101000,
1129            0b0_10000000_1110000,
1130            0b0_10000100_1001001,
1131            0b0_10000011_0000100,
1132            0b0_10000110_1111111,
1133            0b0_10001000_1111010,
1134            0b0_10001000_0010000,
1135            0b0_10000110_0110010,
1136            0b0_10000010_0101000,
1137            0b0_10000000_1110000,
1138            0b0_10000100_1001001,
1139            0b0_10000011_0000100,
1140            0b0_10000110_1111111,
1141            0b0_10001000_1111010,
1142            0b0_10001000_0010000,
1143        ];
1144        assert_eq!(result, expected_result);
1145        let k: __mmask32 = 0b1100_1010_1001_0110_1010_0011_0101_0110;
1146        let c: __m512bh = _mm512_maskz_cvtne2ps_pbh(k, a, b);
1147        let result: [u16; 32] = transmute(c.as_u16x32());
1148        #[rustfmt::skip]
1149        let expected_result: [u16; 32] = [
1150            0,
1151            0b1_10000010_0101000,
1152            0b1_10000000_1110000,
1153            0,
1154            0b1_10000011_0000100,
1155            0,
1156            0b1_10001000_1111010,
1157            0,
1158            0b1_10000110_0110010,
1159            0b1_10000010_0101000,
1160            0,
1161            0,
1162            0,
1163            0b1_10000110_1111111,
1164            0,
1165            0b1_10001000_0010000,
1166            0,
1167            0b0_10000010_0101000,
1168            0b0_10000000_1110000,
1169            0,
1170            0b0_10000011_0000100,
1171            0,
1172            0,
1173            0b0_10001000_0010000,
1174            0,
1175            0b0_10000010_0101000,
1176            0,
1177            0b0_10000100_1001001,
1178            0,
1179            0,
1180            0b0_10001000_1111010,
1181            0b0_10001000_0010000,
1182        ];
1183        assert_eq!(result, expected_result);
1184    }
1185
1186    #[simd_test(enable = "avx512bf16,avx512vl")]
1187    unsafe fn test_mm256_cvtneps_pbh() {
1188        #[rustfmt::skip]
1189        let a_array = [
1190            178.125_f32,
1191            10.5_f32,
1192            3.75_f32,
1193            50.25_f32,
1194            16.5_f32,
1195            255.11_f32,
1196            1000.158_f32,
1197            575.575_f32,
1198        ];
1199        let a: __m256 = transmute(a_array);
1200        let c: __m128bh = _mm256_cvtneps_pbh(a);
1201        let result: [u16; 8] = transmute(c.as_u16x8());
1202        #[rustfmt::skip]
1203        let expected_result: [u16; 8] = [
1204            0b0_10000110_0110010,
1205            0b0_10000010_0101000,
1206            0b0_10000000_1110000,
1207            0b0_10000100_1001001,
1208            0b0_10000011_0000100,
1209            0b0_10000110_1111111,
1210            0b0_10001000_1111010,
1211            0b0_10001000_0010000,
1212        ];
1213        assert_eq!(result, expected_result);
1214    }
1215
1216    #[simd_test(enable = "avx512bf16,avx512vl")]
1217    unsafe fn test_mm256_mask_cvtneps_pbh() {
1218        #[rustfmt::skip]
1219        let a_array = [
1220            178.125_f32,
1221            10.5_f32,
1222            3.75_f32,
1223            50.25_f32,
1224            16.5_f32,
1225            255.11_f32,
1226            1000.158_f32,
1227            575.575_f32,
1228        ];
1229        let src_array: [u16; 8] = [
1230            0b1_10000110_0110010,
1231            0b1_10000010_0101000,
1232            0b1_10000000_1110000,
1233            0b1_10000100_1001001,
1234            0b1_10000011_0000100,
1235            0b1_10000110_1111111,
1236            0b1_10001000_1111010,
1237            0b1_10001000_0010000,
1238        ];
1239        let src: __m128bh = transmute(src_array);
1240        let a: __m256 = transmute(a_array);
1241        let k: __mmask8 = 0xff;
1242        let b = _mm256_mask_cvtneps_pbh(src, k, a);
1243        let result: [u16; 8] = transmute(b.as_u16x8());
1244        #[rustfmt::skip]
1245        let expected_result: [u16; 8] = [
1246            0b0_10000110_0110010,
1247            0b0_10000010_0101000,
1248            0b0_10000000_1110000,
1249            0b0_10000100_1001001,
1250            0b0_10000011_0000100,
1251            0b0_10000110_1111111,
1252            0b0_10001000_1111010,
1253            0b0_10001000_0010000,
1254        ];
1255        assert_eq!(result, expected_result);
1256        let k: __mmask8 = 0x0;
1257        let b: __m128bh = _mm256_mask_cvtneps_pbh(src, k, a);
1258        let result: [u16; 8] = transmute(b.as_u16x8());
1259        let expected_result: [u16; 8] = src_array;
1260        assert_eq!(result, expected_result);
1261    }
1262
1263    #[simd_test(enable = "avx512bf16,avx512vl")]
1264    unsafe fn test_mm256_maskz_cvtneps_pbh() {
1265        #[rustfmt::skip]
1266        let a_array = [
1267            178.125_f32,
1268            10.5_f32,
1269            3.75_f32,
1270            50.25_f32,
1271            16.5_f32,
1272            255.11_f32,
1273            1000.158_f32,
1274            575.575_f32,
1275        ];
1276        let a: __m256 = transmute(a_array);
1277        let k: __mmask8 = 0xff;
1278        let b = _mm256_maskz_cvtneps_pbh(k, a);
1279        let result: [u16; 8] = transmute(b.as_u16x8());
1280        #[rustfmt::skip]
1281        let expected_result: [u16; 8] = [
1282            0b0_10000110_0110010,
1283            0b0_10000010_0101000,
1284            0b0_10000000_1110000,
1285            0b0_10000100_1001001,
1286            0b0_10000011_0000100,
1287            0b0_10000110_1111111,
1288            0b0_10001000_1111010,
1289            0b0_10001000_0010000,
1290        ];
1291        assert_eq!(result, expected_result);
1292        let k: __mmask8 = 0x6;
1293        let b: __m128bh = _mm256_maskz_cvtneps_pbh(k, a);
1294        let result: [u16; 8] = transmute(b.as_u16x8());
1295        let expected_result: [u16; 8] =
1296            [0, 0b0_10000010_0101000, 0b0_10000000_1110000, 0, 0, 0, 0, 0];
1297        assert_eq!(result, expected_result);
1298    }
1299
1300    #[simd_test(enable = "avx512bf16,avx512f")]
1301    unsafe fn test_mm512_cvtneps_pbh() {
1302        #[rustfmt::skip]
1303        let a_array = [
1304            178.125_f32,
1305            10.5_f32,
1306            3.75_f32,
1307            50.25_f32,
1308            16.5_f32,
1309            255.11_f32,
1310            1000.158_f32,
1311            575.575_f32,
1312            178.125_f32,
1313            10.5_f32,
1314            3.75_f32,
1315            50.25_f32,
1316            16.5_f32,
1317            255.11_f32,
1318            1000.158_f32,
1319            575.575_f32,
1320        ];
1321        let a: __m512 = transmute(a_array);
1322        let c: __m256bh = _mm512_cvtneps_pbh(a);
1323        let result: [u16; 16] = transmute(c.as_u16x16());
1324        #[rustfmt::skip]
1325        let expected_result: [u16; 16] = [
1326            0b0_10000110_0110010,
1327            0b0_10000010_0101000,
1328            0b0_10000000_1110000,
1329            0b0_10000100_1001001,
1330            0b0_10000011_0000100,
1331            0b0_10000110_1111111,
1332            0b0_10001000_1111010,
1333            0b0_10001000_0010000,
1334            0b0_10000110_0110010,
1335            0b0_10000010_0101000,
1336            0b0_10000000_1110000,
1337            0b0_10000100_1001001,
1338            0b0_10000011_0000100,
1339            0b0_10000110_1111111,
1340            0b0_10001000_1111010,
1341            0b0_10001000_0010000,
1342        ];
1343        assert_eq!(result, expected_result);
1344    }
1345
1346    #[simd_test(enable = "avx512bf16,avx512f")]
1347    unsafe fn test_mm512_mask_cvtneps_pbh() {
1348        #[rustfmt::skip]
1349        let a_array = [
1350            178.125_f32,
1351            10.5_f32,
1352            3.75_f32,
1353            50.25_f32,
1354            16.5_f32,
1355            255.11_f32,
1356            1000.158_f32,
1357            575.575_f32,
1358            178.125_f32,
1359            10.5_f32,
1360            3.75_f32,
1361            50.25_f32,
1362            16.5_f32,
1363            255.11_f32,
1364            1000.158_f32,
1365            575.575_f32,
1366        ];
1367        let src_array: [u16; 16] = [
1368            0b1_10000110_0110010,
1369            0b1_10000010_0101000,
1370            0b1_10000000_1110000,
1371            0b1_10000100_1001001,
1372            0b1_10000011_0000100,
1373            0b1_10000110_1111111,
1374            0b1_10001000_1111010,
1375            0b1_10001000_0010000,
1376            0b1_10000110_0110010,
1377            0b1_10000010_0101000,
1378            0b1_10000000_1110000,
1379            0b1_10000100_1001001,
1380            0b1_10000011_0000100,
1381            0b1_10000110_1111111,
1382            0b1_10001000_1111010,
1383            0b1_10001000_0010000,
1384        ];
1385        let src: __m256bh = transmute(src_array);
1386        let a: __m512 = transmute(a_array);
1387        let k: __mmask16 = 0xffff;
1388        let c: __m256bh = _mm512_mask_cvtneps_pbh(src, k, a);
1389        let result: [u16; 16] = transmute(c.as_u16x16());
1390        #[rustfmt::skip]
1391        let expected_result: [u16; 16] = [
1392            0b0_10000110_0110010,
1393            0b0_10000010_0101000,
1394            0b0_10000000_1110000,
1395            0b0_10000100_1001001,
1396            0b0_10000011_0000100,
1397            0b0_10000110_1111111,
1398            0b0_10001000_1111010,
1399            0b0_10001000_0010000,
1400            0b0_10000110_0110010,
1401            0b0_10000010_0101000,
1402            0b0_10000000_1110000,
1403            0b0_10000100_1001001,
1404            0b0_10000011_0000100,
1405            0b0_10000110_1111111,
1406            0b0_10001000_1111010,
1407            0b0_10001000_0010000,
1408        ];
1409        assert_eq!(result, expected_result);
1410        let k: __mmask16 = 0;
1411        let c: __m256bh = _mm512_mask_cvtneps_pbh(src, k, a);
1412        let result: [u16; 16] = transmute(c.as_u16x16());
1413        let expected_result = src_array;
1414        assert_eq!(result, expected_result);
1415    }
1416
1417    #[simd_test(enable = "avx512bf16,avx512f")]
1418    unsafe fn test_mm512_maskz_cvtneps_pbh() {
1419        #[rustfmt::skip]
1420        let a_array = [
1421            178.125_f32,
1422            10.5_f32,
1423            3.75_f32,
1424            50.25_f32,
1425            16.5_f32,
1426            255.11_f32,
1427            1000.158_f32,
1428            575.575_f32,
1429            178.125_f32,
1430            10.5_f32,
1431            3.75_f32,
1432            50.25_f32,
1433            16.5_f32,
1434            255.11_f32,
1435            1000.158_f32,
1436            575.575_f32,
1437        ];
1438        let a: __m512 = transmute(a_array);
1439        let k: __mmask16 = 0xffff;
1440        let c: __m256bh = _mm512_maskz_cvtneps_pbh(k, a);
1441        let result: [u16; 16] = transmute(c.as_u16x16());
1442        #[rustfmt::skip]
1443        let expected_result: [u16; 16] = [
1444            0b0_10000110_0110010,
1445            0b0_10000010_0101000,
1446            0b0_10000000_1110000,
1447            0b0_10000100_1001001,
1448            0b0_10000011_0000100,
1449            0b0_10000110_1111111,
1450            0b0_10001000_1111010,
1451            0b0_10001000_0010000,
1452            0b0_10000110_0110010,
1453            0b0_10000010_0101000,
1454            0b0_10000000_1110000,
1455            0b0_10000100_1001001,
1456            0b0_10000011_0000100,
1457            0b0_10000110_1111111,
1458            0b0_10001000_1111010,
1459            0b0_10001000_0010000,
1460        ];
1461        assert_eq!(result, expected_result);
1462        let k: __mmask16 = 0x653a;
1463        let c: __m256bh = _mm512_maskz_cvtneps_pbh(k, a);
1464        let result: [u16; 16] = transmute(c.as_u16x16());
1465        #[rustfmt::skip]
1466        let expected_result: [u16; 16] = [
1467            0,
1468            0b0_10000010_0101000,
1469            0,
1470            0b0_10000100_1001001,
1471            0b0_10000011_0000100,
1472            0b0_10000110_1111111,
1473            0,
1474            0,
1475            0b0_10000110_0110010,
1476            0,
1477            0b0_10000000_1110000,
1478            0,
1479            0,
1480            0b0_10000110_1111111,
1481            0b0_10001000_1111010,
1482            0,
1483        ];
1484        assert_eq!(result, expected_result);
1485    }
1486
1487    #[simd_test(enable = "avx512bf16,avx512vl")]
1488    unsafe fn test_mm_dpbf16_ps() {
1489        let a_array = [8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32];
1490        let b_array = [-1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32];
1491        let a1: __m128 = transmute(a_array);
1492        let b1: __m128 = transmute(b_array);
1493        let src: __m128 = transmute([1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32]);
1494        let a: __m128bh = _mm_cvtne2ps_pbh(a1, a1);
1495        let b: __m128bh = _mm_cvtne2ps_pbh(b1, b1);
1496        let c: __m128 = _mm_dpbf16_ps(src, a, b);
1497        let result: [f32; 4] = transmute(c.as_f32x4());
1498        let expected_result: [f32; 4] = [-18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32];
1499        assert_eq!(result, expected_result);
1500    }
1501
1502    #[simd_test(enable = "avx512bf16,avx512vl")]
1503    unsafe fn test_mm_mask_dpbf16_ps() {
1504        let a_array = [8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32];
1505        let b_array = [-1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32];
1506        let a1: __m128 = transmute(a_array);
1507        let b1: __m128 = transmute(b_array);
1508        let k: __mmask8 = 0xf3;
1509        let src: __m128 = transmute([1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32]);
1510        let a: __m128bh = _mm_cvtne2ps_pbh(a1, a1);
1511        let b: __m128bh = _mm_cvtne2ps_pbh(b1, b1);
1512        let c: __m128 = _mm_mask_dpbf16_ps(src, k, a, b);
1513        let result: [f32; 4] = transmute(c.as_f32x4());
1514        let expected_result: [f32; 4] = [-18.0_f32, -52.0_f32, 3.0_f32, 4.0_f32];
1515        assert_eq!(result, expected_result);
1516        let k: __mmask8 = 0xff;
1517        let c: __m128 = _mm_mask_dpbf16_ps(src, k, a, b);
1518        let result: [f32; 4] = transmute(c.as_f32x4());
1519        let expected_result: [f32; 4] = [-18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32];
1520        assert_eq!(result, expected_result);
1521        let k: __mmask8 = 0;
1522        let c: __m128 = _mm_mask_dpbf16_ps(src, k, a, b);
1523        let result: [f32; 4] = transmute(c.as_f32x4());
1524        let expected_result: [f32; 4] = [1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32];
1525        assert_eq!(result, expected_result);
1526    }
1527
1528    #[simd_test(enable = "avx512bf16,avx512vl")]
1529    unsafe fn test_mm_maskz_dpbf16_ps() {
1530        let a_array = [8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32];
1531        let b_array = [-1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32];
1532        let a1: __m128 = transmute(a_array);
1533        let b1: __m128 = transmute(b_array);
1534        let k: __mmask8 = 0xf3;
1535        let src: __m128 = transmute([1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32]);
1536        let a: __m128bh = _mm_cvtne2ps_pbh(a1, a1);
1537        let b: __m128bh = _mm_cvtne2ps_pbh(b1, b1);
1538        let c: __m128 = _mm_maskz_dpbf16_ps(k, src, a, b);
1539        let result: [f32; 4] = transmute(c.as_f32x4());
1540        let expected_result: [f32; 4] = [-18.0_f32, -52.0_f32, 0.0, 0.0];
1541        assert_eq!(result, expected_result);
1542        let k: __mmask8 = 0xff;
1543        let c: __m128 = _mm_maskz_dpbf16_ps(k, src, a, b);
1544        let result: [f32; 4] = transmute(c.as_f32x4());
1545        let expected_result: [f32; 4] = [-18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32];
1546        assert_eq!(result, expected_result);
1547        let k: __mmask8 = 0;
1548        let c: __m128 = _mm_maskz_dpbf16_ps(k, src, a, b);
1549        let result: [f32; 4] = transmute(c.as_f32x4());
1550        let expected_result: [f32; 4] = [0.0, 0.0, 0.0, 0.0];
1551        assert_eq!(result, expected_result);
1552    }
1553
1554    #[simd_test(enable = "avx512bf16,avx512vl")]
1555    unsafe fn test_mm256_dpbf16_ps() {
1556        #[rustfmt::skip]
1557        let a_array = [
1558            8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32, 8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32,
1559        ];
1560        let b_array = [
1561            -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32,
1562        ];
1563        let a1: __m256 = transmute(a_array);
1564        let b1: __m256 = transmute(b_array);
1565        #[rustfmt::skip]
1566        let src: __m256 = transmute([
1567            1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32,
1568        ]);
1569        let a: __m256bh = _mm256_cvtne2ps_pbh(a1, a1);
1570        let b: __m256bh = _mm256_cvtne2ps_pbh(b1, b1);
1571        let c: __m256 = _mm256_dpbf16_ps(src, a, b);
1572        let result: [f32; 8] = transmute(c.as_f32x8());
1573        #[rustfmt::skip]
1574        let expected_result: [f32; 8] = [
1575            -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32, -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32,
1576        ];
1577        assert_eq!(result, expected_result);
1578    }
1579
1580    #[simd_test(enable = "avx512bf16,avx512vl")]
1581    unsafe fn test_mm256_mask_dpbf16_ps() {
1582        #[rustfmt::skip]
1583        let a_array = [
1584            8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32, 8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32,
1585        ];
1586        let b_array = [
1587            -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32,
1588        ];
1589        let a1: __m256 = transmute(a_array);
1590        let b1: __m256 = transmute(b_array);
1591        let k: __mmask8 = 0x33;
1592        #[rustfmt::skip]
1593        let src: __m256 = transmute([
1594            1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32,
1595        ]);
1596        let a: __m256bh = _mm256_cvtne2ps_pbh(a1, a1);
1597        let b: __m256bh = _mm256_cvtne2ps_pbh(b1, b1);
1598        let c: __m256 = _mm256_mask_dpbf16_ps(src, k, a, b);
1599        let result: [f32; 8] = transmute(c.as_f32x8());
1600        #[rustfmt::skip]
1601        let expected_result: [f32; 8] = [
1602            -18.0_f32, -52.0_f32, 3.0_f32, 4.0_f32, -18.0_f32, -52.0_f32, 3.0_f32, 4.0_f32,
1603        ];
1604        assert_eq!(result, expected_result);
1605        let k: __mmask8 = 0xff;
1606        let c: __m256 = _mm256_mask_dpbf16_ps(src, k, a, b);
1607        let result: [f32; 8] = transmute(c.as_f32x8());
1608        #[rustfmt::skip]
1609        let expected_result: [f32; 8] = [
1610            -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32, -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32,
1611        ];
1612        assert_eq!(result, expected_result);
1613        let k: __mmask8 = 0;
1614        let c: __m256 = _mm256_mask_dpbf16_ps(src, k, a, b);
1615        let result: [f32; 8] = transmute(c.as_f32x8());
1616        #[rustfmt::skip]
1617        let expected_result: [f32; 8] = [
1618            1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32,
1619        ];
1620        assert_eq!(result, expected_result);
1621    }
1622
1623    #[simd_test(enable = "avx512bf16,avx512vl")]
1624    unsafe fn test_mm256_maskz_dpbf16_ps() {
1625        #[rustfmt::skip]
1626        let a_array = [
1627            8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32, 8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32,
1628        ];
1629        let b_array = [
1630            -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32,
1631        ];
1632        let a1: __m256 = transmute(a_array);
1633        let b1: __m256 = transmute(b_array);
1634        let k: __mmask8 = 0x33;
1635        #[rustfmt::skip]
1636        let src: __m256 = transmute([
1637            1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32,
1638        ]);
1639        let a: __m256bh = _mm256_cvtne2ps_pbh(a1, a1);
1640        let b: __m256bh = _mm256_cvtne2ps_pbh(b1, b1);
1641        let c: __m256 = _mm256_maskz_dpbf16_ps(k, src, a, b);
1642        let result: [f32; 8] = transmute(c.as_f32x8());
1643        #[rustfmt::skip]
1644        let expected_result: [f32; 8] = [
1645            -18.0_f32, -52.0_f32, 0.0, 0.0, -18.0_f32, -52.0_f32, 0.0, 0.0,
1646        ];
1647        assert_eq!(result, expected_result);
1648        let k: __mmask8 = 0xff;
1649        let c: __m256 = _mm256_maskz_dpbf16_ps(k, src, a, b);
1650        let result: [f32; 8] = transmute(c.as_f32x8());
1651        #[rustfmt::skip]
1652        let expected_result: [f32; 8] = [
1653            -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32, -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32,
1654        ];
1655        assert_eq!(result, expected_result);
1656        let k: __mmask8 = 0;
1657        let c: __m256 = _mm256_maskz_dpbf16_ps(k, src, a, b);
1658        let result: [f32; 8] = transmute(c.as_f32x8());
1659        let expected_result: [f32; 8] = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0];
1660        assert_eq!(result, expected_result);
1661    }
1662
1663    #[simd_test(enable = "avx512bf16,avx512f")]
1664    unsafe fn test_mm512_dpbf16_ps() {
1665        #[rustfmt::skip]
1666        let a_array = [
1667            8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32, 8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32,
1668            8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32, 8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32,
1669        ];
1670        let b_array = [
1671            -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32,
1672            -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32,
1673        ];
1674        let a1: __m512 = transmute(a_array);
1675        let b1: __m512 = transmute(b_array);
1676        let src: __m512 = transmute([
1677            1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32,
1678            2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32,
1679        ]);
1680        let a: __m512bh = _mm512_cvtne2ps_pbh(a1, a1);
1681        let b: __m512bh = _mm512_cvtne2ps_pbh(b1, b1);
1682        let c: __m512 = _mm512_dpbf16_ps(src, a, b);
1683        let result: [f32; 16] = transmute(c.as_f32x16());
1684        #[rustfmt::skip]
1685        let expected_result: [f32; 16] = [
1686            -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32, -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32,
1687            -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32, -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32,
1688        ];
1689        assert_eq!(result, expected_result);
1690    }
1691
1692    #[simd_test(enable = "avx512bf16,avx512f")]
1693    unsafe fn test_mm512_mask_dpbf16_ps() {
1694        #[rustfmt::skip]
1695        let a_array = [
1696            8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32, 8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32,
1697            8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32, 8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32,
1698        ];
1699        let b_array = [
1700            -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32,
1701            -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32,
1702        ];
1703        let a1: __m512 = transmute(a_array);
1704        let b1: __m512 = transmute(b_array);
1705        let k: __mmask16 = 0x3333;
1706        #[rustfmt::skip]
1707        let src: __m512 = transmute([
1708            1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32,
1709            2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32,
1710        ]);
1711        let a: __m512bh = _mm512_cvtne2ps_pbh(a1, a1);
1712        let b: __m512bh = _mm512_cvtne2ps_pbh(b1, b1);
1713        let c: __m512 = _mm512_mask_dpbf16_ps(src, k, a, b);
1714        let result: [f32; 16] = transmute(c.as_f32x16());
1715        #[rustfmt::skip]
1716        let expected_result: [f32; 16] = [
1717            -18.0_f32, -52.0_f32, 3.0_f32, 4.0_f32, -18.0_f32, -52.0_f32, 3.0_f32, 4.0_f32,
1718            -18.0_f32, -52.0_f32, 3.0_f32, 4.0_f32, -18.0_f32, -52.0_f32, 3.0_f32, 4.0_f32,
1719        ];
1720        assert_eq!(result, expected_result);
1721        let k: __mmask16 = 0xffff;
1722        let c: __m512 = _mm512_mask_dpbf16_ps(src, k, a, b);
1723        let result: [f32; 16] = transmute(c.as_f32x16());
1724        #[rustfmt::skip]
1725        let expected_result: [f32; 16] = [
1726            -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32, -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32,
1727            -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32, -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32,
1728        ];
1729        assert_eq!(result, expected_result);
1730        let k: __mmask16 = 0;
1731        let c: __m512 = _mm512_mask_dpbf16_ps(src, k, a, b);
1732        let result: [f32; 16] = transmute(c.as_f32x16());
1733        #[rustfmt::skip]
1734        let expected_result: [f32; 16] = [
1735            1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32,
1736            2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32,
1737        ];
1738        assert_eq!(result, expected_result);
1739    }
1740
1741    #[simd_test(enable = "avx512bf16,avx512f")]
1742    unsafe fn test_mm512_maskz_dpbf16_ps() {
1743        #[rustfmt::skip]
1744        let a_array = [
1745            8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32, 8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32,
1746            8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32, 8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32,
1747        ];
1748        let b_array = [
1749            -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32,
1750            -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32,
1751        ];
1752        let a1: __m512 = transmute(a_array);
1753        let b1: __m512 = transmute(b_array);
1754        let k: __mmask16 = 0x3333;
1755        #[rustfmt::skip]
1756        let src: __m512 = transmute([
1757            1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32,
1758            2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32,
1759        ]);
1760        let a: __m512bh = _mm512_cvtne2ps_pbh(a1, a1);
1761        let b: __m512bh = _mm512_cvtne2ps_pbh(b1, b1);
1762        let c: __m512 = _mm512_maskz_dpbf16_ps(k, src, a, b);
1763        let result: [f32; 16] = transmute(c.as_f32x16());
1764        #[rustfmt::skip]
1765        let expected_result: [f32; 16] = [
1766            -18.0_f32, -52.0_f32, 0.0, 0.0, -18.0_f32, -52.0_f32, 0.0, 0.0, -18.0_f32, -52.0_f32,
1767            0.0, 0.0, -18.0_f32, -52.0_f32, 0.0, 0.0,
1768        ];
1769        assert_eq!(result, expected_result);
1770        let k: __mmask16 = 0xffff;
1771        let c: __m512 = _mm512_maskz_dpbf16_ps(k, src, a, b);
1772        let result: [f32; 16] = transmute(c.as_f32x16());
1773        #[rustfmt::skip]
1774        let expected_result: [f32; 16] = [
1775            -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32, -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32,
1776            -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32, -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32,
1777        ];
1778        assert_eq!(result, expected_result);
1779        let k: __mmask16 = 0;
1780        let c: __m512 = _mm512_maskz_dpbf16_ps(k, src, a, b);
1781        let result: [f32; 16] = transmute(c.as_f32x16());
1782        #[rustfmt::skip]
1783        let expected_result: [f32; 16] = [
1784            0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
1785        ];
1786        assert_eq!(result, expected_result);
1787    }
1788
1789    const BF16_ONE: u16 = 0b0_01111111_0000000;
1790    const BF16_TWO: u16 = 0b0_10000000_0000000;
1791    const BF16_THREE: u16 = 0b0_10000000_1000000;
1792    const BF16_FOUR: u16 = 0b0_10000001_0000000;
1793    const BF16_FIVE: u16 = 0b0_10000001_0100000;
1794    const BF16_SIX: u16 = 0b0_10000001_1000000;
1795    const BF16_SEVEN: u16 = 0b0_10000001_1100000;
1796    const BF16_EIGHT: u16 = 0b0_10000010_0000000;
1797
1798    #[simd_test(enable = "avx512bf16")]
1799    unsafe fn test_mm512_cvtpbh_ps() {
1800        let a = __m256bh([
1801            BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT,
1802            BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT,
1803        ]);
1804        let r = _mm512_cvtpbh_ps(a);
1805        let e = _mm512_setr_ps(
1806            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0,
1807        );
1808        assert_eq_m512(r, e);
1809    }
1810
1811    #[simd_test(enable = "avx512bf16")]
1812    unsafe fn test_mm512_mask_cvtpbh_ps() {
1813        let a = __m256bh([
1814            BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT,
1815            BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT,
1816        ]);
1817        let src = _mm512_setr_ps(
1818            9., 10., 11., 12., 13., 14., 15., 16., 9., 10., 11., 12., 13., 14., 15., 16.,
1819        );
1820        let k = 0b1010_1010_1010_1010;
1821        let r = _mm512_mask_cvtpbh_ps(src, k, a);
1822        let e = _mm512_setr_ps(
1823            9., 2., 11., 4., 13., 6., 15., 8., 9., 2., 11., 4., 13., 6., 15., 8.,
1824        );
1825        assert_eq_m512(r, e);
1826    }
1827
1828    #[simd_test(enable = "avx512bf16")]
1829    unsafe fn test_mm512_maskz_cvtpbh_ps() {
1830        let a = __m256bh([
1831            BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT,
1832            BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT,
1833        ]);
1834        let k = 0b1010_1010_1010_1010;
1835        let r = _mm512_maskz_cvtpbh_ps(k, a);
1836        let e = _mm512_setr_ps(
1837            0., 2., 0., 4., 0., 6., 0., 8., 0., 2., 0., 4., 0., 6., 0., 8.,
1838        );
1839        assert_eq_m512(r, e);
1840    }
1841
1842    #[simd_test(enable = "avx512bf16,avx512vl")]
1843    unsafe fn test_mm256_cvtpbh_ps() {
1844        let a = __m128bh([
1845            BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT,
1846        ]);
1847        let r = _mm256_cvtpbh_ps(a);
1848        let e = _mm256_setr_ps(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
1849        assert_eq_m256(r, e);
1850    }
1851
1852    #[simd_test(enable = "avx512bf16,avx512vl")]
1853    unsafe fn test_mm256_mask_cvtpbh_ps() {
1854        let a = __m128bh([
1855            BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT,
1856        ]);
1857        let src = _mm256_setr_ps(9., 10., 11., 12., 13., 14., 15., 16.);
1858        let k = 0b1010_1010;
1859        let r = _mm256_mask_cvtpbh_ps(src, k, a);
1860        let e = _mm256_setr_ps(9., 2., 11., 4., 13., 6., 15., 8.);
1861        assert_eq_m256(r, e);
1862    }
1863
1864    #[simd_test(enable = "avx512bf16,avx512vl")]
1865    unsafe fn test_mm256_maskz_cvtpbh_ps() {
1866        let a = __m128bh([
1867            BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT,
1868        ]);
1869        let k = 0b1010_1010;
1870        let r = _mm256_maskz_cvtpbh_ps(k, a);
1871        let e = _mm256_setr_ps(0., 2., 0., 4., 0., 6., 0., 8.);
1872        assert_eq_m256(r, e);
1873    }
1874
1875    #[simd_test(enable = "avx512bf16,avx512vl")]
1876    unsafe fn test_mm_cvtpbh_ps() {
1877        let a = __m128bh([BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, 0, 0, 0, 0]);
1878        let r = _mm_cvtpbh_ps(a);
1879        let e = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
1880        assert_eq_m128(r, e);
1881    }
1882
1883    #[simd_test(enable = "avx512bf16,avx512vl")]
1884    unsafe fn test_mm_mask_cvtpbh_ps() {
1885        let a = __m128bh([BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, 0, 0, 0, 0]);
1886        let src = _mm_setr_ps(9., 10., 11., 12.);
1887        let k = 0b1010;
1888        let r = _mm_mask_cvtpbh_ps(src, k, a);
1889        let e = _mm_setr_ps(9., 2., 11., 4.);
1890        assert_eq_m128(r, e);
1891    }
1892
1893    #[simd_test(enable = "avx512bf16,avx512vl")]
1894    unsafe fn test_mm_maskz_cvtpbh_ps() {
1895        let a = __m128bh([BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, 0, 0, 0, 0]);
1896        let k = 0b1010;
1897        let r = _mm_maskz_cvtpbh_ps(k, a);
1898        let e = _mm_setr_ps(0., 2., 0., 4.);
1899        assert_eq_m128(r, e);
1900    }
1901
1902    #[simd_test(enable = "avx512bf16")]
1903    unsafe fn test_mm_cvtsbh_ss() {
1904        let r = _mm_cvtsbh_ss(bf16::from_bits(BF16_ONE));
1905        assert_eq!(r, 1.);
1906    }
1907
1908    #[simd_test(enable = "avx512bf16,avx512vl")]
1909    unsafe fn test_mm_cvtneps_pbh() {
1910        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
1911        let r: u16x4 = transmute_copy(&_mm_cvtneps_pbh(a));
1912        let e = u16x4::new(BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR);
1913        assert_eq!(r, e);
1914    }
1915
1916    #[simd_test(enable = "avx512bf16,avx512vl")]
1917    unsafe fn test_mm_mask_cvtneps_pbh() {
1918        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
1919        let src = __m128bh([5, 6, 7, 8, !0, !0, !0, !0]);
1920        let k = 0b1010;
1921        let r: u16x4 = transmute_copy(&_mm_mask_cvtneps_pbh(src, k, a));
1922        let e = u16x4::new(5, BF16_TWO, 7, BF16_FOUR);
1923        assert_eq!(r, e);
1924    }
1925
1926    #[simd_test(enable = "avx512bf16,avx512vl")]
1927    unsafe fn test_mm_maskz_cvtneps_pbh() {
1928        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
1929        let k = 0b1010;
1930        let r: u16x4 = transmute_copy(&_mm_maskz_cvtneps_pbh(k, a));
1931        let e = u16x4::new(0, BF16_TWO, 0, BF16_FOUR);
1932        assert_eq!(r, e);
1933    }
1934
1935    #[simd_test(enable = "avx512bf16,avx512vl")]
1936    unsafe fn test_mm_cvtness_sbh() {
1937        let r = _mm_cvtness_sbh(1.);
1938        assert_eq!(r.to_bits(), BF16_ONE);
1939    }
1940}