Vector shift
Meta
This post describes some in progress API design work for some additions to the bitslice crate.
Introduction
Bit shifts are useful. Sometimes as we go about abusing our CPUs, we want to shift lots of bits. Most CPUs support bitshifts on 64-bit words, but sometimes we want to bitshift wider vectors of bits. How do we do this efficiently?
Consider the shift
associated function on the BitArray
trait in the bitslice
crate.
pub trait BitArray {
/// Left shifts a by offset, pulling in bits from b.
fn shift(a: Self, b: Self, offset: usize) -> Self {
let mut target = Self::splat(false);
for i in 0..Self::LEN - offset {
target.set_bit(i, a.get_bit(i + offset));
}
for i in 0..offset {
target.set_bit((Self::LEN - offset) + i, b.get_bit(i));
}
target
}
}
It takes two words, and puts the bits in the target, offset by the amount specified in offset
.
This will be very slow.
We can make it faster.
The general principle is to use a byte-level bit shift, and then OR the resulting bytes together.
However, we wish to use SIMD instructions to make it go faster. We want to shift wide bit vectors, of 128, 256, or 512 bits.
Neon
Consider Arm Neon. It is a good and simple instruction set.
128-bit
#[cfg(target_feature = "neon")]
#[inline(always)]
pub fn neon_bit_shift_128(left: &[u8; 16], right: &[u8; 16], offset: usize) -> [u8; 16] {
use std::arch::aarch64::*;
unsafe {
let byte_offset = offset / 8;
let bit_shift = (offset % 8) as i8;
let mut src = [0u8; 32];
src[0..16].copy_from_slice(left);
src[16..32].copy_from_slice(right);
let src_ptr = src.as_ptr().add(byte_offset);
let shift_left_vec = vdupq_n_s8(bit_shift);
let shift_right_vec = vdupq_n_s8(bit_shift - 8);
let shifted_left = vshlq_u8(vld1q_u8(src_ptr), shift_left_vec);
let next_bytes = vextq_u8(vld1q_u8(src_ptr), vld1q_u8(src_ptr.add(16)), 1);
let shifted_right = vshlq_u8(next_bytes, shift_right_vec);
let res = vorrq_u8(shifted_left, shifted_right);
mem::transmute(res)
}
}
We copy the source words into a temporary buffer, perform unaligned loads, and shift the resulting vectors.
Sadly vextq_u8
requires a constant shift amount.
Were it otherwise, we could do nice things, assuming that the input vectors were aligned, but, sadly we cannot.
Consider how rustc 1.90.0-nightly (LLVM 20.1) compiles it.
bit_shift_demo18bit_shift_demo_128:
sub sp, sp, #0x20
ldr q0, [x1]
mov x8, sp
ldr q1, [x2]
add x8, x8, x3, lsr #3
orr w9, w3, #0xfffffff8
stp q0, q1, [sp]
dup v3.16b, w9
ldp q0, q1, [x8]
and w8, w3, #0x7
dup v2.16b, w8
ext v1.16b, v0.16b, v1.16b, #1
ushl v0.16b, v0.16b, v2.16b
ushl v1.16b, v1.16b, v3.16b
orr v0.16b, v1.16b, v0.16b
str q0, [x0]
add sp, sp, #0x20
ret
At its core, it performs two shifts and an OR.
Arm is a good and upright instruction set.
256-bit
Consider 256-bit.
#[cfg(target_feature = "neon")]
#[inline(always)]
pub fn neon_bit_shift_256(left: &[u8; 32], right: &[u8; 32], offset: usize) -> [u8; 32] {
use std::arch::aarch64::*;
unsafe {
let byte_offset = offset / 8;
let bit_shift = (offset % 8) as i8;
let mut src = [0u8; 64];
src[0..32].copy_from_slice(left);
src[32..64].copy_from_slice(right);
let src_ptr = src.as_ptr().add(byte_offset);
let mut result = [0u8; 32];
let dst_ptr = result.as_mut_ptr();
let shift_left_vec = vdupq_n_s8(bit_shift);
let shift_right_vec = vdupq_n_s8(bit_shift - 8);
let vecs = [
vld1q_u8(src_ptr),
vld1q_u8(src_ptr.add(16)),
vld1q_u8(src_ptr.add(32)),
];
for i in 0..2 {
let v_curr = vecs[i];
let v_next = vecs[i + 1];
let shifted_left = vshlq_u8(v_curr, shift_left_vec);
let next_bytes = vextq_u8(v_curr, v_next, 1);
let shifted_right = vshlq_u8(next_bytes, shift_right_vec);
let res = vorrq_u8(shifted_left, shifted_right);
vst1q_u8(dst_ptr.add(i * 16), res);
}
result
}
}
As before, we concatenate the source words, and perform unaligned loads. However, now we must loop.
bit_shift_demo18bit_shift_demo_256:
ldp q0, q1, [x1]
stp q0, q1, [sp, #-80]!
ldp q0, q2, [x2]
mov x8, sp
add x8, x8, x3, lsr #3
stp xzr, xzr, [sp, #64]
orr w9, w3, #0xfffffff8
dup v5.16b, w9
stp q0, q2, [sp, #32]
ldp q0, q1, [x8]
ldr q2, [x8, #32]
and w8, w3, #0x7
dup v4.16b, w8
ext v3.16b, v0.16b, v1.16b, #1
ext v2.16b, v1.16b, v2.16b, #1
ushl v0.16b, v0.16b, v4.16b
ushl v1.16b, v1.16b, v4.16b
ushl v3.16b, v3.16b, v5.16b
ushl v2.16b, v2.16b, v5.16b
orr v0.16b, v3.16b, v0.16b
orr v1.16b, v2.16b, v1.16b
stp q0, q1, [x0]
add sp, sp, #0x50
ret
Thankfully the compiler unrolles the loop, and we are left with a pleasingly small sequence of instructions.
512-bit
512 bits is similar, but uses a loop of size 4.
#[inline(always)]
#[cfg(target_feature = "neon")]
pub fn neon_bit_shift_512(left: &[u8; 64], right: &[u8; 64], offset: usize) -> [u8; 64] {
use std::arch::aarch64::*;
unsafe {
let byte_offset = offset / 8;
let bit_shift = (offset % 8) as i8;
let mut src = [0u8; 128];
src[0..64].copy_from_slice(left);
src[64..128].copy_from_slice(right);
let src_ptr = src.as_ptr().add(byte_offset);
let mut result = [0u8; 64];
let dst_ptr = result.as_mut_ptr();
let shift_left_vec = vdupq_n_s8(bit_shift);
let shift_right_vec = vdupq_n_s8(bit_shift - 8);
let vecs = [
vld1q_u8(src_ptr),
vld1q_u8(src_ptr.add(16)),
vld1q_u8(src_ptr.add(32)),
vld1q_u8(src_ptr.add(48)),
vld1q_u8(src_ptr.add(64)),
];
for i in 0..4 {
let v_curr = vecs[i];
let v_next = vecs[i + 1];
let shifted_left = vshlq_u8(v_curr, shift_left_vec);
let next_bytes = vextq_u8(v_curr, v_next, 1);
let shifted_right = vshlq_u8(next_bytes, shift_right_vec);
let res = vorrq_u8(shifted_left, shifted_right);
vst1q_u8(dst_ptr.add(i * 16), res);
}
result
}
}
shift_demo18bit_shift_demo_512:
sub sp, sp, #0x90
ldp q1, q0, [x1, #32]
mov x8, sp
add x8, x8, x3, lsr #3
and w9, w3, #0x7
stp xzr, xzr, [sp, #128]
stp q1, q0, [sp, #32]
ldp q0, q2, [x1]
stp q0, q2, [sp]
ldp q1, q0, [x2]
stp q1, q0, [sp, #64]
ldp q2, q0, [x2, #32]
stp q2, q0, [sp, #96]
dup v0.16b, w9
orr w9, w3, #0xfffffff8
ldp q1, q2, [x8]
dup v6.16b, w9
ldp q3, q5, [x8, #32]
ldr q7, [x8, #64]
ext v4.16b, v1.16b, v2.16b, #1
ushl v1.16b, v1.16b, v0.16b
ext v16.16b, v2.16b, v3.16b, #1
ext v17.16b, v3.16b, v5.16b, #1
ext v7.16b, v5.16b, v7.16b, #1
ushl v2.16b, v2.16b, v0.16b
ushl v3.16b, v3.16b, v0.16b
ushl v0.16b, v5.16b, v0.16b
ushl v4.16b, v4.16b, v6.16b
ushl v16.16b, v16.16b, v6.16b
ushl v17.16b, v17.16b, v6.16b
orr v1.16b, v4.16b, v1.16b
ushl v4.16b, v7.16b, v6.16b
orr v2.16b, v16.16b, v2.16b
orr v3.16b, v17.16b, v3.16b
orr v0.16b, v4.16b, v0.16b
stp q1, q2, [x0]
stp q3, q0, [x0, #32]
add sp, sp, #0x90
ret
Here, too, the compiler unrolls the loop, and we are left with a pleasingly small sequence of instructions.
width | instruction count |
---|---|
128 | 17 |
256 | 23 |
512 | 38 |
AVX2
Many older computers lack support for AVX-512, and thus we must support AVX2.
Unlike Neon, AVX2 lacks byte-level shift support. Therefore, we must hack it together with 64-bit shifts and masks.
128-bit
#[cfg(target_feature = "avx2")]
#[inline(always)]
pub fn avx2_bit_shift_128(left: &[u8; 16], right: &[u8; 16], offset: usize) -> [u8; 16] {
use std::arch::x86_64::*;
unsafe {
let mut concatenated = [0u8; 32];
concatenated[0..16].copy_from_slice(left);
concatenated[16..32].copy_from_slice(right);
let byte_shift = offset / 8;
let bit_shift = offset % 8;
let shifted_low_vec = {
let low_bytes_ptr = concatenated.as_ptr().add(byte_shift);
let data_vec = _mm_loadu_si128(low_bytes_ptr as *const _);
let shift_count = _mm_set1_epi64x(bit_shift as i64);
let shifted_vec = _mm_sllv_epi64(data_vec, shift_count);
let mask_byte = (0xFFu16 << bit_shift) as u8;
let mask_vec = _mm_set1_epi8(mask_byte as i8);
_mm_and_si128(shifted_vec, mask_vec)
};
let shifted_high_vec = {
let high_bytes_ptr = concatenated.as_ptr().add(byte_shift + 1);
let data_vec = _mm_loadu_si128(high_bytes_ptr as *const _);
let shift_count = _mm_set1_epi64x(8 - bit_shift as i64);
let shifted_vec = _mm_srlv_epi64(data_vec, shift_count);
let mask_byte = 0xFFu16 >> (8 - bit_shift);
let mask_vec = _mm_set1_epi8(mask_byte as i8);
_mm_and_si128(shifted_vec, mask_vec)
};
let result_vec = _mm_or_si128(shifted_low_vec, shifted_high_vec);
mem::transmute(result_vec)
}
}
128-bit wide is fairly simple.
shift_demo18bit_shift_demo_128:
vmovups xmm0,XMMWORD PTR [rsi]
mov rax,rcx
and ecx,0x7
shr rax,0x3
vmovaps XMMWORD PTR [rsp-0x28],xmm0
vmovups xmm0,XMMWORD PTR [rdx]
vmovaps XMMWORD PTR [rsp-0x18],xmm0
vmovq xmm0,rcx
vmovdqu xmm1,XMMWORD PTR [rsp+rax*1-0x28]
vpsllq xmm0,xmm1,xmm0
vmovdqu xmm1,XMMWORD PTR [rsp+rax*1-0x27]
mov al,0xff
shl al,cl
vmovd xmm2,eax
mov eax,0x8
sub rax,rcx
mov ecx,0xff
vpbroadcastb xmm2,xmm2
vpand xmm0,xmm0,xmm2
vmovq xmm2,rax
shrx eax,ecx,eax
vpsrlq xmm1,xmm1,xmm2
vmovd xmm2,eax
vpbroadcastb xmm2,xmm2
vpand xmm1,xmm1,xmm2
vpor xmm0,xmm1,xmm0
vmovdqa XMMWORD PTR [rdi],xmm0
ret
256-bit
#[cfg(target_feature = "avx2")]
#[inline(always)]
pub fn avx2_bit_shift_256(left: &[u8; 32], right: &[u8; 32], offset: usize) -> [u8; 32] {
use std::arch::x86_64::*;
unsafe {
let mut concatenated = [0u8; 64];
concatenated[0..32].copy_from_slice(left);
concatenated[32..64].copy_from_slice(right);
let byte_shift = offset / 8;
let bit_shift = offset % 8;
let shifted_low_vec = {
let low_bytes_ptr = concatenated.as_ptr().add(byte_shift);
let data_vec = _mm256_loadu_si256(low_bytes_ptr as *const _);
let shift_count = _mm256_set1_epi64x(bit_shift as i64);
let shifted_vec = _mm256_sllv_epi64(data_vec, shift_count);
let mask_byte = (0xFFu16 << bit_shift) as u8;
let mask_vec = _mm256_set1_epi8(mask_byte as i8);
_mm256_and_si256(shifted_vec, mask_vec)
};
let shifted_high_vec = {
let high_bytes_ptr = concatenated.as_ptr().add(byte_shift + 1);
let data_vec = _mm256_loadu_si256(high_bytes_ptr as *const _);
let shift_count = _mm256_set1_epi64x(8 - bit_shift as i64);
let shifted_vec = _mm256_srlv_epi64(data_vec, shift_count);
let mask_byte = 0xFFu16 >> (8 - bit_shift);
let mask_vec = _mm256_set1_epi8(mask_byte as i8);
_mm256_and_si256(shifted_vec, mask_vec)
};
let result_vec = _mm256_or_si256(shifted_low_vec, shifted_high_vec);
mem::transmute(result_vec)
}
}
256 is approximately the same, just different instruction variants.
shift_demo18bit_shift_demo_256:
vmovups ymm0,YMMWORD PTR [rsi]
mov rax,rcx
and ecx,0x7
shr rax,0x3
vmovups YMMWORD PTR [rsp-0x48],ymm0
vmovups ymm0,YMMWORD PTR [rdx]
mov dl,0xff
shl dl,cl
vmovups YMMWORD PTR [rsp-0x28],ymm0
vmovq xmm0,rcx
vmovdqu ymm1,YMMWORD PTR [rsp+rax*1-0x48]
vmovdqu ymm2,YMMWORD PTR [rsp+rax*1-0x47]
mov eax,0x8
sub rax,rcx
mov ecx,0xff
vpsllq ymm0,ymm1,xmm0
vmovd xmm1,edx
vpbroadcastb ymm1,xmm1
vpand ymm0,ymm0,ymm1
vmovq xmm1,rax
shrx eax,ecx,eax
vpsrlq ymm1,ymm2,xmm1
vmovd xmm2,eax
vpbroadcastb ymm2,xmm2
vpand ymm1,ymm1,ymm2
vpor ymm0,ymm1,ymm0
vmovdqa YMMWORD PTR [rdi],ymm0
vzeroupper
ret
The compiler wanted to add a vzeroupper
to tell the CPU that it can safely power down the upper side of the registers, but it is otherwise identical.
512-bit
#[cfg(target_feature = "avx2")]
#[inline(always)]
pub fn avx2_bit_shift_512(left: &[u8; 64], right: &[u8; 64], offset: usize) -> [u8; 64] {
use std::arch::x86_64::*;
unsafe {
use std::mem::transmute;
let mut concatenated = [0u8; 128];
concatenated[0..64].copy_from_slice(left);
concatenated[64..128].copy_from_slice(right);
let byte_shift = offset / 8;
let bit_shift = offset % 8;
let result = [0, 1].map(|i| {
let shifted_low_vec = {
let low_bytes_ptr = concatenated.as_ptr().add(i * 32 + byte_shift);
let data_vec = _mm256_loadu_si256(low_bytes_ptr as *const __m256i);
let shift_count = _mm256_set1_epi64x(bit_shift as i64);
let shifted_vec = _mm256_sllv_epi64(data_vec, shift_count);
let mask_byte = (0xFFu16 << bit_shift) as u8;
let mask_vec = _mm256_set1_epi8(mask_byte as i8);
_mm256_and_si256(shifted_vec, mask_vec)
};
let shifted_high_vec = {
let high_bytes_ptr = concatenated.as_ptr().add(i * 32 + byte_shift + 1);
let data_vec = _mm256_loadu_si256(high_bytes_ptr as *const __m256i);
let shift_count = _mm256_set1_epi64x(8 - bit_shift as i64);
let shifted_vec = _mm256_srlv_epi64(data_vec, shift_count);
let mask_byte = 0xFFu16 >> (8 - bit_shift);
let mask_vec = _mm256_set1_epi8(mask_byte as i8);
_mm256_and_si256(shifted_vec, mask_vec)
};
_mm256_or_si256(shifted_low_vec, shifted_high_vec)
});
transmute(result)
}
}
512-bit requires a loop, but is otherwise similar.
shift_demo18bit_shift_demo_512:
push rax
vmovups ymm0,YMMWORD PTR [rsi]
vmovups ymm1,YMMWORD PTR [rsi+0x20]
mov rax,rcx
and ecx,0x7
mov r8b,0x8
shr rax,0x3
sub r8b,cl
vmovq xmm2,rcx
vmovups YMMWORD PTR [rsp-0x60],ymm1
vmovups YMMWORD PTR [rsp-0x80],ymm0
vmovups ymm0,YMMWORD PTR [rdx]
vmovups ymm1,YMMWORD PTR [rdx+0x20]
mov edx,0xff
shlx esi,edx,ecx
shrx edx,edx,r8d
vmovups YMMWORD PTR [rsp-0x40],ymm0
vmovups YMMWORD PTR [rsp-0x20],ymm1
vmovd xmm0,esi
mov esi,0x8
vmovd xmm1,edx
vmovdqu ymm3,YMMWORD PTR [rsp+rax*1-0x80]
vmovdqu ymm4,YMMWORD PTR [rsp+rax*1-0x7f]
vmovdqu ymm5,YMMWORD PTR [rsp+rax*1-0x60]
vmovdqu ymm6,YMMWORD PTR [rsp+rax*1-0x5f]
sub esi,ecx
vmovd xmm7,esi
vpbroadcastb ymm0,xmm0
vpbroadcastb ymm1,xmm1
vpsllq ymm3,ymm3,xmm2
vpsrlq ymm4,ymm4,xmm7
vpsllq ymm2,ymm5,xmm2
vpand ymm3,ymm3,ymm0
vpand ymm4,ymm4,ymm1
vpand ymm0,ymm2,ymm0
vpor ymm3,ymm4,ymm3
vpsrlq ymm4,ymm6,xmm7
vpand ymm1,ymm4,ymm1
vmovdqa YMMWORD PTR [rdi],ymm3
vpor ymm0,ymm1,ymm0
vmovdqa YMMWORD PTR [rdi+0x20],ymm0
pop rax
vzeroupper
ret
width | instruction count |
---|---|
128 | 27 |
256 | 28 |
512 | 43 |
AVX-512
Given that AVX-512 support implies AVX2, we can call the 256-bit and 128-bit versions on AVX-512 hardware.
512-bit
#[cfg(target_feature = "avx512f")]
#[inline(always)]
pub fn avx512_bit_shift_512(left: &[u8; 64], right: &[u8; 64], offset: usize) -> [u8; 64] {
use std::arch::x86_64::*;
unsafe {
let mut concatenated = [0u8; 128];
concatenated[0..64].copy_from_slice(left);
concatenated[64..128].copy_from_slice(right);
let byte_shift = offset / 8;
let bit_shift = offset % 8;
let shifted_low_vec = {
let low_bytes_ptr = concatenated.as_ptr().add(byte_shift);
let data_vec = _mm512_loadu_si512(low_bytes_ptr as *const _);
let shift_count = _mm512_set1_epi64(bit_shift as i64);
let shifted_vec = _mm512_sllv_epi64(data_vec, shift_count);
let mask_byte = (0xFFu16 << bit_shift) as u8;
let mask_vec = _mm512_set1_epi8(mask_byte as i8);
_mm512_and_si512(shifted_vec, mask_vec)
};
let shifted_high_vec = {
let high_bytes_ptr = concatenated.as_ptr().add(byte_shift + 1);
let data_vec = _mm512_loadu_si512(high_bytes_ptr as *const _);
let shift_count = _mm512_set1_epi64(8 - bit_shift as i64);
let shifted_vec = _mm512_srlv_epi64(data_vec, shift_count);
let mask_byte = 0xFFu16 >> (8 - bit_shift);
let mask_vec = _mm512_set1_epi8(mask_byte as i8);
_mm512_and_si512(shifted_vec, mask_vec)
};
let result_vec = _mm512_or_si512(shifted_low_vec, shifted_high_vec);
mem::transmute(result_vec)
}
}
shift_demo18bit_shift_demo_512:
push rax
vmovups zmm0,ZMMWORD PTR [rsi]
mov rax,rcx
shr rax,0x3
and ecx,0x7
vmovups ZMMWORD PTR [rsp-0x80],zmm0
vmovups zmm0,ZMMWORD PTR [rdx]
vmovups ZMMWORD PTR [rsp-0x40],zmm0
vmovq xmm0,rcx
vmovdqu64 zmm2,ZMMWORD PTR [rsp+rax*1-0x7f]
vmovdqu64 zmm1,ZMMWORD PTR [rsp+rax*1-0x80]
mov al,0xff
shl al,cl
vpbroadcastb zmm4,eax
mov eax,0x8
sub rax,rcx
mov ecx,0xff
vmovq xmm3,rax
shrx eax,ecx,eax
vpsrlq zmm2,zmm2,xmm3
vpbroadcastb zmm3,eax
vpsllq zmm0,zmm1,xmm0
vpandd zmm2,zmm2,zmm3
vpternlogd zmm2,zmm0,zmm4,0xf8
vmovdqa64 ZMMWORD PTR [rdi],zmm2
pop rax
vzeroupper
ret
Both source and asm are approximately the same as for AVX2. Instruction count is identical.
Constant Shifts
Often shifts are known at compile time. This permits optimizations.
Consider a shift of 5 bits.
shift_demo22bitshift_const_shift_5:
117c0: c5 f8 28 06 vmovaps xmm0,XMMWORD PTR [rsi]
117c4: c5 f8 29 44 24 e8 vmovaps XMMWORD PTR [rsp-0x18],xmm0
117ca: 88 54 24 f8 mov BYTE PTR [rsp-0x8],dl
117ce: c5 f9 6f 44 24 e8 vmovdqa xmm0,XMMWORD PTR [rsp-0x18]
117d4: c5 fa 6f 4c 24 e9 vmovdqu xmm1,XMMWORD PTR [rsp-0x17]
117da: c5 f9 73 f0 05 vpsllq xmm0,xmm0,0x5
117df: c5 f1 73 d1 03 vpsrlq xmm1,xmm1,0x3
117e4: c5 f9 db 05 a4 2c ff vpand xmm0,xmm0,XMMWORD PTR [rip+0xffffffffffff2ca4]
117eb: ff
117ec: c5 f1 db 0d bc 2c ff vpand xmm1,xmm1,XMMWORD PTR [rip+0xffffffffffff2cbc]
117f3: ff
117f4: c5 f1 eb c0 vpor xmm0,xmm1,xmm0
117f8: c5 f9 7f 07 vmovdqa XMMWORD PTR [rdi],xmm0
117fc: c3 ret
Arm is even more beautiful thanks to byte-level shift and usra
.
shift_demo22bitshift_const_shift_5:
ext v1.16b, v0.16b, v1.16b, #1
shl v0.16b, v0.16b, #5
usra v0.16b, v1.16b, #3
str q0, [x0]
ret
Or consider the even more degenerate case of an 8 bit shift.
<shift_demo22bitshift_const_shift_8:
11800: c4 e3 79 20 c6 0f vpinsrb xmm0,xmm0,esi,0xf
11806: c5 f9 7f 07 vmovdqa XMMWORD PTR [rdi],xmm0
1180a: c3 ret
And consider a loop of shifts 1 to 5:
shift_demo11bitshift_15
ext v1.16b, v0.16b, v1.16b, #1
add v2.16b, v0.16b, v0.16b
shl v3.16b, v0.16b, #2
shl v4.16b, v0.16b, #3
shl v0.16b, v0.16b, #4
usra v2.16b, v1.16b, #7
usra v3.16b, v1.16b, #6
usra v4.16b, v1.16b, #5
usra v0.16b, v1.16b, #4
eor v1.16b, v2.16b, v3.16b
eor v0.16b, v4.16b, v0.16b
eor v0.16b, v1.16b, v0.16b
mvn v0.16b, v0.16b
str q0, [x0]
ret
Benchmarking
A bare function as small as these cannot be meaningfully benchmarked. In real world usage, the function would be called multiple times with different shifts on the same bits, and/or the shifts would be constant.
We benchmark as follows:
#[inline(always)]
fn core_shift<V: BitArray, const L: usize>(left: V, center: V, right: V) -> [V; L * 2 + 1] {
let mut acc = [V::splat(false); L * 2 + 1];
for i in 0..L {
acc[i] = V::shift(left, center, V::LEN - (L - i));
}
acc[L] = center;
for i in 0..L {
acc[i + L + 1] = V::shift(center, right, i + 1);
}
acc
}
#[inline(never)]
fn benchmark<V: BitArray, const L: usize>(
left: &Vec<V>,
center: &Vec<V>,
right: &Vec<V>,
) -> ([V; L * 2 + 1], f64)
where
[(); L * 2 + 1]:,
{
assert_eq!(left.len(), center.len());
assert_eq!(center.len(), right.len());
let start = std::time::Instant::now();
let mut acc = [V::splat(false); L * 2 + 1];
for j in 0..center.len() {
let results = core_shift(left[j], center[j], right[j]);
for i in 0..L * 2 + 1 {
acc[i] = acc[i] ^ results[i];
}
}
let total_time = start.elapsed();
(acc, total_time.as_nanos() as f64 / center.len() as f64)
}
We cut out windows from L
bits to the left of center, and L
bits to the right of center.
We XOR the windows of each iteration together to prevent excessive optimization.
Now, consider the numbers.
Apple M2
L | B128 ns | B256 ns | B512 ns |
---|---|---|---|
1 | 1.10 | 2.20 | 4.39 |
2 | 1.35 | 2.88 | 5.91 |
3 | 1.86 | 4.11 | 9.20 |
4 | 13.96 | 16.42 | 13.79 |
5 | 12.81 | 18.70 | 17.79 |
6 | 13.08 | 23.42 | 21.69 |
7 | 17.75 | 26.80 | 24.85 |
8 | 38.49 | 50.79 | 31.48 |
9 | 21.24 | 50.02 | 35.35 |
10 | 43.42 | 53.21 | 39.51 |
11 | 24.81 | 57.00 | 116.36 |
12 | 95.97 | 63.06 | 133.27 |
13 | 30.72 | 71.03 | 148.66 |
14 | 116.85 | 81.08 | 165.19 |
15 | 39.61 | 87.41 | 159.74 |
16 | 41.32 | 112.38 | 179.40 |
17 | 43.54 | 100.77 | 203.32 |
Zen+ (2950X)
L | B128 ns | B256 ns | B512 ns |
---|---|---|---|
1 | 12.30 | 12.31 | 15.35 |
2 | 23.39 | 23.05 | 24.19 |
3 | 34.91 | 34.96 | 36.32 |
4 | 46.64 | 47.67 | 43.71 |
5 | 57.94 | 55.03 | 50.40 |
6 | 68.56 | 68.70 | 63.54 |
7 | 79.50 | 79.36 | 82.59 |
8 | 90.57 | 87.32 | 108.37 |
9 | 102.75 | 103.13 | 120.03 |
10 | 114.19 | 132.06 | 136.73 |
11 | 162.78 | 190.03 | 143.69 |
12 | 178.39 | 206.58 | 169.57 |
13 | 190.51 | 221.45 | 184.26 |
14 | 204.37 | 241.32 | 197.38 |
15 | 219.56 | 257.24 | 218.03 |
16 | 242.83 | 274.53 | 248.15 |
17 | 261.72 | 295.75 | 482.02 |
Zen 4 (7950X)
L | B128 ns | B256 ns | B512 ns |
---|---|---|---|
1 | 8.02 | 8.40 | 9.78 |
2 | 15.87 | 16.61 | 19.47 |
3 | 23.79 | 24.87 | 28.56 |
4 | 31.50 | 32.93 | 37.86 |
5 | 39.25 | 41.06 | 48.48 |
6 | 46.99 | 49.11 | 56.17 |
7 | 54.23 | 56.76 | 66.91 |
8 | 61.76 | 64.68 | 78.17 |
9 | 69.69 | 72.80 | 85.12 |
10 | 77.50 | 81.12 | 94.09 |
11 | 92.81 | 99.53 | 106.80 |
12 | 101.38 | 111.43 | 118.82 |
13 | 110.06 | 119.22 | 126.10 |
14 | 119.45 | 130.44 | 141.43 |
15 | 145.12 | 166.63 | 146.27 |
16 | 155.99 | 167.24 | 155.59 |
17 | 166.73 | 175.70 | 164.40 |
Zen 5 (9950X)
L | B128 ns | B256 ns | B512 ns |
---|---|---|---|
1 | 5.90 | 5.93 | 7.06 |
2 | 11.66 | 11.67 | 13.88 |
3 | 17.60 | 17.58 | 19.40 |
4 | 23.19 | 23.20 | 25.38 |
5 | 28.80 | 28.82 | 31.51 |
6 | 34.41 | 34.44 | 38.02 |
7 | 39.66 | 39.71 | 44.35 |
8 | 44.75 | 44.79 | 49.26 |
9 | 50.99 | 51.07 | 56.70 |
10 | 56.68 | 56.77 | 62.75 |
11 | 76.79 | 82.66 | 68.96 |
12 | 83.58 | 89.23 | 76.01 |
13 | 91.33 | 98.23 | 82.22 |
14 | 99.13 | 105.25 | 88.75 |
15 | 120.46 | 127.64 | 97.25 |
16 | 128.54 | 132.41 | 103.29 |
17 | 137.18 | 138.21 | 109.82 |
M2 Arm, which as mentioned is a good and upright instruction set, running on a powerful core, is respectable performance at 128 bits wide, although it has some interesting performance anomalies which we have not yet investigated, and it cannot match the performance of full-width AVX-512 on wider vectors. Zen+ has an anemic vector engine, and thus its poor performance is understandable. Zen 4 and particularly Zen 5 have good performance at wider vector widths. I am amused that 256 bit is slower than 512 bit on Zen 5.
However, much of the code is redundant for these compile time known sets of shifts. If the loop is large enough, the compiler will not unroll the loop and will not be able to optimize.
Perhaps in future posts we will improve performance for known sets of shifts.