bool slice_contains(const uint8_t *slice, size_t size, uint8_t value) {
uint8x16_t val_vec = vdupq_n_u8(value);
for (size_t i = 0; i < size; i += 16) {
uint8x16_t data_vec = vld1q_u8(&slice[i]);
uint8x16_t result_vec = vceqq_u8(data_vec, val_vec);
uint16_t result = vaddvq_u8(result_vec);
if (result) {
return true;
}
}
return false;
}import "C"
import (
"unsafe"
)
func SliceContains(data []uint8, target uint8) bool {
return bool(C.slice_contains((*C.uint8_t)(unsafe.SliceData(data)),
C.size_t(len(data)), C.uint8_t(target)))
}go test -bench=. -benchmem -cpu=1
goos: darwin
goarch: arm64
pkg: asm/simd/slice_contains
cpu: Apple M3 Pro
BenchmarkSliceContains/SliceContainsV1_(SIMD) 63256 18263 ns/op
BenchmarkSliceContains/SliceContainsVO 4813 252514 ns/op
BenchmarkSliceContains/SliceContainsCgo 56691 21913 ns/op
PASS
ok asm/simd/slice_contains 4.359s• WORD
• BYTE
gcc main.c
otool -tvj a.out
0000000100003f24 3dc02fe1 ldr q1, [sp, #0xb0]
0000000100003f28 4e21d400 fadd.4s v0, v0, v1
0000000100003f2c 3d802be0 str q0, [sp, #0xa0]
0000000100003f30 3dc02be0 Idr q0, [sp, #0xa0]
ENDIAN!#include "textflag.h"
// func vectorFloatAdditionV1(first, second, dst []float32)
TEXT ·vectorFloatAdditionV1(SB), NOSPLIT, $0
LDP first_base+0(FP), (R0, R1)
LDP second_base+24(FP), (R2, R3)
LDP dst_base+48(FP), (R4, R5)
MOVD $0, R7
loop:
CMP R5, R7
BGE done VLD1 (R0), [V0.S4]
VLD1 (R2), [V1.S4]
WORD $0x4e21d400 // fadd.4s v0, v0, v1
VST1 [V0.S4], (R4)
ADD $4, R7
ADD $16, R4
ADD $16, R0
ADD $16, R2
B loop
done:
RET