// Copyright 2017 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

#include "textflag.h"

// countByte(s []byte, c byte) int
TEXT bytes·countByte(SB),NOSPLIT,$0-40
	MOVD	s_base+0(FP), R0
	MOVD	s_len+8(FP), R2
	MOVBU	c+24(FP), R1
	// R11 = count of byte to search
	MOVD	$0, R11
	// short path to handle 0-byte case
	CBZ	R2, done
	CMP	$0x20, R2
	// jump directly to tail if length < 32
	BLO	tail
	ANDS	$0x1f, R0, R9
	BEQ	chunk
	// Work with not 32-byte aligned head
	BIC	$0x1f, R0, R3
	ADD	$0x20, R3
head_loop:
	MOVBU.P	1(R0), R5
	CMP	R5, R1
	CINC	EQ, R11, R11
	SUB	$1, R2, R2
	CMP	R0, R3
	BNE	head_loop
	// Work with 32-byte aligned chunks
chunk:
	BIC	$0x1f, R2, R9
	// The first chunk can also be the last
	CBZ	R9, tail
	// R3 = end of 32-byte chunks
	ADD	R0, R9, R3
	MOVD	$1, R5
	VMOV	R5, V5.B16
	// R2 = length of tail
	SUB	R9, R2, R2
	// Duplicate R1 (byte to search) to 16 1-byte elements of V0
	VMOV	R1, V0.B16
	// Clear the low 64-bit element of V7 and V8
	VEOR	V7.B8, V7.B8, V7.B8
	VEOR	V8.B8, V8.B8, V8.B8
	// Count the target byte in 32-byte chunk
chunk_loop:
	VLD1.P	(R0), [V1.B16, V2.B16]
	CMP	R0, R3
	VCMEQ	V0.B16, V1.B16, V3.B16
	VCMEQ	V0.B16, V2.B16, V4.B16
	// Clear the higher 7 bits
	VAND	V5.B16, V3.B16, V3.B16
	VAND	V5.B16, V4.B16, V4.B16
	// Count lanes match the requested byte
	VADDP	V4.B16, V3.B16, V6.B16 // 32B->16B
	VUADDLV	V6.B16, V7
	// Accumulate the count in low 64-bit element of V8 when inside the loop
	VADD	V7, V8
	BNE	chunk_loop
	VMOV	V8.D[0], R6
	ADD	R6, R11, R11
	CBZ	R2, done
tail:
	// Work with tail shorter than 32 bytes
	MOVBU.P	1(R0), R5
	SUB	$1, R2, R2
	CMP	R5, R1
	CINC	EQ, R11, R11
	CBNZ	R2, tail
done:
	MOVD	R11, ret+32(FP)
	RET