// Inferno's libkern/memmove-arm.s // https://bitbucket.org/inferno-os/inferno-os/src/default/libkern/memmove-arm.s // // Copyright © 1994-1999 Lucent Technologies Inc. All rights reserved. // Revisions Copyright © 2000-2007 Vita Nuova Holdings Limited (www.vitanuova.com). All rights reserved. // Portions Copyright 2009 The Go Authors. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #include "textflag.h" // TE or TS are spilled to the stack during bulk register moves. #define TS R0 #define TE R8 // Warning: the linker will use R11 to synthesize certain instructions. Please // take care and double check with objdump. #define FROM R11 #define N R12 #define TMP R12 /* N and TMP don't overlap */ #define TMP1 R5 #define RSHIFT R5 #define LSHIFT R6 #define OFFSET R7 #define BR0 R0 /* shared with TS */ #define BW0 R1 #define BR1 R1 #define BW1 R2 #define BR2 R2 #define BW2 R3 #define BR3 R3 #define BW3 R4 #define FW0 R1 #define FR0 R2 #define FW1 R2 #define FR1 R3 #define FW2 R3 #define FR2 R4 #define FW3 R4 #define FR3 R8 /* shared with TE */ TEXT runtime·memmove(SB), NOSPLIT, $4-12 _memmove: MOVW to+0(FP), TS MOVW from+4(FP), FROM MOVW n+8(FP), N ADD N, TS, TE /* to end pointer */ CMP FROM, TS BLS _forward _back: ADD N, FROM /* from end pointer */ CMP $4, N /* need at least 4 bytes to copy */ BLT _b1tail _b4align: /* align destination on 4 */ AND.S $3, TE, TMP BEQ _b4aligned MOVBU.W -1(FROM), TMP /* pre-indexed */ MOVBU.W TMP, -1(TE) /* pre-indexed */ B _b4align _b4aligned: /* is source now aligned? */ AND.S $3, FROM, TMP BNE _bunaligned ADD $31, TS, TMP /* do 32-byte chunks if possible */ MOVW TS, savedts-4(SP) _b32loop: CMP TMP, TE BLS _b4tail MOVM.DB.W (FROM), [R0-R7] MOVM.DB.W [R0-R7], (TE) B _b32loop _b4tail: /* do remaining words if possible */ MOVW savedts-4(SP), TS ADD $3, TS, TMP _b4loop: CMP TMP, TE BLS _b1tail MOVW.W -4(FROM), TMP1 /* pre-indexed */ MOVW.W TMP1, -4(TE) /* pre-indexed */ B _b4loop _b1tail: /* remaining bytes */ CMP TE, TS BEQ _return MOVBU.W -1(FROM), TMP /* pre-indexed */ MOVBU.W TMP, -1(TE) /* pre-indexed */ B _b1tail _forward: CMP $4, N /* need at least 4 bytes to copy */ BLT _f1tail _f4align: /* align destination on 4 */ AND.S $3, TS, TMP BEQ _f4aligned MOVBU.P 1(FROM), TMP /* implicit write back */ MOVBU.P TMP, 1(TS) /* implicit write back */ B _f4align _f4aligned: /* is source now aligned? */ AND.S $3, FROM, TMP BNE _funaligned SUB $31, TE, TMP /* do 32-byte chunks if possible */ MOVW TE, savedte-4(SP) _f32loop: CMP TMP, TS BHS _f4tail MOVM.IA.W (FROM), [R1-R8] MOVM.IA.W [R1-R8], (TS) B _f32loop _f4tail: MOVW savedte-4(SP), TE SUB $3, TE, TMP /* do remaining words if possible */ _f4loop: CMP TMP, TS BHS _f1tail MOVW.P 4(FROM), TMP1 /* implicit write back */ MOVW.P TMP1, 4(TS) /* implicit write back */ B _f4loop _f1tail: CMP TS, TE BEQ _return MOVBU.P 1(FROM), TMP /* implicit write back */ MOVBU.P TMP, 1(TS) /* implicit write back */ B _f1tail _return: MOVW to+0(FP), R0 RET _bunaligned: CMP $2, TMP /* is TMP < 2 ? */ MOVW.LT $8, RSHIFT /* (R(n)<<24)|(R(n-1)>>8) */ MOVW.LT $24, LSHIFT MOVW.LT $1, OFFSET MOVW.EQ $16, RSHIFT /* (R(n)<<16)|(R(n-1)>>16) */ MOVW.EQ $16, LSHIFT MOVW.EQ $2, OFFSET MOVW.GT $24, RSHIFT /* (R(n)<<8)|(R(n-1)>>24) */ MOVW.GT $8, LSHIFT MOVW.GT $3, OFFSET ADD $16, TS, TMP /* do 16-byte chunks if possible */ CMP TMP, TE BLS _b1tail BIC $3, FROM /* align source */ MOVW TS, savedts-4(SP) MOVW (FROM), BR0 /* prime first block register */ _bu16loop: CMP TMP, TE BLS _bu1tail MOVW BR0<<LSHIFT, BW3 MOVM.DB.W (FROM), [BR0-BR3] ORR BR3>>RSHIFT, BW3 MOVW BR3<<LSHIFT, BW2 ORR BR2>>RSHIFT, BW2 MOVW BR2<<LSHIFT, BW1 ORR BR1>>RSHIFT, BW1 MOVW BR1<<LSHIFT, BW0 ORR BR0>>RSHIFT, BW0 MOVM.DB.W [BW0-BW3], (TE) B _bu16loop _bu1tail: MOVW savedts-4(SP), TS ADD OFFSET, FROM B _b1tail _funaligned: CMP $2, TMP MOVW.LT $8, RSHIFT /* (R(n+1)<<24)|(R(n)>>8) */ MOVW.LT $24, LSHIFT MOVW.LT $3, OFFSET MOVW.EQ $16, RSHIFT /* (R(n+1)<<16)|(R(n)>>16) */ MOVW.EQ $16, LSHIFT MOVW.EQ $2, OFFSET MOVW.GT $24, RSHIFT /* (R(n+1)<<8)|(R(n)>>24) */ MOVW.GT $8, LSHIFT MOVW.GT $1, OFFSET SUB $16, TE, TMP /* do 16-byte chunks if possible */ CMP TMP, TS BHS _f1tail BIC $3, FROM /* align source */ MOVW TE, savedte-4(SP) MOVW.P 4(FROM), FR3 /* prime last block register, implicit write back */ _fu16loop: CMP TMP, TS BHS _fu1tail MOVW FR3>>RSHIFT, FW0 MOVM.IA.W (FROM), [FR0,FR1,FR2,FR3] ORR FR0<<LSHIFT, FW0 MOVW FR0>>RSHIFT, FW1 ORR FR1<<LSHIFT, FW1 MOVW FR1>>RSHIFT, FW2 ORR FR2<<LSHIFT, FW2 MOVW FR2>>RSHIFT, FW3 ORR FR3<<LSHIFT, FW3 MOVM.IA.W [FW0,FW1,FW2,FW3], (TS) B _fu16loop _fu1tail: MOVW savedte-4(SP), TE SUB OFFSET, FROM B _f1tail