// Copyright 2009 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package amd64
import (
"cmd/compile/internal/gc"
"cmd/internal/obj"
"cmd/internal/obj/x86"
)
func blockcopy(n, ns *gc.Node, osrc, odst, w int64) {
var noddi gc.Node
gc.Nodreg(&noddi, gc.Types[gc.Tptr], x86.REG_DI)
var nodsi gc.Node
gc.Nodreg(&nodsi, gc.Types[gc.Tptr], x86.REG_SI)
var nodl gc.Node
var nodr gc.Node
if n.Ullman >= ns.Ullman {
gc.Agenr(n, &nodr, &nodsi)
if ns.Op == gc.ONAME {
gc.Gvardef(ns)
}
gc.Agenr(ns, &nodl, &noddi)
} else {
if ns.Op == gc.ONAME {
gc.Gvardef(ns)
}
gc.Agenr(ns, &nodl, &noddi)
gc.Agenr(n, &nodr, &nodsi)
}
if nodl.Reg != x86.REG_DI {
gmove(&nodl, &noddi)
}
if nodr.Reg != x86.REG_SI {
gmove(&nodr, &nodsi)
}
gc.Regfree(&nodl)
gc.Regfree(&nodr)
c := w % 8 // bytes
q := w / 8 // quads
var oldcx gc.Node
var cx gc.Node
savex(x86.REG_CX, &cx, &oldcx, nil, gc.Types[gc.TINT64])
// if we are copying forward on the stack and
// the src and dst overlap, then reverse direction
if osrc < odst && odst < osrc+w {
// reverse direction
gins(x86.ASTD, nil, nil) // set direction flag
if c > 0 {
gconreg(addptr, w-1, x86.REG_SI)
gconreg(addptr, w-1, x86.REG_DI)
gconreg(movptr, c, x86.REG_CX)
gins(x86.AREP, nil, nil) // repeat
gins(x86.AMOVSB, nil, nil) // MOVB *(SI)-,*(DI)-
}
if q > 0 {
if c > 0 {
gconreg(addptr, -7, x86.REG_SI)
gconreg(addptr, -7, x86.REG_DI)
} else {
gconreg(addptr, w-8, x86.REG_SI)
gconreg(addptr, w-8, x86.REG_DI)
}
gconreg(movptr, q, x86.REG_CX)
gins(x86.AREP, nil, nil) // repeat
gins(x86.AMOVSQ, nil, nil) // MOVQ *(SI)-,*(DI)-
}
// we leave with the flag clear
gins(x86.ACLD, nil, nil)
} else {
// normal direction
if q > 128 || (gc.Nacl && q >= 4) {
gconreg(movptr, q, x86.REG_CX)
gins(x86.AREP, nil, nil) // repeat
gins(x86.AMOVSQ, nil, nil) // MOVQ *(SI)+,*(DI)+
} else if q >= 4 {
p := gins(obj.ADUFFCOPY, nil, nil)
p.To.Type = obj.TYPE_ADDR
p.To.Sym = gc.Linksym(gc.Pkglookup("duffcopy", gc.Runtimepkg))
// 14 and 128 = magic constants: see ../../runtime/asm_amd64.s
p.To.Offset = 14 * (128 - q)
} else if !gc.Nacl && c == 0 {
// We don't need the MOVSQ side-effect of updating SI and DI,
// and issuing a sequence of MOVQs directly is faster.
nodsi.Op = gc.OINDREG
noddi.Op = gc.OINDREG
for q > 0 {
gmove(&nodsi, &cx) // MOVQ x+(SI),CX
gmove(&cx, &noddi) // MOVQ CX,x+(DI)
nodsi.Xoffset += 8
noddi.Xoffset += 8
q--
}
} else {
for q > 0 {
gins(x86.AMOVSQ, nil, nil) // MOVQ *(SI)+,*(DI)+
q--
}
}
// copy the remaining c bytes
if w < 4 || c <= 1 || (odst < osrc && osrc < odst+w) {
for c > 0 {
gins(x86.AMOVSB, nil, nil) // MOVB *(SI)+,*(DI)+
c--
}
} else if w < 8 || c <= 4 {
nodsi.Op = gc.OINDREG
noddi.Op = gc.OINDREG
cx.Type = gc.Types[gc.TINT32]
nodsi.Type = gc.Types[gc.TINT32]
noddi.Type = gc.Types[gc.TINT32]
if c > 4 {
nodsi.Xoffset = 0
noddi.Xoffset = 0
gmove(&nodsi, &cx)
gmove(&cx, &noddi)
}
nodsi.Xoffset = c - 4
noddi.Xoffset = c - 4
gmove(&nodsi, &cx)
gmove(&cx, &noddi)
} else {
nodsi.Op = gc.OINDREG
noddi.Op = gc.OINDREG
cx.Type = gc.Types[gc.TINT64]
nodsi.Type = gc.Types[gc.TINT64]
noddi.Type = gc.Types[gc.TINT64]
nodsi.Xoffset = c - 8
noddi.Xoffset = c - 8
gmove(&nodsi, &cx)
gmove(&cx, &noddi)
}
}
restx(&cx, &oldcx)
}