; Tests basics and corner cases of x86-32 sandboxing, using -Om1 in
; the hope that the output will remain stable.  When packing bundles,
; we try to limit to a few instructions with well known sizes and
; minimal use of registers and stack slots in the lowering sequence.

; XFAIL: filtype=asm
; RUN: %p2i -i %s --sandbox --filetype=obj --disassemble --args -Om1 \
; RUN:   -allow-externally-defined-symbols \
; RUN:   -ffunction-sections | FileCheck %s

; RUN: %p2i -i %s --sandbox --filetype=obj --disassemble --target=x8664 \
; RUN:   --args -Om1 -allow-externally-defined-symbols  \
; RUN:   -ffunction-sections | FileCheck %s --check-prefix X8664

declare void @call_target()
@global_byte = internal global [1 x i8] zeroinitializer
@global_short = internal global [2 x i8] zeroinitializer
@global_int = internal global [4 x i8] zeroinitializer

; A direct call sequence uses the right mask and register-call sequence.
define internal void @test_direct_call() {
entry:
  call void @call_target()
  ret void
}
; CHECK-LABEL: test_direct_call
; CHECK: nop
; CHECK: 1b: {{.*}} call 1c
; CHECK-NEXT: 20:
; X8664-LABEL: test_direct_call
; X8664: push {{.*}} R_X86_64_32S test_direct_call+{{.*}}20
; X8664: jmp {{.*}} call_target

; An indirect call sequence uses the right mask and register-call sequence.
define internal void @test_indirect_call(i32 %target) {
entry:
  %__1 = inttoptr i32 %target to void ()*
  call void %__1()
  ret void
}
; CHECK-LABEL: test_indirect_call
; CHECK: mov [[REG:.*]],DWORD PTR [esp
; CHECK-NEXT: nop
; CHECK: 1b: {{.*}} and [[REG]],0xffffffe0
; CHECK-NEXT: call [[REG]]
; CHECk-NEXT: 20:
; X8664-LABEL: test_indirect_call
; X8664: push {{.*}} R_X86_64_32S test_indirect_call+{{.*}}20
; X8664: {{.*}} and e[[REG:..]],0xffffffe0
; X8664: add r[[REG]],r15
; X8664: jmp r[[REG]]

; A return sequence uses the right pop / mask / jmp sequence.
define internal void @test_ret() {
entry:
  ret void
}
; CHECK-LABEL: test_ret
; CHECK: pop ecx
; CHECK-NEXT: and ecx,0xffffffe0
; CHECK-NEXT: jmp ecx
; X8664-LABEL: test_ret
; X8664: pop rcx
; X8664: and ecx,0xffffffe0
; X8664: add rcx,r15
; X8664: jmp rcx

; A perfectly packed bundle should not have nops at the end.
define internal void @packed_bundle() {
entry:
  call void @call_target()
  ; bundle boundary
  %addr_byte = bitcast [1 x i8]* @global_byte to i8*
  %addr_short = bitcast [2 x i8]* @global_short to i16*
  store i8 0, i8* %addr_byte, align 1      ; 7-byte instruction
  store i16 0, i16* %addr_short, align 1   ; 9-byte instruction
  store i8 0, i8* %addr_byte, align 1      ; 7-byte instruction
  store i16 0, i16* %addr_short, align 1   ; 9-byte instruction
  ; bundle boundary
  store i8 0, i8* %addr_byte, align 1      ; 7-byte instruction
  store i16 0, i16* %addr_short, align 1   ; 9-byte instruction
  ret void
}
; CHECK-LABEL: packed_bundle
; CHECK: call
; CHECK-NEXT: 20: {{.*}} mov BYTE PTR
; CHECK-NEXT: 27: {{.*}} mov WORD PTR
; CHECK-NEXT: 30: {{.*}} mov BYTE PTR
; CHECK-NEXT: 37: {{.*}} mov WORD PTR
; CHECK-NEXT: 40: {{.*}} mov BYTE PTR
; CHECK-NEXT: 47: {{.*}} mov WORD PTR

; An imperfectly packed bundle should have one or more nops at the end.
define internal void @nonpacked_bundle() {
entry:
  call void @call_target()
  ; bundle boundary
  %addr_short = bitcast [2 x i8]* @global_short to i16*
  store i16 0, i16* %addr_short, align 1   ; 9-byte instruction
  store i16 0, i16* %addr_short, align 1   ; 9-byte instruction
  store i16 0, i16* %addr_short, align 1   ; 9-byte instruction
  ; nop padding
  ; bundle boundary
  store i16 0, i16* %addr_short, align 1   ; 9-byte instruction
  ret void
}
; CHECK-LABEL: nonpacked_bundle
; CHECK: call
; CHECK-NEXT: 20: {{.*}} mov WORD PTR
; CHECK-NEXT: 29: {{.*}} mov WORD PTR
; CHECK-NEXT: 32: {{.*}} mov WORD PTR
; CHECK-NEXT: 3b: {{.*}} nop
; CHECK: 40: {{.*}} mov WORD PTR

; A zero-byte instruction (e.g. local label definition) at a bundle
; boundary should not trigger nop padding.
define internal void @label_at_boundary(i32 %arg, float %farg1, float %farg2) {
entry:
  %argi8 = trunc i32 %arg to i8
  call void @call_target()
  ; bundle boundary
  %addr_short = bitcast [2 x i8]* @global_short to i16*
  %addr_int = bitcast [4 x i8]* @global_int to i32*
  store i32 0, i32* %addr_int, align 1           ; 10-byte instruction
  %blah = select i1 true, i8 %argi8, i8 %argi8   ; 22-byte lowering sequence
  ; label is here
  store i16 0, i16* %addr_short, align 1         ; 9-byte instruction
  ret void
}
; CHECK-LABEL: label_at_boundary
; CHECK: call
; We rely on a particular 7-instruction 22-byte Om1 lowering sequence
; for select.
; CHECK-NEXT: 20: {{.*}} mov DWORD PTR
; CHECK-NEXT: 2a: {{.*}} mov {{.*}},0x1
; CHECK-NEXT: 2c: {{.*}} cmp {{.*}},0x0
; CHECK-NEXT: 2e: {{.*}} mov {{.*}},BYTE PTR
; CHECK-NEXT: 32: {{.*}} mov BYTE PTR
; CHECK-NEXT: 36: {{.*}} jne 40
; CHECK-NEXT: 38: {{.*}} mov {{.*}},BYTE PTR
; CHECK-NEXT: 3c: {{.*}} mov BYTE PTR
; CHECK-NEXT: 40: {{.*}} mov WORD PTR

; Bundle lock without padding.
define internal void @bundle_lock_without_padding() {
entry:
  %addr_short = bitcast [2 x i8]* @global_short to i16*
  store i16 0, i16* %addr_short, align 1   ; 9-byte instruction
  ret void
}
; CHECK-LABEL: bundle_lock_without_padding
; CHECK: mov WORD PTR
; CHECK-NEXT: pop ecx
; CHECK-NEXT: and ecx,0xffffffe0
; CHECK-NEXT: jmp ecx

; Bundle lock with padding.
define internal void @bundle_lock_with_padding() {
entry:
  call void @call_target()
  ; bundle boundary
  %addr_byte = bitcast [1 x i8]* @global_byte to i8*
  %addr_short = bitcast [2 x i8]* @global_short to i16*
  store i8 0, i8* %addr_byte, align 1      ; 7-byte instruction
  store i16 0, i16* %addr_short, align 1   ; 9-byte instruction
  store i16 0, i16* %addr_short, align 1   ; 9-byte instruction
  ret void
  ; 3 bytes to restore stack pointer
  ; 1 byte to pop ecx
  ; bundle_lock
  ; 3 bytes to mask ecx
  ; This is now 32 bytes from the beginning of the bundle, so
  ; a 3-byte nop will need to be emitted before the bundle_lock.
  ; 2 bytes to jump to ecx
  ; bundle_unlock
}
; CHECK-LABEL: bundle_lock_with_padding
; CHECK: call
; CHECK-NEXT: 20: {{.*}} mov BYTE PTR
; CHECK-NEXT: 27: {{.*}} mov WORD PTR
; CHECK-NEXT: 30: {{.*}} mov WORD PTR
; CHECK-NEXT: 39: {{.*}} add esp,
; CHECK-NEXT: 3c: {{.*}} pop ecx
; CHECK-NEXT: 3d: {{.*}} nop
; CHECK-NEXT: 40: {{.*}} and ecx,0xffffffe0
; CHECK-NEXT: 43: {{.*}} jmp ecx

; Bundle lock align_to_end without any padding.
define internal void @bundle_lock_align_to_end_padding_0() {
entry:
  call void @call_target()
  ; bundle boundary
  %addr_short = bitcast [2 x i8]* @global_short to i16*
  store i16 0, i16* %addr_short, align 1   ; 9-byte instruction
  store i16 0, i16* %addr_short, align 1   ; 9-byte instruction
  store i16 0, i16* %addr_short, align 1   ; 9-byte instruction
  call void @call_target()                 ; 5-byte instruction
  ret void
}
; CHECK-LABEL: bundle_lock_align_to_end_padding_0
; CHECK: call
; CHECK-NEXT: 20: {{.*}} mov WORD PTR
; CHECK-NEXT: 29: {{.*}} mov WORD PTR
; CHECK-NEXT: 32: {{.*}} mov WORD PTR
; CHECK-NEXT: 3b: {{.*}} call

; Bundle lock align_to_end with one bunch of padding.
define internal void @bundle_lock_align_to_end_padding_1() {
entry:
  call void @call_target()
  ; bundle boundary
  %addr_byte = bitcast [1 x i8]* @global_byte to i8*
  store i8 0, i8* %addr_byte, align 1      ; 7-byte instruction
  store i8 0, i8* %addr_byte, align 1      ; 7-byte instruction
  store i8 0, i8* %addr_byte, align 1      ; 7-byte instruction
  call void @call_target()                 ; 5-byte instruction
  ret void
}
; CHECK-LABEL: bundle_lock_align_to_end_padding_1
; CHECK: call
; CHECK-NEXT: 20: {{.*}} mov BYTE PTR
; CHECK-NEXT: 27: {{.*}} mov BYTE PTR
; CHECK-NEXT: 2e: {{.*}} mov BYTE PTR
; CHECK-NEXT: 35: {{.*}} nop
; CHECK: 3b: {{.*}} call

; Bundle lock align_to_end with two bunches of padding.
define internal void @bundle_lock_align_to_end_padding_2(i32 %target) {
entry:
  call void @call_target()
  ; bundle boundary
  %addr_byte = bitcast [1 x i8]* @global_byte to i8*
  %addr_short = bitcast [2 x i8]* @global_short to i16*
  %__1 = inttoptr i32 %target to void ()*
  store i8 0, i8* %addr_byte, align 1      ; 7-byte instruction
  store i16 0, i16* %addr_short, align 1   ; 9-byte instruction
  store i16 0, i16* %addr_short, align 1   ; 9-byte instruction
  call void %__1()
  ; 4 bytes to load %target into a register
  ; bundle_lock align_to_end
  ; 3 bytes to mask the register
  ; This is now 32 bytes from the beginning of the bundle, so
  ; a 3-byte nop will need to be emitted before the bundle_lock,
  ; followed by a 27-byte nop before the mask/jump.
  ; 2 bytes to jump to the register
  ; bundle_unlock
  ret void
}
; CHECK-LABEL: bundle_lock_align_to_end_padding_2
; CHECK: call
; CHECK-NEXT: 20: {{.*}} mov BYTE PTR
; CHECK-NEXT: 27: {{.*}} mov WORD PTR
; CHECK-NEXT: 30: {{.*}} mov WORD PTR
; CHECK-NEXT: 39: {{.*}} mov [[REG:.*]],DWORD PTR [esp
; CHECK-NEXT: 3d: {{.*}} nop
; CHECK: 40: {{.*}} nop
; CHECK: 5b: {{.*}} and [[REG]],0xffffffe0
; CHECK-NEXT: 5e: {{.*}} call [[REG]]

; Tests the pad_to_end bundle alignment with no padding bytes needed.
define internal void @bundle_lock_pad_to_end_padding_0(i32 %arg0, i32 %arg1,
                                                       i32 %arg3, i32 %arg4,
                                                       i32 %arg5, i32 %arg6) {
  call void @call_target()
  ; bundle boundary
  %x = add i32 %arg5, %arg6  ; 12 bytes
  %y = trunc i32 %x to i16   ; 10 bytes
  call void @call_target()   ; 10 bytes
  ; bundle boundary
  ret void
}
; X8664: 56: {{.*}} push {{.*}} R_X86_64_32S bundle_lock_pad_to_end_padding_0+{{.*}}60
; X8664: 5b: {{.*}} jmp {{.*}} call_target
; X8664: 60: {{.*}} add

; Tests the pad_to_end bundle alignment with 11 padding bytes needed, and some
; instructions before the call.
define internal void @bundle_lock_pad_to_end_padding_11(i32 %arg0, i32 %arg1,
                                                        i32 %arg3, i32 %arg4,
                                                        i32 %arg5, i32 %arg6) {
  call void @call_target()
  ; bundle boundary
  %x = add i32 %arg5, %arg6  ; 11 bytes
  call void @call_target()   ; 10 bytes
                             ; 11 bytes of nop
  ; bundle boundary
  ret void
}
; X8664: 4b: {{.*}} push {{.*}} R_X86_64_32S bundle_lock_pad_to_end_padding_11+{{.*}}60
; X8664: 50: {{.*}} jmp {{.*}} call_target
; X8664: 55: {{.*}} nop
; X8664: 5d: {{.*}} nop
; X8664: 60: {{.*}} add

; Tests the pad_to_end bundle alignment with 22 padding bytes needed, and no
; instructions before the call.
define internal void @bundle_lock_pad_to_end_padding_22(i32 %arg0, i32 %arg1,
                                                        i32 %arg3, i32 %arg4,
                                                        i32 %arg5, i32 %arg6) {
  call void @call_target()
  ; bundle boundary
  call void @call_target()   ; 10 bytes
                             ; 22 bytes of nop
  ; bundle boundary
  ret void
}
; X8664: 40: {{.*}} push {{.*}} R_X86_64_32S bundle_lock_pad_to_end_padding_22+{{.*}}60
; X8664: 45: {{.*}} jmp {{.*}} call_target
; X8664: 4a: {{.*}} nop
; X8664: 52: {{.*}} nop
; X8664: 5a: {{.*}} nop
; X8664: 60: {{.*}} add

; Stack adjustment state during an argument push sequence gets
; properly checkpointed and restored during the two passes, as
; observed by the stack adjustment for accessing stack-allocated
; variables.
define internal void @checkpoint_restore_stack_adjustment(i32 %arg) {
entry:
  call void @call_target()
  ; bundle boundary
  call void @checkpoint_restore_stack_adjustment(i32 %arg)
  ret void
}
; CHECK-LABEL: checkpoint_restore_stack_adjustment
; CHECK: sub esp,0x1c
; CHECK: call
; The address of %arg should be [esp+0x20], not [esp+0x30].
; CHECK-NEXT: mov [[REG:.*]],DWORD PTR [esp+0x20]
; CHECK-NEXT: mov DWORD PTR [esp],[[REG]]
; CHECK: call
; CHECK: add esp,0x1c