;; -*- fundamental -*-
;; -----------------------------------------------------------------------
;;
;;   Copyright 1994-2008 H. Peter Anvin - All Rights Reserved
;;   Copyright 2009 Intel Corporation; author: H. Peter Anvin
;;
;;   This program is free software; you can redistribute it and/or modify
;;   it under the terms of the GNU General Public License as published by
;;   the Free Software Foundation, Inc., 53 Temple Place Ste 330,
;;   Boston MA 02111-1307, USA; either version 2 of the License, or
;;   (at your option) any later version; incorporated herein by reference.
;;
;; -----------------------------------------------------------------------

;;
;; init16.asm
;;
;; Routine to initialize and to trampoline into 32-bit
;; protected memory.  This code is derived from bcopy32.inc and
;; com32.inc in the main SYSLINUX distribution.
;;

%include '../version.gen'

MY_CS		equ 0x0800		; Segment address to use
CS_BASE		equ (MY_CS << 4)	; Corresponding address

; Low memory bounce buffer
BOUNCE_SEG	equ (MY_CS+0x1000)

%define DO_WBINVD 0

		section .rodata align=16
		section .data   align=16
		section .bss    align=16
		section .stack	align=16 nobits
stack		resb 512
stack_end	equ $

;; -----------------------------------------------------------------------
;;  Kernel image header
;; -----------------------------------------------------------------------

		section .text		; Must be first in image
		bits 16

cmdline		times 497 db 0		; We put the command line here
setup_sects	db 0
root_flags	dw 0
syssize		dw 0
swap_dev	dw 0
ram_size	dw 0
vid_mode	dw 0
root_dev	dw 0
boot_flag	dw 0xAA55

_start:		jmp short start

		db "HdrS"		; Header signature
		dw 0x0203		; Header version number

realmode_swtch	dw 0, 0			; default_switch, SETUPSEG
start_sys_seg	dw 0x1000		; obsolete
version_ptr	dw memdisk_version-0x200	; version string ptr
type_of_loader	db 0			; Filled in by boot loader
loadflags	db 1			; Please load high
setup_move_size	dw 0			; Unused
code32_start	dd 0x100000		; 32-bit start address
ramdisk_image	dd 0			; Loaded ramdisk image address
ramdisk_size	dd 0			; Size of loaded ramdisk
bootsect_kludge	dw 0, 0
heap_end_ptr	dw 0
pad1		dw 0
cmd_line_ptr	dd 0			; Command line
ramdisk_max	dd 0xffffffff		; Highest allowed ramdisk address

;
; These fields aren't real setup fields, they're poked in by the
; 32-bit code.
;
b_esdi		dd 0			; ES:DI for boot sector invocation
b_edx		dd 0			; EDX for boot sector invocation
b_sssp		dd 0			; SS:SP on boot sector invocation
b_csip		dd 0			; CS:IP on boot sector invocation

		section .rodata
memdisk_version:
		db "MEMDISK ", VERSION_STR, " ", DATE, 0

;; -----------------------------------------------------------------------
;;  End kernel image header
;; -----------------------------------------------------------------------

;
; Move ourselves down into memory to reduce the risk of conflicts;
; then canonicalize CS to match the other segments.
;
		section .text
		bits 16
start:
		mov ax,MY_CS
		mov es,ax
		movzx cx,byte [setup_sects]
		inc cx			; Add one for the boot sector
		shl cx,7		; Convert to dwords
		xor si,si
		xor di,di
		mov fs,si		; fs <- 0
		cld
		rep movsd
		mov ds,ax
		mov ss,ax
		mov esp,stack_end
		jmp MY_CS:.next
.next:

;
; Copy the command line, if there is one
;
copy_cmdline:
		xor di,di		; Bottom of our own segment (= "boot sector")
		mov eax,[cmd_line_ptr]
		and eax,eax
		jz .endcmd		; No command line
		mov si,ax
		shr eax,4		; Convert to segment
		and si,0x000F		; Starting offset only
		mov gs,ax
		mov cx,496		; Max number of bytes
.copycmd:
		gs lodsb
		and al,al
		jz .endcmd
		stosb
		loop .copycmd
.endcmd:
		xor al,al
		stosb

;
; Now jump to 32-bit code
;
		sti
		call init32
;
; When init32 returns, we have been set up, the new boot sector loaded,
; and we should go and and run the newly loaded boot sector.
;
; The setup function will have poked values into the setup area.
;
		movzx edi,word [cs:b_esdi]
		mov es,word [cs:b_esdi+2]
		mov edx,[cs:b_edx]

		cli
		xor esi,esi		; No partition table involved
		mov ds,si		; Make all the segments consistent
		mov fs,si
		mov gs,si
		lss sp,[cs:b_sssp]
		movzx esp,sp
		jmp far [cs:b_csip]

;
; We enter protected mode, set up a flat 32-bit environment, run rep movsd
; and then exit.  IMPORTANT: This code assumes cs == MY_CS.
;
; This code is probably excessively anal-retentive in its handling of
; segments, but this stuff is painful enough as it is without having to rely
; on everything happening "as it ought to."
;
DummyTSS	equ  0x580		; Hopefully safe place in low mmoery

		section .data

	; desc base, limit, flags
%macro	desc 3
	dd (%2 & 0xffff) | ((%1 & 0xffff) << 16)
	dd (%1 & 0xff000000) | (%2 & 0xf0000) | ((%3 & 0xf0ff) << 8) | ((%1 & 0x00ff0000) >> 16)
%endmacro

		align 8, db 0
call32_gdt:	dw call32_gdt_size-1	; Null descriptor - contains GDT
.adj1:		dd call32_gdt+CS_BASE	; pointer for LGDT instruction
		dw 0

		; 0008: Dummy TSS to make Intel VT happy
		; Should never be actually accessed...
		desc DummyTSS, 103, 0x8089

		; 0010: Code segment, use16, readable, dpl 0, base CS_BASE, 64K
		desc CS_BASE, 0xffff, 0x009b

		; 0018: Data segment, use16, read/write, dpl 0, base CS_BASE, 64K
		desc CS_BASE, 0xffff, 0x0093

		; 0020: Code segment, use32, read/write, dpl 0, base 0, 4G
		desc 0, 0xfffff, 0xc09b

		; 0028: Data segment, use32, read/write, dpl 0, base 0, 4G
		desc 0, 0xfffff, 0xc093

call32_gdt_size:	equ $-call32_gdt

err_a20:	db 'ERROR: A20 gate not responding!',13,10,0

		section .bss
		alignb 4
Return		resd 1			; Return value
SavedSP		resw 1			; Place to save SP
A20Tries	resb 1

		section .data
		align 4, db 0
Target		dd 0			; Target address
Target_Seg	dw 20h			; Target CS

A20Type		dw 0			; Default = unknown

		section .text
		bits 16
;
; Routines to enable and disable (yuck) A20.  These routines are gathered
; from tips from a couple of sources, including the Linux kernel and
; http://www.x86.org/.  The need for the delay to be as large as given here
; is indicated by Donnie Barnes of RedHat, the problematic system being an
; IBM ThinkPad 760EL.
;
; We typically toggle A20 twice for every 64K transferred.
;
%define	io_delay	call _io_delay
%define IO_DELAY_PORT	80h		; Invalid port (we hope!)
%define disable_wait	32		; How long to wait for a disable

%define A20_DUNNO	0		; A20 type unknown
%define A20_NONE	1		; A20 always on?
%define A20_BIOS	2		; A20 BIOS enable
%define A20_KBC		3		; A20 through KBC
%define A20_FAST	4		; A20 through port 92h

		align 2, db 0
A20List		dw a20_dunno, a20_none, a20_bios, a20_kbc, a20_fast
A20DList	dw a20d_dunno, a20d_none, a20d_bios, a20d_kbc, a20d_fast
a20_adjust_cnt	equ ($-A20List)/2

slow_out:	out dx, al		; Fall through

_io_delay:	out IO_DELAY_PORT,al
		out IO_DELAY_PORT,al
		ret

enable_a20:
		pushad
		mov byte [A20Tries],255 ; Times to try to make this work

try_enable_a20:

;
; Flush the caches
;
%if DO_WBINVD
		call try_wbinvd
%endif

;
; If the A20 type is known, jump straight to type
;
		mov bp,[A20Type]
		add bp,bp			; Convert to word offset
.adj4:		jmp word [bp+A20List]

;
; First, see if we are on a system with no A20 gate
;
a20_dunno:
a20_none:
		mov byte [A20Type], A20_NONE
		call a20_test
		jnz a20_done

;
; Next, try the BIOS (INT 15h AX=2401h)
;
a20_bios:
		mov byte [A20Type], A20_BIOS
		mov ax,2401h
		pushf				; Some BIOSes muck with IF
		int 15h
		popf

		call a20_test
		jnz a20_done

;
; Enable the keyboard controller A20 gate
;
a20_kbc:
		mov dl, 1			; Allow early exit
		call empty_8042
		jnz a20_done			; A20 live, no need to use KBC

		mov byte [A20Type], A20_KBC	; Starting KBC command sequence

		mov al,0D1h			; Write output port
		out 064h, al
		call empty_8042_uncond

		mov al,0DFh			; A20 on
		out 060h, al
		call empty_8042_uncond

		; Apparently the UHCI spec assumes that A20 toggle
		; ends with a null command (assumed to be for sychronization?)
		; Put it here to see if it helps anything...
		mov al,0FFh			; Null command
		out 064h, al
		call empty_8042_uncond

		; Verify that A20 actually is enabled.  Do that by
		; observing a word in low memory and the same word in
		; the HMA until they are no longer coherent.  Note that
		; we don't do the same check in the disable case, because
		; we don't want to *require* A20 masking (SYSLINUX should
		; work fine without it, if the BIOS does.)
.kbc_wait:	push cx
		xor cx,cx
.kbc_wait_loop:
		call a20_test
		jnz a20_done_pop
		loop .kbc_wait_loop

		pop cx
;
; Running out of options here.  Final attempt: enable the "fast A20 gate"
;
a20_fast:
		mov byte [A20Type], A20_FAST	; Haven't used the KBC yet
		in al, 092h
		or al,02h
		and al,~01h			; Don't accidentally reset the machine!
		out 092h, al

.fast_wait:	push cx
		xor cx,cx
.fast_wait_loop:
		call a20_test
		jnz a20_done_pop
		loop .fast_wait_loop

		pop cx

;
; Oh bugger.  A20 is not responding.  Try frobbing it again; eventually give up
; and report failure to the user.
;

		dec byte [A20Tries]
		jnz try_enable_a20


		; Error message time
		mov si,err_a20
print_err:
		lodsb
		and al,al
		jz die
		mov bx,7
		mov ah,0xe
		int 10h
		jmp print_err


die:
		sti
.hlt:		hlt
		jmp short .hlt

;
; A20 unmasked, proceed...
;
a20_done_pop:	pop cx
a20_done:	popad
		ret

;
; This routine tests if A20 is enabled (ZF = 0).  This routine
; must not destroy any register contents.
;

; This is the INT 1Fh vector, which is standard PCs is used by the
; BIOS when the screen is in graphics mode.  Even if it is, it points to
; data, not code, so it should be safe enough to fiddle with.
A20Test		equ (1Fh*4)

a20_test:
		push ds
		push es
		push cx
		push eax
		xor ax,ax
		mov ds,ax		; DS == 0
		dec ax
		mov es,ax		; ES == 0FFFFh
		mov cx,32		; Loop count
		mov eax,[A20Test]
		cmp eax,[es:A20Test+10h]
		jne .a20_done
		push eax
.a20_wait:
		inc eax
		mov [A20Test],eax
		io_delay
		cmp eax,[es:A20Test+10h]
		loopz .a20_wait
		pop dword [A20Test]	; Restore original value
.a20_done:
		pop eax
		pop cx
		pop es
		pop ds
		ret

disable_a20:
		pushad
;
; Flush the caches
;
%if DO_WBINVD
		call try_wbinvd
%endif

		mov bp,[A20Type]
		add bp,bp			; Convert to word offset
.adj5:		jmp word [bp+A20DList]

a20d_bios:
		mov ax,2400h
		pushf				; Some BIOSes muck with IF
		int 15h
		popf
		jmp short a20d_snooze

;
; Disable the "fast A20 gate"
;
a20d_fast:
		in al, 092h
		and al,~03h
		out 092h, al
		jmp short a20d_snooze

;
; Disable the keyboard controller A20 gate
;
a20d_kbc:
		call empty_8042_uncond

		mov al,0D1h
		out 064h, al		; Write output port
		call empty_8042_uncond

		mov al,0DDh		; A20 off
		out 060h, al
		call empty_8042_uncond

		mov al,0FFh		; Null command/synchronization
		out 064h, al
		call empty_8042_uncond

		; Wait a bit for it to take effect
a20d_snooze:
		push cx
		mov cx, disable_wait
.delayloop:	call a20_test
		jz .disabled
		loop .delayloop
.disabled:	pop cx
a20d_dunno:
a20d_none:
		popad
		ret

;
; Routine to empty the 8042 KBC controller.  If dl != 0
; then we will test A20 in the loop and exit if A20 is
; suddenly enabled.
;
empty_8042_uncond:
		xor dl,dl
empty_8042:
		call a20_test
		jz .a20_on
		and dl,dl
		jnz .done
.a20_on:	io_delay
		in al, 064h		; Status port
		test al,1
		jz .no_output
		io_delay
		in al, 060h		; Read input
		jmp short empty_8042
.no_output:
		test al,2
		jnz empty_8042
		io_delay
.done:		ret

;
; Execute a WBINVD instruction if possible on this CPU
;
%if DO_WBINVD
try_wbinvd:
		wbinvd
		ret
%endif

		section .bss
		alignb 4
PMESP		resd 1			; Protected mode %esp

		section .idt nobits align=4096
		alignb 4096
pm_idt		resb 4096		; Protected-mode IDT, followed by interrupt stubs




pm_entry:	equ 0x100000

		section .rodata
		align 2, db 0
call32_rmidt:
		dw 0ffffh		; Limit
		dd 0			; Address

		section .data
		alignb 2
call32_pmidt:
		dw 8*256		; Limit
		dd 0			; Address (entered later)

		section .text
;
; This is the main entrypoint in this function
;
init32:
		mov bx,call32_call_start	; Where to go in PM

;
; Enter protected mode.  BX contains the entry point relative to the
; real-mode CS.
;
call32_enter_pm:
		mov ax,cs
		mov ds,ax
		movzx ebp,ax
		shl ebp,4		; EBP <- CS_BASE
		movzx ebx,bx
		add ebx,ebp		; entry point += CS_BASE
		cli
		mov [SavedSP],sp
		cld
		call enable_a20
		mov byte [call32_gdt+8+5],89h	; Mark TSS unbusy
		o32 lgdt [call32_gdt]	; Set up GDT
		o32 lidt [call32_pmidt]	; Set up IDT
		mov eax,cr0
		or al,1
		mov cr0,eax		; Enter protected mode
		jmp 20h:strict dword .in_pm+CS_BASE
.pm_jmp		equ $-6


		bits 32
.in_pm:
		xor eax,eax		; Available for future use...
		mov fs,eax
		mov gs,eax
		lldt ax

		mov al,28h		; Set up data segments
		mov es,eax
		mov ds,eax
		mov ss,eax

		mov al,08h
		ltr ax

		mov esp,[ebp+PMESP]	; Load protmode %esp if available
		jmp ebx			; Go to where we need to go

;
; This is invoked before first dispatch of the 32-bit code, in 32-bit mode
;
call32_call_start:
		;
		; Set up a temporary stack in the bounce buffer;
		; start32.S will override this to point us to the real
		; high-memory stack.
		;
		mov esp, (BOUNCE_SEG << 4) + 0x10000

		push dword call32_enter_rm.rm_jmp+CS_BASE
		push dword call32_enter_pm.pm_jmp+CS_BASE
		push dword stack_end		; RM size
		push dword call32_gdt+CS_BASE
		push dword call32_handle_interrupt+CS_BASE
		push dword CS_BASE		; Segment base
		push dword (BOUNCE_SEG << 4)	; Bounce buffer address
		push dword call32_syscall+CS_BASE ; Syscall entry point

		call pm_entry-CS_BASE		; Run the program...

		; ... fall through to call32_exit ...

call32_exit:
		mov bx,call32_done	; Return to command loop

call32_enter_rm:
		; Careful here... the PM code may have relocated the
		; entire RM code, so we need to figure out exactly
		; where we are executing from.  If the PM code has
		; relocated us, it *will* have adjusted the GDT to
		; match, though.
		call .here
.here:		pop ebp
		sub ebp,.here
		o32 sidt [ebp+call32_pmidt]
		cli
		cld
		mov [ebp+PMESP],esp	; Save exit %esp
		xor esp,esp		; Make sure the high bits are zero
		jmp 10h:.in_pm16	; Return to 16-bit mode first

		bits 16
.in_pm16:
		mov ax,18h		; Real-mode-like segment
		mov es,ax
		mov ds,ax
		mov ss,ax
		mov fs,ax
		mov gs,ax

		lidt [call32_rmidt]	; Real-mode IDT (rm needs no GDT)
		mov eax,cr0
		and al,~1
		mov cr0,eax
		jmp MY_CS:.in_rm
.rm_jmp		equ $-2

.in_rm:					; Back in real mode
		mov ax,cs
		mov ds,ax
		mov es,ax
		mov fs,ax
		mov gs,ax
		mov ss,ax
		mov sp,[SavedSP]	; Restore stack
		jmp bx			; Go to whereever we need to go...

call32_done:
		call disable_a20
		sti
		ret

;
; 16-bit support code
;
		bits 16

;
; 16-bit interrupt-handling code
;
call32_int_rm:
		pushf				; Flags on stack
		push cs				; Return segment
		push word .cont			; Return address
		push dword edx			; Segment:offset of IVT entry
		retf				; Invoke IVT routine
.cont:		; ... on resume ...
		mov bx,call32_int_resume
		jmp call32_enter_pm		; Go back to PM

;
; 16-bit system call handling code
;
call32_sys_rm:
		pop gs
		pop fs
		pop es
		pop ds
		popad
		popfd
		retf				; Invoke routine
.return:
		pushfd
		pushad
		push ds
		push es
		push fs
		push gs
		mov bx,call32_sys_resume
		jmp call32_enter_pm

;
; 32-bit support code
;
		bits 32

;
; This is invoked on getting an interrupt in protected mode.  At
; this point, we need to context-switch to real mode and invoke
; the interrupt routine.
;
; When this gets invoked, the registers are saved on the stack and
; AL contains the register number.
;
call32_handle_interrupt:
		movzx eax,al
		xor ebx,ebx		; Actually makes the code smaller
		mov edx,[ebx+eax*4]	; Get the segment:offset of the routine
		mov bx,call32_int_rm
		jmp call32_enter_rm	; Go to real mode

call32_int_resume:
		popad
		iret

;
; Syscall invocation.  We manifest a structure on the real-mode stack,
; containing the call32sys_t structure from <call32.h> as well as
; the following entries (from low to high address):
; - Target offset
; - Target segment
; - Return offset
; - Return segment (== real mode cs)
; - Return flags
;
call32_syscall:
		pushfd			; Save IF among other things...
		pushad			; We only need to save some, but...
		cld
		call .here
.here:		pop ebp
		sub ebp,.here

		movzx edi,word [ebp+SavedSP]
		sub edi,54		; Allocate 54 bytes
		mov [ebp+SavedSP],di
		add edi,ebp		; Create linear address

		mov esi,[esp+11*4]	; Source regs
		xor ecx,ecx
		mov cl,11		; 44 bytes to copy
		rep movsd

		movzx eax,byte [esp+10*4] ; Interrupt number
		; ecx == 0 here; adding it to the EA makes the
		; encoding smaller
		mov eax,[ecx+eax*4]	; Get IVT entry
		stosd			; Save in stack frame
		mov ax,call32_sys_rm.return	; Return offset
		stosw				; Save in stack frame
		mov eax,ebp
		shr eax,4			; Return segment
		stosw				; Save in stack frame
		mov eax,[edi-12]	; Return flags
		and eax,0x200cd7	; Mask (potentially) unsafe flags
		mov [edi-12],eax	; Primary flags entry
		stosw			; Return flags

		mov bx,call32_sys_rm
		jmp call32_enter_rm	; Go to real mode

		; On return, the 44-byte return structure is on the
		; real-mode stack.  call32_enter_pm will leave ebp
		; pointing to the real-mode base.
call32_sys_resume:
		movzx esi,word [ebp+SavedSP]
		mov edi,[esp+12*4]	; Dest regs
		add esi,ebp		; Create linear address
		and edi,edi		; NULL pointer?
		jnz .do_copy
.no_copy:	mov edi,esi		; Do a dummy copy-to-self
.do_copy:	xor ecx,ecx
		mov cl,11		; 44 bytes
		rep movsd		; Copy register block

		add word [ebp+SavedSP],44	; Remove from stack

		popad
		popfd
		ret			; Return to 32-bit program