/*--------------------------------------------------------------------*/
/*--- User-mode execve() for ELF executables           m_ume_elf.c ---*/
/*--------------------------------------------------------------------*/

/*
   This file is part of Valgrind, a dynamic binary instrumentation
   framework.

   Copyright (C) 2000-2012 Julian Seward 
      jseward@acm.org

   This program is free software; you can redistribute it and/or
   modify it under the terms of the GNU General Public License as
   published by the Free Software Foundation; either version 2 of the
   License, or (at your option) any later version.

   This program is distributed in the hope that it will be useful, but
   WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   General Public License for more details.

   You should have received a copy of the GNU General Public License
   along with this program; if not, write to the Free Software
   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
   02111-1307, USA.

   The GNU General Public License is contained in the file COPYING.
*/

#if defined(VGO_linux)

#include "pub_core_basics.h"
#include "pub_core_vki.h"

#include "pub_core_aspacemgr.h"     // various mapping fns
#include "pub_core_debuglog.h"
#include "pub_core_libcassert.h"    // VG_(exit), vg_assert
#include "pub_core_libcbase.h"      // VG_(memcmp), etc
#include "pub_core_libcprint.h"
#include "pub_core_libcfile.h"      // VG_(open) et al
#include "pub_core_machine.h"       // VG_ELF_CLASS (XXX: which should be moved)
#include "pub_core_mallocfree.h"    // VG_(malloc), VG_(free)
#include "pub_core_syscall.h"       // VG_(strerror)
#include "pub_core_ume.h"           // self

#include "priv_ume.h"

/* --- !!! --- EXTERNAL HEADERS start --- !!! --- */
#define _GNU_SOURCE
#define _FILE_OFFSET_BITS 64
/* This is for ELF types etc, and also the AT_ constants. */
#include <elf.h>
/* --- !!! --- EXTERNAL HEADERS end --- !!! --- */


#if     VG_WORDSIZE == 8
#define ESZ(x)  Elf64_##x
#elif   VG_WORDSIZE == 4
#define ESZ(x)  Elf32_##x
#else
#error VG_WORDSIZE needs to ==4 or ==8
#endif

struct elfinfo
{
   ESZ(Ehdr)    e;
   ESZ(Phdr)    *p;
   Int          fd;
};

static void check_mmap(SysRes res, Addr base, SizeT len)
{
   if (sr_isError(res)) {
      VG_(printf)("valgrind: mmap(0x%llx, %lld) failed in UME "
                  "with error %lu (%s).\n", 
                  (ULong)base, (Long)len, 
                  sr_Err(res), VG_(strerror)(sr_Err(res)) );
      if (sr_Err(res) == VKI_EINVAL) {
         VG_(printf)("valgrind: this can be caused by executables with "
                     "very large text, data or bss segments.\n");
      }
      VG_(exit)(1);
   }
}

/*------------------------------------------------------------*/
/*--- Loading ELF files                                    ---*/
/*------------------------------------------------------------*/

static 
struct elfinfo *readelf(Int fd, const char *filename)
{
   SysRes sres;
   struct elfinfo *e = VG_(malloc)("ume.re.1", sizeof(*e));
   Int phsz;

   vg_assert(e);
   e->fd = fd;

   sres = VG_(pread)(fd, &e->e, sizeof(e->e), 0);
   if (sr_isError(sres) || sr_Res(sres) != sizeof(e->e)) {
      VG_(printf)("valgrind: %s: can't read ELF header: %s\n", 
                  filename, VG_(strerror)(sr_Err(sres)));
      goto bad;
   }

   if (VG_(memcmp)(&e->e.e_ident[0], ELFMAG, SELFMAG) != 0) {
      VG_(printf)("valgrind: %s: bad ELF magic number\n", filename);
      goto bad;
   }
   if (e->e.e_ident[EI_CLASS] != VG_ELF_CLASS) {
      VG_(printf)("valgrind: wrong ELF executable class "
                  "(eg. 32-bit instead of 64-bit)\n");
      goto bad;
   }
   if (e->e.e_ident[EI_DATA] != VG_ELF_DATA2XXX) {
      VG_(printf)("valgrind: executable has wrong endian-ness\n");
      goto bad;
   }
   if (!(e->e.e_type == ET_EXEC || e->e.e_type == ET_DYN)) {
      VG_(printf)("valgrind: this is not an executable\n");
      goto bad;
   }

   if (e->e.e_machine != VG_ELF_MACHINE) {
      VG_(printf)("valgrind: executable is not for "
                  "this architecture\n");
      goto bad;
   }

   if (e->e.e_phentsize != sizeof(ESZ(Phdr))) {
      VG_(printf)("valgrind: sizeof ELF Phdr wrong\n");
      goto bad;
   }

   phsz = sizeof(ESZ(Phdr)) * e->e.e_phnum;
   e->p = VG_(malloc)("ume.re.2", phsz);
   vg_assert(e->p);

   sres = VG_(pread)(fd, e->p, phsz, e->e.e_phoff);
   if (sr_isError(sres) || sr_Res(sres) != phsz) {
      VG_(printf)("valgrind: can't read phdr: %s\n", 
                  VG_(strerror)(sr_Err(sres)));
      VG_(free)(e->p);
      goto bad;
   }

   return e;

  bad:
   VG_(free)(e);
   return NULL;
}

/* Map an ELF file.  Returns the brk address. */
static
ESZ(Addr) mapelf(struct elfinfo *e, ESZ(Addr) base)
{
   Int    i;
   SysRes res;
   ESZ(Addr) elfbrk = 0;

   for (i = 0; i < e->e.e_phnum; i++) {
      ESZ(Phdr) *ph = &e->p[i];
      ESZ(Addr) addr, brkaddr;
      ESZ(Word) memsz;

      if (ph->p_type != PT_LOAD)
         continue;

      addr    = ph->p_vaddr+base;
      memsz   = ph->p_memsz;
      brkaddr = addr+memsz;

      if (brkaddr > elfbrk)
         elfbrk = brkaddr;
   }

   for (i = 0; i < e->e.e_phnum; i++) {
      ESZ(Phdr) *ph = &e->p[i];
      ESZ(Addr) addr, bss, brkaddr;
      ESZ(Off) off;
      ESZ(Word) filesz;
      ESZ(Word) memsz;
      unsigned prot = 0;

      if (ph->p_type != PT_LOAD)
         continue;

      if (ph->p_flags & PF_X) prot |= VKI_PROT_EXEC;
      if (ph->p_flags & PF_W) prot |= VKI_PROT_WRITE;
      if (ph->p_flags & PF_R) prot |= VKI_PROT_READ;

      addr    = ph->p_vaddr+base;
      off     = ph->p_offset;
      filesz  = ph->p_filesz;
      bss     = addr+filesz;
      memsz   = ph->p_memsz;
      brkaddr = addr+memsz;

      // Tom says: In the following, do what the Linux kernel does and only
      // map the pages that are required instead of rounding everything to
      // the specified alignment (ph->p_align).  (AMD64 doesn't work if you
      // use ph->p_align -- part of stage2's memory gets trashed somehow.)
      //
      // The condition handles the case of a zero-length segment.
      if (VG_PGROUNDUP(bss)-VG_PGROUNDDN(addr) > 0) {
         if (0) VG_(debugLog)(0,"ume","mmap_file_fixed_client #1\n");
         res = VG_(am_mmap_file_fixed_client)(
                  VG_PGROUNDDN(addr),
                  VG_PGROUNDUP(bss)-VG_PGROUNDDN(addr),
                  prot, /*VKI_MAP_FIXED|VKI_MAP_PRIVATE, */
                  e->fd, VG_PGROUNDDN(off)
               );
         if (0) VG_(am_show_nsegments)(0,"after #1");
         check_mmap(res, VG_PGROUNDDN(addr),
                         VG_PGROUNDUP(bss)-VG_PGROUNDDN(addr));
      }

      // if memsz > filesz, fill the remainder with zeroed pages
      if (memsz > filesz) {
         UInt bytes;

         bytes = VG_PGROUNDUP(brkaddr)-VG_PGROUNDUP(bss);
         if (bytes > 0) {
            if (0) VG_(debugLog)(0,"ume","mmap_anon_fixed_client #2\n");
            res = VG_(am_mmap_anon_fixed_client)(
                     VG_PGROUNDUP(bss), bytes,
                     prot
                  );
            if (0) VG_(am_show_nsegments)(0,"after #2");
            check_mmap(res, VG_PGROUNDUP(bss), bytes);
         }

         bytes = bss & (VKI_PAGE_SIZE - 1);

         // The 'prot' condition allows for a read-only bss
         if ((prot & VKI_PROT_WRITE) && (bytes > 0)) {
            bytes = VKI_PAGE_SIZE - bytes;
            VG_(memset)((char *)bss, 0, bytes);
         }
      }
   }

   return elfbrk;
}

Bool VG_(match_ELF)(Char *hdr, Int len)
{
   ESZ(Ehdr) *e = (ESZ(Ehdr) *)hdr;
   return (len > sizeof(*e)) && VG_(memcmp)(&e->e_ident[0], ELFMAG, SELFMAG) == 0;
}


/* load_ELF pulls an ELF executable into the address space, prepares
   it for execution, and writes info about it into INFO.  In
   particular it fills in .init_eip, which is the starting point.

   Returns zero on success, non-zero (a VKI_E.. value) on failure.

   The sequence of activities is roughly as follows:

   - use readelf() to extract program header info from the exe file.

   - scan the program header, collecting info (not sure what all those
     info-> fields are, or whether they are used, but still) and in
     particular looking out fo the PT_INTERP header, which describes
     the interpreter.  If such a field is found, the space needed to
     hold the interpreter is computed into interp_size.

   - map the executable in, by calling mapelf().  This maps in all
     loadable sections, and I _think_ also creates any .bss areas
     required.  mapelf() returns the address just beyond the end of
     the furthest-along mapping it creates.  The executable is mapped
     starting at EBASE, which is usually read from it (eg, 0x8048000
     etc) except if it's a PIE, in which case I'm not sure what
     happens.

     The returned address is recorded in info->brkbase as the start
     point of the brk (data) segment, as it is traditional to place
     the data segment just after the executable.  Neither load_ELF nor
     mapelf creates the brk segment, though: that is for the caller of
     load_ELF to attend to.

   - If the initial phdr scan didn't find any mention of an
     interpreter (interp == NULL), this must be a statically linked
     executable, and we're pretty much done.

   - Otherwise, we need to use mapelf() a second time to load the
     interpreter.  The interpreter can go anywhere, but mapelf() wants
     to be told a specific address to put it at.  So an advisory query
     is passed to aspacem, asking where it would put an anonymous
     client mapping of size INTERP_SIZE.  That address is then used
     as the mapping address for the interpreter.

   - The entry point in INFO is set to the interpreter's entry point,
     and we're done.  */
Int VG_(load_ELF)(Int fd, const HChar* name, /*MOD*/ExeInfo* info)
{
   SysRes sres;
   struct elfinfo *e;
   struct elfinfo *interp = NULL;
   ESZ(Addr) minaddr = ~0;      /* lowest mapped address */
   ESZ(Addr) maxaddr = 0;       /* highest mapped address */
   ESZ(Addr) interp_addr = 0;   /* interpreter (ld.so) address */
   ESZ(Word) interp_size = 0;   /* interpreter size */
   /* ESZ(Word) interp_align = VKI_PAGE_SIZE; */ /* UNUSED */
   Int i;
   void *entry;
   ESZ(Addr) ebase = 0;

   /* The difference between where the interpreter got mapped and
      where it asked to be mapped.  Needed for computing the ppc64 ELF
      entry point and initial tocptr (R2) value. */
   ESZ(Word) interp_offset = 0;

#ifdef HAVE_PIE
   ebase = info->exe_base;
#endif

   e = readelf(fd, name);

   if (e == NULL)
      return VKI_ENOEXEC;

   /* The kernel maps position-independent executables at TASK_SIZE*2/3;
      duplicate this behavior as close as we can. */
   if (e->e.e_type == ET_DYN && ebase == 0) {
      ebase = VG_PGROUNDDN(info->exe_base 
                           + (info->exe_end - info->exe_base) * 2 / 3);
      /* We really don't want to load PIEs at zero or too close.  It
         works, but it's unrobust (NULL pointer reads and writes
         become legit, which is really bad) and causes problems for
         exp-ptrcheck, which assumes all numbers below 1MB are
         nonpointers.  So, hackily, move it above 1MB. */
      /* Later .. is appears ppc32-linux tries to put [vdso] at 1MB,
         which totally screws things up, because nothing else can go
         there.  So bump the hacky load addess along by 0x8000, to
         0x108000. */
      if (ebase < 0x108000)
         ebase = 0x108000;
   }

   info->phnum = e->e.e_phnum;
   info->entry = e->e.e_entry + ebase;
   info->phdr = 0;

   for (i = 0; i < e->e.e_phnum; i++) {
      ESZ(Phdr) *ph = &e->p[i];

      switch(ph->p_type) {
      case PT_PHDR:
         info->phdr = ph->p_vaddr + ebase;
         break;

      case PT_LOAD:
         if (ph->p_vaddr < minaddr)
            minaddr = ph->p_vaddr;
         if (ph->p_vaddr+ph->p_memsz > maxaddr)
            maxaddr = ph->p_vaddr+ph->p_memsz;
         break;
                        
      case PT_INTERP: {
         HChar *buf = VG_(malloc)("ume.LE.1", ph->p_filesz+1);
         Int j;
         Int intfd;
         Int baseaddr_set;

         vg_assert(buf);
         VG_(pread)(fd, buf, ph->p_filesz, ph->p_offset);
         buf[ph->p_filesz] = '\0';

         sres = VG_(open)(buf, VKI_O_RDONLY, 0);
         if (sr_isError(sres)) {
            VG_(printf)("valgrind: m_ume.c: can't open interpreter\n");
            VG_(exit)(1);
         }
         intfd = sr_Res(sres);

         interp = readelf(intfd, buf);
         if (interp == NULL) {
            VG_(printf)("valgrind: m_ume.c: can't read interpreter\n");
            return 1;
         }
         VG_(free)(buf);

         baseaddr_set = 0;
         for (j = 0; j < interp->e.e_phnum; j++) {
            ESZ(Phdr) *iph = &interp->p[j];
            ESZ(Addr) end;

            if (iph->p_type != PT_LOAD || iph->p_memsz == 0)
               continue;
            
            if (!baseaddr_set) {
               interp_addr  = iph->p_vaddr;
               /* interp_align = iph->p_align; */ /* UNUSED */
               baseaddr_set = 1;
            }

            /* assumes that all segments in the interp are close */
            end = (iph->p_vaddr - interp_addr) + iph->p_memsz;

            if (end > interp_size)
               interp_size = end;
         }
         break;

      default:
         // do nothing
         break;
      }
      }
   }

   if (info->phdr == 0)
      info->phdr = minaddr + ebase + e->e.e_phoff;

   if (info->exe_base != info->exe_end) {
      if (minaddr >= maxaddr ||
          (minaddr + ebase < info->exe_base ||
           maxaddr + ebase > info->exe_end)) {
         VG_(printf)("Executable range %p-%p is outside the\n"
                     "acceptable range %p-%p\n",
                     (char *)minaddr + ebase, (char *)maxaddr + ebase,
                     (char *)info->exe_base,  (char *)info->exe_end);
         return VKI_ENOMEM;
      }
   }

   info->brkbase = mapelf(e, ebase);    /* map the executable */

   if (info->brkbase == 0)
      return VKI_ENOMEM;

   if (interp != NULL) {
      /* reserve a chunk of address space for interpreter */
      MapRequest mreq;
      Addr       advised;
      Bool       ok;

      /* Don't actually reserve the space.  Just get an advisory
         indicating where it would be allocated, and pass that to
         mapelf(), which in turn asks aspacem to do some fixed maps at
         the specified address.  This is a bit of hack, but it should
         work because there should be no intervening transactions with
         aspacem which could cause those fixed maps to fail.

         Placement policy is:

         if the interpreter asks to be loaded at zero
            ignore that and put it wherever we like (mappings at zero 
            are bad news)
         else
            try and put it where it asks for, but if that doesn't work,
            just put it anywhere.
      */
      if (interp_addr == 0) {
         mreq.rkind = MAny;
         mreq.start = 0;
         mreq.len   = interp_size;
      } else {
         mreq.rkind = MHint;
         mreq.start = interp_addr;
         mreq.len   = interp_size;
      }

      advised = VG_(am_get_advisory)( &mreq, True/*client*/, &ok );

      if (!ok) {
         /* bomb out */
         SysRes res = VG_(mk_SysRes_Error)(VKI_EINVAL);
         if (0) VG_(printf)("reserve for interp: failed\n");
         check_mmap(res, (Addr)interp_addr, interp_size);
         /*NOTREACHED*/
      }

      (void)mapelf(interp, (ESZ(Addr))advised - interp_addr);

      VG_(close)(interp->fd);

      entry = (void *)(advised - interp_addr + interp->e.e_entry);
      info->interp_base = (ESZ(Addr))advised;
      interp_offset = advised - interp_addr;

      VG_(free)(interp->p);
      VG_(free)(interp);
   } else
      entry = (void *)(ebase + e->e.e_entry);

   info->exe_base = minaddr + ebase;
   info->exe_end  = maxaddr + ebase;

#if defined(VGP_ppc64_linux)
   /* On PPC64, a func ptr is represented by a TOC entry ptr.  This
      TOC entry contains three words; the first word is the function
      address, the second word is the TOC ptr (r2), and the third word
      is the static chain value. */
   info->init_ip  = ((ULong*)entry)[0];
   info->init_toc = ((ULong*)entry)[1];
   info->init_ip  += interp_offset;
   info->init_toc += interp_offset;
#else
   info->init_ip  = (Addr)entry;
   info->init_toc = 0; /* meaningless on this platform */
   (void) interp_offset; /* stop gcc complaining it is unused */
#endif
   VG_(free)(e->p);
   VG_(free)(e);

   return 0;
}

#endif // defined(VGO_linux)

/*--------------------------------------------------------------------*/
/*--- end                                                          ---*/
/*--------------------------------------------------------------------*/