#!/usr/bin/env python
# Copyright (c) 2012 The Chromium Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.

"""Dump functions called by static intializers in a Linux Release binary.

Usage example:
  tools/linux/dump-static-intializers.py out/Release/chrome

A brief overview of static initialization:
1) the compiler writes out, per object file, a function that contains
   the static intializers for that file.
2) the compiler also writes out a pointer to that function in a special
   section.
3) at link time, the linker concatenates the function pointer sections
   into a single list of all initializers.
4) at run time, on startup the binary runs all function pointers.

The functions in (1) all have mangled names of the form
  _GLOBAL__I_foobar.cc
using objdump, we can disassemble those functions and dump all symbols that
they reference.
"""

import optparse
import re
import subprocess
import sys

# A map of symbol => informative text about it.
NOTES = {
  '__cxa_atexit@plt': 'registers a dtor to run at exit',
  'std::__ioinit': '#includes <iostream>, use <ostream> instead',
}

# Determine whether this is a git checkout (as opposed to e.g. svn).
IS_GIT_WORKSPACE = (subprocess.Popen(
    ['git', 'rev-parse'], stderr=subprocess.PIPE).wait() == 0)

class Demangler(object):
  """A wrapper around c++filt to provide a function to demangle symbols."""
  def __init__(self):
    self.cppfilt = subprocess.Popen(['c++filt'],
                                    stdin=subprocess.PIPE,
                                    stdout=subprocess.PIPE)

  def Demangle(self, sym):
    """Given mangled symbol |sym|, return its demangled form."""
    self.cppfilt.stdin.write(sym + '\n')
    return self.cppfilt.stdout.readline().strip()

# Matches for example: "cert_logger.pb.cc", capturing "cert_logger".
protobuf_filename_re = re.compile(r'(.*)\.pb\.cc$')
def QualifyFilenameAsProto(filename):
  """Attempt to qualify a bare |filename| with a src-relative path, assuming it
  is a protoc-generated file.  If a single match is found, it is returned.
  Otherwise the original filename is returned."""
  if not IS_GIT_WORKSPACE:
    return filename
  match = protobuf_filename_re.match(filename)
  if not match:
    return filename
  basename = match.groups(0)
  gitlsfiles = subprocess.Popen(
    ['git', 'ls-files', '--', '*/%s.proto' % basename],
    stdout=subprocess.PIPE)
  candidate = filename
  for line in gitlsfiles.stdout:
    if candidate != filename:
      return filename # Multiple hits, can't help.
    candidate = line.strip()
  return candidate

# Regex matching the substring of a symbol's demangled text representation most
# likely to appear in a source file.
# Example: "v8::internal::Builtins::InitBuiltinFunctionTable()" becomes
# "InitBuiltinFunctionTable", since the first (optional & non-capturing) group
# picks up any ::-qualification and the last fragment picks up a suffix that
# starts with an opener.
symbol_code_name_re = re.compile(r'^(?:[^(<[]*::)?([^:(<[]*).*?$')
def QualifyFilename(filename, symbol):
  """Given a bare filename and a symbol that occurs in it, attempt to qualify
  it with a src-relative path.  If more than one file matches, return the
  original filename."""
  if not IS_GIT_WORKSPACE:
    return filename
  match = symbol_code_name_re.match(symbol)
  if not match:
    return filename
  symbol = match.group(1)
  gitgrep = subprocess.Popen(
    ['git', 'grep', '-l', symbol, '--', '*/%s' % filename],
    stdout=subprocess.PIPE)
  candidate = filename
  for line in gitgrep.stdout:
    if candidate != filename:  # More than one candidate; return bare filename.
      return filename
    candidate = line.strip()
  return candidate

# Regex matching nm output for the symbols we're interested in.
# See test_ParseNmLine for examples.
nm_re = re.compile(r'(\S+) (\S+) t (?:_ZN12)?_GLOBAL__(?:sub_)?I_(.*)')
def ParseNmLine(line):
  """Given a line of nm output, parse static initializers as a
  (file, start, size) tuple."""
  match = nm_re.match(line)
  if match:
    addr, size, filename = match.groups()
    return (filename, int(addr, 16), int(size, 16))


def test_ParseNmLine():
  """Verify the nm_re regex matches some sample lines."""
  parse = ParseNmLine(
    '0000000001919920 0000000000000008 t '
    '_ZN12_GLOBAL__I_safe_browsing_service.cc')
  assert parse == ('safe_browsing_service.cc', 26319136, 8), parse

  parse = ParseNmLine(
    '00000000026b9eb0 0000000000000024 t '
    '_GLOBAL__sub_I_extension_specifics.pb.cc')
  assert parse == ('extension_specifics.pb.cc', 40607408, 36), parse

# Just always run the test; it is fast enough.
test_ParseNmLine()


def ParseNm(binary):
  """Given a binary, yield static initializers as (file, start, size) tuples."""
  nm = subprocess.Popen(['nm', '-S', binary], stdout=subprocess.PIPE)
  for line in nm.stdout:
    parse = ParseNmLine(line)
    if parse:
      yield parse

# Regex matching objdump output for the symbols we're interested in.
# Example line:
#     12354ab:  (disassembly, including <FunctionReference>)
disassembly_re = re.compile(r'^\s+[0-9a-f]+:.*<(\S+)>')
def ExtractSymbolReferences(binary, start, end):
  """Given a span of addresses, returns symbol references from disassembly."""
  cmd = ['objdump', binary, '--disassemble',
         '--start-address=0x%x' % start, '--stop-address=0x%x' % end]
  objdump = subprocess.Popen(cmd, stdout=subprocess.PIPE)

  refs = set()
  for line in objdump.stdout:
    if '__static_initialization_and_destruction' in line:
      raise RuntimeError, ('code mentions '
                           '__static_initialization_and_destruction; '
                           'did you accidentally run this on a Debug binary?')
    match = disassembly_re.search(line)
    if match:
      (ref,) = match.groups()
      if ref.startswith('.LC') or ref.startswith('_DYNAMIC'):
        # Ignore these, they are uninformative.
        continue
      if ref.startswith('_GLOBAL__I_'):
        # Probably a relative jump within this function.
        continue
      refs.add(ref)

  return sorted(refs)

def main():
  parser = optparse.OptionParser(usage='%prog [option] filename')
  parser.add_option('-d', '--diffable', dest='diffable',
                    action='store_true', default=False,
                    help='Prints the filename on each line, for more easily '
                         'diff-able output. (Used by sizes.py)')
  opts, args = parser.parse_args()
  if len(args) != 1:
    parser.error('missing filename argument')
    return 1
  binary = args[0]

  demangler = Demangler()
  file_count = 0
  initializer_count = 0

  files = ParseNm(binary)
  if opts.diffable:
    files = sorted(files)
  for filename, addr, size in files:
    file_count += 1
    ref_output = []

    qualified_filename = QualifyFilenameAsProto(filename)

    if size == 2:
      # gcc generates a two-byte 'repz retq' initializer when there is a
      # ctor even when the ctor is empty.  This is fixed in gcc 4.6, but
      # Android uses gcc 4.4.
      ref_output.append('[empty ctor, but it still has cost on gcc <4.6]')
    else:
      for ref in ExtractSymbolReferences(binary, addr, addr+size):
        initializer_count += 1

        ref = demangler.Demangle(ref)
        if qualified_filename == filename:
          qualified_filename = QualifyFilename(filename, ref)

        note = ''
        if ref in NOTES:
          note = NOTES[ref]
        elif ref.endswith('_2eproto()'):
          note = 'protocol compiler bug: crbug.com/105626'

        if note:
          ref_output.append('%s [%s]' % (ref, note))
        else:
          ref_output.append(ref)

    if opts.diffable:
      print '\n'.join('# ' + qualified_filename + ' ' + r for r in ref_output)
    else:
      print '%s (initializer offset 0x%x size 0x%x)' % (qualified_filename,
                                                        addr, size)
      print ''.join('  %s\n' % r for r in ref_output)

  if opts.diffable:
    print '#',
  print 'Found %d static initializers in %d files.' % (initializer_count,
                                                       file_count)

  return 0

if '__main__' == __name__:
  sys.exit(main())