escaping.cc - Android社区 - https://www.androidos.net.cn/

#include "strings/escaping.h"

#include <cassert>

#include "android-base/logging.h"
#include "strings/ascii_ctype.h"

namespace dynamic_depth {

// ----------------------------------------------------------------------
// ptrdiff_t Base64Unescape() - base64 decoder
// ptrdiff_t Base64Escape() - base64 encoder
// ptrdiff_t WebSafeBase64Unescape() - Google's variation of base64 decoder
// ptrdiff_t WebSafeBase64Escape() - Google's variation of base64 encoder
//
// Check out
// http://tools.ietf.org/html/rfc2045 for formal description, but what we
// care about is that...
//   Take the encoded stuff in groups of 4 characters and turn each
//   character into a code 0 to 63 thus:
//           A-Z map to 0 to 25
//           a-z map to 26 to 51
//           0-9 map to 52 to 61
//           +(- for WebSafe) maps to 62
//           /(_ for WebSafe) maps to 63
//   There will be four numbers, all less than 64 which can be represented
//   by a 6 digit binary number (aaaaaa, bbbbbb, cccccc, dddddd respectively).
//   Arrange the 6 digit binary numbers into three bytes as such:
//   aaaaaabb bbbbcccc ccdddddd
//   Equals signs (one or two) are used at the end of the encoded block to
//   indicate that the text was not an integer multiple of three bytes long.
// ----------------------------------------------------------------------

bool Base64UnescapeInternal(const char* src_param, size_t szsrc, char* dest,
                            size_t szdest, const signed char* unbase64,
                            size_t* len) {
  static const char kPad64Equals = '=';
  static const char kPad64Dot = '.';

  size_t destidx = 0;
  int decode = 0;
  int state = 0;
  unsigned int ch = 0;
  unsigned int temp = 0;

  // If "char" is signed by default, using *src as an array index results in
  // accessing negative array elements. Treat the input as a pointer to
  // unsigned char to avoid this.
  const unsigned char* src = reinterpret_cast<const unsigned char*>(src_param);

  // The GET_INPUT macro gets the next input character, skipping
  // over any whitespace, and stopping when we reach the end of the
  // string or when we read any non-data character.  The arguments are
  // an arbitrary identifier (used as a label for goto) and the number
  // of data bytes that must remain in the input to avoid aborting the
  // loop.
#define GET_INPUT(label, remain)                          \
  label:                                                  \
  --szsrc;                                                \
  ch = *src++;                                            \
  decode = unbase64[ch];                                  \
  if (decode < 0) {                                       \
    if (ascii_isspace(ch) && szsrc >= remain) goto label; \
    state = 4 - remain;                                   \
    break;                                                \
  }

  // if dest is null, we're just checking to see if it's legal input
  // rather than producing output.  (I suspect this could just be done
  // with a regexp...).  We duplicate the loop so this test can be
  // outside it instead of in every iteration.

  if (dest) {
    // This loop consumes 4 input bytes and produces 3 output bytes
    // per iteration.  We can't know at the start that there is enough
    // data left in the string for a full iteration, so the loop may
    // break out in the middle; if so 'state' will be set to the
    // number of input bytes read.

    while (szsrc >= 4) {
      // We'll start by optimistically assuming that the next four
      // bytes of the string (src[0..3]) are four good data bytes
      // (that is, no nulls, whitespace, padding chars, or illegal
      // chars).  We need to test src[0..2] for nulls individually
      // before constructing temp to preserve the property that we
      // never read past a null in the string (no matter how long
      // szsrc claims the string is).

      if (!src[0] || !src[1] || !src[2] ||
          ((temp = ((unsigned(unbase64[src[0]]) << 18) |
                    (unsigned(unbase64[src[1]]) << 12) |
                    (unsigned(unbase64[src[2]]) << 6) |
                    (unsigned(unbase64[src[3]])))) &
           0x80000000)) {
        // Iff any of those four characters was bad (null, illegal,
        // whitespace, padding), then temp's high bit will be set
        // (because unbase64[] is -1 for all bad characters).
        //
        // We'll back up and resort to the slower decoder, which knows
        // how to handle those cases.

        GET_INPUT(first, 4);
        temp = decode;
        GET_INPUT(second, 3);
        temp = (temp << 6) | decode;
        GET_INPUT(third, 2);
        temp = (temp << 6) | decode;
        GET_INPUT(fourth, 1);
        temp = (temp << 6) | decode;
      } else {
        // We really did have four good data bytes, so advance four
        // characters in the string.

        szsrc -= 4;
        src += 4;
        decode = -1;
        ch = '\0';
      }

      // temp has 24 bits of input, so write that out as three bytes.

      if (destidx + 3 > szdest) return false;
      dest[destidx + 2] = temp;
      temp >>= 8;
      dest[destidx + 1] = temp;
      temp >>= 8;
      dest[destidx] = temp;
      destidx += 3;
    }
  } else {
    while (szsrc >= 4) {
      if (!src[0] || !src[1] || !src[2] ||
          ((temp = ((unsigned(unbase64[src[0]]) << 18) |
                    (unsigned(unbase64[src[1]]) << 12) |
                    (unsigned(unbase64[src[2]]) << 6) |
                    (unsigned(unbase64[src[3]])))) &
           0x80000000)) {
        GET_INPUT(first_no_dest, 4);
        GET_INPUT(second_no_dest, 3);
        GET_INPUT(third_no_dest, 2);
        GET_INPUT(fourth_no_dest, 1);
      } else {
        szsrc -= 4;
        src += 4;
        decode = -1;
        ch = '\0';
      }
      destidx += 3;
    }
  }

#undef GET_INPUT

  // if the loop terminated because we read a bad character, return
  // now.
  if (decode < 0 && ch != '\0' && ch != kPad64Equals && ch != kPad64Dot &&
      !ascii_isspace(ch))
    return false;

  if (ch == kPad64Equals || ch == kPad64Dot) {
    // if we stopped by hitting an '=' or '.', un-read that character -- we'll
    // look at it again when we count to check for the proper number of
    // equals signs at the end.
    ++szsrc;
    --src;
  } else {
    // This loop consumes 1 input byte per iteration.  It's used to
    // clean up the 0-3 input bytes remaining when the first, faster
    // loop finishes.  'temp' contains the data from 'state' input
    // characters read by the first loop.
    while (szsrc > 0) {
      --szsrc;
      ch = *src++;
      decode = unbase64[ch];
      if (decode < 0) {
        if (ascii_isspace(ch)) {
          continue;
        } else if (ch == '\0') {
          break;
        } else if (ch == kPad64Equals || ch == kPad64Dot) {
          // back up one character; we'll read it again when we check
          // for the correct number of pad characters at the end.
          ++szsrc;
          --src;
          break;
        } else {
          return false;
        }
      }

      // Each input character gives us six bits of output.
      temp = (temp << 6) | decode;
      ++state;
      if (state == 4) {
        // If we've accumulated 24 bits of output, write that out as
        // three bytes.
        if (dest) {
          if (destidx + 3 > szdest) return false;
          dest[destidx + 2] = temp;
          temp >>= 8;
          dest[destidx + 1] = temp;
          temp >>= 8;
          dest[destidx] = temp;
        }
        destidx += 3;
        state = 0;
        temp = 0;
      }
    }
  }

  // Process the leftover data contained in 'temp' at the end of the input.
  int expected_equals = 0;
  switch (state) {
    case 0:
      // Nothing left over; output is a multiple of 3 bytes.
      break;

    case 1:
      // Bad input; we have 6 bits left over.
      return false;

    case 2:
      // Produce one more output byte from the 12 input bits we have left.
      if (dest) {
        if (destidx + 1 > szdest) return false;
        temp >>= 4;
        dest[destidx] = temp;
      }
      ++destidx;
      expected_equals = 2;
      break;

    case 3:
      // Produce two more output bytes from the 18 input bits we have left.
      if (dest) {
        if (destidx + 2 > szdest) return false;
        temp >>= 2;
        dest[destidx + 1] = temp;
        temp >>= 8;
        dest[destidx] = temp;
      }
      destidx += 2;
      expected_equals = 1;
      break;

    default:
      // state should have no other values at this point.
      LOG(FATAL) << "This can't happen; base64 decoder state = " << state;
  }

  // The remainder of the string should be all whitespace, mixed with
  // exactly 0 equals signs, or exactly 'expected_equals' equals
  // signs.  (Always accepting 0 equals signs is a google extension
  // not covered in the RFC, as is accepting dot as the pad character.)

  int equals = 0;
  while (szsrc > 0 && *src) {
    if (*src == kPad64Equals || *src == kPad64Dot)
      ++equals;
    else if (!ascii_isspace(*src))
      return false;
    --szsrc;
    ++src;
  }

  const bool ok = (equals == 0 || equals == expected_equals);
  if (ok) *len = destidx;
  return ok;
}

// The arrays below were generated by the following code
// #include <sys/time.h>
// #include <stdlib.h>
// #include <string.h>
// main()
// {
//   static const char Base64[] =
//     "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
//   char* pos;
//   int idx, i, j;
//   printf("    ");
//   for (i = 0; i < 255; i += 8) {
//     for (j = i; j < i + 8; j++) {
//       pos = strchr(Base64, j);
//       if ((pos == NULL) || (j == 0))
//         idx = -1;
//       else
//         idx = pos - Base64;
//       if (idx == -1)
//         printf(" %2d,     ", idx);
//       else
//         printf(" %2d/*%c*/,", idx, j);
//     }
//     printf("\n    ");
//   }
// }
//
// where the value of "Base64[]" was replaced by one of the base-64 conversion
// tables from the functions below.
static const signed char kUnBase64[] = {
    -1,       -1,       -1,       -1,       -1,       -1,        -1,
    -1,       -1,       -1,       -1,       -1,       -1,        -1,
    -1,       -1,       -1,       -1,       -1,       -1,        -1,
    -1,       -1,       -1,       -1,       -1,       -1,        -1,
    -1,       -1,       -1,       -1,       -1,       -1,        -1,
    -1,       -1,       -1,       -1,       -1,       -1,        -1,
    -1,       62 /*+*/, -1,       -1,       -1,       63 /*/ */, 52 /*0*/,
    53 /*1*/, 54 /*2*/, 55 /*3*/, 56 /*4*/, 57 /*5*/, 58 /*6*/,  59 /*7*/,
    60 /*8*/, 61 /*9*/, -1,       -1,       -1,       -1,        -1,
    -1,       -1,       0 /*A*/,  1 /*B*/,  2 /*C*/,  3 /*D*/,   4 /*E*/,
    5 /*F*/,  6 /*G*/,  07 /*H*/, 8 /*I*/,  9 /*J*/,  10 /*K*/,  11 /*L*/,
    12 /*M*/, 13 /*N*/, 14 /*O*/, 15 /*P*/, 16 /*Q*/, 17 /*R*/,  18 /*S*/,
    19 /*T*/, 20 /*U*/, 21 /*V*/, 22 /*W*/, 23 /*X*/, 24 /*Y*/,  25 /*Z*/,
    -1,       -1,       -1,       -1,       -1,       -1,        26 /*a*/,
    27 /*b*/, 28 /*c*/, 29 /*d*/, 30 /*e*/, 31 /*f*/, 32 /*g*/,  33 /*h*/,
    34 /*i*/, 35 /*j*/, 36 /*k*/, 37 /*l*/, 38 /*m*/, 39 /*n*/,  40 /*o*/,
    41 /*p*/, 42 /*q*/, 43 /*r*/, 44 /*s*/, 45 /*t*/, 46 /*u*/,  47 /*v*/,
    48 /*w*/, 49 /*x*/, 50 /*y*/, 51 /*z*/, -1,       -1,        -1,
    -1,       -1,       -1,       -1,       -1,       -1,        -1,
    -1,       -1,       -1,       -1,       -1,       -1,        -1,
    -1,       -1,       -1,       -1,       -1,       -1,        -1,
    -1,       -1,       -1,       -1,       -1,       -1,        -1,
    -1,       -1,       -1,       -1,       -1,       -1,        -1,
    -1,       -1,       -1,       -1,       -1,       -1,        -1,
    -1,       -1,       -1,       -1,       -1,       -1,        -1,
    -1,       -1,       -1,       -1,       -1,       -1,        -1,
    -1,       -1,       -1,       -1,       -1,       -1,        -1,
    -1,       -1,       -1,       -1,       -1,       -1,        -1,
    -1,       -1,       -1,       -1,       -1,       -1,        -1,
    -1,       -1,       -1,       -1,       -1,       -1,        -1,
    -1,       -1,       -1,       -1,       -1,       -1,        -1,
    -1,       -1,       -1,       -1,       -1,       -1,        -1,
    -1,       -1,       -1,       -1,       -1,       -1,        -1,
    -1,       -1,       -1,       -1,       -1,       -1,        -1,
    -1,       -1,       -1,       -1,       -1,       -1,        -1,
    -1,       -1,       -1,       -1,       -1,       -1,        -1,
    -1,       -1,       -1,       -1};
static const signed char kUnWebSafeBase64[] = {
    -1,       -1,       -1,       -1,       -1,       -1,       -1,
    -1,       -1,       -1,       -1,       -1,       -1,       -1,
    -1,       -1,       -1,       -1,       -1,       -1,       -1,
    -1,       -1,       -1,       -1,       -1,       -1,       -1,
    -1,       -1,       -1,       -1,       -1,       -1,       -1,
    -1,       -1,       -1,       -1,       -1,       -1,       -1,
    -1,       -1,       -1,       62 /*-*/, -1,       -1,       52 /*0*/,
    53 /*1*/, 54 /*2*/, 55 /*3*/, 56 /*4*/, 57 /*5*/, 58 /*6*/, 59 /*7*/,
    60 /*8*/, 61 /*9*/, -1,       -1,       -1,       -1,       -1,
    -1,       -1,       0 /*A*/,  1 /*B*/,  2 /*C*/,  3 /*D*/,  4 /*E*/,
    5 /*F*/,  6 /*G*/,  07 /*H*/, 8 /*I*/,  9 /*J*/,  10 /*K*/, 11 /*L*/,
    12 /*M*/, 13 /*N*/, 14 /*O*/, 15 /*P*/, 16 /*Q*/, 17 /*R*/, 18 /*S*/,
    19 /*T*/, 20 /*U*/, 21 /*V*/, 22 /*W*/, 23 /*X*/, 24 /*Y*/, 25 /*Z*/,
    -1,       -1,       -1,       -1,       63 /*_*/, -1,       26 /*a*/,
    27 /*b*/, 28 /*c*/, 29 /*d*/, 30 /*e*/, 31 /*f*/, 32 /*g*/, 33 /*h*/,
    34 /*i*/, 35 /*j*/, 36 /*k*/, 37 /*l*/, 38 /*m*/, 39 /*n*/, 40 /*o*/,
    41 /*p*/, 42 /*q*/, 43 /*r*/, 44 /*s*/, 45 /*t*/, 46 /*u*/, 47 /*v*/,
    48 /*w*/, 49 /*x*/, 50 /*y*/, 51 /*z*/, -1,       -1,       -1,
    -1,       -1,       -1,       -1,       -1,       -1,       -1,
    -1,       -1,       -1,       -1,       -1,       -1,       -1,
    -1,       -1,       -1,       -1,       -1,       -1,       -1,
    -1,       -1,       -1,       -1,       -1,       -1,       -1,
    -1,       -1,       -1,       -1,       -1,       -1,       -1,
    -1,       -1,       -1,       -1,       -1,       -1,       -1,
    -1,       -1,       -1,       -1,       -1,       -1,       -1,
    -1,       -1,       -1,       -1,       -1,       -1,       -1,
    -1,       -1,       -1,       -1,       -1,       -1,       -1,
    -1,       -1,       -1,       -1,       -1,       -1,       -1,
    -1,       -1,       -1,       -1,       -1,       -1,       -1,
    -1,       -1,       -1,       -1,       -1,       -1,       -1,
    -1,       -1,       -1,       -1,       -1,       -1,       -1,
    -1,       -1,       -1,       -1,       -1,       -1,       -1,
    -1,       -1,       -1,       -1,       -1,       -1,       -1,
    -1,       -1,       -1,       -1,       -1,       -1,       -1,
    -1,       -1,       -1,       -1,       -1,       -1,       -1,
    -1,       -1,       -1,       -1,       -1,       -1,       -1,
    -1,       -1,       -1,       -1};

static bool Base64UnescapeInternal(const char* src, size_t slen, string* dest,
                                   const signed char* unbase64) {
  // Determine the size of the output string.  Base64 encodes every 3 bytes into
  // 4 characters.  any leftover chars are added directly for good measure.
  // This is documented in the base64 RFC: http://tools.ietf.org/html/rfc3548
  const size_t dest_len = 3 * (slen / 4) + (slen % 4);

  dest->resize(dest_len);

  // We are getting the destination buffer by getting the beginning of the
  // string and converting it into a char *.
  size_t len;
  const bool ok =
      Base64UnescapeInternal(src, slen, dest->empty() ? NULL : &*dest->begin(),
                             dest_len, unbase64, &len);
  if (!ok) {
    dest->clear();
    return false;
  }

  // could be shorter if there was padding
  DCHECK_LE(len, dest_len);
  dest->erase(len);

  return true;
}

bool Base64Unescape(const string& src, string* dest) {
  return Base64UnescapeInternal(src.data(), src.size(), dest, kUnBase64);
}

bool WebSafeBase64Unescape(const string& src, string* dest) {
  return Base64UnescapeInternal(src.data(), src.size(), dest, kUnWebSafeBase64);
}

// Base64Escape
//
// NOTE: We have to use an unsigned type for src because code built
// in the the /google tree treats characters as signed unless
// otherwised specified.
int Base64EscapeInternal(const unsigned char* src, int szsrc, char* dest,
                         int szdest, const char* base64, bool do_padding) {
  static const char kPad64 = '=';

  if (szsrc <= 0) return 0;

  char* cur_dest = dest;
  const unsigned char* cur_src = src;

  // Three bytes of data encodes to four characters of cyphertext.
  // So we can pump through three-byte chunks atomically.
  while (szsrc > 2) { /* keep going until we have less than 24 bits */
    if ((szdest -= 4) < 0) return 0;
    cur_dest[0] = base64[cur_src[0] >> 2];
    cur_dest[1] = base64[((cur_src[0] & 0x03) << 4) + (cur_src[1] >> 4)];
    cur_dest[2] = base64[((cur_src[1] & 0x0f) << 2) + (cur_src[2] >> 6)];
    cur_dest[3] = base64[cur_src[2] & 0x3f];

    cur_dest += 4;
    cur_src += 3;
    szsrc -= 3;
  }

  /* now deal with the tail (<=2 bytes) */
  switch (szsrc) {
    case 0:
      // Nothing left; nothing more to do.
      break;
    case 1:
      // One byte left: this encodes to two characters, and (optionally)
      // two pad characters to round out the four-character cypherblock.
      if ((szdest -= 2) < 0) return 0;
      cur_dest[0] = base64[cur_src[0] >> 2];
      cur_dest[1] = base64[(cur_src[0] & 0x03) << 4];
      cur_dest += 2;
      if (do_padding) {
        if ((szdest -= 2) < 0) return 0;
        cur_dest[0] = kPad64;
        cur_dest[1] = kPad64;
        cur_dest += 2;
      }
      break;
    case 2:
      // Two bytes left: this encodes to three characters, and (optionally)
      // one pad character to round out the four-character cypherblock.
      if ((szdest -= 3) < 0) return 0;
      cur_dest[0] = base64[cur_src[0] >> 2];
      cur_dest[1] = base64[((cur_src[0] & 0x03) << 4) + (cur_src[1] >> 4)];
      cur_dest[2] = base64[(cur_src[1] & 0x0f) << 2];
      cur_dest += 3;
      if (do_padding) {
        if ((szdest -= 1) < 0) return 0;
        cur_dest[0] = kPad64;
        cur_dest += 1;
      }
      break;
    default:
      // Should not be reached: blocks of 3 bytes are handled
      // in the while loop before this switch statement.
      CHECK(false) << "Logic problem? szsrc = " << szsrc;
      break;
  }
  return (cur_dest - dest);
}

static const char kBase64Chars[] =
    "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";

// Digit conversion.
static const char kHexTable[513] =
    "000102030405060708090a0b0c0d0e0f"
    "101112131415161718191a1b1c1d1e1f"
    "202122232425262728292a2b2c2d2e2f"
    "303132333435363738393a3b3c3d3e3f"
    "404142434445464748494a4b4c4d4e4f"
    "505152535455565758595a5b5c5d5e5f"
    "606162636465666768696a6b6c6d6e6f"
    "707172737475767778797a7b7c7d7e7f"
    "808182838485868788898a8b8c8d8e8f"
    "909192939495969798999a9b9c9d9e9f"
    "a0a1a2a3a4a5a6a7a8a9aaabacadaeaf"
    "b0b1b2b3b4b5b6b7b8b9babbbcbdbebf"
    "c0c1c2c3c4c5c6c7c8c9cacbcccdcecf"
    "d0d1d2d3d4d5d6d7d8d9dadbdcdddedf"
    "e0e1e2e3e4e5e6e7e8e9eaebecedeeef"
    "f0f1f2f3f4f5f6f7f8f9fafbfcfdfeff";

size_t CalculateBase64EscapedLenInternal(size_t input_len, bool do_padding) {
  // Base64 encodes three bytes of input at a time. If the input is not
  // divisible by three, we pad as appropriate.
  //
  // (from http://tools.ietf.org/html/rfc3548)
  // Special processing is performed if fewer than 24 bits are available
  // at the end of the data being encoded.  A full encoding quantum is
  // always completed at the end of a quantity.  When fewer than 24 input
  // bits are available in an input group, zero bits are added (on the
  // right) to form an integral number of 6-bit groups.  Padding at the
  // end of the data is performed using the '=' character.  Since all base
  // 64 input is an integral number of octets, only the following cases
  // can arise:

  // Base64 encodes each three bytes of input into four bytes of output.
  size_t len = (input_len / 3) * 4;

  if (input_len % 3 == 0) {
    // (from http://tools.ietf.org/html/rfc3548)
    // (1) the final quantum of encoding input is an integral multiple of 24
    // bits; here, the final unit of encoded output will be an integral
    // multiple of 4 characters with no "=" padding,
  } else if (input_len % 3 == 1) {
    // (from http://tools.ietf.org/html/rfc3548)
    // (2) the final quantum of encoding input is exactly 8 bits; here, the
    // final unit of encoded output will be two characters followed by two
    // "=" padding characters, or
    len += 2;
    if (do_padding) {
      len += 2;
    }
  } else {  // (input_len % 3 == 2)
    // (from http://tools.ietf.org/html/rfc3548)
    // (3) the final quantum of encoding input is exactly 16 bits; here, the
    // final unit of encoded output will be three characters followed by one
    // "=" padding character.
    len += 3;
    if (do_padding) {
      len += 1;
    }
  }

  assert(len >= input_len);  // make sure we didn't overflow
  return len;
}

void Base64EscapeInternal(const unsigned char* src, size_t szsrc, string* dest,
                          bool do_padding, const char* base64_chars) {
  const size_t calc_escaped_size =
      CalculateBase64EscapedLenInternal(szsrc, do_padding);
  dest->resize(calc_escaped_size);
  const int escaped_len = Base64EscapeInternal(
      src, static_cast<int>(szsrc), dest->empty() ? NULL : &*dest->begin(),
      static_cast<int>(dest->size()), base64_chars, do_padding);
  DCHECK_EQ(calc_escaped_size, escaped_len);
  dest->erase(escaped_len);
}

void Base64Escape(const unsigned char* src, ptrdiff_t szsrc, string* dest,
                  bool do_padding) {
  if (szsrc < 0) return;
  Base64EscapeInternal(src, szsrc, dest, do_padding, kBase64Chars);
}

// This is a templated function so that T can be either a char* or a string.
template <typename T>
static void b2a_hex_t(const unsigned char* src, T dest, ptrdiff_t num) {
  auto dest_ptr = &dest[0];
  for (auto src_ptr = src; src_ptr != (src + num); ++src_ptr, dest_ptr += 2) {
    const char* hex_p = &kHexTable[*src_ptr * 2];
    std::copy(hex_p, hex_p + 2, dest_ptr);
  }
}

string b2a_hex(const char* b, ptrdiff_t len) {
  string result;
  result.resize(len << 1);
  b2a_hex_t<string&>(reinterpret_cast<const unsigned char*>(b), result, len);
  return result;
}

}  // namespace dynamic_depth