Java程序  |  332行  |  11.83 KB

/*
 * Copyright (C) 2013 The Guava Authors
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.google.common.base;

import com.google.common.annotations.GwtCompatible;
import com.google.common.annotations.GwtIncompatible;

import junit.framework.TestCase;

import java.io.UnsupportedEncodingException;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Random;

/**
 * Unit tests for {@link Utf8}.
 *
 * @author Jon Perlow
 * @author Martin Buchholz
 * @author Clément Roux
 */
@GwtCompatible(emulated = true)
public class Utf8Test extends TestCase {
  public void testEncodedLength_validStrings() {
    assertEquals(0, Utf8.encodedLength(""));
    assertEquals(11, Utf8.encodedLength("Hello world"));
    assertEquals(8, Utf8.encodedLength("Résumé"));
    assertEquals(461, Utf8.encodedLength("威廉·莎士比亞(William Shakespeare,"
        + "1564年4月26號—1616年4月23號[1])係隻英國嗰演員、劇作家同詩人,"
        + "有時間佢簡稱莎翁;中國清末民初哈拕翻譯做舌克斯毕、沙斯皮耳、筛斯比耳、"
        + "莎基斯庇尔、索士比尔、夏克思芘尔、希哀苦皮阿、叶斯壁、沙克皮尔、"
        + "狹斯丕爾。[2]莎士比亞編寫過好多作品,佢嗰劇作響西洋文學好有影響,"
        + "哈都拕人翻譯做好多話。"));
    // A surrogate pair
    assertEquals(4, Utf8.encodedLength(
        newString(Character.MIN_HIGH_SURROGATE, Character.MIN_LOW_SURROGATE)));
  }

  @GwtIncompatible("StringBuilder.appendCodePoint()")
  public void testEncodedLength_validStrings2() {
    HashMap<Integer, Integer> utf8Lengths = new HashMap<Integer, Integer>();
    utf8Lengths.put(0x00, 1);
    utf8Lengths.put(0x7f, 1);
    utf8Lengths.put(0x80, 2);
    utf8Lengths.put(0x7ff, 2);
    utf8Lengths.put(0x800, 3);
    utf8Lengths.put(Character.MIN_SUPPLEMENTARY_CODE_POINT - 1, 3);
    utf8Lengths.put(Character.MIN_SUPPLEMENTARY_CODE_POINT, 4);
    utf8Lengths.put(Character.MAX_CODE_POINT, 4);

    Integer[] codePoints = utf8Lengths.keySet().toArray(new Integer[]{});
    StringBuilder sb = new StringBuilder();
    Random rnd = new Random();
    for (int trial = 0; trial < 100; trial++) {
      sb.setLength(0);
      int utf8Length = 0;
      for (int i = 0; i < 6; i++) {
        Integer randomCodePoint = codePoints[rnd.nextInt(codePoints.length)];
        sb.appendCodePoint(randomCodePoint);
        utf8Length += utf8Lengths.get(randomCodePoint);
        if (utf8Length != Utf8.encodedLength(sb)) {
          StringBuilder repro = new StringBuilder();
          for (int j = 0; j < sb.length(); j++) {
            repro.append(" " + (int) sb.charAt(j));  // GWT compatible
          }
          assertEquals(repro.toString(), utf8Length, Utf8.encodedLength(sb));
        }
      }
    }
  }

  public void testEncodedLength_invalidStrings() {
    testEncodedLengthFails(newString(Character.MIN_HIGH_SURROGATE), 0);
    testEncodedLengthFails("foobar" + newString(Character.MIN_HIGH_SURROGATE), 6);
    testEncodedLengthFails(newString(Character.MIN_LOW_SURROGATE), 0);
    testEncodedLengthFails("foobar" + newString(Character.MIN_LOW_SURROGATE), 6);
    testEncodedLengthFails(
        newString(
            Character.MIN_HIGH_SURROGATE,
            Character.MIN_HIGH_SURROGATE), 0);
  }

  private static void testEncodedLengthFails(String invalidString,
      int invalidCodePointIndex) {
    try {
      Utf8.encodedLength(invalidString);
      fail();
    } catch (IllegalArgumentException expected) {
      assertEquals("Unpaired surrogate at index " + invalidCodePointIndex,
          expected.getMessage());
    }
  }

  // 128 - [chars 0x0000 to 0x007f]
  private static final long ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS =
      0x007f - 0x0000 + 1;

  // 128
  private static final long EXPECTED_ONE_BYTE_ROUNDTRIPPABLE_COUNT =
      ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS;

  // 1920 [chars 0x0080 to 0x07FF]
  private static final long TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS =
      0x07FF - 0x0080 + 1;

  // 18,304
  private static final long EXPECTED_TWO_BYTE_ROUNDTRIPPABLE_COUNT =
      // Both bytes are one byte characters
      (long) Math.pow(EXPECTED_ONE_BYTE_ROUNDTRIPPABLE_COUNT, 2) +
      // The possible number of two byte characters
      TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS;

  // 2048
  private static final long THREE_BYTE_SURROGATES = 2 * 1024;

  // 61,440 [chars 0x0800 to 0xFFFF, minus surrogates]
  private static final long THREE_BYTE_ROUNDTRIPPABLE_CHARACTERS =
      0xFFFF - 0x0800 + 1 - THREE_BYTE_SURROGATES;

  // 2,650,112
  private static final long EXPECTED_THREE_BYTE_ROUNDTRIPPABLE_COUNT =
      // All one byte characters
      (long) Math.pow(EXPECTED_ONE_BYTE_ROUNDTRIPPABLE_COUNT, 3) +
      // One two byte character and a one byte character
      2 * TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS *
          ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS +
       // Three byte characters
      THREE_BYTE_ROUNDTRIPPABLE_CHARACTERS;

  // 1,048,576 [chars 0x10000L to 0x10FFFF]
  private static final long FOUR_BYTE_ROUNDTRIPPABLE_CHARACTERS =
      0x10FFFF - 0x10000L + 1;

  // 289,571,839
  private static final long EXPECTED_FOUR_BYTE_ROUNDTRIPPABLE_COUNT =
      // All one byte characters
      (long) Math.pow(EXPECTED_ONE_BYTE_ROUNDTRIPPABLE_COUNT, 4) +
      // One and three byte characters
      2 * THREE_BYTE_ROUNDTRIPPABLE_CHARACTERS *
          ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS +
      // Two two byte characters
      TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS * TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS +
      // Permutations of one and two byte characters
      3 * TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS *
          ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS *
          ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS +
      // Four byte characters
      FOUR_BYTE_ROUNDTRIPPABLE_CHARACTERS;

  /** Tests that round tripping of all two byte permutations work. */
  @GwtIncompatible("java.nio.charset.Charset")
  public void testIsWellFormed_1Byte() {
    testBytes(1, EXPECTED_ONE_BYTE_ROUNDTRIPPABLE_COUNT);
  }

  /** Tests that round tripping of all two byte permutations work. */
  @GwtIncompatible("java.nio.charset.Charset")
  public void testIsWellFormed_2Bytes() {
    testBytes(2, EXPECTED_TWO_BYTE_ROUNDTRIPPABLE_COUNT);
  }

  /** Tests that round tripping of all three byte permutations work. */
  @GwtIncompatible("java.nio.charset.Charset")
  public void testIsWellFormed_3Bytes() {
    testBytes(3, EXPECTED_THREE_BYTE_ROUNDTRIPPABLE_COUNT);
  }

  /**
   * Tests that round tripping of a sample of four byte permutations work.
   * All permutations are prohibitively expensive to test for automated runs.
   * This method tests specific four-byte cases.
   */
  public void testIsWellFormed_4BytesSamples() {
    // Valid 4 byte.
    assertWellFormed(0xF0, 0xA4, 0xAD, 0xA2);
    // Bad trailing bytes
    assertNotWellFormed(0xF0, 0xA4, 0xAD, 0x7F);
    assertNotWellFormed(0xF0, 0xA4, 0xAD, 0xC0);
    // Special cases for byte2
    assertNotWellFormed(0xF0, 0x8F, 0xAD, 0xA2);
    assertNotWellFormed(0xF4, 0x90, 0xAD, 0xA2);
  }

  /** Tests some hard-coded test cases. */
  public void testSomeSequences() {
    // Empty
    assertWellFormed();
    // One-byte characters, including control characters
    assertWellFormed(0x00, 0x61, 0x62, 0x63, 0x7F); // "\u0000abc\u007f"
    // Two-byte characters
    assertWellFormed(0xC2, 0xA2, 0xC2, 0xA2); // "\u00a2\u00a2"
    // Three-byte characters
    assertWellFormed(0xc8, 0x8a, 0x63, 0xc8, 0x8a, 0x63); // "\u020ac\u020ac"
    // Four-byte characters
    // "\u024B62\u024B62"
    assertWellFormed(0xc9, 0x8b, 0x36, 0x32, 0xc9, 0x8b, 0x36, 0x32);
    // Mixed string
    // "a\u020ac\u00a2b\\u024B62u020acc\u00a2de\u024B62"
    assertWellFormed(0x61, 0xc8, 0x8a, 0x63, 0xc2, 0xa2, 0x62, 0x5c, 0x75, 0x30,
        0x32, 0x34, 0x42, 0x36, 0x32, 0x75, 0x30, 0x32, 0x30, 0x61, 0x63, 0x63,
        0xc2, 0xa2, 0x64, 0x65, 0xc9, 0x8b, 0x36, 0x32);
    // Not a valid string
    assertNotWellFormed(-1, 0, -1, 0);
  }

  public void testShardsHaveExpectedRoundTrippables() {
    // A sanity check.
    long actual = 0;
    for (long expected : generateFourByteShardsExpectedRunnables()) {
      actual += expected;
    }
    assertEquals(EXPECTED_FOUR_BYTE_ROUNDTRIPPABLE_COUNT, actual);
  }

  private String newString(char... chars) {
    return new String(chars);
  }

  private byte[] toByteArray(int... bytes) {
    byte[] realBytes = new byte[bytes.length];
    for (int i = 0; i < bytes.length; i++) {
      realBytes[i] = (byte) bytes[i];
    }
    return realBytes;
  }

  private void assertWellFormed(int... bytes) {
    assertTrue(Utf8.isWellFormed(toByteArray(bytes)));
  }

  private void assertNotWellFormed(int... bytes) {
    assertFalse(Utf8.isWellFormed(toByteArray(bytes)));
  }

  private static long[] generateFourByteShardsExpectedRunnables() {
    long[] expected = new long[128];
    // 0-63 are all 5300224
    for (int i = 0; i <= 63; i++) {
      expected[i] = 5300224;
    }
    // 97-111 are all 2342912
    for (int i = 97; i <= 111; i++) {
     expected[i] = 2342912;
    }
    // 113-117 are all 1048576
    for (int i = 113; i <= 117; i++) {
      expected[i] = 1048576;
    }
    // One offs
    expected[112] = 786432;
    expected[118] = 786432;
    expected[119] = 1048576;
    expected[120] = 458752;
    expected[121] = 524288;
    expected[122] = 65536;
    // Anything not assigned was the default 0.
    return expected;
  }

  /**
   * Helper to run the loop to test all the permutations for the number of bytes
   * specified.
   *
   * @param numBytes the number of bytes in the byte array
   * @param expectedCount the expected number of roundtrippable permutations
   */
  @GwtIncompatible("java.nio.charset.Charset")
  private static void testBytes(int numBytes, long expectedCount) {
    testBytes(numBytes, expectedCount, 0, -1);
  }

  /**
   * Helper to run the loop to test all the permutations for the number of bytes
   * specified. This overload is useful for debugging to get the loop to start
   * at a certain character.
   *
   * @param numBytes the number of bytes in the byte array
   * @param expectedCount the expected number of roundtrippable permutations
   * @param start the starting bytes encoded as a long as big-endian
   * @param lim the limit of bytes to process encoded as a long as big-endian,
   *     or -1 to mean the max limit for numBytes
   */
  @GwtIncompatible("java.nio.charset.Charset")
  private static void testBytes(int numBytes, long expectedCount, long start,
      long lim) {
    byte[] bytes = new byte[numBytes];
    if (lim == -1) {
      lim = 1L << (numBytes * 8);
    }
    long countRoundTripped = 0;
    for (long byteChar = start; byteChar < lim; byteChar++) {
      long tmpByteChar = byteChar;
      for (int i = 0; i < numBytes; i++) {
        bytes[bytes.length - i - 1] = (byte) tmpByteChar;
        tmpByteChar = tmpByteChar >> 8;
      }
      boolean isRoundTrippable = Utf8.isWellFormed(bytes);
      assertEquals(isRoundTrippable, Utf8.isWellFormed(bytes, 0, numBytes));
      boolean bytesEqual;
      try {
        String s = new String(bytes, Charsets.UTF_8.name());
        byte[] bytesReencoded = s.getBytes(Charsets.UTF_8.name());
        bytesEqual = Arrays.equals(bytes, bytesReencoded);
      } catch (UnsupportedEncodingException e) {
        throw new AssertionError(e);
      }

      if (bytesEqual != isRoundTrippable) {
        fail();
      }
      if (isRoundTrippable) {
        countRoundTripped++;
      }
    }
    assertEquals(expectedCount, countRoundTripped);
  }
}