/*
* Copyright (C) 2011 The Guava Authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.google.common.base;
import com.google.caliper.BeforeExperiment;
import com.google.caliper.Benchmark;
import com.google.caliper.Param;
import java.util.Random;
/**
* Benchmark for the {@link Utf8} class.
*
*
* @author Martin Buchholz
*/
public class Utf8Benchmark {
static class MaxCodePoint {
final int value;
/**
* Convert the input string to a code point. Accepts regular
* decimal numerals, hex strings, and some symbolic names
* meaningful to humans.
*/
private static int decode(String userFriendly) {
try {
return Integer.decode(userFriendly);
} catch (NumberFormatException ignored) {
if (userFriendly.matches("(?i)(?:American|English|ASCII)")) {
// 1-byte UTF-8 sequences - "American" ASCII text
return 0x80;
} else if (userFriendly.matches("(?i)(?:French|Latin|Western.*European)")) {
// Mostly 1-byte UTF-8 sequences, mixed with occasional 2-byte
// sequences - "Western European" text
return 0x90;
} else if (userFriendly.matches("(?i)(?:Branch.*Prediction.*Hostile)")) {
// Defeat branch predictor for: c < 0x80 ; branch taken 50% of the time.
return 0x100;
} else if (userFriendly.matches("(?i)(?:Greek|Cyrillic|European|ISO.?8859)")) {
// Mostly 2-byte UTF-8 sequences - "European" text
return 0x800;
} else if (userFriendly.matches("(?i)(?:Chinese|Han|Asian|BMP)")) {
// Mostly 3-byte UTF-8 sequences - "Asian" text
return Character.MIN_SUPPLEMENTARY_CODE_POINT;
} else if (userFriendly.matches("(?i)(?:Cuneiform|rare|exotic|supplementary.*)")) {
// Mostly 4-byte UTF-8 sequences - "rare exotic" text
return Character.MAX_CODE_POINT;
} else {
throw new IllegalArgumentException("Can't decode codepoint " + userFriendly);
}
}
}
public static MaxCodePoint valueOf(String userFriendly) {
return new MaxCodePoint(userFriendly);
}
public MaxCodePoint(String userFriendly) {
value = decode(userFriendly);
}
}
/**
* The default values of maxCodePoint below provide pretty good
* performance models of different kinds of common human text.
* @see MaxCodePoint#decode
*/
@Param({"0x80", "0x90", "0x100", "0x800", "0x10000", "0x10ffff"}) MaxCodePoint maxCodePoint;
@Param({"100"}) int stringCount;
@Param({"16384"}) int charCount;
private CharSequence[] seqs; // actually, all StringBuilders
private String[] strings;
private byte[][] byteArrays;
/**
* Compute arrays of valid unicode text, and store it in 3 forms:
* byte arrays, Strings, and StringBuilders (in a CharSequence[] to
* make it a little harder for the JVM).
*/
@BeforeExperiment void setUp() {
final long seed = 99;
final Random rnd = new Random(seed);
seqs = new CharSequence[stringCount];
strings = new String[stringCount];
byteArrays = new byte[stringCount][];
for (int i = 0; i < stringCount; i++) {
StringBuilder sb = new StringBuilder();
for (int j = 0; j < charCount; j++) {
int codePoint;
// discard illegal surrogate "codepoints"
do {
codePoint = rnd.nextInt(maxCodePoint.value);
} while (isSurrogate(codePoint));
sb.appendCodePoint(codePoint);
}
seqs[i] = sb;
strings[i] = sb.toString();
byteArrays[i] = strings[i].getBytes(Charsets.UTF_8);
}
}
/**
* Benchmarks {@link Utf8#isWellFormed} on valid byte arrays
* containing pseudo-randomly-generated codePoints less than {@code
* maxCodePoint}. A constant seed is used, so separate runs perform
* identical computations.
*/
@Benchmark void isWellFormed(int reps) {
for (int i = 0; i < reps; i++) {
for (byte[] byteArray : byteArrays) {
if (!Utf8.isWellFormed(byteArray)) {
throw new Error("unexpected invalid UTF-8");
}
}
}
}
/**
* Benchmarks {@link Utf8#length} on valid strings containing
* pseudo-randomly-generated codePoints less than {@code
* maxCodePoint}. A constant seed is used, so separate runs perform
* identical computations.
*/
@Benchmark void lengthOfString(int reps) {
for (int i = 0; i < reps; i++) {
for (String string : strings) {
if (Utf8.encodedLength(string) == 1237482374) {
throw new Error("Unlikely! We're just defeating the optimizer!");
}
}
}
}
/**
* Benchmarks {@link Utf8#length} on valid StringBuilders containing
* pseudo-randomly-generated codePoints less than {@code
* maxCodePoint}. A constant seed is used, so separate runs perform
* identical computations.
*/
@Benchmark void lengthOfStringBuilder(int reps) {
for (int i = 0; i < reps; i++) {
for (CharSequence seq : seqs) {
if (Utf8.encodedLength(seq) == 1237482374) {
throw new Error("Unlikely! We're just defeating the optimizer!");
}
}
}
}
/** Character.isSurrogate was added in Java SE 7. */
private boolean isSurrogate(int c) {
return (Character.MIN_HIGH_SURROGATE <= c &&
c <= Character.MAX_LOW_SURROGATE);
}
}