/* * Copyright (C) 2008 The Android Open Source Project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /* * UTF-8 and Unicode string manipulation, plus java/lang/String convenience * functions. * * In most cases we populate the fields in the String object directly, * rather than going through an instance field lookup. */ #include "Dalvik.h" #include <stdlib.h> /* * Initialize string globals. * * This isn't part of the VM init sequence because it's hard to get the * timing right -- we need it to happen after java/lang/String has been * loaded, but before anybody wants to use a string. It's easiest to * just initialize it on first use. * * In some unusual circumstances (e.g. trying to throw an exception because * String implements java/lang/CharSequence, but CharSequence doesn't exist) * we can try to create an exception string internally before anything has * really tried to use String. In that case we basically self-destruct. */ static bool stringStartup() { if (gDvm.javaLangStringReady < 0) { LOGE("ERROR: reentrant string initialization\n"); assert(false); return false; } assert(gDvm.javaLangStringReady == 0); gDvm.javaLangStringReady = -1; if (gDvm.classJavaLangString == NULL) gDvm.classJavaLangString = dvmFindSystemClassNoInit("Ljava/lang/String;"); gDvm.offJavaLangString_value = dvmFindFieldOffset(gDvm.classJavaLangString, "value", "[C"); gDvm.offJavaLangString_count = dvmFindFieldOffset(gDvm.classJavaLangString, "count", "I"); gDvm.offJavaLangString_offset = dvmFindFieldOffset(gDvm.classJavaLangString, "offset", "I"); gDvm.offJavaLangString_hashCode = dvmFindFieldOffset(gDvm.classJavaLangString, "hashCode", "I"); if (gDvm.offJavaLangString_value < 0 || gDvm.offJavaLangString_count < 0 || gDvm.offJavaLangString_offset < 0 || gDvm.offJavaLangString_hashCode < 0) { LOGE("VM-required field missing from java/lang/String\n"); return false; } gDvm.javaLangStringReady = 1; return true; } /* * Discard heap-allocated storage. */ void dvmStringShutdown() { // currently unused } /* * Compute a hash code on a UTF-8 string, for use with internal hash tables. * * This may or may not yield the same results as the java/lang/String * computeHashCode() function. (To make sure this doesn't get abused, * I'm initializing the hash code to 1 so they *don't* match up.) * * It would be more correct to invoke dexGetUtf16FromUtf8() here and compute * the hash with the result. That way, if something encoded the same * character in two different ways, the hash value would be the same. For * our purposes that isn't necessary. */ u4 dvmComputeUtf8Hash(const char* utf8Str) { u4 hash = 1; while (*utf8Str != '\0') hash = hash * 31 + *utf8Str++; return hash; } /* * Like "strlen", but for strings encoded with "modified" UTF-8. * * The value returned is the number of characters, which may or may not * be the same as the number of bytes. * * (If this needs optimizing, try: mask against 0xa0, shift right 5, * get increment {1-3} from table of 8 values.) */ int dvmUtf8Len(const char* utf8Str) { int ic, len = 0; while ((ic = *utf8Str++) != '\0') { len++; if ((ic & 0x80) != 0) { /* two- or three-byte encoding */ utf8Str++; if ((ic & 0x20) != 0) { /* three-byte encoding */ utf8Str++; } } } return len; } /* * Convert a "modified" UTF-8 string to UTF-16. */ void dvmConvertUtf8ToUtf16(u2* utf16Str, const char* utf8Str) { while (*utf8Str != '\0') *utf16Str++ = dexGetUtf16FromUtf8(&utf8Str); } /* * Given a UTF-16 string, compute the length of the corresponding UTF-8 * string in bytes. */ static int utf16_utf8ByteLen(const u2* utf16Str, int len) { int utf8Len = 0; while (len--) { unsigned int uic = *utf16Str++; /* * The most common case is (uic > 0 && uic <= 0x7f). */ if (uic == 0 || uic > 0x7f) { if (uic > 0x07ff) utf8Len += 3; else /*(uic > 0x7f || uic == 0) */ utf8Len += 2; } else utf8Len++; } return utf8Len; } /* * Convert a UTF-16 string to UTF-8. * * Make sure you allocate "utf8Str" with the result of utf16_utf8ByteLen(), * not just "len". */ static void convertUtf16ToUtf8(char* utf8Str, const u2* utf16Str, int len) { assert(len >= 0); while (len--) { unsigned int uic = *utf16Str++; /* * The most common case is (uic > 0 && uic <= 0x7f). */ if (uic == 0 || uic > 0x7f) { if (uic > 0x07ff) { *utf8Str++ = (uic >> 12) | 0xe0; *utf8Str++ = ((uic >> 6) & 0x3f) | 0x80; *utf8Str++ = (uic & 0x3f) | 0x80; } else /*(uic > 0x7f || uic == 0)*/ { *utf8Str++ = (uic >> 6) | 0xc0; *utf8Str++ = (uic & 0x3f) | 0x80; } } else { *utf8Str++ = uic; } } *utf8Str = '\0'; } /* * Use the java/lang/String.computeHashCode() algorithm. */ static inline u4 dvmComputeUtf16Hash(const u2* utf16Str, int len) { u4 hash = 0; while (len--) hash = hash * 31 + *utf16Str++; return hash; } u4 dvmComputeStringHash(StringObject* strObj) { ArrayObject* chars = (ArrayObject*) dvmGetFieldObject((Object*) strObj, gDvm.offJavaLangString_value); int offset, len; len = dvmGetFieldInt((Object*) strObj, gDvm.offJavaLangString_count); offset = dvmGetFieldInt((Object*) strObj, gDvm.offJavaLangString_offset); return dvmComputeUtf16Hash((u2*) chars->contents + offset, len); } /* * Create a new java/lang/String object, using the string data in "utf8Str". * * Note that "allocFlags" affects both of the allocations here. If you * use ALLOC_DONT_TRACK in a context where a GC could happen between the * two allocations, you could lose the array reference. * * Returns NULL and throws an exception on failure. */ StringObject* dvmCreateStringFromCstr(const char* utf8Str, int allocFlags) { assert(utf8Str != NULL); return dvmCreateStringFromCstrAndLength(utf8Str, dvmUtf8Len(utf8Str), allocFlags); } /* * Create a java/lang/String from a C string, given its UTF-16 length * (number of UTF-16 code points). * * The caller must call dvmReleaseTrackedAlloc() on the return value or * use a non-default value for "allocFlags". It is never appropriate * to use ALLOC_DONT_TRACK with this function. * * Returns NULL and throws an exception on failure. */ StringObject* dvmCreateStringFromCstrAndLength(const char* utf8Str, u4 utf16Length, int allocFlags) { StringObject* newObj; ArrayObject* chars; u4 hashCode = 0; //LOGV("Creating String from '%s'\n", utf8Str); assert(allocFlags != ALLOC_DONT_TRACK); /* don't currently need */ assert(utf8Str != NULL); if (gDvm.javaLangStringReady <= 0) { if (!stringStartup()) return NULL; } /* init before alloc */ if (!dvmIsClassInitialized(gDvm.classJavaLangString) && !dvmInitClass(gDvm.classJavaLangString)) { return NULL; } newObj = (StringObject*) dvmAllocObject(gDvm.classJavaLangString, allocFlags); if (newObj == NULL) return NULL; chars = dvmAllocPrimitiveArray('C', utf16Length, allocFlags); if (chars == NULL) { dvmReleaseTrackedAllocIFN((Object*) newObj, NULL, allocFlags); return NULL; } dvmConvertUtf8ToUtf16((u2*)chars->contents, utf8Str); hashCode = dvmComputeUtf16Hash((u2*) chars->contents, utf16Length); dvmSetFieldObject((Object*)newObj, gDvm.offJavaLangString_value, (Object*)chars); dvmReleaseTrackedAllocIFN((Object*) chars, NULL, allocFlags); dvmSetFieldInt((Object*)newObj, gDvm.offJavaLangString_count, utf16Length); dvmSetFieldInt((Object*)newObj, gDvm.offJavaLangString_hashCode, hashCode); /* leave offset set to zero */ /* debugging stuff */ //dvmDumpObject((Object*)newObj); //printHexDumpEx(ANDROID_LOG_DEBUG, chars->contents, utf16Length * 2, // kHexDumpMem); /* caller may need to dvmReleaseTrackedAlloc(newObj) */ return newObj; } /* * Create a new java/lang/String object, using the Unicode data. */ StringObject* dvmCreateStringFromUnicode(const u2* unichars, int len) { StringObject* newObj; ArrayObject* chars; u4 hashCode = 0; /* we allow a null pointer if the length is zero */ assert(len == 0 || unichars != NULL); if (gDvm.javaLangStringReady <= 0) { if (!stringStartup()) return NULL; } /* init before alloc */ if (!dvmIsClassInitialized(gDvm.classJavaLangString) && !dvmInitClass(gDvm.classJavaLangString)) { return NULL; } newObj = (StringObject*) dvmAllocObject(gDvm.classJavaLangString, ALLOC_DEFAULT); if (newObj == NULL) return NULL; chars = dvmAllocPrimitiveArray('C', len, ALLOC_DEFAULT); if (chars == NULL) { dvmReleaseTrackedAlloc((Object*) newObj, NULL); return NULL; } if (len > 0) memcpy(chars->contents, unichars, len * sizeof(u2)); hashCode = dvmComputeUtf16Hash((u2*) chars->contents, len); dvmSetFieldObject((Object*)newObj, gDvm.offJavaLangString_value, (Object*)chars); dvmReleaseTrackedAlloc((Object*) chars, NULL); dvmSetFieldInt((Object*)newObj, gDvm.offJavaLangString_count, len); dvmSetFieldInt((Object*)newObj, gDvm.offJavaLangString_hashCode, hashCode); /* leave offset set to zero */ /* debugging stuff */ //dvmDumpObject((Object*)newObj); //printHexDumpEx(ANDROID_LOG_DEBUG, chars->contents, len*2, kHexDumpMem); /* caller must dvmReleaseTrackedAlloc(newObj) */ return newObj; } /* * Create a new C string from a java/lang/String object. * * Returns NULL if the object is NULL. */ char* dvmCreateCstrFromString(StringObject* jstr) { char* newStr; ArrayObject* chars; int len, byteLen, offset; const u2* data; assert(gDvm.javaLangStringReady > 0); if (jstr == NULL) return NULL; len = dvmGetFieldInt((Object*) jstr, gDvm.offJavaLangString_count); offset = dvmGetFieldInt((Object*) jstr, gDvm.offJavaLangString_offset); chars = (ArrayObject*) dvmGetFieldObject((Object*) jstr, gDvm.offJavaLangString_value); data = (const u2*) chars->contents + offset; assert(offset + len <= (int) chars->length); byteLen = utf16_utf8ByteLen(data, len); newStr = (char*) malloc(byteLen+1); if (newStr == NULL) return NULL; convertUtf16ToUtf8(newStr, data, len); return newStr; } /* * Create a UTF-8 C string from a region of a java/lang/String. (Used by * the JNI GetStringUTFRegion call.) */ void dvmCreateCstrFromStringRegion(StringObject* jstr, int start, int len, char* buf) { const u2* data; data = dvmStringChars(jstr) + start; convertUtf16ToUtf8(buf, data, len); } /* * Compute the length, in modified UTF-8, of a java/lang/String object. * * Does not include the terminating null byte. */ int dvmStringUtf8ByteLen(StringObject* jstr) { ArrayObject* chars; int len, offset; const u2* data; assert(gDvm.javaLangStringReady > 0); if (jstr == NULL) return 0; // should we throw something? assert? len = dvmGetFieldInt((Object*) jstr, gDvm.offJavaLangString_count); offset = dvmGetFieldInt((Object*) jstr, gDvm.offJavaLangString_offset); chars = (ArrayObject*) dvmGetFieldObject((Object*) jstr, gDvm.offJavaLangString_value); data = (const u2*) chars->contents + offset; assert(offset + len <= (int) chars->length); return utf16_utf8ByteLen(data, len); } /* * Get the string's length. */ int dvmStringLen(StringObject* jstr) { return dvmGetFieldInt((Object*) jstr, gDvm.offJavaLangString_count); } /* * Get the char[] object from the String. */ ArrayObject* dvmStringCharArray(StringObject* jstr) { return (ArrayObject*) dvmGetFieldObject((Object*) jstr, gDvm.offJavaLangString_value); } /* * Get the string's data. */ const u2* dvmStringChars(StringObject* jstr) { ArrayObject* chars; int offset; offset = dvmGetFieldInt((Object*) jstr, gDvm.offJavaLangString_offset); chars = (ArrayObject*) dvmGetFieldObject((Object*) jstr, gDvm.offJavaLangString_value); return (const u2*) chars->contents + offset; } /* * Compare two String objects. * * This is a dvmHashTableLookup() callback. The function has already * compared their hash values; we need to do a full compare to ensure * that the strings really match. */ int dvmHashcmpStrings(const void* vstrObj1, const void* vstrObj2) { const StringObject* strObj1 = (const StringObject*) vstrObj1; const StringObject* strObj2 = (const StringObject*) vstrObj2; ArrayObject* chars1; ArrayObject* chars2; int len1, len2, offset1, offset2; assert(gDvm.javaLangStringReady > 0); /* get offset and length into char array; all values are in 16-bit units */ len1 = dvmGetFieldInt((Object*) strObj1, gDvm.offJavaLangString_count); offset1 = dvmGetFieldInt((Object*) strObj1, gDvm.offJavaLangString_offset); len2 = dvmGetFieldInt((Object*) strObj2, gDvm.offJavaLangString_count); offset2 = dvmGetFieldInt((Object*) strObj2, gDvm.offJavaLangString_offset); if (len1 != len2) return len1 - len2; chars1 = (ArrayObject*) dvmGetFieldObject((Object*) strObj1, gDvm.offJavaLangString_value); chars2 = (ArrayObject*) dvmGetFieldObject((Object*) strObj2, gDvm.offJavaLangString_value); /* damage here actually indicates a broken java/lang/String */ assert(offset1 + len1 <= (int) chars1->length); assert(offset2 + len2 <= (int) chars2->length); return memcmp((const u2*) chars1->contents + offset1, (const u2*) chars2->contents + offset2, len1 * sizeof(u2)); }