/* * Copyright (C) 2005 The Android Open Source Project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include <utils/String8.h> #include <utils/Log.h> #include <utils/String16.h> #include <utils/TextOutput.h> #include <utils/threads.h> #include <private/utils/Static.h> #include <ctype.h> /* * Functions outside android is below the namespace android, since they use * functions and constants in android namespace. */ // --------------------------------------------------------------------------- namespace android { static const char32_t kByteMask = 0x000000BF; static const char32_t kByteMark = 0x00000080; // Surrogates aren't valid for UTF-32 characters, so define some // constants that will let us screen them out. static const char32_t kUnicodeSurrogateHighStart = 0x0000D800; static const char32_t kUnicodeSurrogateHighEnd = 0x0000DBFF; static const char32_t kUnicodeSurrogateLowStart = 0x0000DC00; static const char32_t kUnicodeSurrogateLowEnd = 0x0000DFFF; static const char32_t kUnicodeSurrogateStart = kUnicodeSurrogateHighStart; static const char32_t kUnicodeSurrogateEnd = kUnicodeSurrogateLowEnd; static const char32_t kUnicodeMaxCodepoint = 0x0010FFFF; // Mask used to set appropriate bits in first byte of UTF-8 sequence, // indexed by number of bytes in the sequence. // 0xxxxxxx // -> (00-7f) 7bit. Bit mask for the first byte is 0x00000000 // 110yyyyx 10xxxxxx // -> (c0-df)(80-bf) 11bit. Bit mask is 0x000000C0 // 1110yyyy 10yxxxxx 10xxxxxx // -> (e0-ef)(80-bf)(80-bf) 16bit. Bit mask is 0x000000E0 // 11110yyy 10yyxxxx 10xxxxxx 10xxxxxx // -> (f0-f7)(80-bf)(80-bf)(80-bf) 21bit. Bit mask is 0x000000F0 static const char32_t kFirstByteMark[] = { 0x00000000, 0x00000000, 0x000000C0, 0x000000E0, 0x000000F0 }; // Separator used by resource paths. This is not platform dependent contrary // to OS_PATH_SEPARATOR. #define RES_PATH_SEPARATOR '/' // Return number of utf8 bytes required for the character. static size_t utf32_to_utf8_bytes(char32_t srcChar) { size_t bytesToWrite; // Figure out how many bytes the result will require. if (srcChar < 0x00000080) { bytesToWrite = 1; } else if (srcChar < 0x00000800) { bytesToWrite = 2; } else if (srcChar < 0x00010000) { if ((srcChar < kUnicodeSurrogateStart) || (srcChar > kUnicodeSurrogateEnd)) { bytesToWrite = 3; } else { // Surrogates are invalid UTF-32 characters. return 0; } } // Max code point for Unicode is 0x0010FFFF. else if (srcChar <= kUnicodeMaxCodepoint) { bytesToWrite = 4; } else { // Invalid UTF-32 character. return 0; } return bytesToWrite; } // Write out the source character to <dstP>. static void utf32_to_utf8(uint8_t* dstP, char32_t srcChar, size_t bytes) { dstP += bytes; switch (bytes) { /* note: everything falls through. */ case 4: *--dstP = (uint8_t)((srcChar | kByteMark) & kByteMask); srcChar >>= 6; case 3: *--dstP = (uint8_t)((srcChar | kByteMark) & kByteMask); srcChar >>= 6; case 2: *--dstP = (uint8_t)((srcChar | kByteMark) & kByteMask); srcChar >>= 6; case 1: *--dstP = (uint8_t)(srcChar | kFirstByteMark[bytes]); } } // --------------------------------------------------------------------------- static SharedBuffer* gEmptyStringBuf = NULL; static char* gEmptyString = NULL; extern int gDarwinCantLoadAllObjects; int gDarwinIsReallyAnnoying; static inline char* getEmptyString() { gEmptyStringBuf->acquire(); return gEmptyString; } void initialize_string8() { // HACK: This dummy dependency forces linking libutils Static.cpp, // which is needed to initialize String8/String16 classes. // These variables are named for Darwin, but are needed elsewhere too, // including static linking on any platform. gDarwinIsReallyAnnoying = gDarwinCantLoadAllObjects; SharedBuffer* buf = SharedBuffer::alloc(1); char* str = (char*)buf->data(); *str = 0; gEmptyStringBuf = buf; gEmptyString = str; } void terminate_string8() { SharedBuffer::bufferFromData(gEmptyString)->release(); gEmptyStringBuf = NULL; gEmptyString = NULL; } // --------------------------------------------------------------------------- static char* allocFromUTF8(const char* in, size_t len) { if (len > 0) { SharedBuffer* buf = SharedBuffer::alloc(len+1); LOG_ASSERT(buf, "Unable to allocate shared buffer"); if (buf) { char* str = (char*)buf->data(); memcpy(str, in, len); str[len] = 0; return str; } return NULL; } return getEmptyString(); } template<typename T, typename L> static char* allocFromUTF16OrUTF32(const T* in, L len) { if (len == 0) return getEmptyString(); size_t bytes = 0; const T* end = in+len; const T* p = in; while (p < end) { bytes += utf32_to_utf8_bytes(*p); p++; } SharedBuffer* buf = SharedBuffer::alloc(bytes+1); LOG_ASSERT(buf, "Unable to allocate shared buffer"); if (buf) { p = in; char* str = (char*)buf->data(); char* d = str; while (p < end) { const T c = *p++; size_t len = utf32_to_utf8_bytes(c); utf32_to_utf8((uint8_t*)d, c, len); d += len; } *d = 0; return str; } return getEmptyString(); } static char* allocFromUTF16(const char16_t* in, size_t len) { if (len == 0) return getEmptyString(); const size_t bytes = utf8_length_from_utf16(in, len); SharedBuffer* buf = SharedBuffer::alloc(bytes+1); LOG_ASSERT(buf, "Unable to allocate shared buffer"); if (buf) { char* str = (char*)buf->data(); utf16_to_utf8(in, len, str, bytes+1); return str; } return getEmptyString(); } static char* allocFromUTF32(const char32_t* in, size_t len) { return allocFromUTF16OrUTF32<char32_t, size_t>(in, len); } // --------------------------------------------------------------------------- String8::String8() : mString(getEmptyString()) { } String8::String8(const String8& o) : mString(o.mString) { SharedBuffer::bufferFromData(mString)->acquire(); } String8::String8(const char* o) : mString(allocFromUTF8(o, strlen(o))) { if (mString == NULL) { mString = getEmptyString(); } } String8::String8(const char* o, size_t len) : mString(allocFromUTF8(o, len)) { if (mString == NULL) { mString = getEmptyString(); } } String8::String8(const String16& o) : mString(allocFromUTF16(o.string(), o.size())) { } String8::String8(const char16_t* o) : mString(allocFromUTF16(o, strlen16(o))) { } String8::String8(const char16_t* o, size_t len) : mString(allocFromUTF16(o, len)) { } String8::String8(const char32_t* o) : mString(allocFromUTF32(o, strlen32(o))) { } String8::String8(const char32_t* o, size_t len) : mString(allocFromUTF32(o, len)) { } String8::~String8() { SharedBuffer::bufferFromData(mString)->release(); } void String8::setTo(const String8& other) { SharedBuffer::bufferFromData(other.mString)->acquire(); SharedBuffer::bufferFromData(mString)->release(); mString = other.mString; } status_t String8::setTo(const char* other) { SharedBuffer::bufferFromData(mString)->release(); mString = allocFromUTF8(other, strlen(other)); if (mString) return NO_ERROR; mString = getEmptyString(); return NO_MEMORY; } status_t String8::setTo(const char* other, size_t len) { SharedBuffer::bufferFromData(mString)->release(); mString = allocFromUTF8(other, len); if (mString) return NO_ERROR; mString = getEmptyString(); return NO_MEMORY; } status_t String8::setTo(const char16_t* other, size_t len) { SharedBuffer::bufferFromData(mString)->release(); mString = allocFromUTF16(other, len); if (mString) return NO_ERROR; mString = getEmptyString(); return NO_MEMORY; } status_t String8::setTo(const char32_t* other, size_t len) { SharedBuffer::bufferFromData(mString)->release(); mString = allocFromUTF32(other, len); if (mString) return NO_ERROR; mString = getEmptyString(); return NO_MEMORY; } status_t String8::append(const String8& other) { const size_t otherLen = other.bytes(); if (bytes() == 0) { setTo(other); return NO_ERROR; } else if (otherLen == 0) { return NO_ERROR; } return real_append(other.string(), otherLen); } status_t String8::append(const char* other) { return append(other, strlen(other)); } status_t String8::append(const char* other, size_t otherLen) { if (bytes() == 0) { return setTo(other, otherLen); } else if (otherLen == 0) { return NO_ERROR; } return real_append(other, otherLen); } status_t String8::real_append(const char* other, size_t otherLen) { const size_t myLen = bytes(); SharedBuffer* buf = SharedBuffer::bufferFromData(mString) ->editResize(myLen+otherLen+1); if (buf) { char* str = (char*)buf->data(); mString = str; str += myLen; memcpy(str, other, otherLen); str[otherLen] = '\0'; return NO_ERROR; } return NO_MEMORY; } char* String8::lockBuffer(size_t size) { SharedBuffer* buf = SharedBuffer::bufferFromData(mString) ->editResize(size+1); if (buf) { char* str = (char*)buf->data(); mString = str; return str; } return NULL; } void String8::unlockBuffer() { unlockBuffer(strlen(mString)); } status_t String8::unlockBuffer(size_t size) { if (size != this->size()) { SharedBuffer* buf = SharedBuffer::bufferFromData(mString) ->editResize(size+1); if (buf) { char* str = (char*)buf->data(); str[size] = 0; mString = str; return NO_ERROR; } } return NO_MEMORY; } ssize_t String8::find(const char* other, size_t start) const { size_t len = size(); if (start >= len) { return -1; } const char* s = mString+start; const char* p = strstr(s, other); return p ? p-mString : -1; } void String8::toLower() { toLower(0, size()); } void String8::toLower(size_t start, size_t length) { const size_t len = size(); if (start >= len) { return; } if (start+length > len) { length = len-start; } char* buf = lockBuffer(len); buf += start; while (length > 0) { *buf = tolower(*buf); buf++; length--; } unlockBuffer(len); } void String8::toUpper() { toUpper(0, size()); } void String8::toUpper(size_t start, size_t length) { const size_t len = size(); if (start >= len) { return; } if (start+length > len) { length = len-start; } char* buf = lockBuffer(len); buf += start; while (length > 0) { *buf = toupper(*buf); buf++; length--; } unlockBuffer(len); } size_t String8::getUtf32Length() const { return utf32_length(mString, length()); } int32_t String8::getUtf32At(size_t index, size_t *next_index) const { return utf32_at(mString, length(), index, next_index); } size_t String8::getUtf32(char32_t* dst, size_t dst_len) const { return utf8_to_utf32(mString, length(), dst, dst_len); } TextOutput& operator<<(TextOutput& to, const String8& val) { to << val.string(); return to; } // --------------------------------------------------------------------------- // Path functions void String8::setPathName(const char* name) { setPathName(name, strlen(name)); } void String8::setPathName(const char* name, size_t len) { char* buf = lockBuffer(len); memcpy(buf, name, len); // remove trailing path separator, if present if (len > 0 && buf[len-1] == OS_PATH_SEPARATOR) len--; buf[len] = '\0'; unlockBuffer(len); } String8 String8::getPathLeaf(void) const { const char* cp; const char*const buf = mString; cp = strrchr(buf, OS_PATH_SEPARATOR); if (cp == NULL) return String8(*this); else return String8(cp+1); } String8 String8::getPathDir(void) const { const char* cp; const char*const str = mString; cp = strrchr(str, OS_PATH_SEPARATOR); if (cp == NULL) return String8(""); else return String8(str, cp - str); } String8 String8::walkPath(String8* outRemains) const { const char* cp; const char*const str = mString; const char* buf = str; cp = strchr(buf, OS_PATH_SEPARATOR); if (cp == buf) { // don't include a leading '/'. buf = buf+1; cp = strchr(buf, OS_PATH_SEPARATOR); } if (cp == NULL) { String8 res = buf != str ? String8(buf) : *this; if (outRemains) *outRemains = String8(""); return res; } String8 res(buf, cp-buf); if (outRemains) *outRemains = String8(cp+1); return res; } /* * Helper function for finding the start of an extension in a pathname. * * Returns a pointer inside mString, or NULL if no extension was found. */ char* String8::find_extension(void) const { const char* lastSlash; const char* lastDot; int extLen; const char* const str = mString; // only look at the filename lastSlash = strrchr(str, OS_PATH_SEPARATOR); if (lastSlash == NULL) lastSlash = str; else lastSlash++; // find the last dot lastDot = strrchr(lastSlash, '.'); if (lastDot == NULL) return NULL; // looks good, ship it return const_cast<char*>(lastDot); } String8 String8::getPathExtension(void) const { char* ext; ext = find_extension(); if (ext != NULL) return String8(ext); else return String8(""); } String8 String8::getBasePath(void) const { char* ext; const char* const str = mString; ext = find_extension(); if (ext == NULL) return String8(*this); else return String8(str, ext - str); } String8& String8::appendPath(const char* name) { // TODO: The test below will fail for Win32 paths. Fix later or ignore. if (name[0] != OS_PATH_SEPARATOR) { if (*name == '\0') { // nothing to do return *this; } size_t len = length(); if (len == 0) { // no existing filename, just use the new one setPathName(name); return *this; } // make room for oldPath + '/' + newPath int newlen = strlen(name); char* buf = lockBuffer(len+1+newlen); // insert a '/' if needed if (buf[len-1] != OS_PATH_SEPARATOR) buf[len++] = OS_PATH_SEPARATOR; memcpy(buf+len, name, newlen+1); len += newlen; unlockBuffer(len); return *this; } else { setPathName(name); return *this; } } String8& String8::convertToResPath() { #if OS_PATH_SEPARATOR != RES_PATH_SEPARATOR size_t len = length(); if (len > 0) { char * buf = lockBuffer(len); for (char * end = buf + len; buf < end; ++buf) { if (*buf == OS_PATH_SEPARATOR) *buf = RES_PATH_SEPARATOR; } unlockBuffer(len); } #endif return *this; } }; // namespace android // --------------------------------------------------------------------------- size_t strlen32(const char32_t *s) { const char32_t *ss = s; while ( *ss ) ss++; return ss-s; } size_t strnlen32(const char32_t *s, size_t maxlen) { const char32_t *ss = s; while ((maxlen > 0) && *ss) { ss++; maxlen--; } return ss-s; } size_t utf8_length(const char *src) { const char *cur = src; size_t ret = 0; while (*cur != '\0') { const char first_char = *cur++; if ((first_char & 0x80) == 0) { // ASCII ret += 1; continue; } // (UTF-8's character must not be like 10xxxxxx, // but 110xxxxx, 1110xxxx, ... or 1111110x) if ((first_char & 0x40) == 0) { return 0; } int32_t mask, to_ignore_mask; size_t num_to_read = 0; char32_t utf32 = 0; for (num_to_read = 1, mask = 0x40, to_ignore_mask = 0x80; num_to_read < 5 && (first_char & mask); num_to_read++, to_ignore_mask |= mask, mask >>= 1) { if ((*cur & 0xC0) != 0x80) { // must be 10xxxxxx return 0; } // 0x3F == 00111111 utf32 = (utf32 << 6) + (*cur++ & 0x3F); } // "first_char" must be (110xxxxx - 11110xxx) if (num_to_read == 5) { return 0; } to_ignore_mask |= mask; utf32 |= ((~to_ignore_mask) & first_char) << (6 * (num_to_read - 1)); if (utf32 > android::kUnicodeMaxCodepoint) { return 0; } ret += num_to_read; } return ret; } size_t utf32_length(const char *src, size_t src_len) { if (src == NULL || src_len == 0) { return 0; } size_t ret = 0; const char* cur; const char* end; size_t num_to_skip; for (cur = src, end = src + src_len, num_to_skip = 1; cur < end; cur += num_to_skip, ret++) { const char first_char = *cur; num_to_skip = 1; if ((first_char & 0x80) == 0) { // ASCII continue; } int32_t mask; for (mask = 0x40; (first_char & mask); num_to_skip++, mask >>= 1) { } } return ret; } size_t utf8_length_from_utf32(const char32_t *src, size_t src_len) { if (src == NULL || src_len == 0) { return 0; } size_t ret = 0; const char32_t *end = src + src_len; while (src < end) { ret += android::utf32_to_utf8_bytes(*src++); } return ret; } size_t utf8_length_from_utf16(const char16_t *src, size_t src_len) { if (src == NULL || src_len == 0) { return 0; } size_t ret = 0; const char16_t* const end = src + src_len; while (src < end) { if ((*src & 0xFC00) == 0xD800 && (src + 1) < end && (*++src & 0xFC00) == 0xDC00) { // surrogate pairs are always 4 bytes. ret += 4; src++; } else { ret += android::utf32_to_utf8_bytes((char32_t) *src++); } } return ret; } static int32_t utf32_at_internal(const char* cur, size_t *num_read) { const char first_char = *cur; if ((first_char & 0x80) == 0) { // ASCII *num_read = 1; return *cur; } cur++; char32_t mask, to_ignore_mask; size_t num_to_read = 0; char32_t utf32 = first_char; for (num_to_read = 1, mask = 0x40, to_ignore_mask = 0xFFFFFF80; (first_char & mask); num_to_read++, to_ignore_mask |= mask, mask >>= 1) { // 0x3F == 00111111 utf32 = (utf32 << 6) + (*cur++ & 0x3F); } to_ignore_mask |= mask; utf32 &= ~(to_ignore_mask << (6 * (num_to_read - 1))); *num_read = num_to_read; return static_cast<int32_t>(utf32); } int32_t utf32_at(const char *src, size_t src_len, size_t index, size_t *next_index) { if (index >= src_len) { return -1; } size_t dummy_index; if (next_index == NULL) { next_index = &dummy_index; } size_t num_read; int32_t ret = utf32_at_internal(src + index, &num_read); if (ret >= 0) { *next_index = index + num_read; } return ret; } size_t utf8_to_utf32(const char* src, size_t src_len, char32_t* dst, size_t dst_len) { if (src == NULL || src_len == 0 || dst == NULL || dst_len == 0) { return 0; } const char* cur = src; const char* end = src + src_len; char32_t* cur_utf32 = dst; const char32_t* end_utf32 = dst + dst_len; while (cur_utf32 < end_utf32 && cur < end) { size_t num_read; *cur_utf32++ = static_cast<char32_t>(utf32_at_internal(cur, &num_read)); cur += num_read; } if (cur_utf32 < end_utf32) { *cur_utf32 = 0; } return static_cast<size_t>(cur_utf32 - dst); } size_t utf32_to_utf8(const char32_t* src, size_t src_len, char* dst, size_t dst_len) { if (src == NULL || src_len == 0 || dst == NULL || dst_len == 0) { return 0; } const char32_t *cur_utf32 = src; const char32_t *end_utf32 = src + src_len; char *cur = dst; const char *end = dst + dst_len; while (cur_utf32 < end_utf32 && cur < end) { size_t len = android::utf32_to_utf8_bytes(*cur_utf32); android::utf32_to_utf8((uint8_t *)cur, *cur_utf32++, len); cur += len; } if (cur < end) { *cur = '\0'; } return cur - dst; } size_t utf16_to_utf8(const char16_t* src, size_t src_len, char* dst, size_t dst_len) { if (src == NULL || src_len == 0 || dst == NULL || dst_len == 0) { return 0; } const char16_t* cur_utf16 = src; const char16_t* const end_utf16 = src + src_len; char *cur = dst; const char* const end = dst + dst_len; while (cur_utf16 < end_utf16 && cur < end) { char32_t utf32; // surrogate pairs if ((*cur_utf16 & 0xFC00) == 0xD800 && (cur_utf16 + 1) < end_utf16) { utf32 = (*cur_utf16++ - 0xD800) << 10; utf32 |= *cur_utf16++ - 0xDC00; utf32 += 0x10000; } else { utf32 = (char32_t) *cur_utf16++; } size_t len = android::utf32_to_utf8_bytes(utf32); android::utf32_to_utf8((uint8_t*)cur, utf32, len); cur += len; } if (cur < end) { *cur = '\0'; } return cur - dst; }