C++程序  |  199行  |  3.61 KB

/* Copyright 2016 The Chromium OS Authors. All rights reserved.
 * Use of this source code is governed by a BSD-style license that can be
 * found in the LICENSE file.
 */

#include <stdlib.h>
#include <stdint.h>
#include <sys/types.h>

#ifdef CRAS_DBUS
#include <dbus/dbus.h>
#endif

#include "cras_utf8.h"
#include "cras_util.h"

static const uint8_t kUTF8ByteOrderMask[3] = { 0xef, 0xbb, 0xbf };

typedef struct u8range {
	uint8_t min;
	uint8_t max;
} u8range_t;

static const u8range_t kUTF8TwoByteSeq[] = {
	{ 0xc2, 0xdf },
	{ 0x80, 0xbf },
	{ 0, 0 }
};

static const u8range_t kUTF8ByteSeqE0[] = {
	{ 0xe0, 0xe0 },
	{ 0xa0, 0xbf },
	{ 0x80, 0xbf },
	{ 0, 0 }
};

static const u8range_t kUTF8ByteSeqE1EC[] = {
	{ 0xe1, 0xec },
	{ 0x80, 0xbf },
	{ 0x80, 0xbf },
	{ 0, 0 }
};

static const u8range_t kUTF8ByteSeqED[] = {
	{ 0xed, 0xed },
	{ 0x80, 0x9f },
	{ 0x80, 0xbf },
	{ 0, 0 }
};

static const u8range_t kUTF8ByteSeqEEEF[] = {
	{ 0xee, 0xef },
	{ 0x80, 0xbf },
	{ 0x80, 0xbf },
	{ 0, 0 }
};

static const u8range_t kUTF8ByteSeqF0[] = {
	{ 0xf0, 0xf0 },
	{ 0x90, 0xbf },
	{ 0x80, 0xbf },
	{ 0x80, 0xbf },
	{ 0, 0 }
};

static const u8range_t kUTF8ByteSeqF1F3[] = {
	{ 0xf1, 0xf3 },
	{ 0x80, 0xbf },
	{ 0x80, 0xbf },
	{ 0x80, 0xbf },
	{ 0, 0 }
};

static const u8range_t kUTF8ByteSeqF4[] = {
	{ 0xf4, 0xf4 },
	{ 0x80, 0x8f },
	{ 0x80, 0xbf },
	{ 0x80, 0xbf },
	{ 0, 0 }
};

static const u8range_t kUTF8NullRange[] = {
	{ 0, 0 }
};

typedef struct utf8seq {
	const u8range_t *ranges;
} utf8seq_t;

static const utf8seq_t kUTF8Sequences[] = {
	{ kUTF8TwoByteSeq },
	{ kUTF8ByteSeqE0 },
	{ kUTF8ByteSeqE1EC },
	{ kUTF8ByteSeqED },
	{ kUTF8ByteSeqEEEF },
	{ kUTF8ByteSeqF0 },
	{ kUTF8ByteSeqF1F3 },
	{ kUTF8ByteSeqF4 },
	{ kUTF8NullRange }
};

int valid_utf8_string(const char *string, size_t *bad_pos)
{
	int bom_chars = 0;
	uint8_t byte;
	const char *pos = string;
	int ret = 1;
	const utf8seq_t *seq = NULL;
	const u8range_t *range = NULL;

	if (!pos) {
		ret = 0;
		goto error;
	}

	while ((byte = (uint8_t)*(pos++))) {
		if (!range || range->min == 0) {
			if (byte < 128) {
				/* Ascii character. */
				continue;
			}

			if (bom_chars < ARRAY_SIZE(kUTF8ByteOrderMask)) {
				if (byte == kUTF8ByteOrderMask[bom_chars]) {
					bom_chars++;
					continue;
				} else {
					/* Characters not matching BOM.
					 * Rewind and assume that there is
					 * no BOM. */
					bom_chars =
					        ARRAY_SIZE(kUTF8ByteOrderMask);
                                        pos = string;
					continue;
				}
			}

			/* Find the matching sequence of characters by
			 * matching the first character in the sequence.
			 */
			seq = kUTF8Sequences;
			while (seq->ranges->min != 0) {
				if (byte >= seq->ranges->min &&
				    byte <= seq->ranges->max) {
					/* Matching sequence. */
					break;
				}
				seq++;
			}

			if (seq->ranges->min == 0) {
				/* Could not find a matching sequence. */
				ret = 0;
				goto error;
			}

			/* Found the appropriate sequence. */
			range = seq->ranges + 1;
			continue;
		}

		if (byte >= range->min && byte <= range->max) {
			range++;
			continue;
		}

		/* This character doesn't belong in UTF8. */
		ret = 0;
		goto error;
	}

	if (range && range->min != 0) {
	        /* Stopped in the middle of a sequence. */
	        ret = 0;
	}

error:
	if (bad_pos)
		*bad_pos = pos - string - 1;
	return ret;
}

#ifdef CRAS_DBUS
/* Use the DBus implementation if available to ensure that the UTF-8
 * sequences match those expected by the DBus implementation. */

int is_utf8_string(const char *string)
{
	return !!dbus_validate_utf8(string, NULL);
}

#else

int is_utf8_string (const char *string) {
	return valid_utf8_string(string, NULL);
}

#endif