From f1121648d0762cf9bf4e5117bfc1008447fb4080 Mon Sep 17 00:00:00 2001
From: android
Date: Thu, 1 Apr 2010 11:46:35 -0700
Subject: [PATCH] Add ICU support for libxml.

This is derived from Jungshik's patch.  The encoding.c is a copy from Chrome's source,
which has one extra modification than Jungshik's patch.

Issue:2557315
Change-Id: I8e4c9e544660f3f943a15042756f7248d5afff8e
---
 Android.mk                  |    4 +-
 encoding.c                  |  248 +++++++++++++++++++++++++++++++++++++++++-
 include/libxml/encoding.h   |   29 +++++
 include/libxml/parser.h     |    3 +-
 include/libxml/xmlversion.h |   11 ++-
 parser.c                    |    9 ++
 xmlregexp.c                 |    2 +-
 7 files changed, 294 insertions(+), 12 deletions(-)

diff --git a/Android.mk b/Android.mk
index 3d0ede8..08bf11f 100644
--- a/Android.mk
+++ b/Android.mk
@@ -57,7 +57,7 @@ common_C_INCLUDES += \
 include $(CLEAR_VARS)
 
 LOCAL_SRC_FILES := $(common_SRC_FILES)
-LOCAL_C_INCLUDES += $(common_C_INCLUDES)
+LOCAL_C_INCLUDES += $(common_C_INCLUDES) external/icu4c/common
 LOCAL_SHARED_LIBRARIES += $(common_SHARED_LIBRARIES)
 LOCAL_CFLAGS += -fvisibility=hidden
 
@@ -71,7 +71,7 @@ include $(BUILD_STATIC_LIBRARY)
 
 include $(CLEAR_VARS)
 LOCAL_SRC_FILES := $(common_SRC_FILES)
-LOCAL_C_INCLUDES += $(common_C_INCLUDES)
+LOCAL_C_INCLUDES += $(common_C_INCLUDES) external/icu4c/common
 LOCAL_SHARED_LIBRARIES += $(common_SHARED_LIBRARIES)
 LOCAL_MODULE:= libxml2
 include $(BUILD_HOST_STATIC_LIBRARY)
diff --git a/encoding.c b/encoding.c
index e2df797..2abc32e 100644
--- a/encoding.c
+++ b/encoding.c
@@ -58,7 +58,7 @@ static xmlCharEncodingAliasPtr xmlCharEncodingAliases = NULL;
 static int xmlCharEncodingAliasesNb = 0;
 static int xmlCharEncodingAliasesMax = 0;
 
-#ifdef LIBXML_ICONV_ENABLED
+#if defined(LIBXML_ICONV_ENABLED) || defined(LIBXML_ICU_ENABLED)
 #if 0
 #define DEBUG_ENCODING  /* Define this to get encoding traces */
 #endif
@@ -97,6 +97,54 @@ xmlEncodingErr(xmlParserErrors error, const char *msg, const char *val)
                     NULL, 0, val, NULL, NULL, 0, 0, msg, val);
 }
 
+#ifdef LIBXML_ICU_ENABLED
+static uconv_t* 
+openIcuConverter(const char* name, int toUnicode)
+{
+  UErrorCode status = U_ZERO_ERROR;
+  uconv_t *conv = (uconv_t *) xmlMalloc(sizeof(uconv_t));
+  if (conv == NULL)
+    return NULL;
+
+  conv->uconv = ucnv_open(name, &status);
+  if (U_FAILURE(status))
+    goto error;
+
+  status = U_ZERO_ERROR;
+  if (toUnicode) {
+    ucnv_setToUCallBack(conv->uconv, UCNV_TO_U_CALLBACK_STOP, 
+                        NULL, NULL, NULL, &status);
+  }
+  else {
+    ucnv_setFromUCallBack(conv->uconv, UCNV_FROM_U_CALLBACK_STOP, 
+                        NULL, NULL, NULL, &status);
+  }
+  if (U_FAILURE(status))
+    goto error;
+
+  status = U_ZERO_ERROR;
+  conv->utf8 = ucnv_open("UTF-8", &status);
+  if (U_SUCCESS(status))
+    return conv;
+
+error:
+  if (conv->uconv) 
+    ucnv_close(conv->uconv);
+  xmlFree(conv);
+  return NULL;
+}
+
+static void
+closeIcuConverter(uconv_t *conv)
+{
+  if (conv != NULL) {
+    ucnv_close(conv->uconv);
+    ucnv_close(conv->utf8);
+    xmlFree(conv);
+  }
+}
+#endif /* LIBXML_ICU_ENABLED */
+
 /************************************************************************
  *									*
  *		Conversions To/From UTF8 encoding			*
@@ -1306,7 +1354,11 @@ xmlNewCharEncodingHandler(const char *name,
 #ifdef LIBXML_ICONV_ENABLED
     handler->iconv_in = NULL;
     handler->iconv_out = NULL;
-#endif /* LIBXML_ICONV_ENABLED */
+#endif
+#ifdef LIBXML_ICU_ENABLED
+    handler->uconv_in = NULL;
+    handler->uconv_out = NULL;
+#endif
 
     /*
      * registers and returns the handler.
@@ -1371,7 +1423,7 @@ xmlInitCharEncodingHandlers(void) {
     xmlNewCharEncodingHandler("ASCII", asciiToUTF8, NULL);
     xmlNewCharEncodingHandler("US-ASCII", asciiToUTF8, NULL);
 #endif /* LIBXML_OUTPUT_ENABLED */
-#ifndef LIBXML_ICONV_ENABLED
+#if !defined(LIBXML_ICONV_ENABLED) && !defined(LIBXML_ICU_ENABLED)
 #ifdef LIBXML_ISO8859X_ENABLED
     xmlRegisterCharEncodingHandlersISO8859x ();
 #endif
@@ -1576,6 +1628,10 @@ xmlFindCharEncodingHandler(const char *name) {
     xmlCharEncodingHandlerPtr enc;
     iconv_t icv_in, icv_out;
 #endif /* LIBXML_ICONV_ENABLED */
+#ifdef LIBXML_ICU_ENABLED
+    xmlCharEncodingHandlerPtr enc;
+    uconv_t *ucv_in, *ucv_out;
+#endif /* LIBXML_ICU_ENABLED */
     char upper[100];
     int i;
 
@@ -1642,6 +1698,35 @@ xmlFindCharEncodingHandler(const char *name) {
 		    "iconv : problems with filters for '%s'\n", name);
     }
 #endif /* LIBXML_ICONV_ENABLED */
+#ifdef LIBXML_ICU_ENABLED
+    /* check whether icu can handle this */
+    ucv_in = openIcuConverter(name, 1);
+    ucv_out = openIcuConverter(name, 0);
+    if (ucv_in != NULL && ucv_out != NULL) {
+	    enc = (xmlCharEncodingHandlerPtr)
+	          xmlMalloc(sizeof(xmlCharEncodingHandler));
+	    if (enc == NULL) {
+                closeIcuConverter(ucv_in);
+                closeIcuConverter(ucv_out);
+		return(NULL);
+	    }
+	    enc->name = xmlMemStrdup(name);
+	    enc->input = NULL;
+	    enc->output = NULL;
+	    enc->uconv_in = ucv_in;
+	    enc->uconv_out = ucv_out;
+#ifdef DEBUG_ENCODING
+            xmlGenericError(xmlGenericErrorContext,
+		    "Found ICU converter handler for encoding %s\n", name);
+#endif
+	    return enc;
+    } else if (ucv_in != NULL || ucv_out != NULL) {
+            closeIcuConverter(ucv_in);
+            closeIcuConverter(ucv_out);
+	    xmlEncodingErr(XML_ERR_INTERNAL_ERROR,
+		    "ICU converter : problems with filters for '%s'\n", name);
+    }
+#endif /* LIBXML_ICU_ENABLED */
 
 #ifdef DEBUG_ENCODING
     xmlGenericError(xmlGenericErrorContext,
@@ -1732,6 +1817,75 @@ xmlIconvWrapper(iconv_t cd, unsigned char *out, int *outlen,
 
 /************************************************************************
  *									*
+ *		ICU based generic conversion functions	         	*
+ *									*
+ ************************************************************************/
+
+#ifdef LIBXML_ICU_ENABLED
+/**
+ * xmlUconvWrapper:
+ * @cd: ICU uconverter data structure
+ * @toUnicode : non-zero if toUnicode. 0 otherwise.
+ * @out:  a pointer to an array of bytes to store the result
+ * @outlen:  the length of @out
+ * @in:  a pointer to an array of ISO Latin 1 chars
+ * @inlen:  the length of @in
+ *
+ * Returns 0 if success, or 
+ *     -1 by lack of space, or
+ *     -2 if the transcoding fails (for *in is not valid utf8 string or
+ *        the result of transformation can't fit into the encoding we want), or
+ *     -3 if there the last byte can't form a single output char.
+ *     
+ * The value of @inlen after return is the number of octets consumed
+ *     as the return value is positive, else unpredictable.
+ * The value of @outlen after return is the number of ocetes consumed.
+ */
+static int
+xmlUconvWrapper(uconv_t *cd, int toUnicode, unsigned char *out, int *outlen,
+                const unsigned char *in, int *inlen) {
+    const char *ucv_in = (const char *) in;
+    char *ucv_out = (char *) out;
+    UErrorCode err = U_ZERO_ERROR;
+
+    if ((out == NULL) || (outlen == NULL) || (inlen == NULL) || (in == NULL)) {
+        if (outlen != NULL) *outlen = 0;
+        return(-1);
+    }
+
+    /* 
+     * TODO(jungshik)
+     * 1. is ucnv_convert(To|From)Algorithmic better?
+     * 2. had we better use an explicit pivot buffer?
+     * 3. error returned comes from 'fromUnicode' only even
+     *    when toUnicode is true !
+     */
+    if (toUnicode) {
+        /* encoding => UTF-16 => UTF-8 */
+        ucnv_convertEx(cd->utf8, cd->uconv, &ucv_out, ucv_out + *outlen,
+                       &ucv_in, ucv_in + *inlen, NULL, NULL, NULL, NULL,
+                       0, TRUE, &err);
+    } else {
+        /* UTF-8 => UTF-16 => encoding */
+        ucnv_convertEx(cd->uconv, cd->utf8, &ucv_out, ucv_out + *outlen,
+                       &ucv_in, ucv_in + *inlen, NULL, NULL, NULL, NULL,
+                       0, TRUE, &err);
+    }
+    *inlen = ucv_in - (const char*) in; 
+    *outlen = ucv_out - (char *) out;
+    if (U_SUCCESS(err))
+        return 0;
+    if (err == U_BUFFER_OVERFLOW_ERROR)
+        return -1;
+    if (err == U_INVALID_CHAR_FOUND || err == U_ILLEGAL_CHAR_FOUND)
+        return -2;
+    /* if (err == U_TRUNCATED_CHAR_FOUND) */
+    return -3;
+}
+#endif /* LIBXML_ICU_ENABLED */
+
+/************************************************************************
+ *									*
  *		The real API used by libxml for on-the-fly conversion	*
  *									*
  ************************************************************************/
@@ -1794,6 +1948,16 @@ xmlCharEncFirstLine(xmlCharEncodingHandler *handler, xmlBufferPtr out,
 	if (ret == -1) ret = -3;
     }
 #endif /* LIBXML_ICONV_ENABLED */
+#ifdef LIBXML_ICU_ENABLED
+    else if (handler->uconv_in != NULL) {
+	ret = xmlUconvWrapper(handler->uconv_in, 1, &out->content[out->use],
+	                      &written, in->content, &toconv);
+	xmlBufferShrink(in, toconv);
+	out->use += written;
+	out->content[out->use] = 0;
+	if (ret == -1) ret = -3;
+    }
+#endif /* LIBXML_ICU_ENABLED */
 #ifdef DEBUG_ENCODING
     switch (ret) {
         case 0:
@@ -1879,6 +2043,17 @@ xmlCharEncInFunc(xmlCharEncodingHandler * handler, xmlBufferPtr out,
             ret = -3;
     }
 #endif /* LIBXML_ICONV_ENABLED */
+#ifdef LIBXML_ICU_ENABLED
+    else if (handler->uconv_in != NULL) {
+        ret = xmlUconvWrapper(handler->uconv_in, 1, &out->content[out->use],
+                              &written, in->content, &toconv);
+        xmlBufferShrink(in, toconv);
+        out->use += written;
+        out->content[out->use] = 0;
+        if (ret == -1)
+            ret = -3;
+    }
+#endif /* LIBXML_ICU_ENABLED */
     switch (ret) {
         case 0:
 #ifdef DEBUG_ENCODING
@@ -1979,6 +2154,15 @@ retry:
 	    out->content[out->use] = 0;
 	}
 #endif /* LIBXML_ICONV_ENABLED */
+#ifdef LIBXML_ICU_ENABLED
+	else if (handler->uconv_out != NULL) {
+	    ret = xmlUconvWrapper(handler->uconv_out, 0,
+                              &out->content[out->use],
+ 				              &written, NULL, &toconv);
+	    out->use += written;
+	    out->content[out->use] = 0;
+	}
+#endif /* LIBXML_ICU_ENABLED */
 #ifdef DEBUG_ENCODING
 	xmlGenericError(xmlGenericErrorContext,
 		"initialized encoder\n");
@@ -2003,7 +2187,7 @@ retry:
 	    xmlBufferShrink(in, toconv);
 	    out->use += written;
 	    writtentot += written;
-	} 
+	}
 	out->content[out->use] = 0;
     }
 #ifdef LIBXML_ICONV_ENABLED
@@ -2025,6 +2209,26 @@ retry:
 	}
     }
 #endif /* LIBXML_ICONV_ENABLED */
+#ifdef LIBXML_ICU_ENABLED
+    else if (handler->uconv_out != NULL) {
+	ret = xmlUconvWrapper(handler->uconv_out, 0,
+                              &out->content[out->use],
+	                      &written, in->content, &toconv);
+	xmlBufferShrink(in, toconv);
+	out->use += written;
+	writtentot += written;
+	out->content[out->use] = 0;
+	if (ret == -1) {
+	    if (written > 0) {
+		/*
+		 * Can be a limitation of iconv
+		 */
+		goto retry;
+	    }
+	    ret = -3;
+	}
+    }
+#endif /* LIBXML_ICU_ENABLED */
     else {
 	xmlEncodingErr(XML_I18N_NO_OUTPUT,
 		       "xmlCharEncOutFunc: no output function !\n", NULL);
@@ -2137,6 +2341,22 @@ xmlCharEncCloseFunc(xmlCharEncodingHandler *handler) {
 	xmlFree(handler);
     }
 #endif /* LIBXML_ICONV_ENABLED */
+#ifdef LIBXML_ICU_ENABLED
+    if ((handler->uconv_out != NULL) || (handler->uconv_in != NULL)) {
+	if (handler->name != NULL)
+	    xmlFree(handler->name);
+	handler->name = NULL;
+	if (handler->uconv_out != NULL) {
+	    closeIcuConverter(handler->uconv_out);
+	    handler->uconv_out = NULL;
+	}
+	if (handler->uconv_in != NULL) {
+	    closeIcuConverter(handler->uconv_in);
+	    handler->uconv_in = NULL;
+	}
+	xmlFree(handler);
+    }
+#endif
 #ifdef DEBUG_ENCODING
     if (ret)
         xmlGenericError(xmlGenericErrorContext,
@@ -2212,6 +2432,22 @@ xmlByteConsumed(xmlParserCtxtPtr ctxt) {
 		    cur += toconv;
 		} while (ret == -2);
 #endif
+#ifdef LIBXML_ICU_ENABLED
+	    } else if (handler->uconv_out != NULL) {
+	        do {
+		    toconv = in->end - cur;
+		    written = 32000;
+		    ret = xmlUconvWrapper(handler->uconv_out, 0, &convbuf[0],
+	                      &written, cur, &toconv);
+		    if (ret < 0) {
+		        if (written > 0)
+			    ret = -2;
+			else
+			    return(-1);
+		    }
+		    unused += written;
+		    cur += toconv;
+		} while (ret == -2);
             } else {
 	        /* could not find a converter */
 	        return(-1);
@@ -2223,8 +2459,9 @@ xmlByteConsumed(xmlParserCtxtPtr ctxt) {
     }
     return(in->consumed + (in->cur - in->base));
 }
+#endif
 
-#ifndef LIBXML_ICONV_ENABLED
+#if !defined(LIBXML_ICONV_ENABLED) && !defined(LIBXML_ICU_ENABLED)
 #ifdef LIBXML_ISO8859X_ENABLED
 
 /**
@@ -3296,4 +3533,3 @@ xmlRegisterCharEncodingHandlersISO8859x (void) {
 
 #define bottom_encoding
 #include "elfgcchack.h"
-
diff --git a/include/libxml/encoding.h b/include/libxml/encoding.h
index c74b25f..c68ec10 100644
--- a/include/libxml/encoding.h
+++ b/include/libxml/encoding.h
@@ -26,6 +26,24 @@
 
 #ifdef LIBXML_ICONV_ENABLED
 #include <iconv.h>
+#else
+#ifdef LIBXML_ICU_ENABLED
+#include <unicode/ucnv.h>
+#if 0
+/* Forward-declare UConverter here rather than pulling in <unicode/ucnv.h>
+ * to prevent unwanted ICU symbols being exposed to users of libxml2.
+ * One particular case is Qt4 conflicting on UChar32.
+ */
+#include <stdint.h>
+struct UConverter;
+typedef struct UConverter UConverter;
+#ifdef _MSC_VER
+typedef wchar_t UChar;
+#else
+typedef uint16_t UChar;
+#endif
+#endif
+#endif
 #endif
 #ifdef __cplusplus
 extern "C" {
@@ -125,6 +143,13 @@ typedef int (* xmlCharEncodingOutputFunc)(unsigned char *out, int *outlen,
  * Block defining the handlers for non UTF-8 encodings.
  * If iconv is supported, there are two extra fields.
  */
+#ifdef LIBXML_ICU_ENABLED
+struct _uconv_t {
+  UConverter *uconv; /* for conversion between an encoding and UTF-16 */
+  UConverter *utf8; /* for conversion between UTF-8 and UTF-16 */
+};
+typedef struct _uconv_t uconv_t;
+#endif
 
 typedef struct _xmlCharEncodingHandler xmlCharEncodingHandler;
 typedef xmlCharEncodingHandler *xmlCharEncodingHandlerPtr;
@@ -136,6 +161,10 @@ struct _xmlCharEncodingHandler {
     iconv_t                    iconv_in;
     iconv_t                    iconv_out;
 #endif /* LIBXML_ICONV_ENABLED */
+#ifdef LIBXML_ICU_ENABLED
+    uconv_t                    *uconv_in;
+    uconv_t                    *uconv_out;
+#endif /* LIBXML_ICU_ENABLED */
 };
 
 #ifdef __cplusplus
diff --git a/include/libxml/parser.h b/include/libxml/parser.h
index 567addb..bd9de24 100644
--- a/include/libxml/parser.h
+++ b/include/libxml/parser.h
@@ -276,6 +276,7 @@ struct _xmlParserCtxt {
     int                nsNr;          /* the number of inherited namespaces */
     int                nsMax;         /* the size of the arrays */
     const xmlChar *   *nsTab;         /* the array of prefix/namespace name */
+    struct _xmlParserCtxt *nsParent;  /* parent context to inherit namespaces from * */
     int               *attallocs;     /* which attribute were allocated */
     void *            *pushTab;       /* array of data for push */
     xmlHashTablePtr    attsDefault;   /* defaulted attributes if any */
@@ -1213,6 +1214,7 @@ typedef enum {
     XML_WITH_DEBUG_MEM = 29,
     XML_WITH_DEBUG_RUN = 30,
     XML_WITH_ZLIB = 31,
+    XML_WITH_ICU = 32,
     XML_WITH_NONE = 99999 /* just to be sure of allocation size */
 } xmlFeature;
 
@@ -1223,4 +1225,3 @@ XMLPUBFUN int XMLCALL
 }
 #endif
 #endif /* __XML_PARSER_H__ */
-
diff --git a/include/libxml/xmlversion.h b/include/libxml/xmlversion.h
index a98e00c..fb2b8ca 100644
--- a/include/libxml/xmlversion.h
+++ b/include/libxml/xmlversion.h
@@ -269,6 +269,15 @@ XMLPUBFUN void XMLCALL xmlCheckVersion(int version);
 #endif
 
 /**
+ * LIBXML_ICU_ENABLED:
+ *
+ * Whether icu support is available
+ */
+#if 1
+#define LIBXML_ICU_ENABLED
+#endif
+
+/**
  * LIBXML_ISO8859X_ENABLED:
  *
  * Whether ISO-8859-* support is made available in case iconv is not
@@ -454,5 +463,3 @@ XMLPUBFUN void XMLCALL xmlCheckVersion(int version);
 }
 #endif /* __cplusplus */
 #endif
-
-
diff --git a/parser.c b/parser.c
index 9db664f..306b84d 100644
--- a/parser.c
+++ b/parser.c
@@ -937,6 +937,12 @@ xmlHasFeature(xmlFeature feature)
 #else
             return(0);
 #endif
+        case XML_WITH_ICU:
+#ifdef LIBXML_ICU_ENABLED
+            return(1);
+#else
+            return(0);
+#endif
         default:
 	    break;
      }
@@ -8189,6 +8195,7 @@ xmlGetNamespace(xmlParserCtxtPtr ctxt, const xmlChar *prefix) {
 	        return(NULL);
 	    return(ctxt->nsTab[i + 1]);
 	}
+    if (ctxt->nsParent) return xmlGetNamespace(ctxt->nsParent, prefix);
     return(NULL);
 }
 
@@ -12538,6 +12545,8 @@ xmlParseBalancedChunkMemoryInternal(xmlParserCtxtPtr oldctxt,
     ctxt->str_xmlns = xmlDictLookup(ctxt->dict, BAD_CAST "xmlns", 5);
     ctxt->str_xml_ns = xmlDictLookup(ctxt->dict, XML_XML_NAMESPACE, 36);
 
+    ctxt->nsParent = oldctxt;
+
     oldsax = ctxt->sax;
     ctxt->sax = oldctxt->sax;
     xmlDetectSAX2(ctxt);
diff --git a/xmlregexp.c b/xmlregexp.c
index 73598a5..4258a08 100644
--- a/xmlregexp.c
+++ b/xmlregexp.c
@@ -6401,7 +6401,7 @@ xmlExpHashNameComputeKey(const xmlChar *name) {
     if (name != NULL) {
 	value += 30 * (*name);
 	while ((ch = *name++) != 0) {
-	    value = value ^ ((value << 5) + (value >> 3) + (unsigned long)ch);
+	    value = value ^ ((value << 5) + (value >> 3) + (unsigned short)ch);
 	}
     }
     return (value);
-- 
1.7.0.1