/*
**********************************************************************
* Copyright (C) 2013, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
*
* indentifier_info.h
*
* created on: 2013 Jan 7
* created by: Andy Heninger
*/
#ifndef __IDENTIFIER_INFO_H__
#define __IDENTIFIER_INFO_H__
#include "unicode/utypes.h"
#include "unicode/uniset.h"
#include "unicode/uspoof.h"
#include "uhash.h"
U_NAMESPACE_BEGIN
class ScriptSet;
// TODO(andy): review consistency of reference vs pointer arguments to the funcions.
/**
* This class analyzes a possible identifier for script and identifier status. Use it by calling setIdentifierProfile
* then setIdentifier. Available methods include:
* <ol>
* <li>call getScripts for the specific scripts in the identifier. The identifier contains at least one character in
* each of these.
* <li>call getAlternates to get cases where a character is not limited to a single script. For example, it could be
* either Katakana or Hiragana.
* <li>call getCommonAmongAlternates to find out if any scripts are common to all the alternates.
* <li>call getNumerics to get a representative character (with value zero) for each of the decimal number systems in
* the identifier.
* <li>call getRestrictionLevel to see what the UTS36 restriction level is.
* </ol>
*
* This is a port from ICU4J of class com.ibm.icu.text.IdentifierInfo
*/
class U_I18N_API IdentifierInfo : public UMemory {
public:
/**
* Create an identifier info object. Subsequently, call setIdentifier(), etc.
* @internal
*/
IdentifierInfo(UErrorCode &status);
/**
* Destructor
*/
virtual ~IdentifierInfo();
private:
/* Disallow copying for now. Can be added if there's a need. */
IdentifierInfo(const IdentifierInfo &other);
public:
/**
* Set the identifier profile: the characters that are to be allowed in the identifier.
*
* @param identifierProfile the characters that are to be allowed in the identifier
* @return this
* @internal
*/
IdentifierInfo &setIdentifierProfile(const UnicodeSet &identifierProfile);
/**
* Get the identifier profile: the characters that are to be allowed in the identifier.
*
* @return The characters that are to be allowed in the identifier.
* @internal
*/
const UnicodeSet &getIdentifierProfile() const;
/**
* Set an identifier to analyze. Afterwards, call methods like getScripts()
*
* @param identifier the identifier to analyze
* @param status Errorcode, set if errors occur.
* @return this
* @internal
*/
IdentifierInfo &setIdentifier(const UnicodeString &identifier, UErrorCode &status);
/**
* Get the identifier that was analyzed. The returned string is owned by the ICU library,
* and must not be deleted by the caller.
*
* @return the identifier that was analyzed.
* @internal
*/
const UnicodeString *getIdentifier() const;
/**
* Get the scripts found in the identifiers.
*
* @return the set of explicit scripts.
* @internal
*/
const ScriptSet *getScripts() const;
/**
* Get the set of alternate scripts found in the identifiers. That is, when a character can be in two scripts, then
* the set consisting of those scripts will be returned.
*
* @return a uhash, with each key being of type (ScriptSet *).
* This is a set, not a map, so the value stored in the uhash is not relevant.
* (It is, in fact, 1).
* Ownership of the uhash and its contents remains with the IndetifierInfo object,
* and remains valid until a new identifer is set or until the object is deleted.
* @internal
*/
const UHashtable *getAlternates() const;
/**
* Get the representative characters (zeros) for the numerics found in the identifier.
*
* @return the set of explicit scripts.
* @internal
*/
const UnicodeSet *getNumerics() const;
/**
* Find out which scripts are in common among the alternates.
*
* @return the set of scripts that are in common among the alternates.
* @internal
*/
const ScriptSet *getCommonAmongAlternates() const;
/**
* Get the number of scripts appearing in the identifier.
* Note: Common and Inherited scripts are omitted from the count.
* Note: Result may be high when the identifier contains characters
* with alternate scripts. The distinction between
* 0, 1 and > 1 will remain valid, however.
* @return the number of scripts.
*/
int32_t getScriptCount() const;
#if !UCONFIG_NO_NORMALIZATION
/**
* Find the "tightest" restriction level that the identifier satisfies.
*
* @return the restriction level.
* @internal
*/
URestrictionLevel getRestrictionLevel(UErrorCode &status) const;
#endif /*!UCONFIG_NO_NORMALIZATION */
UnicodeString toString() const;
/**
* Produce a readable string of alternates.
*
* @param alternates a UHashtable of UScriptSets.
* Keys only, no meaningful values in the UHash.
* @return display form
* @internal
*/
static UnicodeString &displayAlternates(UnicodeString &dest, const UHashtable *alternates, UErrorCode &status);
/**
* Static memory cleanup function.
* @internal
*/
static UBool cleanup();
private:
IdentifierInfo & clear();
UBool containsWithAlternates(const ScriptSet &container, const ScriptSet &containee) const;
UnicodeString *fIdentifier;
ScriptSet *fRequiredScripts;
UHashtable *fScriptSetSet;
ScriptSet *fCommonAmongAlternates;
UnicodeSet *fNumerics;
UnicodeSet *fIdentifierProfile;
static UnicodeSet *ASCII;
static ScriptSet *JAPANESE;
static ScriptSet *CHINESE;
static ScriptSet *KOREAN;
static ScriptSet *CONFUSABLE_WITH_LATIN;
};
U_NAMESPACE_END
#endif // __IDENTIFIER_INFO_H__