/* * Copyright (C) 2008-2009 SVOX AG, Baslerstr. 30, 8048 Zuerich, Switzerland * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /** * @file picoktab.c * * symbol tables needed at runtime * * Copyright (C) 2008-2009 SVOX AG, Baslerstr. 30, 8048 Zuerich, Switzerland * All rights reserved. * * History: * - 2009-04-20 -- initial version * */ #include "picoos.h" #include "picodbg.h" #include "picoknow.h" #include "picobase.h" #include "picoktab.h" #include "picodata.h" #ifdef __cplusplus extern "C" { #endif #if 0 } #endif /** @todo : the following would be better part of a knowledge base. * Make sure it is consistent with the phoneme symbol table used in the lingware */ /* PLANE_PHONEMES */ /* PLANE_POS */ /* PLANE_PB_STRENGTHS */ /* PLANE_ACCENTS */ /* PLANE_INTERN */ #define PICOKTAB_TMPID_PHONSTART '\x26' /* 38 '&' */ #define PICOKTAB_TMPID_PHONTERM '\x23' /* 35 '#' */ /* ************************************************************/ /* fixed ids */ /* ************************************************************/ static pico_status_t ktabIdsInitialize(register picoknow_KnowledgeBase this, picoos_Common common) { picoktab_FixedIds ids; PICODBG_DEBUG(("start")); if (NULL == this || NULL == this->subObj) { return picoos_emRaiseException(common->em, PICO_EXC_KB_MISSING, NULL, NULL); } ids = (picoktab_FixedIds) this->subObj; ids->phonStartId = PICOKTAB_TMPID_PHONSTART; ids->phonTermId = PICOKTAB_TMPID_PHONTERM; return PICO_OK; } static pico_status_t ktabIdsSubObjDeallocate(register picoknow_KnowledgeBase this, picoos_MemoryManager mm) { if (NULL != this) { picoos_deallocate(mm, (void *) &this->subObj); } return PICO_OK; } pico_status_t picoktab_specializeIdsKnowledgeBase(picoknow_KnowledgeBase this, picoos_Common common) { if (NULL == this) { return picoos_emRaiseException(common->em, PICO_EXC_KB_MISSING, NULL, NULL); } this->subDeallocate = ktabIdsSubObjDeallocate; this->subObj = picoos_allocate(common->mm, sizeof(picoktab_fixed_ids_t)); if (NULL == this->subObj) { return picoos_emRaiseException(common->em, PICO_EXC_OUT_OF_MEM, NULL, NULL); } return ktabIdsInitialize(this, common); } picoktab_FixedIds picoktab_getFixedIds(picoknow_KnowledgeBase this) { return ((NULL == this) ? NULL : ((picoktab_FixedIds) this->subObj)); } picoktab_FixedIds picoktab_newFixedIds(picoos_MemoryManager mm) { picoktab_FixedIds this = (picoktab_FixedIds) picoos_allocate(mm,sizeof(*this)); if (NULL != this) { /* initialize */ } return this; } void picoktab_disposeFixedIds(picoos_MemoryManager mm, picoktab_FixedIds * this) { if (NULL != (*this)) { /* terminate */ picoos_deallocate(mm,(void *)this); } } /* ************************************************************/ /* Graphs */ /* ************************************************************/ /* overview binary file format for graphs kb: graphs-kb = NROFSENTRIES SIZEOFSENTRY ofstable graphs NROFSENTRIES : 2 bytes, number of entries in offset table SIZEOFSENTRY : 1 byte, size of one entry in offset table ofstable = {OFFSET}=NROFSENTRIES (contains NROFSENTRIES entries of OFFSET) OFFSET: SIZEOFSENTRY bytes, offset to baseaddress of graphs-kb to entry in graphs graphs = {graph}=NROFSENTRIES (contains NROFSENTRIES entries of graph) graph = PROPSET FROM TO [TOKENTYPE] [TOKENSUBTYPE] [VALUE] [LOWERCASE] [GRAPHSUBS1] [GRAPHSUBS2] FROM : 1..4 unsigned bytes, UTF8 character without terminating 0 TO : 1..4 unsigned bytes, UTF8 character without terminating 0 PROPSET : 1 unsigned byte, least significant bit : has TO field next bit : has TOKENTYPE next bit : has TOKENSUBTYPE next bit : has VALUE next bit : has LOWERCASE next bit : has GRAPHSUBS1 next bit : has GRAPHSUBS2 next bit : has PUNC TOKENTYPE : 1 unsigned byte TOKENSUBTYPE : 1 unsigned byte VALUE : 1 unsigned byte LOWERCASE : 1..4 unsigned bytes, UTF8 character without terminating 0 GRAPHSUBS1 : 1..4 unsigned bytes, UTF8 character without terminating 0 GRAPHSUBS2 : 1..4 unsigned bytes, UTF8 character without terminating 0 PUNC : 1 unsigned byte */ static picoos_uint32 ktab_propOffset (const picoktab_Graphs this, picoos_uint32 graphsOffset, picoos_uint32 prop); #define KTAB_START_GRAPHS_NR_OFFSET 0 #define KTAB_START_GRAPHS_SIZE_OFFSET 2 #define KTAB_START_GRAPHS_OFFSET_TABLE 3 #define KTAB_START_GRAPHS_GRAPH_TABLE 0 /* bitmasks to extract the grapheme properties info from the property set */ #define KTAB_GRAPH_PROPSET_TO ((picoos_uint8)'\x01') #define KTAB_GRAPH_PROPSET_TOKENTYPE ((picoos_uint8)'\x02') #define KTAB_GRAPH_PROPSET_TOKENSUBTYPE ((picoos_uint8)'\x04') #define KTAB_GRAPH_PROPSET_VALUE ((picoos_uint8)'\x08') #define KTAB_GRAPH_PROPSET_LOWERCASE ((picoos_uint8)'\x010') #define KTAB_GRAPH_PROPSET_GRAPHSUBS1 ((picoos_uint8)'\x020') #define KTAB_GRAPH_PROPSET_GRAPHSUBS2 ((picoos_uint8)'\x040') #define KTAB_GRAPH_PROPSET_PUNCT ((picoos_uint8)'\x080') typedef struct ktabgraphs_subobj *ktabgraphs_SubObj; typedef struct ktabgraphs_subobj { picoos_uint16 nrOffset; picoos_uint16 sizeOffset; picoos_uint8 * offsetTable; picoos_uint8 * graphTable; } ktabgraphs_subobj_t; static pico_status_t ktabGraphsInitialize(register picoknow_KnowledgeBase this, picoos_Common common) { ktabgraphs_subobj_t * ktabgraphs; PICODBG_DEBUG(("start")); if (NULL == this || NULL == this->subObj) { return picoos_emRaiseException(common->em, PICO_EXC_KB_MISSING, NULL, NULL); } ktabgraphs = (ktabgraphs_subobj_t *) this->subObj; ktabgraphs->nrOffset = ((int)(this->base[KTAB_START_GRAPHS_NR_OFFSET])) + 256*(int)(this->base[KTAB_START_GRAPHS_NR_OFFSET+1]); ktabgraphs->sizeOffset = (int)(this->base[KTAB_START_GRAPHS_SIZE_OFFSET]); ktabgraphs->offsetTable = &(this->base[KTAB_START_GRAPHS_OFFSET_TABLE]); ktabgraphs->graphTable = &(this->base[KTAB_START_GRAPHS_GRAPH_TABLE]); return PICO_OK; } static pico_status_t ktabGraphsSubObjDeallocate(register picoknow_KnowledgeBase this, picoos_MemoryManager mm) { if (NULL != this) { picoos_deallocate(mm, (void *) &this->subObj); } return PICO_OK; } pico_status_t picoktab_specializeGraphsKnowledgeBase(picoknow_KnowledgeBase this, picoos_Common common) { if (NULL == this) { return picoos_emRaiseException(common->em, PICO_EXC_KB_MISSING, NULL, NULL); } this->subDeallocate = ktabGraphsSubObjDeallocate; this->subObj = picoos_allocate(common->mm, sizeof(ktabgraphs_subobj_t)); if (NULL == this->subObj) { return picoos_emRaiseException(common->em, PICO_EXC_OUT_OF_MEM, NULL, NULL); } return ktabGraphsInitialize(this, common); } picoktab_Graphs picoktab_getGraphs(picoknow_KnowledgeBase this) { if (NULL == this) { return NULL; } else { return (picoktab_Graphs) this->subObj; } } /* Graphs methods */ picoos_uint8 picoktab_hasVowellikeProp(const picoktab_Graphs this, const picoos_uint8 *graph, const picoos_uint8 graphlenmax) { picoos_uint8 ui8App; picoos_uint32 graphsOffset; ktabgraphs_subobj_t * g = (ktabgraphs_SubObj)this; ui8App = graphlenmax; /* avoid warning "var not used in this function"*/ graphsOffset = picoktab_graphOffset (this, (picoos_uchar *)graph); return g->graphTable[graphsOffset + ktab_propOffset (this, graphsOffset, KTAB_GRAPH_PROPSET_TOKENTYPE)] == PICODATA_ITEMINFO1_TOKTYPE_LETTERV; } static void ktab_getStrProp (const picoktab_Graphs this, picoos_uint32 graphsOffset, picoos_uint32 propOffset, picoos_uchar * str) { ktabgraphs_subobj_t * g = (ktabgraphs_SubObj)this; picoos_uint32 i, l; i = 0; l = picobase_det_utf8_length(g->graphTable[graphsOffset+propOffset]); while (i<l) { str[i] = g->graphTable[graphsOffset+propOffset+i]; i++; } str[l] = 0; } static picoos_uint32 ktab_propOffset(const picoktab_Graphs this, picoos_uint32 graphsOffset, picoos_uint32 prop) /* Returns offset of property 'prop' inside the graph with offset 'graphsOffset' in graphs table; If the property is found, a value > 0 is returned otherwise 0 */ { picoos_uint32 n = 0; ktabgraphs_subobj_t * g = (ktabgraphs_SubObj) this; if ((g->graphTable[graphsOffset] & prop) == prop) { n = n + 1; /* overread PROPSET field */ n = n + picobase_det_utf8_length(g->graphTable[graphsOffset+n]); /* overread FROM field */ if (prop > KTAB_GRAPH_PROPSET_TO) { if ((g->graphTable[graphsOffset] & KTAB_GRAPH_PROPSET_TO) == KTAB_GRAPH_PROPSET_TO) { n = n + picobase_det_utf8_length(g->graphTable[graphsOffset+n]); /* overread TO field */ } } else { return n; } if (prop > KTAB_GRAPH_PROPSET_TOKENTYPE) { if ((g->graphTable[graphsOffset] & KTAB_GRAPH_PROPSET_TOKENTYPE) == KTAB_GRAPH_PROPSET_TOKENTYPE) { n = n + 1; /* overread TOKENTYPE field */ } } else { return n; } if (prop > KTAB_GRAPH_PROPSET_TOKENSUBTYPE) { if ((g->graphTable[graphsOffset] & KTAB_GRAPH_PROPSET_TOKENSUBTYPE) == KTAB_GRAPH_PROPSET_TOKENSUBTYPE) { n = n + 1; /* overread stokentype field */ } } else { return n; } if (prop > KTAB_GRAPH_PROPSET_VALUE) { if ((g->graphTable[graphsOffset] & KTAB_GRAPH_PROPSET_VALUE) == KTAB_GRAPH_PROPSET_VALUE) { n = n + 1; /* overread value field */ } } else { return n; } if (prop > KTAB_GRAPH_PROPSET_LOWERCASE) { if ((g->graphTable[graphsOffset] & KTAB_GRAPH_PROPSET_LOWERCASE) == KTAB_GRAPH_PROPSET_LOWERCASE) { n = n + picobase_det_utf8_length(g->graphTable[graphsOffset+n]); /* overread lowercase field */ } } else { return n; } if (prop > KTAB_GRAPH_PROPSET_GRAPHSUBS1) { if ((g->graphTable[graphsOffset] & KTAB_GRAPH_PROPSET_GRAPHSUBS1) == KTAB_GRAPH_PROPSET_GRAPHSUBS1) { n = n + picobase_det_utf8_length(g->graphTable[graphsOffset+n]); /* overread graphsubs1 field */ } } else { return n; } if (prop > KTAB_GRAPH_PROPSET_GRAPHSUBS2) { if ((g->graphTable[graphsOffset] & KTAB_GRAPH_PROPSET_GRAPHSUBS2) == KTAB_GRAPH_PROPSET_GRAPHSUBS2) { n = n + picobase_det_utf8_length(g->graphTable[graphsOffset+n]); /* overread graphsubs2 field */ } } else { return n; } if (prop > KTAB_GRAPH_PROPSET_PUNCT) { if ((g->graphTable[graphsOffset] & KTAB_GRAPH_PROPSET_PUNCT) == KTAB_GRAPH_PROPSET_PUNCT) { n = n + 1; /* overread value field */ } } else { return n; } } return n; } picoos_uint32 picoktab_graphOffset (const picoktab_Graphs this, picoos_uchar * utf8graph) { ktabgraphs_subobj_t * g = (ktabgraphs_SubObj)this; picoos_int32 a, b, m; picoos_uint32 graphsOffset; picoos_uint32 propOffset; picobase_utf8char from; picobase_utf8char to; picoos_bool utfGEfrom; picoos_bool utfLEto; if (g->nrOffset > 0) { a = 0; b = g->nrOffset-1; do { m = (a+b) / 2; /* get offset to graph[m] */ if (g->sizeOffset == 1) { graphsOffset = g->offsetTable[g->sizeOffset*m]; } else { graphsOffset = g->offsetTable[g->sizeOffset*m ] + 256*g->offsetTable[g->sizeOffset*m + 1]; /* PICODBG_DEBUG(("picoktab_graphOffset: %i %i %i %i", m, g->offsetTable[g->sizeOffset*m], g->offsetTable[g->sizeOffset*m + 1], graphsOffset)); */ } /* get FROM and TO field of graph[m] */ ktab_getStrProp(this, graphsOffset, 1, from); propOffset = ktab_propOffset(this, graphsOffset, KTAB_GRAPH_PROPSET_TO); if (propOffset > 0) { ktab_getStrProp(this, graphsOffset, propOffset, to); } else { picoos_strcpy((picoos_char *)to, (picoos_char *)from); } /* PICODBG_DEBUG(("picoktab_graphOffset: %i %i %i '%s' '%s' '%s'", a, m, b, from, utf8graph, to)); */ utfGEfrom = picoos_strcmp((picoos_char *)utf8graph, (picoos_char *)from) >= 0; utfLEto = picoos_strcmp((picoos_char *)utf8graph, (picoos_char *)to) <= 0; if (utfGEfrom && utfLEto) { /* PICODBG_DEBUG(("picoktab_graphOffset: utf char '%s' found", utf8graph)); */ return graphsOffset; } if (!utfGEfrom) { b = m-1; } else if (!utfLEto) { a = m+1; } } while (a<=b); } PICODBG_DEBUG(("picoktab_graphOffset: utf char '%s' not found", utf8graph)); return 0; } picoos_bool picoktab_getIntPropTokenType (const picoktab_Graphs this, picoos_uint32 graphsOffset, picoos_uint8 * stokenType) { picoos_uint32 propOffset; ktabgraphs_subobj_t * g = (ktabgraphs_SubObj)this; propOffset = ktab_propOffset(this, graphsOffset, KTAB_GRAPH_PROPSET_TOKENTYPE); if (propOffset > 0) { *stokenType = (picoos_uint8)(g->graphTable[graphsOffset+propOffset]); return TRUE; } else { return FALSE; } } picoos_bool picoktab_getIntPropTokenSubType (const picoktab_Graphs this, picoos_uint32 graphsOffset, picoos_int8 * stokenSubType) { picoos_uint32 propOffset; ktabgraphs_subobj_t * g = (ktabgraphs_SubObj)this; propOffset = ktab_propOffset(this, graphsOffset, KTAB_GRAPH_PROPSET_TOKENSUBTYPE); if (propOffset > 0) { *stokenSubType = (picoos_int8)(g->graphTable[graphsOffset+propOffset]); return TRUE; } else { return FALSE; } } picoos_bool picoktab_getIntPropValue (const picoktab_Graphs this, picoos_uint32 graphsOffset, picoos_uint32 * value) { picoos_uint32 propOffset; ktabgraphs_subobj_t * g = (ktabgraphs_SubObj)this; propOffset = ktab_propOffset(this, graphsOffset, KTAB_GRAPH_PROPSET_VALUE); if (propOffset > 0) { *value = (picoos_uint32)(g->graphTable[graphsOffset+propOffset]); return TRUE; } else { return FALSE; } } picoos_bool picoktab_getIntPropPunct (const picoktab_Graphs this, picoos_uint32 graphsOffset, picoos_uint8 * info1, picoos_uint8 * info2) { picoos_uint32 propOffset; ktabgraphs_subobj_t * g = (ktabgraphs_SubObj)this; propOffset = ktab_propOffset(this, graphsOffset, KTAB_GRAPH_PROPSET_PUNCT); if (propOffset > 0) { if (g->graphTable[graphsOffset+propOffset] == 2) { *info1 = PICODATA_ITEMINFO1_PUNC_SENTEND; } else { *info1 = PICODATA_ITEMINFO1_PUNC_PHRASEEND; } if (g->graphTable[graphsOffset+1] == '.') { *info2 = PICODATA_ITEMINFO2_PUNC_SENT_T; } else if (g->graphTable[graphsOffset+1] == '?') { *info2 = PICODATA_ITEMINFO2_PUNC_SENT_Q; } else if (g->graphTable[graphsOffset+1] == '!') { *info2 = PICODATA_ITEMINFO2_PUNC_SENT_E; } else { *info2 = PICODATA_ITEMINFO2_PUNC_PHRASE; } return TRUE; } else { return FALSE; } } picoos_bool picoktab_getStrPropLowercase (const picoktab_Graphs this, picoos_uint32 graphsOffset, picoos_uchar * lowercase) { picoos_uint32 propOffset; propOffset = ktab_propOffset(this, graphsOffset, KTAB_GRAPH_PROPSET_LOWERCASE); if (propOffset > 0) { ktab_getStrProp(this, graphsOffset, propOffset, lowercase); return TRUE; } else { return FALSE; } } picoos_bool picoktab_getStrPropGraphsubs1 (const picoktab_Graphs this, picoos_uint32 graphsOffset, picoos_uchar * graphsubs1) { picoos_uint32 propOffset; propOffset = ktab_propOffset(this, graphsOffset, KTAB_GRAPH_PROPSET_GRAPHSUBS1); if (propOffset > 0) { ktab_getStrProp(this, graphsOffset, propOffset, graphsubs1); return TRUE; } else { return FALSE; } } picoos_bool picoktab_getStrPropGraphsubs2 (const picoktab_Graphs this, picoos_uint32 graphsOffset, picoos_uchar * graphsubs2) { picoos_uint32 propOffset; propOffset = ktab_propOffset(this, graphsOffset, KTAB_GRAPH_PROPSET_GRAPHSUBS2); if (propOffset > 0) { ktab_getStrProp(this, graphsOffset, propOffset, graphsubs2); return TRUE; } else { return FALSE; } } /* *****************************************************************/ /* used for tools */ static void ktab_getUtf8 (picoos_uchar ** pos, picoos_uchar * to) { picoos_uint32 l; l = picobase_det_utf8_length(**pos); while (l>0) { *(to++) = *((*pos)++); l--; } *to = 0; } picoos_uint16 picoktab_graphsGetNumEntries(const picoktab_Graphs this) { ktabgraphs_subobj_t * g = (ktabgraphs_SubObj) this; return g->nrOffset; } void picoktab_graphsGetGraphInfo(const picoktab_Graphs this, picoos_uint16 graphIndex, picoos_uchar * from, picoos_uchar * to, picoos_uint8 * propset, picoos_uint8 * stokenType, picoos_uint8 * stokenSubType, picoos_uint8 * value, picoos_uchar * lowercase, picoos_uchar * graphsubs1, picoos_uchar * graphsubs2, picoos_uint8 * punct) { ktabgraphs_subobj_t * g = (ktabgraphs_SubObj) this; picoos_uint32 graphsOffset; picoos_uint8 * pos; /* calculate offset of graph[graphIndex] */ if (g->sizeOffset == 1) { graphsOffset = g->offsetTable[graphIndex]; } else { graphsOffset = g->offsetTable[2 * graphIndex] + (g->offsetTable[2 * graphIndex + 1] << 8); } pos = &(g->graphTable[graphsOffset]); *propset = *pos; pos++; /* advance to FROM */ ktab_getUtf8(&pos, from); /* get FROM and advance */ if ((*propset) & KTAB_GRAPH_PROPSET_TO) { ktab_getUtf8(&pos, to); /* get TO and advance */ } else { picoos_strcpy((picoos_char *)to, (picoos_char *)from); } if ((*propset) & KTAB_GRAPH_PROPSET_TOKENTYPE) { (*stokenType) = *(pos++); /* get TOKENTYPE and advance */ } else { (*stokenType) = -1; } if ((*propset) & KTAB_GRAPH_PROPSET_TOKENSUBTYPE) { (*stokenSubType) = *(pos++); /* get TOKENSUBTYPE and advance */ } else { (*stokenSubType) = -1; } if ((*propset) & KTAB_GRAPH_PROPSET_VALUE) { (*value) = *(pos++); /* get VALUE and advance */ } else { (*value) = -1; } if ((*propset) & KTAB_GRAPH_PROPSET_LOWERCASE) { ktab_getUtf8(&pos, lowercase); /* get LOWERCASE and advance */ } else { lowercase[0] = NULLC; } if ((*propset) & KTAB_GRAPH_PROPSET_GRAPHSUBS1) { ktab_getUtf8(&pos, graphsubs1); /* get GRAPHSUBS1 and advance */ } else { graphsubs1[0] = NULLC; } if ((*propset) & KTAB_GRAPH_PROPSET_GRAPHSUBS2) { ktab_getUtf8(&pos, graphsubs2); /* get GRAPHSUBS2 and advance */ } else { graphsubs2[0] = NULLC; } if ((*propset) & KTAB_GRAPH_PROPSET_PUNCT) { (*punct) = *(pos++); /* get PUNCT and advance */ } else { (*punct) = -1; } } /* ************************************************************/ /* Phones */ /* ************************************************************/ /* overview binary file format for phones kb: phones-kb = specids propertytable specids = PRIMSTRESSID1 SECSTRESSID1 SYLLBOUNDID1 PAUSEID1 WORDBOUNDID1 RESERVE1 RESERVE1 RESERVE1 propertytable = {PHONEPROP2}=256 PRIMSTRESSID1: one byte, ID of primary stress SECSTRESSID1: one byte, ID of secondary stress SYLLBOUNDID1: one byte, ID of syllable boundary PAUSEID1: one byte, ID of pause RESERVE1: reserved for future use PHONEPROP2: one byte, max. of 256 phones directly access this table to check a property for a phone; binary properties encoded (1 bit per prop) least significant bit: vowel next bit: diphth next bit: glott next bit: nonsyllvowel next bit: syllcons 3 bits spare */ #define KTAB_START_SPECIDS 0 #define KTAB_IND_PRIMSTRESS 0 #define KTAB_IND_SECSTRESS 1 #define KTAB_IND_SYLLBOUND 2 #define KTAB_IND_PAUSE 3 #define KTAB_IND_WORDBOUND 4 #define KTAB_START_PROPS 8 typedef struct ktabphones_subobj *ktabphones_SubObj; typedef struct ktabphones_subobj { picoos_uint8 *specids; picoos_uint8 *props; } ktabphones_subobj_t; /* bitmasks to extract the property info from props */ #define KTAB_PPROP_VOWEL '\x01' #define KTAB_PPROP_DIPHTH '\x02' #define KTAB_PPROP_GLOTT '\x04' #define KTAB_PPROP_NONSYLLVOWEL '\x08' #define KTAB_PPROP_SYLLCONS '\x10' static pico_status_t ktabPhonesInitialize(register picoknow_KnowledgeBase this, picoos_Common common) { ktabphones_subobj_t * ktabphones; PICODBG_DEBUG(("start")); if (NULL == this || NULL == this->subObj) { return picoos_emRaiseException(common->em, PICO_EXC_KB_MISSING, NULL, NULL); } ktabphones = (ktabphones_subobj_t *) this->subObj; ktabphones->specids = &(this->base[KTAB_START_SPECIDS]); ktabphones->props = &(this->base[KTAB_START_PROPS]); return PICO_OK; } static pico_status_t ktabPhonesSubObjDeallocate(register picoknow_KnowledgeBase this, picoos_MemoryManager mm) { if (NULL != this) { picoos_deallocate(mm, (void *) &this->subObj); } return PICO_OK; } pico_status_t picoktab_specializePhonesKnowledgeBase(picoknow_KnowledgeBase this, picoos_Common common) { if (NULL == this) { return picoos_emRaiseException(common->em, PICO_EXC_KB_MISSING, NULL, NULL); } this->subDeallocate = ktabPhonesSubObjDeallocate; this->subObj = picoos_allocate(common->mm, sizeof(ktabphones_subobj_t)); if (NULL == this->subObj) { return picoos_emRaiseException(common->em, PICO_EXC_OUT_OF_MEM, NULL, NULL); } return ktabPhonesInitialize(this, common); } picoktab_Phones picoktab_getPhones(picoknow_KnowledgeBase this) { if (NULL == this) { return NULL; } else { return (picoktab_Phones) this->subObj; } } /* Phones methods */ picoos_uint8 picoktab_hasVowelProp(const picoktab_Phones this, const picoos_uint8 ch) { return (KTAB_PPROP_VOWEL & ((ktabphones_SubObj)this)->props[ch]); } picoos_uint8 picoktab_hasDiphthProp(const picoktab_Phones this, const picoos_uint8 ch) { return (KTAB_PPROP_DIPHTH & ((ktabphones_SubObj)this)->props[ch]); } picoos_uint8 picoktab_hasGlottProp(const picoktab_Phones this, const picoos_uint8 ch) { return (KTAB_PPROP_GLOTT & ((ktabphones_SubObj)this)->props[ch]); } picoos_uint8 picoktab_hasNonsyllvowelProp(const picoktab_Phones this, const picoos_uint8 ch) { return (KTAB_PPROP_NONSYLLVOWEL & ((ktabphones_SubObj)this)->props[ch]); } picoos_uint8 picoktab_hasSyllconsProp(const picoktab_Phones this, const picoos_uint8 ch) { return (KTAB_PPROP_SYLLCONS & ((ktabphones_SubObj)this)->props[ch]); } picoos_bool picoktab_isSyllCarrier(const picoktab_Phones this, const picoos_uint8 ch) { picoos_uint8 props; props = ((ktabphones_SubObj)this)->props[ch]; return (((KTAB_PPROP_VOWEL & props) && !(KTAB_PPROP_NONSYLLVOWEL & props)) || (KTAB_PPROP_SYLLCONS & props)); } picoos_bool picoktab_isPrimstress(const picoktab_Phones this, const picoos_uint8 ch) { return (ch == ((ktabphones_SubObj)this)->specids[KTAB_IND_PRIMSTRESS]); } picoos_bool picoktab_isSecstress(const picoktab_Phones this, const picoos_uint8 ch) { return (ch == ((ktabphones_SubObj)this)->specids[KTAB_IND_SECSTRESS]); } picoos_bool picoktab_isSyllbound(const picoktab_Phones this, const picoos_uint8 ch) { return (ch == ((ktabphones_SubObj)this)->specids[KTAB_IND_SYLLBOUND]); } picoos_bool picoktab_isWordbound(const picoktab_Phones this, const picoos_uint8 ch) { return (ch == ((ktabphones_SubObj)this)->specids[KTAB_IND_WORDBOUND]); } picoos_bool picoktab_isPause(const picoktab_Phones this, const picoos_uint8 ch) { return (ch == ((ktabphones_SubObj)this)->specids[KTAB_IND_PAUSE]); } picoos_uint8 picoktab_getPrimstressID(const picoktab_Phones this) { return ((ktabphones_SubObj)this)->specids[KTAB_IND_PRIMSTRESS]; } picoos_uint8 picoktab_getSecstressID(const picoktab_Phones this) { return ((ktabphones_SubObj)this)->specids[KTAB_IND_SECSTRESS]; } picoos_uint8 picoktab_getSyllboundID(const picoktab_Phones this) { return ((ktabphones_SubObj)this)->specids[KTAB_IND_SYLLBOUND]; } picoos_uint8 picoktab_getWordboundID(const picoktab_Phones this) { return ((ktabphones_SubObj)this)->specids[KTAB_IND_WORDBOUND]; } picoos_uint8 picoktab_getPauseID(const picoktab_Phones this) { return ((ktabphones_SubObj)this)->specids[KTAB_IND_PAUSE]; } /* ************************************************************/ /* Pos */ /* ************************************************************/ /* overview binary file format for pos kb: pos-kb = header posids header = {COUNT2 OFFS2}=8 posids = {POSID1 {PARTID1}0:8}1: where POSID1 is the value of the (combined) part-of-speech symbol, and {PARTID1} are the symbol values of its components (empty if it is not a combined symbol). The {PARTID1} list is sorted. Part-of-speech symbols with equal number of components are grouped together. The header contains information about these groups: COUNT2 specifies the number of elements in the group, and OFFS2 specifies the offset (relative to the beginning of the kb) where the group data starts, i.e.: 25 32 -> 25 not-combined elements, starting at offset 32 44 57 -> 44 elements composed of 2 symbols, starting at offset 57 23 189 -> 23 elements composed of 3 symbols, starting at offset 189 ... Currently, each symbol may be composed of up to 8 other symbols. Therefore, the header has 8 entries, too. The header starts with the unique POS list, and then in increasing order, 2 symbols, 3 symbols,... Zur Anschauung die ge-printf-te Version: 25 32 44 57 23 189 12 281 4 341 1 365 0 0 0 0 33 | 34 | 35 | 60 | etc. 36 | 35 60 50 | 35 95 51 | 35 97 58 | 35 120 59 | 35 131 61 | 60 75 63 | 60 95 64 | 60 97 etc. 42 | 35 60 117 44 | 35 60 131 45 | 35 73 97 48 | 35 84 97 54 | 35 97 131 56 | 35 113 120 57 | 35 117 120 62 | 60 84 122 etc. */ typedef struct ktabpos_subobj *ktabpos_SubObj; typedef struct ktabpos_subobj { picoos_uint16 nrcomb[PICOKTAB_MAXNRPOS_IN_COMB]; picoos_uint8 *nrcombstart[PICOKTAB_MAXNRPOS_IN_COMB]; } ktabpos_subobj_t; static pico_status_t ktabPosInitialize(register picoknow_KnowledgeBase this, picoos_Common common) { ktabpos_subobj_t *ktabpos; picoos_uint16 osprev; picoos_uint16 os, pos; picoos_uint8 i; PICODBG_DEBUG(("start")); if (NULL == this || NULL == this->subObj) { return picoos_emRaiseException(common->em, PICO_EXC_KB_MISSING, NULL, NULL); } ktabpos = (ktabpos_subobj_t *)this->subObj; os = 0; for (i = 0, pos = 0; i < PICOKTAB_MAXNRPOS_IN_COMB; i++, pos += 4) { ktabpos->nrcomb[i] = ((picoos_uint16)(this->base[pos+1])) << 8 | this->base[pos]; if (ktabpos->nrcomb[i] > 0) { osprev = os; os = ((picoos_uint16)(this->base[pos+3])) << 8 | this->base[pos+2]; ktabpos->nrcombstart[i] = &(this->base[os]); PICODBG_TRACE(("i %d, pos %d, nr %d, osprev %d, os %d", i, pos, ktabpos->nrcomb[i], osprev, os)); if (osprev >= os) { /* cannot be, in a valid kb */ return picoos_emRaiseException(common->em, PICO_EXC_FILE_CORRUPT, NULL, NULL); } } else { if (i == 0) { /* cannot be, in a valid kb */ return picoos_emRaiseException(common->em, PICO_EXC_FILE_CORRUPT, NULL, NULL); } ktabpos->nrcombstart[i] = NULL; } } return PICO_OK; } static pico_status_t ktabPosSubObjDeallocate(register picoknow_KnowledgeBase this, picoos_MemoryManager mm) { if (NULL != this) { picoos_deallocate(mm, (void *) &this->subObj); } return PICO_OK; } pico_status_t picoktab_specializePosKnowledgeBase(picoknow_KnowledgeBase this, picoos_Common common) { if (NULL == this) { return picoos_emRaiseException(common->em, PICO_EXC_KB_MISSING, NULL, NULL); } this->subDeallocate = ktabPosSubObjDeallocate; this->subObj = picoos_allocate(common->mm, sizeof(ktabpos_subobj_t)); if (NULL == this->subObj) { return picoos_emRaiseException(common->em, PICO_EXC_OUT_OF_MEM, NULL, NULL); } return ktabPosInitialize(this, common); } picoktab_Pos picoktab_getPos(picoknow_KnowledgeBase this) { if (NULL == this) { return NULL; } else { return (picoktab_Pos) this->subObj; } } /* Pos methods */ static picoos_int16 ktab_isEqualPosGroup(const picoos_uint8 *grp1, const picoos_uint8 *grp2, picoos_uint8 len) { /* if both, grp1 and grp2 would be sorted in ascending order we could implement a function picoktab_comparePosGroup in a similar manner as strcmp */ picoos_uint16 i, j, equal; equal = 1; i = 0; while (equal && (i < len)) { /* search grp1[i] in grp2 */ j = 0; while ((j < len) && (grp1[i] != grp2[j])) { j++; } equal = (j < len); i++; } return equal; } picoos_bool picoktab_isUniquePos(const picoktab_Pos this, const picoos_uint8 pos) { ktabpos_subobj_t *ktabpos; picoos_uint16 i; /* speed-up possible with e.g. binary search */ ktabpos = (ktabpos_subobj_t *)this; PICODBG_TRACE(("pos %d, nrcombinations %d", pos, ktabpos->nrcomb[0])); i = 0; while ((i < ktabpos->nrcomb[0]) && (pos > ktabpos->nrcombstart[0][i])) { PICODBG_TRACE(("compare with pos %d at position %d", ktabpos->nrcombstart[0][i], pos, i)); i++; } return ((i < ktabpos->nrcomb[0]) && (pos == ktabpos->nrcombstart[0][i])); } picoos_bool picoktab_isPartOfPosGroup(const picoktab_Pos this, const picoos_uint8 pos, const picoos_uint8 posgroup) { ktabpos_subobj_t *ktabpos; picoos_uint8 *grp; picoos_uint16 i, j, n, s, grplen; picoos_uint8 *e; picoos_uint8 found; ktabpos = (ktabpos_subobj_t *) this; grp = NULL; found = FALSE; grplen = 0; /* currently, a linear search is required to find 'posgroup'; the knowledge base should be extended to allow for a faster search */ /* treat case i==0, grplen==0, ie. pos == posgroup */ if (pos == posgroup) { found = TRUE; } i = 1; while ((grp == NULL) && (i < PICOKTAB_MAXNRPOS_IN_COMB)) { n = ktabpos->nrcomb[i]; /* number of entries */ e = ktabpos->nrcombstart[i]; /* ptr to first entry */ s = i + 2; /* size of an entry in bytes */ /* was with while starting at 0: s = i > 0 ? i + 2 : 1; */ j = 0; while ((grp == NULL) && (j < n)) { if (posgroup == e[0]) { grp = e + 1; grplen = s - 1; } e += s; j++; } i++; } /* test if 'pos' is contained in the components of 'posgroup' */ if (grp != NULL) { for (i = 0; !found && (i < grplen); i++) { if (pos == grp[i]) { found = TRUE; } } /* just a way to test picoktab_getPosGroup */ /* PICODBG_ASSERT(picoktab_getPosGroup(this, grp, grplen) == posgroup); */ } return found; } picoos_uint8 picoktab_getPosGroup(const picoktab_Pos this, const picoos_uint8 *poslist, const picoos_uint8 poslistlen) { picoos_uint8 poscomb; ktabpos_subobj_t *ktabpos; picoos_uint16 i, j, n, s; picoos_uint8 *e; ktabpos = (ktabpos_subobj_t *) this; poscomb = 0; if ((poslistlen > 0) && (poslistlen <= PICOKTAB_MAXNRPOS_IN_COMB)) { i = poslistlen - 1; if (i > 0) { n = ktabpos->nrcomb[i]; /* number of entries */ e = ktabpos->nrcombstart[i]; /* ptr to first entry */ s = i + 2; /* size of an entry in bytes */ j = 0; while (!poscomb && (j < n)) { if (ktab_isEqualPosGroup(poslist, e + 1, poslistlen)) { poscomb = *e; } e += s; j++; } if (!poscomb) { /* combination not found; shouldn't occur if lingware OK! */ /* contingency solution: take first */ PICODBG_WARN(("dynamically created POS combination not found in table; taking first (%i)",poslist[0])); poscomb = poslist[0]; } } else { /* not a composed POS */ poscomb = poslist[0]; } } return poscomb; } #ifdef __cplusplus } #endif /* end */