/* * Copyright (C) 2009 The Android Open Source Project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include <assert.h> #include <stdio.h> #include <stdlib.h> #include <string.h> #include "../include/dictbuilder.h" #include "../include/dicttrie.h" #include "../include/mystdlib.h" #include "../include/ngram.h" #include "../include/searchutility.h" #include "../include/spellingtable.h" #include "../include/spellingtrie.h" #include "../include/splparser.h" #include "../include/utf16reader.h" namespace ime_pinyin { #ifdef ___BUILD_MODEL___ static const size_t kReadBufLen = 512; static const size_t kSplTableHashLen = 2000; // Compare a SingleCharItem, first by Hanzis, then by spelling ids, then by // frequencies. int cmp_scis_hz_splid_freq(const void* p1, const void* p2) { const SingleCharItem *s1, *s2; s1 = static_cast<const SingleCharItem*>(p1); s2 = static_cast<const SingleCharItem*>(p2); if (s1->hz < s2->hz) return -1; if (s1->hz > s2->hz) return 1; if (s1->splid.half_splid < s2->splid.half_splid) return -1; if (s1->splid.half_splid > s2->splid.half_splid) return 1; if (s1->splid.full_splid < s2->splid.full_splid) return -1; if (s1->splid.full_splid > s2->splid.full_splid) return 1; if (s1->freq > s2->freq) return -1; if (s1->freq < s2->freq) return 1; return 0; } int cmp_scis_hz_splid(const void* p1, const void* p2) { const SingleCharItem *s1, *s2; s1 = static_cast<const SingleCharItem*>(p1); s2 = static_cast<const SingleCharItem*>(p2); if (s1->hz < s2->hz) return -1; if (s1->hz > s2->hz) return 1; if (s1->splid.half_splid < s2->splid.half_splid) return -1; if (s1->splid.half_splid > s2->splid.half_splid) return 1; if (s1->splid.full_splid < s2->splid.full_splid) return -1; if (s1->splid.full_splid > s2->splid.full_splid) return 1; return 0; } int cmp_lemma_entry_hzs(const void* p1, const void* p2) { size_t size1 = utf16_strlen(((const LemmaEntry*)p1)->hanzi_str); size_t size2 = utf16_strlen(((const LemmaEntry*)p2)->hanzi_str); if (size1 < size2) return -1; else if (size1 > size2) return 1; return utf16_strcmp(((const LemmaEntry*)p1)->hanzi_str, ((const LemmaEntry*)p2)->hanzi_str); } int compare_char16(const void* p1, const void* p2) { if (*((const char16*)p1) < *((const char16*)p2)) return -1; if (*((const char16*)p1) > *((const char16*)p2)) return 1; return 0; } int compare_py(const void* p1, const void* p2) { int ret = utf16_strcmp(((const LemmaEntry*)p1)->spl_idx_arr, ((const LemmaEntry*)p2)->spl_idx_arr); if (0 != ret) return ret; return static_cast<int>(((const LemmaEntry*)p2)->freq) - static_cast<int>(((const LemmaEntry*)p1)->freq); } // First hanzi, if the same, then Pinyin int cmp_lemma_entry_hzspys(const void* p1, const void* p2) { size_t size1 = utf16_strlen(((const LemmaEntry*)p1)->hanzi_str); size_t size2 = utf16_strlen(((const LemmaEntry*)p2)->hanzi_str); if (size1 < size2) return -1; else if (size1 > size2) return 1; int ret = utf16_strcmp(((const LemmaEntry*)p1)->hanzi_str, ((const LemmaEntry*)p2)->hanzi_str); if (0 != ret) return ret; ret = utf16_strcmp(((const LemmaEntry*)p1)->spl_idx_arr, ((const LemmaEntry*)p2)->spl_idx_arr); return ret; } int compare_splid2(const void* p1, const void* p2) { int ret = utf16_strcmp(((const LemmaEntry*)p1)->spl_idx_arr, ((const LemmaEntry*)p2)->spl_idx_arr); return ret; } DictBuilder::DictBuilder() { lemma_arr_ = NULL; lemma_num_ = 0; scis_ = NULL; scis_num_ = 0; lma_nodes_le0_ = NULL; lma_nodes_ge1_ = NULL; lma_nds_used_num_le0_ = 0; lma_nds_used_num_ge1_ = 0; homo_idx_buf_ = NULL; homo_idx_num_eq1_ = 0; homo_idx_num_gt1_ = 0; top_lmas_ = NULL; top_lmas_num_ = 0; spl_table_ = NULL; spl_parser_ = NULL; } DictBuilder::~DictBuilder() { free_resource(); } bool DictBuilder::alloc_resource(size_t lma_num) { if (0 == lma_num) return false; free_resource(); lemma_num_ = lma_num; lemma_arr_ = new LemmaEntry[lemma_num_]; top_lmas_num_ = 0; top_lmas_ = new LemmaEntry[kTopScoreLemmaNum]; // New the scis_ buffer to the possible maximum size. scis_num_ = lemma_num_ * kMaxLemmaSize; scis_ = new SingleCharItem[scis_num_]; // The root and first level nodes is less than kMaxSpellingNum + 1 lma_nds_used_num_le0_ = 0; lma_nodes_le0_ = new LmaNodeLE0[kMaxSpellingNum + 1]; // Other nodes is less than lemma_num lma_nds_used_num_ge1_ = 0; lma_nodes_ge1_ = new LmaNodeGE1[lemma_num_]; homo_idx_buf_ = new LemmaIdType[lemma_num_]; spl_table_ = new SpellingTable(); spl_parser_ = new SpellingParser(); if (NULL == lemma_arr_ || NULL == top_lmas_ || NULL == scis_ || NULL == spl_table_ || NULL == spl_parser_ || NULL == lma_nodes_le0_ || NULL == lma_nodes_ge1_ || NULL == homo_idx_buf_) { free_resource(); return false; } memset(lemma_arr_, 0, sizeof(LemmaEntry) * lemma_num_); memset(scis_, 0, sizeof(SingleCharItem) * scis_num_); memset(lma_nodes_le0_, 0, sizeof(LmaNodeLE0) * (kMaxSpellingNum + 1)); memset(lma_nodes_ge1_, 0, sizeof(LmaNodeGE1) * lemma_num_); memset(homo_idx_buf_, 0, sizeof(LemmaIdType) * lemma_num_); spl_table_->init_table(kMaxPinyinSize, kSplTableHashLen, true); return true; } char16* DictBuilder::read_valid_hanzis(const char *fn_validhzs, size_t *num) { if (NULL == fn_validhzs || NULL == num) return NULL; *num = 0; FILE *fp = fopen(fn_validhzs, "rb"); if (NULL == fp) return NULL; char16 utf16header; if (fread(&utf16header, sizeof(char16), 1, fp) != 1 || 0xfeff != utf16header) { fclose(fp); return NULL; } fseek(fp, 0, SEEK_END); *num = ftell(fp) / sizeof(char16); assert(*num >= 1); *num -= 1; char16 *hzs = new char16[*num]; if (NULL == hzs) { fclose(fp); return NULL; } fseek(fp, 2, SEEK_SET); if (fread(hzs, sizeof(char16), *num, fp) != *num) { fclose(fp); delete [] hzs; return NULL; } fclose(fp); myqsort(hzs, *num, sizeof(char16), compare_char16); return hzs; } bool DictBuilder::hz_in_hanzis_list(const char16 *hzs, size_t hzs_len, char16 hz) { if (NULL == hzs) return false; char16 *found; found = static_cast<char16*>( mybsearch(&hz, hzs, hzs_len, sizeof(char16), compare_char16)); if (NULL == found) return false; assert(*found == hz); return true; } // The caller makes sure that the parameters are valid. bool DictBuilder::str_in_hanzis_list(const char16 *hzs, size_t hzs_len, const char16 *str, size_t str_len) { if (NULL == hzs || NULL == str) return false; for (size_t pos = 0; pos < str_len; pos++) { if (!hz_in_hanzis_list(hzs, hzs_len, str[pos])) return false; } return true; } void DictBuilder::get_top_lemmas() { top_lmas_num_ = 0; if (NULL == lemma_arr_) return; for (size_t pos = 0; pos < lemma_num_; pos++) { if (0 == top_lmas_num_) { top_lmas_[0] = lemma_arr_[pos]; top_lmas_num_ = 1; continue; } if (lemma_arr_[pos].freq > top_lmas_[top_lmas_num_ - 1].freq) { if (kTopScoreLemmaNum > top_lmas_num_) top_lmas_num_ += 1; size_t move_pos; for (move_pos = top_lmas_num_ - 1; move_pos > 0; move_pos--) { top_lmas_[move_pos] = top_lmas_[move_pos - 1]; if (0 == move_pos - 1 || (move_pos - 1 > 0 && top_lmas_[move_pos - 2].freq > lemma_arr_[pos].freq)) { break; } } assert(move_pos > 0); top_lmas_[move_pos - 1] = lemma_arr_[pos]; } else if (kTopScoreLemmaNum > top_lmas_num_) { top_lmas_[top_lmas_num_] = lemma_arr_[pos]; top_lmas_num_ += 1; } } if (kPrintDebug0) { printf("\n------Top Lemmas------------------\n"); for (size_t pos = 0; pos < top_lmas_num_; pos++) { printf("--%d, idx:%06d, score:%.5f\n", pos, top_lmas_[pos].idx_by_hz, top_lmas_[pos].freq); } } } void DictBuilder::free_resource() { if (NULL != lemma_arr_) delete [] lemma_arr_; if (NULL != scis_) delete [] scis_; if (NULL != lma_nodes_le0_) delete [] lma_nodes_le0_; if (NULL != lma_nodes_ge1_) delete [] lma_nodes_ge1_; if (NULL != homo_idx_buf_) delete [] homo_idx_buf_; if (NULL != spl_table_) delete spl_table_; if (NULL != spl_parser_) delete spl_parser_; lemma_arr_ = NULL; scis_ = NULL; lma_nodes_le0_ = NULL; lma_nodes_ge1_ = NULL; homo_idx_buf_ = NULL; spl_table_ = NULL; spl_parser_ = NULL; lemma_num_ = 0; lma_nds_used_num_le0_ = 0; lma_nds_used_num_ge1_ = 0; homo_idx_num_eq1_ = 0; homo_idx_num_gt1_ = 0; } size_t DictBuilder::read_raw_dict(const char* fn_raw, const char *fn_validhzs, size_t max_item) { if (NULL == fn_raw) return 0; Utf16Reader utf16_reader; if (!utf16_reader.open(fn_raw, kReadBufLen * 10)) return false; char16 read_buf[kReadBufLen]; // Read the number of lemmas in the file size_t lemma_num = 240000; // allocate resource required if (!alloc_resource(lemma_num)) { utf16_reader.close(); } // Read the valid Hanzi list. char16 *valid_hzs = NULL; size_t valid_hzs_num = 0; valid_hzs = read_valid_hanzis(fn_validhzs, &valid_hzs_num); // Begin reading the lemma entries for (size_t i = 0; i < max_item; i++) { // read next entry if (!utf16_reader.readline(read_buf, kReadBufLen)) { lemma_num = i; break; } size_t token_size; char16 *token; char16 *to_tokenize = read_buf; // Get the Hanzi string token = utf16_strtok(to_tokenize, &token_size, &to_tokenize); if (NULL == token) { free_resource(); utf16_reader.close(); return false; } size_t lemma_size = utf16_strlen(token); if (lemma_size > kMaxLemmaSize) { i--; continue; } if (lemma_size > 4) { i--; continue; } // Copy to the lemma entry utf16_strcpy(lemma_arr_[i].hanzi_str, token); lemma_arr_[i].hz_str_len = token_size; // Get the freq string token = utf16_strtok(to_tokenize, &token_size, &to_tokenize); if (NULL == token) { free_resource(); utf16_reader.close(); return false; } lemma_arr_[i].freq = utf16_atof(token); if (lemma_size > 1 && lemma_arr_[i].freq < 60) { i--; continue; } // Get GBK mark, if no valid Hanzi list available, all items which contains // GBK characters will be discarded. Otherwise, all items which contains // characters outside of the valid Hanzi list will be discarded. token = utf16_strtok(to_tokenize, &token_size, &to_tokenize); assert(NULL != token); int gbk_flag = utf16_atoi(token); if (NULL == valid_hzs || 0 == valid_hzs_num) { if (0 != gbk_flag) { i--; continue; } } else { if (!str_in_hanzis_list(valid_hzs, valid_hzs_num, lemma_arr_[i].hanzi_str, lemma_arr_[i].hz_str_len)) { i--; continue; } } // Get spelling String bool spelling_not_support = false; for (size_t hz_pos = 0; hz_pos < (size_t)lemma_arr_[i].hz_str_len; hz_pos++) { // Get a Pinyin token = utf16_strtok(to_tokenize, &token_size, &to_tokenize); if (NULL == token) { free_resource(); utf16_reader.close(); return false; } assert(utf16_strlen(token) <= kMaxPinyinSize); utf16_strcpy_tochar(lemma_arr_[i].pinyin_str[hz_pos], token); format_spelling_str(lemma_arr_[i].pinyin_str[hz_pos]); // Put the pinyin to the spelling table if (!spl_table_->put_spelling(lemma_arr_[i].pinyin_str[hz_pos], lemma_arr_[i].freq)) { spelling_not_support = true; break; } } // The whole line must have been parsed fully, otherwise discard this one. token = utf16_strtok(to_tokenize, &token_size, &to_tokenize); if (spelling_not_support || NULL != token) { i--; continue; } } delete [] valid_hzs; utf16_reader.close(); printf("read succesfully, lemma num: %d\n", lemma_num); return lemma_num; } bool DictBuilder::build_dict(const char *fn_raw, const char *fn_validhzs, DictTrie *dict_trie) { if (NULL == fn_raw || NULL == dict_trie) return false; lemma_num_ = read_raw_dict(fn_raw, fn_validhzs, 240000); if (0 == lemma_num_) return false; // Arrange the spelling table, and build a spelling tree // The size of an spelling. '\0' is included. If the spelling table is // initialized to calculate the spelling scores, the last char in the // spelling string will be score, and it is also included in spl_item_size. size_t spl_item_size; size_t spl_num; const char* spl_buf; spl_buf = spl_table_->arrange(&spl_item_size, &spl_num); if (NULL == spl_buf) { free_resource(); return false; } SpellingTrie &spl_trie = SpellingTrie::get_instance(); if (!spl_trie.construct(spl_buf, spl_item_size, spl_num, spl_table_->get_score_amplifier(), spl_table_->get_average_score())) { free_resource(); return false; } printf("spelling tree construct successfully.\n"); // Convert the spelling string to idxs for (size_t i = 0; i < lemma_num_; i++) { for (size_t hz_pos = 0; hz_pos < (size_t)lemma_arr_[i].hz_str_len; hz_pos++) { uint16 spl_idxs[2]; uint16 spl_start_pos[3]; bool is_pre = true; int spl_idx_num = spl_parser_->splstr_to_idxs(lemma_arr_[i].pinyin_str[hz_pos], strlen(lemma_arr_[i].pinyin_str[hz_pos]), spl_idxs, spl_start_pos, 2, is_pre); assert(1 == spl_idx_num); if (spl_trie.is_half_id(spl_idxs[0])) { uint16 num = spl_trie.half_to_full(spl_idxs[0], spl_idxs); assert(0 != num); } lemma_arr_[i].spl_idx_arr[hz_pos] = spl_idxs[0]; } } // Sort the lemma items according to the hanzi, and give each unique item a // id sort_lemmas_by_hz(); scis_num_ = build_scis(); // Construct the dict list dict_trie->dict_list_ = new DictList(); bool dl_success = dict_trie->dict_list_->init_list(scis_, scis_num_, lemma_arr_, lemma_num_); assert(dl_success); // Construct the NGram information NGram& ngram = NGram::get_instance(); ngram.build_unigram(lemma_arr_, lemma_num_, lemma_arr_[lemma_num_ - 1].idx_by_hz + 1); // sort the lemma items according to the spelling idx string myqsort(lemma_arr_, lemma_num_, sizeof(LemmaEntry), compare_py); get_top_lemmas(); #ifdef ___DO_STATISTICS___ stat_init(); #endif lma_nds_used_num_le0_ = 1; // The root node bool dt_success = construct_subset(static_cast<void*>(lma_nodes_le0_), lemma_arr_, 0, lemma_num_, 0); if (!dt_success) { free_resource(); return false; } #ifdef ___DO_STATISTICS___ stat_print(); #endif // Move the node data and homo data to the DictTrie dict_trie->root_ = new LmaNodeLE0[lma_nds_used_num_le0_]; dict_trie->nodes_ge1_ = new LmaNodeGE1[lma_nds_used_num_ge1_]; size_t lma_idx_num = homo_idx_num_eq1_ + homo_idx_num_gt1_ + top_lmas_num_; dict_trie->lma_idx_buf_ = new unsigned char[lma_idx_num * kLemmaIdSize]; assert(NULL != dict_trie->root_); assert(NULL != dict_trie->lma_idx_buf_); dict_trie->lma_node_num_le0_ = lma_nds_used_num_le0_; dict_trie->lma_node_num_ge1_ = lma_nds_used_num_ge1_; dict_trie->lma_idx_buf_len_ = lma_idx_num * kLemmaIdSize; dict_trie->top_lmas_num_ = top_lmas_num_; memcpy(dict_trie->root_, lma_nodes_le0_, sizeof(LmaNodeLE0) * lma_nds_used_num_le0_); memcpy(dict_trie->nodes_ge1_, lma_nodes_ge1_, sizeof(LmaNodeGE1) * lma_nds_used_num_ge1_); for (size_t pos = 0; pos < homo_idx_num_eq1_ + homo_idx_num_gt1_; pos++) { id_to_charbuf(dict_trie->lma_idx_buf_ + pos * kLemmaIdSize, homo_idx_buf_[pos]); } for (size_t pos = homo_idx_num_eq1_ + homo_idx_num_gt1_; pos < lma_idx_num; pos++) { LemmaIdType idx = top_lmas_[pos - homo_idx_num_eq1_ - homo_idx_num_gt1_].idx_by_hz; id_to_charbuf(dict_trie->lma_idx_buf_ + pos * kLemmaIdSize, idx); } if (kPrintDebug0) { printf("homo_idx_num_eq1_: %d\n", homo_idx_num_eq1_); printf("homo_idx_num_gt1_: %d\n", homo_idx_num_gt1_); printf("top_lmas_num_: %d\n", top_lmas_num_); } free_resource(); if (kPrintDebug0) { printf("Building dict succeds\n"); } return dt_success; } void DictBuilder::id_to_charbuf(unsigned char *buf, LemmaIdType id) { if (NULL == buf) return; for (size_t pos = 0; pos < kLemmaIdSize; pos++) { (buf)[pos] = (unsigned char)(id >> (pos * 8)); } } void DictBuilder::set_son_offset(LmaNodeGE1 *node, size_t offset) { node->son_1st_off_l = static_cast<uint16>(offset); node->son_1st_off_h = static_cast<unsigned char>(offset >> 16); } void DictBuilder:: set_homo_id_buf_offset(LmaNodeGE1 *node, size_t offset) { node->homo_idx_buf_off_l = static_cast<uint16>(offset); node->homo_idx_buf_off_h = static_cast<unsigned char>(offset >> 16); } // All spelling strings will be converted to upper case, except that // spellings started with "ZH"/"CH"/"SH" will be converted to // "Zh"/"Ch"/"Sh" void DictBuilder::format_spelling_str(char *spl_str) { if (NULL == spl_str) return; uint16 pos = 0; while ('\0' != spl_str[pos]) { if (spl_str[pos] >= 'a' && spl_str[pos] <= 'z') spl_str[pos] = spl_str[pos] - 'a' + 'A'; if (1 == pos && 'H' == spl_str[pos]) { if ('C' == spl_str[0] || 'S' == spl_str[0] || 'Z' == spl_str[0]) { spl_str[pos] = 'h'; } } pos++; } } LemmaIdType DictBuilder::sort_lemmas_by_hz() { if (NULL == lemma_arr_ || 0 == lemma_num_) return 0; myqsort(lemma_arr_, lemma_num_, sizeof(LemmaEntry), cmp_lemma_entry_hzs); lemma_arr_[0].idx_by_hz = 1; LemmaIdType idx_max = 1; for (size_t i = 1; i < lemma_num_; i++) { if (utf16_strcmp(lemma_arr_[i].hanzi_str, lemma_arr_[i-1].hanzi_str)) { idx_max++; lemma_arr_[i].idx_by_hz = idx_max; } else { idx_max++; lemma_arr_[i].idx_by_hz = idx_max; } } return idx_max + 1; } size_t DictBuilder::build_scis() { if (NULL == scis_ || lemma_num_ * kMaxLemmaSize > scis_num_) return 0; SpellingTrie &spl_trie = SpellingTrie::get_instance(); // This first one is blank, because id 0 is invalid. scis_[0].freq = 0; scis_[0].hz = 0; scis_[0].splid.full_splid = 0; scis_[0].splid.half_splid = 0; scis_num_ = 1; // Copy the hanzis to the buffer for (size_t pos = 0; pos < lemma_num_; pos++) { size_t hz_num = lemma_arr_[pos].hz_str_len; for (size_t hzpos = 0; hzpos < hz_num; hzpos++) { scis_[scis_num_].hz = lemma_arr_[pos].hanzi_str[hzpos]; scis_[scis_num_].splid.full_splid = lemma_arr_[pos].spl_idx_arr[hzpos]; scis_[scis_num_].splid.half_splid = spl_trie.full_to_half(scis_[scis_num_].splid.full_splid); if (1 == hz_num) scis_[scis_num_].freq = lemma_arr_[pos].freq; else scis_[scis_num_].freq = 0.000001; scis_num_++; } } myqsort(scis_, scis_num_, sizeof(SingleCharItem), cmp_scis_hz_splid_freq); // Remove repeated items size_t unique_scis_num = 1; for (size_t pos = 1; pos < scis_num_; pos++) { if (scis_[pos].hz == scis_[pos - 1].hz && scis_[pos].splid.full_splid == scis_[pos - 1].splid.full_splid) continue; scis_[unique_scis_num] = scis_[pos]; scis_[unique_scis_num].splid.half_splid = spl_trie.full_to_half(scis_[pos].splid.full_splid); unique_scis_num++; } scis_num_ = unique_scis_num; // Update the lemma list. for (size_t pos = 0; pos < lemma_num_; pos++) { size_t hz_num = lemma_arr_[pos].hz_str_len; for (size_t hzpos = 0; hzpos < hz_num; hzpos++) { SingleCharItem key; key.hz = lemma_arr_[pos].hanzi_str[hzpos]; key.splid.full_splid = lemma_arr_[pos].spl_idx_arr[hzpos]; key.splid.half_splid = spl_trie.full_to_half(key.splid.full_splid); SingleCharItem *found; found = static_cast<SingleCharItem*>(mybsearch(&key, scis_, unique_scis_num, sizeof(SingleCharItem), cmp_scis_hz_splid)); assert(found); lemma_arr_[pos].hanzi_scis_ids[hzpos] = static_cast<uint16>(found - scis_); lemma_arr_[pos].spl_idx_arr[hzpos] = found->splid.full_splid; } } return scis_num_; } bool DictBuilder::construct_subset(void* parent, LemmaEntry* lemma_arr, size_t item_start, size_t item_end, size_t level) { if (level >= kMaxLemmaSize || item_end <= item_start) return false; // 1. Scan for how many sons size_t parent_son_num = 0; // LemmaNode *son_1st = NULL; // parent.num_of_son = 0; LemmaEntry *lma_last_start = lemma_arr_ + item_start; uint16 spl_idx_node = lma_last_start->spl_idx_arr[level]; // Scan for how many sons to be allocaed for (size_t i = item_start + 1; i< item_end; i++) { LemmaEntry *lma_current = lemma_arr + i; uint16 spl_idx_current = lma_current->spl_idx_arr[level]; if (spl_idx_current != spl_idx_node) { parent_son_num++; spl_idx_node = spl_idx_current; } } parent_son_num++; #ifdef ___DO_STATISTICS___ // Use to indicate whether all nodes of this layer have no son. bool allson_noson = true; assert(level < kMaxLemmaSize); if (parent_son_num > max_sonbuf_len_[level]) max_sonbuf_len_[level] = parent_son_num; total_son_num_[level] += parent_son_num; total_sonbuf_num_[level] += 1; if (parent_son_num == 1) sonbufs_num1_++; else sonbufs_numgt1_++; total_lma_node_num_ += parent_son_num; #endif // 2. Update the parent's information // Update the parent's son list; LmaNodeLE0 *son_1st_le0 = NULL; // only one of le0 or ge1 is used LmaNodeGE1 *son_1st_ge1 = NULL; // only one of le0 or ge1 is used. if (0 == level) { // the parent is root (static_cast<LmaNodeLE0*>(parent))->son_1st_off = lma_nds_used_num_le0_; son_1st_le0 = lma_nodes_le0_ + lma_nds_used_num_le0_; lma_nds_used_num_le0_ += parent_son_num; assert(parent_son_num <= 65535); (static_cast<LmaNodeLE0*>(parent))->num_of_son = static_cast<uint16>(parent_son_num); } else if (1 == level) { // the parent is a son of root (static_cast<LmaNodeLE0*>(parent))->son_1st_off = lma_nds_used_num_ge1_; son_1st_ge1 = lma_nodes_ge1_ + lma_nds_used_num_ge1_; lma_nds_used_num_ge1_ += parent_son_num; assert(parent_son_num <= 65535); (static_cast<LmaNodeLE0*>(parent))->num_of_son = static_cast<uint16>(parent_son_num); } else { set_son_offset((static_cast<LmaNodeGE1*>(parent)), lma_nds_used_num_ge1_); son_1st_ge1 = lma_nodes_ge1_ + lma_nds_used_num_ge1_; lma_nds_used_num_ge1_ += parent_son_num; assert(parent_son_num <= 255); (static_cast<LmaNodeGE1*>(parent))->num_of_son = (unsigned char)parent_son_num; } // 3. Now begin to construct the son one by one size_t son_pos = 0; lma_last_start = lemma_arr_ + item_start; spl_idx_node = lma_last_start->spl_idx_arr[level]; size_t homo_num = 0; if (lma_last_start->spl_idx_arr[level + 1] == 0) homo_num = 1; size_t item_start_next = item_start; for (size_t i = item_start + 1; i < item_end; i++) { LemmaEntry* lma_current = lemma_arr_ + i; uint16 spl_idx_current = lma_current->spl_idx_arr[level]; if (spl_idx_current == spl_idx_node) { if (lma_current->spl_idx_arr[level + 1] == 0) homo_num++; } else { // Construct a node LmaNodeLE0 *node_cur_le0 = NULL; // only one of them is valid LmaNodeGE1 *node_cur_ge1 = NULL; if (0 == level) { node_cur_le0 = son_1st_le0 + son_pos; node_cur_le0->spl_idx = spl_idx_node; node_cur_le0->homo_idx_buf_off = homo_idx_num_eq1_ + homo_idx_num_gt1_; node_cur_le0->son_1st_off = 0; homo_idx_num_eq1_ += homo_num; } else { node_cur_ge1 = son_1st_ge1 + son_pos; node_cur_ge1->spl_idx = spl_idx_node; set_homo_id_buf_offset(node_cur_ge1, (homo_idx_num_eq1_ + homo_idx_num_gt1_)); set_son_offset(node_cur_ge1, 0); homo_idx_num_gt1_ += homo_num; } if (homo_num > 0) { LemmaIdType* idx_buf = homo_idx_buf_ + homo_idx_num_eq1_ + homo_idx_num_gt1_ - homo_num; if (0 == level) { assert(homo_num <= 65535); node_cur_le0->num_of_homo = static_cast<uint16>(homo_num); } else { assert(homo_num <= 255); node_cur_ge1->num_of_homo = (unsigned char)homo_num; } for (size_t homo_pos = 0; homo_pos < homo_num; homo_pos++) { idx_buf[homo_pos] = lemma_arr_[item_start_next + homo_pos].idx_by_hz; } #ifdef ___DO_STATISTICS___ if (homo_num > max_homobuf_len_[level]) max_homobuf_len_[level] = homo_num; total_homo_num_[level] += homo_num; #endif } if (i - item_start_next > homo_num) { void *next_parent; if (0 == level) next_parent = static_cast<void*>(node_cur_le0); else next_parent = static_cast<void*>(node_cur_ge1); construct_subset(next_parent, lemma_arr, item_start_next + homo_num, i, level + 1); #ifdef ___DO_STATISTICS___ total_node_hasson_[level] += 1; allson_noson = false; #endif } // for the next son lma_last_start = lma_current; spl_idx_node = spl_idx_current; item_start_next = i; homo_num = 0; if (lma_current->spl_idx_arr[level + 1] == 0) homo_num = 1; son_pos++; } } // 4. The last one to construct LmaNodeLE0 *node_cur_le0 = NULL; // only one of them is valid LmaNodeGE1 *node_cur_ge1 = NULL; if (0 == level) { node_cur_le0 = son_1st_le0 + son_pos; node_cur_le0->spl_idx = spl_idx_node; node_cur_le0->homo_idx_buf_off = homo_idx_num_eq1_ + homo_idx_num_gt1_; node_cur_le0->son_1st_off = 0; homo_idx_num_eq1_ += homo_num; } else { node_cur_ge1 = son_1st_ge1 + son_pos; node_cur_ge1->spl_idx = spl_idx_node; set_homo_id_buf_offset(node_cur_ge1, (homo_idx_num_eq1_ + homo_idx_num_gt1_)); set_son_offset(node_cur_ge1, 0); homo_idx_num_gt1_ += homo_num; } if (homo_num > 0) { LemmaIdType* idx_buf = homo_idx_buf_ + homo_idx_num_eq1_ + homo_idx_num_gt1_ - homo_num; if (0 == level) { assert(homo_num <= 65535); node_cur_le0->num_of_homo = static_cast<uint16>(homo_num); } else { assert(homo_num <= 255); node_cur_ge1->num_of_homo = (unsigned char)homo_num; } for (size_t homo_pos = 0; homo_pos < homo_num; homo_pos++) { idx_buf[homo_pos] = lemma_arr[item_start_next + homo_pos].idx_by_hz; } #ifdef ___DO_STATISTICS___ if (homo_num > max_homobuf_len_[level]) max_homobuf_len_[level] = homo_num; total_homo_num_[level] += homo_num; #endif } if (item_end - item_start_next > homo_num) { void *next_parent; if (0 == level) next_parent = static_cast<void*>(node_cur_le0); else next_parent = static_cast<void*>(node_cur_ge1); construct_subset(next_parent, lemma_arr, item_start_next + homo_num, item_end, level + 1); #ifdef ___DO_STATISTICS___ total_node_hasson_[level] += 1; allson_noson = false; #endif } #ifdef ___DO_STATISTICS___ if (allson_noson) { total_sonbuf_allnoson_[level] += 1; total_node_in_sonbuf_allnoson_[level] += parent_son_num; } #endif assert(son_pos + 1 == parent_son_num); return true; } #ifdef ___DO_STATISTICS___ void DictBuilder::stat_init() { memset(max_sonbuf_len_, 0, sizeof(size_t) * kMaxLemmaSize); memset(max_homobuf_len_, 0, sizeof(size_t) * kMaxLemmaSize); memset(total_son_num_, 0, sizeof(size_t) * kMaxLemmaSize); memset(total_node_hasson_, 0, sizeof(size_t) * kMaxLemmaSize); memset(total_sonbuf_num_, 0, sizeof(size_t) * kMaxLemmaSize); memset(total_sonbuf_allnoson_, 0, sizeof(size_t) * kMaxLemmaSize); memset(total_node_in_sonbuf_allnoson_, 0, sizeof(size_t) * kMaxLemmaSize); memset(total_homo_num_, 0, sizeof(size_t) * kMaxLemmaSize); sonbufs_num1_ = 0; sonbufs_numgt1_ = 0; total_lma_node_num_ = 0; } void DictBuilder::stat_print() { printf("\n------------STAT INFO-------------\n"); printf("[root is layer -1]\n"); printf(".. max_sonbuf_len per layer(from layer 0):\n "); for (size_t i = 0; i < kMaxLemmaSize; i++) printf("%d, ", max_sonbuf_len_[i]); printf("-, \n"); printf(".. max_homobuf_len per layer:\n -, "); for (size_t i = 0; i < kMaxLemmaSize; i++) printf("%d, ", max_homobuf_len_[i]); printf("\n"); printf(".. total_son_num per layer:\n "); for (size_t i = 0; i < kMaxLemmaSize; i++) printf("%d, ", total_son_num_[i]); printf("-, \n"); printf(".. total_node_hasson per layer:\n 1, "); for (size_t i = 0; i < kMaxLemmaSize; i++) printf("%d, ", total_node_hasson_[i]); printf("\n"); printf(".. total_sonbuf_num per layer:\n "); for (size_t i = 0; i < kMaxLemmaSize; i++) printf("%d, ", total_sonbuf_num_[i]); printf("-, \n"); printf(".. total_sonbuf_allnoson per layer:\n "); for (size_t i = 0; i < kMaxLemmaSize; i++) printf("%d, ", total_sonbuf_allnoson_[i]); printf("-, \n"); printf(".. total_node_in_sonbuf_allnoson per layer:\n "); for (size_t i = 0; i < kMaxLemmaSize; i++) printf("%d, ", total_node_in_sonbuf_allnoson_[i]); printf("-, \n"); printf(".. total_homo_num per layer:\n 0, "); for (size_t i = 0; i < kMaxLemmaSize; i++) printf("%d, ", total_homo_num_[i]); printf("\n"); printf(".. son buf allocation number with only 1 son: %d\n", sonbufs_num1_); printf(".. son buf allocation number with more than 1 son: %d\n", sonbufs_numgt1_); printf(".. total lemma node number: %d\n", total_lma_node_num_ + 1); } #endif // ___DO_STATISTICS___ #endif // ___BUILD_MODEL___ } // namespace ime_pinyin