/* * Copyright (C) 2009 The Android Open Source Project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include "../include/userdict.h" #include "../include/splparser.h" #include "../include/ngram.h" #include <stdio.h> #include <string.h> #include <stdlib.h> #include <cutils/log.h> #include <unistd.h> #include <fcntl.h> #include <sys/stat.h> #include <assert.h> #include <ctype.h> #include <sys/types.h> #include <sys/time.h> #include <time.h> #include <pthread.h> #include <math.h> namespace ime_pinyin { #ifdef ___DEBUG_PERF___ static uint64 _ellapse_ = 0; static struct timeval _tv_start_, _tv_end_; #define DEBUG_PERF_BEGIN \ do { \ gettimeofday(&_tv_start_, NULL); \ } while(0) #define DEBUG_PERF_END \ do { \ gettimeofday(&_tv_end_, NULL); \ _ellapse_ = (_tv_end_.tv_sec - _tv_start_.tv_sec) * 1000000 + \ (_tv_end_.tv_usec - _tv_start_.tv_usec); \ } while(0) #define LOGD_PERF(message) \ ALOGD("PERFORMANCE[%s] %llu usec.", message, _ellapse_); #else #define DEBUG_PERF_BEGIN #define DEBUG_PERF_END #define LOGD_PERF(message) #endif // XXX File load and write are thread-safe by g_mutex_ static pthread_mutex_t g_mutex_ = PTHREAD_MUTEX_INITIALIZER; static struct timeval g_last_update_ = {0, 0}; inline uint32 UserDict::get_dict_file_size(UserDictInfo * info) { return (4 + info->lemma_size + (info->lemma_count << 3) #ifdef ___PREDICT_ENABLED___ + (info->lemma_count << 2) #endif #ifdef ___SYNC_ENABLED___ + (info->sync_count << 2) #endif + sizeof(*info)); } inline LmaScoreType UserDict::translate_score(int raw_score) { // 1) ori_freq: original user frequency uint32 ori_freq = extract_score_freq(raw_score); // 2) lmt_off: lmt index (week offset for example) uint64 lmt_off = ((raw_score & 0xffff0000) >> 16); if (kUserDictLMTBitWidth < 16) { uint64 mask = ~(1 << kUserDictLMTBitWidth); lmt_off &= mask; } // 3) now_off: current time index (current week offset for example) // assuming load_time_ is around current time uint64 now_off = load_time_.tv_sec; now_off = (now_off - kUserDictLMTSince) / kUserDictLMTGranularity; now_off = (now_off << (64 - kUserDictLMTBitWidth)); now_off = (now_off >> (64 - kUserDictLMTBitWidth)); // 4) factor: decide expand-factor int delta = now_off - lmt_off; if (delta > 4) delta = 4; int factor = 80 - (delta << 4); double tf = (double)(dict_info_.total_nfreq + total_other_nfreq_); return (LmaScoreType)(log((double)factor * (double)ori_freq / tf) * NGram::kLogValueAmplifier); } inline int UserDict::extract_score_freq(int raw_score) { // Frequence stored in lowest 16 bits int freq = (raw_score & 0x0000ffff); return freq; } inline uint64 UserDict::extract_score_lmt(int raw_score) { uint64 lmt = ((raw_score & 0xffff0000) >> 16); if (kUserDictLMTBitWidth < 16) { uint64 mask = ~(1 << kUserDictLMTBitWidth); lmt &= mask; } lmt = lmt * kUserDictLMTGranularity + kUserDictLMTSince; return lmt; } inline int UserDict::build_score(uint64 lmt, int freq) { lmt = (lmt - kUserDictLMTSince) / kUserDictLMTGranularity; lmt = (lmt << (64 - kUserDictLMTBitWidth)); lmt = (lmt >> (64 - kUserDictLMTBitWidth)); uint16 lmt16 = (uint16)lmt; int s = freq; s &= 0x0000ffff; s = (lmt16 << 16) | s; return s; } inline int64 UserDict::utf16le_atoll(uint16 *s, int len) { int64 ret = 0; if (len <= 0) return ret; int flag = 1; const uint16 * endp = s + len; if (*s == '-') { flag = -1; s++; } else if (*s == '+') { s++; } while (*s >= '0' && *s <= '9' && s < endp) { ret += ret * 10 + (*s) - '0'; s++; } return ret * flag; } inline int UserDict::utf16le_lltoa(int64 v, uint16 *s, int size) { if (!s || size <= 0) return 0; uint16 *endp = s + size; int ret_len = 0; if (v < 0) { *(s++) = '-'; ++ret_len; v *= -1; } uint16 *b = s; while (s < endp && v != 0) { *(s++) = '0' + (v % 10); v = v / 10; ++ret_len; } if (v != 0) return 0; --s; while (b < s) { *b = *s; ++b, --s; } return ret_len; } inline void UserDict::set_lemma_flag(uint32 offset, uint8 flag) { offset &= kUserDictOffsetMask; lemmas_[offset] |= flag; } inline char UserDict::get_lemma_flag(uint32 offset) { offset &= kUserDictOffsetMask; return (char)(lemmas_[offset]); } inline char UserDict::get_lemma_nchar(uint32 offset) { offset &= kUserDictOffsetMask; return (char)(lemmas_[offset + 1]); } inline uint16 * UserDict::get_lemma_spell_ids(uint32 offset) { offset &= kUserDictOffsetMask; return (uint16 *)(lemmas_ + offset + 2); } inline uint16 * UserDict::get_lemma_word(uint32 offset) { offset &= kUserDictOffsetMask; uint8 nchar = get_lemma_nchar(offset); return (uint16 *)(lemmas_ + offset + 2 + (nchar << 1)); } inline LemmaIdType UserDict::get_max_lemma_id() { // When a lemma is deleted, we don't not claim its id back for // simplicity and performance return start_id_ + dict_info_.lemma_count - 1; } inline bool UserDict::is_valid_lemma_id(LemmaIdType id) { if (id >= start_id_ && id <= get_max_lemma_id()) return true; return false; } inline bool UserDict::is_valid_state() { if (state_ == USER_DICT_NONE) return false; return true; } UserDict::UserDict() : start_id_(0), version_(0), lemmas_(NULL), offsets_(NULL), scores_(NULL), ids_(NULL), #ifdef ___PREDICT_ENABLED___ predicts_(NULL), #endif #ifdef ___SYNC_ENABLED___ syncs_(NULL), sync_count_size_(0), #endif offsets_by_id_(NULL), lemma_count_left_(0), lemma_size_left_(0), dict_file_(NULL), state_(USER_DICT_NONE) { memset(&dict_info_, 0, sizeof(dict_info_)); memset(&load_time_, 0, sizeof(load_time_)); #ifdef ___CACHE_ENABLED___ cache_init(); #endif } UserDict::~UserDict() { close_dict(); } bool UserDict::load_dict(const char *file_name, LemmaIdType start_id, LemmaIdType end_id) { #ifdef ___DEBUG_PERF___ DEBUG_PERF_BEGIN; #endif dict_file_ = strdup(file_name); if (!dict_file_) return false; start_id_ = start_id; if (false == validate(file_name) && false == reset(file_name)) { goto error; } if (false == load(file_name, start_id)) { goto error; } state_ = USER_DICT_SYNC; gettimeofday(&load_time_, NULL); #ifdef ___DEBUG_PERF___ DEBUG_PERF_END; LOGD_PERF("load_dict"); #endif return true; error: free((void*)dict_file_); start_id_ = 0; return false; } bool UserDict::close_dict() { if (state_ == USER_DICT_NONE) return true; if (state_ == USER_DICT_SYNC) goto out; // If dictionary is written back by others, // we can not simply write back here // To do a safe flush, we have to discard all newly added // lemmas and try to reload dict file. pthread_mutex_lock(&g_mutex_); if (load_time_.tv_sec > g_last_update_.tv_sec || (load_time_.tv_sec == g_last_update_.tv_sec && load_time_.tv_usec > g_last_update_.tv_usec)) { write_back(); gettimeofday(&g_last_update_, NULL); } pthread_mutex_unlock(&g_mutex_); out: free((void*)dict_file_); free(lemmas_); free(offsets_); free(offsets_by_id_); free(scores_); free(ids_); #ifdef ___PREDICT_ENABLED___ free(predicts_); #endif version_ = 0; dict_file_ = NULL; lemmas_ = NULL; #ifdef ___SYNC_ENABLED___ syncs_ = NULL; sync_count_size_ = 0; #endif offsets_ = NULL; offsets_by_id_ = NULL; scores_ = NULL; ids_ = NULL; #ifdef ___PREDICT_ENABLED___ predicts_ = NULL; #endif memset(&dict_info_, 0, sizeof(dict_info_)); lemma_count_left_ = 0; lemma_size_left_ = 0; state_ = USER_DICT_NONE; return true; } size_t UserDict::number_of_lemmas() { return dict_info_.lemma_count; } void UserDict::reset_milestones(uint16 from_step, MileStoneHandle from_handle) { return; } MileStoneHandle UserDict::extend_dict(MileStoneHandle from_handle, const DictExtPara *dep, LmaPsbItem *lpi_items, size_t lpi_max, size_t *lpi_num) { if (is_valid_state() == false) return 0; bool need_extend = false; #ifdef ___DEBUG_PERF___ DEBUG_PERF_BEGIN; #endif *lpi_num = _get_lpis(dep->splids, dep->splids_extended + 1, lpi_items, lpi_max, &need_extend); #ifdef ___DEBUG_PERF___ DEBUG_PERF_END; LOGD_PERF("extend_dict"); #endif return ((*lpi_num > 0 || need_extend) ? 1 : 0); } int UserDict::is_fuzzy_prefix_spell_id( const uint16 * id1, uint16 len1, const UserDictSearchable *searchable) { if (len1 < searchable->splids_len) return 0; SpellingTrie &spl_trie = SpellingTrie::get_instance(); uint32 i = 0; for (i = 0; i < searchable->splids_len; i++) { const char py1 = *spl_trie.get_spelling_str(id1[i]); uint16 off = 8 * (i % 4); const char py2 = ((searchable->signature[i/4] & (0xff << off)) >> off); if (py1 == py2) continue; return 0; } return 1; } int UserDict::fuzzy_compare_spell_id( const uint16 * id1, uint16 len1, const UserDictSearchable *searchable) { if (len1 < searchable->splids_len) return -1; if (len1 > searchable->splids_len) return 1; SpellingTrie &spl_trie = SpellingTrie::get_instance(); uint32 i = 0; for (i = 0; i < len1; i++) { const char py1 = *spl_trie.get_spelling_str(id1[i]); uint16 off = 8 * (i % 4); const char py2 = ((searchable->signature[i/4] & (0xff << off)) >> off); if (py1 == py2) continue; if (py1 > py2) return 1; return -1; } return 0; } bool UserDict::is_prefix_spell_id( const uint16 * fullids, uint16 fulllen, const UserDictSearchable *searchable) { if (fulllen < searchable->splids_len) return false; uint32 i = 0; for (; i < searchable->splids_len; i++) { uint16 start_id = searchable->splid_start[i]; uint16 count = searchable->splid_count[i]; if (fullids[i] >= start_id && fullids[i] < start_id + count) continue; else return false; } return true; } bool UserDict::equal_spell_id( const uint16 * fullids, uint16 fulllen, const UserDictSearchable *searchable) { if (fulllen != searchable->splids_len) return false; uint32 i = 0; for (; i < fulllen; i++) { uint16 start_id = searchable->splid_start[i]; uint16 count = searchable->splid_count[i]; if (fullids[i] >= start_id && fullids[i] < start_id + count) continue; else return false; } return true; } int32 UserDict::locate_first_in_offsets(const UserDictSearchable * searchable) { int32 begin = 0; int32 end = dict_info_.lemma_count - 1; int32 middle = -1; int32 first_prefix = middle; int32 last_matched = middle; while (begin <= end) { middle = (begin + end) >> 1; uint32 offset = offsets_[middle]; uint8 nchar = get_lemma_nchar(offset); const uint16 * splids = get_lemma_spell_ids(offset); int cmp = fuzzy_compare_spell_id(splids, nchar, searchable); int pre = is_fuzzy_prefix_spell_id(splids, nchar, searchable); if (pre) first_prefix = middle; if (cmp < 0) { begin = middle + 1; } else if (cmp > 0) { end = middle - 1; } else { end = middle - 1; last_matched = middle; } } return first_prefix; } void UserDict::prepare_locate(UserDictSearchable *searchable, const uint16 *splid_str, uint16 splid_str_len) { searchable->splids_len = splid_str_len; memset(searchable->signature, 0, sizeof(searchable->signature)); SpellingTrie &spl_trie = SpellingTrie::get_instance(); uint32 i = 0; for (; i < splid_str_len; i++) { if (spl_trie.is_half_id(splid_str[i])) { searchable->splid_count[i] = spl_trie.half_to_full(splid_str[i], &(searchable->splid_start[i])); } else { searchable->splid_count[i] = 1; searchable->splid_start[i] = splid_str[i]; } const unsigned char py = *spl_trie.get_spelling_str(splid_str[i]); searchable->signature[i>>2] |= (py << (8 * (i % 4))); } } size_t UserDict::get_lpis(const uint16 *splid_str, uint16 splid_str_len, LmaPsbItem *lpi_items, size_t lpi_max) { return _get_lpis(splid_str, splid_str_len, lpi_items, lpi_max, NULL); } size_t UserDict::_get_lpis(const uint16 *splid_str, uint16 splid_str_len, LmaPsbItem *lpi_items, size_t lpi_max, bool * need_extend) { bool tmp_extend; if (!need_extend) need_extend = &tmp_extend; *need_extend = false; if (is_valid_state() == false) return 0; if (lpi_max <= 0) return 0; if (0 == pthread_mutex_trylock(&g_mutex_)) { if (load_time_.tv_sec < g_last_update_.tv_sec || (load_time_.tv_sec == g_last_update_.tv_sec && load_time_.tv_usec < g_last_update_.tv_usec)) { // Others updated disk file, have to reload pthread_mutex_unlock(&g_mutex_); flush_cache(); } else { pthread_mutex_unlock(&g_mutex_); } } else { } UserDictSearchable searchable; prepare_locate(&searchable, splid_str, splid_str_len); uint32 max_off = dict_info_.lemma_count; #ifdef ___CACHE_ENABLED___ int32 middle; uint32 start, count; bool cached = cache_hit(&searchable, &start, &count); if (cached) { middle = start; max_off = start + count; } else { middle = locate_first_in_offsets(&searchable); start = middle; } #else int32 middle = locate_first_in_offsets(&searchable); #endif if (middle == -1) { #ifdef ___CACHE_ENABLED___ if (!cached) cache_push(USER_DICT_MISS_CACHE, &searchable, 0, 0); #endif return 0; } size_t lpi_current = 0; bool fuzzy_break = false; bool prefix_break = false; while ((size_t)middle < max_off && !fuzzy_break && !prefix_break) { if (lpi_current >= lpi_max) break; uint32 offset = offsets_[middle]; // Ignore deleted lemmas if (offset & kUserDictOffsetFlagRemove) { middle++; continue; } uint8 nchar = get_lemma_nchar(offset); uint16 * splids = get_lemma_spell_ids(offset); #ifdef ___CACHE_ENABLED___ if (!cached && 0 != fuzzy_compare_spell_id(splids, nchar, &searchable)) { #else if (0 != fuzzy_compare_spell_id(splids, nchar, &searchable)) { #endif fuzzy_break = true; } if (prefix_break == false) { if (is_fuzzy_prefix_spell_id(splids, nchar, &searchable)) { if (*need_extend == false && is_prefix_spell_id(splids, nchar, &searchable)) { *need_extend = true; } } else { prefix_break = true; } } if (equal_spell_id(splids, nchar, &searchable) == true) { lpi_items[lpi_current].psb = translate_score(scores_[middle]); lpi_items[lpi_current].id = ids_[middle]; lpi_items[lpi_current].lma_len = nchar; lpi_current++; } middle++; } #ifdef ___CACHE_ENABLED___ if (!cached) { count = middle - start; cache_push(USER_DICT_CACHE, &searchable, start, count); } #endif return lpi_current; } uint16 UserDict::get_lemma_str(LemmaIdType id_lemma, char16* str_buf, uint16 str_max) { if (is_valid_state() == false) return 0; if (is_valid_lemma_id(id_lemma) == false) return 0; uint32 offset = offsets_by_id_[id_lemma - start_id_]; uint8 nchar = get_lemma_nchar(offset); char16 * str = get_lemma_word(offset); uint16 m = nchar < str_max -1 ? nchar : str_max - 1; int i = 0; for (; i < m; i++) { str_buf[i] = str[i]; } str_buf[i] = 0; return m; } uint16 UserDict::get_lemma_splids(LemmaIdType id_lemma, uint16 *splids, uint16 splids_max, bool arg_valid) { if (is_valid_lemma_id(id_lemma) == false) return 0; uint32 offset = offsets_by_id_[id_lemma - start_id_]; uint8 nchar = get_lemma_nchar(offset); const uint16 * ids = get_lemma_spell_ids(offset); int i = 0; for (; i < nchar && i < splids_max; i++) splids[i] = ids[i]; return i; } size_t UserDict::predict(const char16 last_hzs[], uint16 hzs_len, NPredictItem *npre_items, size_t npre_max, size_t b4_used) { uint32 new_added = 0; #ifdef ___PREDICT_ENABLED___ int32 end = dict_info_.lemma_count - 1; int j = locate_first_in_predicts((const uint16*)last_hzs, hzs_len); if (j == -1) return 0; while (j <= end) { uint32 offset = predicts_[j]; // Ignore deleted lemmas if (offset & kUserDictOffsetFlagRemove) { j++; continue; } uint32 nchar = get_lemma_nchar(offset); uint16 * words = get_lemma_word(offset); uint16 * splids = get_lemma_spell_ids(offset); if (nchar <= hzs_len) { j++; continue; } if (memcmp(words, last_hzs, hzs_len << 1) == 0) { if (new_added >= npre_max) { return new_added; } uint32 cpy_len = (nchar < kMaxPredictSize ? (nchar << 1) : (kMaxPredictSize << 1)) - (hzs_len << 1); npre_items[new_added].his_len = hzs_len; npre_items[new_added].psb = get_lemma_score(words, splids, nchar); memcpy(npre_items[new_added].pre_hzs, words + hzs_len, cpy_len); if ((cpy_len >> 1) < kMaxPredictSize) { npre_items[new_added].pre_hzs[cpy_len >> 1] = 0; } new_added++; } else { break; } j++; } #endif return new_added; } int32 UserDict::locate_in_offsets(char16 lemma_str[], uint16 splid_str[], uint16 lemma_len) { int32 max_off = dict_info_.lemma_count; UserDictSearchable searchable; prepare_locate(&searchable, splid_str, lemma_len); #ifdef ___CACHE_ENABLED___ int32 off; uint32 start, count; bool cached = load_cache(&searchable, &start, &count); if (cached) { off = start; max_off = start + count; } else { off = locate_first_in_offsets(&searchable); start = off; } #else int32 off = locate_first_in_offsets(&searchable); #endif if (off == -1) { return off; } while (off < max_off) { uint32 offset = offsets_[off]; if (offset & kUserDictOffsetFlagRemove) { off++; continue; } uint16 * splids = get_lemma_spell_ids(offset); #ifdef ___CACHE_ENABLED___ if (!cached && 0 != fuzzy_compare_spell_id(splids, lemma_len, &searchable)) break; #else if (0 != fuzzy_compare_spell_id(splids, lemma_len, &searchable)) break; #endif if (equal_spell_id(splids, lemma_len, &searchable) == true) { uint16 * str = get_lemma_word(offset); uint32 i = 0; for (i = 0; i < lemma_len; i++) { if (str[i] == lemma_str[i]) continue; break; } if (i < lemma_len) { off++; continue; } #ifdef ___CACHE_ENABLED___ // No need to save_cache here, since current function is invoked by // put_lemma. It's rarely possible for a user input same lemma twice. // That means first time user type a new lemma, it is newly added into // user dictionary, then it's possible that user type the same lemma // again. // Another reason save_cache can not be invoked here is this function // aborts when lemma is found, and it never knows the count. #endif return off; } off++; } return -1; } #ifdef ___PREDICT_ENABLED___ uint32 UserDict::locate_where_to_insert_in_predicts( const uint16 * words, int lemma_len) { int32 begin = 0; int32 end = dict_info_.lemma_count - 1; int32 middle = end; uint32 last_matched = middle; while (begin <= end) { middle = (begin + end) >> 1; uint32 offset = offsets_[middle]; uint8 nchar = get_lemma_nchar(offset); const uint16 * ws = get_lemma_word(offset); uint32 minl = nchar < lemma_len ? nchar : lemma_len; uint32 k = 0; int cmp = 0; for (; k < minl; k++) { if (ws[k] < words[k]) { cmp = -1; break; } else if (ws[k] > words[k]) { cmp = 1; break; } } if (cmp == 0) { if (nchar < lemma_len) cmp = -1; else if (nchar > lemma_len) cmp = 1; } if (cmp < 0) { begin = middle + 1; last_matched = middle; } else if (cmp > 0) { end = middle - 1; } else { end = middle - 1; last_matched = middle; } } return last_matched; } int32 UserDict::locate_first_in_predicts(const uint16 * words, int lemma_len) { int32 begin = 0; int32 end = dict_info_.lemma_count - 1; int32 middle = -1; int32 last_matched = middle; while (begin <= end) { middle = (begin + end) >> 1; uint32 offset = offsets_[middle]; uint8 nchar = get_lemma_nchar(offset); const uint16 * ws = get_lemma_word(offset); uint32 minl = nchar < lemma_len ? nchar : lemma_len; uint32 k = 0; int cmp = 0; for (; k < minl; k++) { if (ws[k] < words[k]) { cmp = -1; break; } else if (ws[k] > words[k]) { cmp = 1; break; } } if (cmp == 0) { if (nchar >= lemma_len) last_matched = middle; if (nchar < lemma_len) cmp = -1; else if (nchar > lemma_len) cmp = 1; } if (cmp < 0) { begin = middle + 1; } else if (cmp > 0) { end = middle - 1; } else { end = middle - 1; } } return last_matched; } #endif LemmaIdType UserDict::get_lemma_id(char16 lemma_str[], uint16 splids[], uint16 lemma_len) { int32 off = locate_in_offsets(lemma_str, splids, lemma_len); if (off == -1) { return 0; } return ids_[off]; } LmaScoreType UserDict::get_lemma_score(LemmaIdType lemma_id) { if (is_valid_state() == false) return 0; if (is_valid_lemma_id(lemma_id) == false) return 0; return translate_score(_get_lemma_score(lemma_id)); } LmaScoreType UserDict::get_lemma_score(char16 lemma_str[], uint16 splids[], uint16 lemma_len) { if (is_valid_state() == false) return 0; return translate_score(_get_lemma_score(lemma_str, splids, lemma_len)); } int UserDict::_get_lemma_score(LemmaIdType lemma_id) { if (is_valid_state() == false) return 0; if (is_valid_lemma_id(lemma_id) == false) return 0; uint32 offset = offsets_by_id_[lemma_id - start_id_]; uint32 nchar = get_lemma_nchar(offset); uint16 * spl = get_lemma_spell_ids(offset); uint16 * wrd = get_lemma_word(offset); int32 off = locate_in_offsets(wrd, spl, nchar); if (off == -1) { return 0; } return scores_[off]; } int UserDict::_get_lemma_score(char16 lemma_str[], uint16 splids[], uint16 lemma_len) { if (is_valid_state() == false) return 0; int32 off = locate_in_offsets(lemma_str, splids, lemma_len); if (off == -1) { return 0; } return scores_[off]; } #ifdef ___SYNC_ENABLED___ void UserDict::remove_lemma_from_sync_list(uint32 offset) { offset &= kUserDictOffsetMask; uint32 i = 0; for (; i < dict_info_.sync_count; i++) { unsigned int off = (syncs_[i] & kUserDictOffsetMask); if (off == offset) break; } if (i < dict_info_.sync_count) { syncs_[i] = syncs_[dict_info_.sync_count - 1]; dict_info_.sync_count--; } } #endif #ifdef ___PREDICT_ENABLED___ void UserDict::remove_lemma_from_predict_list(uint32 offset) { offset &= kUserDictOffsetMask; uint32 i = 0; for (; i < dict_info_.lemma_count; i++) { unsigned int off = (predicts_[i] & kUserDictOffsetMask); if (off == offset) { predicts_[i] |= kUserDictOffsetFlagRemove; break; } } } #endif bool UserDict::remove_lemma_by_offset_index(int offset_index) { if (is_valid_state() == false) return 0; int32 off = offset_index; if (off == -1) { return false; } uint32 offset = offsets_[off]; uint32 nchar = get_lemma_nchar(offset); offsets_[off] |= kUserDictOffsetFlagRemove; #ifdef ___SYNC_ENABLED___ // Remove corresponding sync item remove_lemma_from_sync_list(offset); #endif #ifdef ___PREDICT_ENABLED___ remove_lemma_from_predict_list(offset); #endif dict_info_.free_count++; dict_info_.free_size += (2 + (nchar << 2)); if (state_ < USER_DICT_OFFSET_DIRTY) state_ = USER_DICT_OFFSET_DIRTY; return true; } bool UserDict::remove_lemma(LemmaIdType lemma_id) { if (is_valid_state() == false) return 0; if (is_valid_lemma_id(lemma_id) == false) return false; uint32 offset = offsets_by_id_[lemma_id - start_id_]; uint32 nchar = get_lemma_nchar(offset); uint16 * spl = get_lemma_spell_ids(offset); uint16 * wrd = get_lemma_word(offset); int32 off = locate_in_offsets(wrd, spl, nchar); return remove_lemma_by_offset_index(off); } void UserDict::flush_cache() { LemmaIdType start_id = start_id_; const char * file = strdup(dict_file_); if (!file) return; close_dict(); load_dict(file, start_id, kUserDictIdEnd); free((void*)file); #ifdef ___CACHE_ENABLED___ cache_init(); #endif return; } bool UserDict::reset(const char *file) { FILE *fp = fopen(file, "w+"); if (!fp) { return false; } uint32 version = kUserDictVersion; size_t wred = fwrite(&version, 1, 4, fp); UserDictInfo info; memset(&info, 0, sizeof(info)); // By default, no limitation for lemma count and size // thereby, reclaim_ratio is never used wred += fwrite(&info, 1, sizeof(info), fp); if (wred != sizeof(info) + sizeof(version)) { fclose(fp); unlink(file); return false; } fclose(fp); return true; } bool UserDict::validate(const char *file) { // b is ignored in POSIX compatible os including Linux // while b is important flag for Windows to specify binary mode FILE *fp = fopen(file, "rb"); if (!fp) { return false; } size_t size; size_t readed; uint32 version; UserDictInfo dict_info; // validate int err = fseek(fp, 0, SEEK_END); if (err) { goto error; } size = ftell(fp); if (size < 4 + sizeof(dict_info)) { goto error; } err = fseek(fp, 0, SEEK_SET); if (err) { goto error; } readed = fread(&version, 1, sizeof(version), fp); if (readed < sizeof(version)) { goto error; } if (version != kUserDictVersion) { goto error; } err = fseek(fp, -1 * sizeof(dict_info), SEEK_END); if (err) { goto error; } readed = fread(&dict_info, 1, sizeof(dict_info), fp); if (readed != sizeof(dict_info)) { goto error; } if (size != get_dict_file_size(&dict_info)) { goto error; } fclose(fp); return true; error: fclose(fp); return false; } bool UserDict::load(const char *file, LemmaIdType start_id) { if (0 != pthread_mutex_trylock(&g_mutex_)) { return false; } // b is ignored in POSIX compatible os including Linux // while b is important flag for Windows to specify binary mode FILE *fp = fopen(file, "rb"); if (!fp) { pthread_mutex_unlock(&g_mutex_); return false; } size_t readed, toread; UserDictInfo dict_info; uint8 *lemmas = NULL; uint32 *offsets = NULL; #ifdef ___SYNC_ENABLED___ uint32 *syncs = NULL; #endif uint32 *scores = NULL; uint32 *ids = NULL; uint32 *offsets_by_id = NULL; #ifdef ___PREDICT_ENABLED___ uint32 *predicts = NULL; #endif size_t i; int err; err = fseek(fp, -1 * sizeof(dict_info), SEEK_END); if (err) goto error; readed = fread(&dict_info, 1, sizeof(dict_info), fp); if (readed != sizeof(dict_info)) goto error; lemmas = (uint8 *)malloc( dict_info.lemma_size + (kUserDictPreAlloc * (2 + (kUserDictAverageNchar << 2)))); if (!lemmas) goto error; offsets = (uint32 *)malloc((dict_info.lemma_count + kUserDictPreAlloc) << 2); if (!offsets) goto error; #ifdef ___PREDICT_ENABLED___ predicts = (uint32 *)malloc((dict_info.lemma_count + kUserDictPreAlloc) << 2); if (!predicts) goto error; #endif #ifdef ___SYNC_ENABLED___ syncs = (uint32 *)malloc((dict_info.sync_count + kUserDictPreAlloc) << 2); if (!syncs) goto error; #endif scores = (uint32 *)malloc((dict_info.lemma_count + kUserDictPreAlloc) << 2); if (!scores) goto error; ids = (uint32 *)malloc((dict_info.lemma_count + kUserDictPreAlloc) << 2); if (!ids) goto error; offsets_by_id = (uint32 *)malloc( (dict_info.lemma_count + kUserDictPreAlloc) << 2); if (!offsets_by_id) goto error; err = fseek(fp, 4, SEEK_SET); if (err) goto error; readed = 0; while (readed < dict_info.lemma_size && !ferror(fp) && !feof(fp)) { readed += fread(lemmas + readed, 1, dict_info.lemma_size - readed, fp); } if (readed < dict_info.lemma_size) goto error; toread = (dict_info.lemma_count << 2); readed = 0; while (readed < toread && !ferror(fp) && !feof(fp)) { readed += fread((((uint8*)offsets) + readed), 1, toread - readed, fp); } if (readed < toread) goto error; #ifdef ___PREDICT_ENABLED___ toread = (dict_info.lemma_count << 2); readed = 0; while (readed < toread && !ferror(fp) && !feof(fp)) { readed += fread((((uint8*)predicts) + readed), 1, toread - readed, fp); } if (readed < toread) goto error; #endif readed = 0; while (readed < toread && !ferror(fp) && !feof(fp)) { readed += fread((((uint8*)scores) + readed), 1, toread - readed, fp); } if (readed < toread) goto error; #ifdef ___SYNC_ENABLED___ toread = (dict_info.sync_count << 2); readed = 0; while (readed < toread && !ferror(fp) && !feof(fp)) { readed += fread((((uint8*)syncs) + readed), 1, toread - readed, fp); } if (readed < toread) goto error; #endif for (i = 0; i < dict_info.lemma_count; i++) { ids[i] = start_id + i; offsets_by_id[i] = offsets[i]; } lemmas_ = lemmas; offsets_ = offsets; #ifdef ___SYNC_ENABLED___ syncs_ = syncs; sync_count_size_ = dict_info.sync_count + kUserDictPreAlloc; #endif offsets_by_id_ = offsets_by_id; scores_ = scores; ids_ = ids; #ifdef ___PREDICT_ENABLED___ predicts_ = predicts; #endif lemma_count_left_ = kUserDictPreAlloc; lemma_size_left_ = kUserDictPreAlloc * (2 + (kUserDictAverageNchar << 2)); memcpy(&dict_info_, &dict_info, sizeof(dict_info)); state_ = USER_DICT_SYNC; fclose(fp); pthread_mutex_unlock(&g_mutex_); return true; error: if (lemmas) free(lemmas); if (offsets) free(offsets); #ifdef ___SYNC_ENABLED___ if (syncs) free(syncs); #endif if (scores) free(scores); if (ids) free(ids); if (offsets_by_id) free(offsets_by_id); #ifdef ___PREDICT_ENABLED___ if (predicts) free(predicts); #endif fclose(fp); pthread_mutex_unlock(&g_mutex_); return false; } void UserDict::write_back() { // XXX write back is only allowed from close_dict due to thread-safe sake if (state_ == USER_DICT_NONE || state_ == USER_DICT_SYNC) return; int fd = open(dict_file_, O_WRONLY); if (fd == -1) return; switch (state_) { case USER_DICT_DEFRAGMENTED: write_back_all(fd); break; case USER_DICT_LEMMA_DIRTY: write_back_lemma(fd); break; case USER_DICT_OFFSET_DIRTY: write_back_offset(fd); break; case USER_DICT_SCORE_DIRTY: write_back_score(fd); break; #ifdef ___SYNC_ENABLED___ case USER_DICT_SYNC_DIRTY: write_back_sync(fd); break; #endif default: break; } // It seems truncate is not need on Linux, Windows except Mac // I am doing it here anyway for safety. off_t cur = lseek(fd, 0, SEEK_CUR); ftruncate(fd, cur); close(fd); state_ = USER_DICT_SYNC; } #ifdef ___SYNC_ENABLED___ void UserDict::write_back_sync(int fd) { int err = lseek(fd, 4 + dict_info_.lemma_size + (dict_info_.lemma_count << 3) #ifdef ___PREDICT_ENABLED___ + (dict_info_.lemma_count << 2) #endif , SEEK_SET); if (err == -1) return; write(fd, syncs_, dict_info_.sync_count << 2); write(fd, &dict_info_, sizeof(dict_info_)); } #endif void UserDict::write_back_offset(int fd) { int err = lseek(fd, 4 + dict_info_.lemma_size, SEEK_SET); if (err == -1) return; write(fd, offsets_, dict_info_.lemma_count << 2); #ifdef ___PREDICT_ENABLED___ write(fd, predicts_, dict_info_.lemma_count << 2); #endif write(fd, scores_, dict_info_.lemma_count << 2); #ifdef ___SYNC_ENABLED___ write(fd, syncs_, dict_info_.sync_count << 2); #endif write(fd, &dict_info_, sizeof(dict_info_)); } void UserDict::write_back_score(int fd) { int err = lseek(fd, 4 + dict_info_.lemma_size + (dict_info_.lemma_count << 2) #ifdef ___PREDICT_ENABLED___ + (dict_info_.lemma_count << 2) #endif , SEEK_SET); if (err == -1) return; write(fd, scores_, dict_info_.lemma_count << 2); #ifdef ___SYNC_ENABLED___ write(fd, syncs_, dict_info_.sync_count << 2); #endif write(fd, &dict_info_, sizeof(dict_info_)); } void UserDict::write_back_lemma(int fd) { int err = lseek(fd, 4, SEEK_SET); if (err == -1) return; // New lemmas are always appended, no need to write whole lemma block size_t need_write = kUserDictPreAlloc * (2 + (kUserDictAverageNchar << 2)) - lemma_size_left_; err = lseek(fd, dict_info_.lemma_size - need_write, SEEK_CUR); if (err == -1) return; write(fd, lemmas_ + dict_info_.lemma_size - need_write, need_write); write(fd, offsets_, dict_info_.lemma_count << 2); #ifdef ___PREDICT_ENABLED___ write(fd, predicts_, dict_info_.lemma_count << 2); #endif write(fd, scores_, dict_info_.lemma_count << 2); #ifdef ___SYNC_ENABLED___ write(fd, syncs_, dict_info_.sync_count << 2); #endif write(fd, &dict_info_, sizeof(dict_info_)); } void UserDict::write_back_all(int fd) { // XXX lemma_size is handled differently in writeall // and writelemma. I update lemma_size and lemma_count in different // places for these two cases. Should fix it to make it consistent. int err = lseek(fd, 4, SEEK_SET); if (err == -1) return; write(fd, lemmas_, dict_info_.lemma_size); write(fd, offsets_, dict_info_.lemma_count << 2); #ifdef ___PREDICT_ENABLED___ write(fd, predicts_, dict_info_.lemma_count << 2); #endif write(fd, scores_, dict_info_.lemma_count << 2); #ifdef ___SYNC_ENABLED___ write(fd, syncs_, dict_info_.sync_count << 2); #endif write(fd, &dict_info_, sizeof(dict_info_)); } #ifdef ___CACHE_ENABLED___ bool UserDict::load_cache(UserDictSearchable *searchable, uint32 *offset, uint32 *length) { UserDictCache *cache = &caches_[searchable->splids_len - 1]; if (cache->head == cache->tail) return false; uint16 j, sig_len = kMaxLemmaSize / 4; uint16 i = cache->head; while (1) { j = 0; for (; j < sig_len; j++) { if (cache->signatures[i][j] != searchable->signature[j]) break; } if (j < sig_len) { i++; if (i >= kUserDictCacheSize) i -= kUserDictCacheSize; if (i == cache->tail) break; continue; } *offset = cache->offsets[i]; *length = cache->lengths[i]; return true; } return false; } void UserDict::save_cache(UserDictSearchable *searchable, uint32 offset, uint32 length) { UserDictCache *cache = &caches_[searchable->splids_len - 1]; uint16 next = cache->tail; cache->offsets[next] = offset; cache->lengths[next] = length; uint16 sig_len = kMaxLemmaSize / 4; uint16 j = 0; for (; j < sig_len; j++) { cache->signatures[next][j] = searchable->signature[j]; } if (++next >= kUserDictCacheSize) { next -= kUserDictCacheSize; } if (next == cache->head) { cache->head++; if (cache->head >= kUserDictCacheSize) { cache->head -= kUserDictCacheSize; } } cache->tail = next; } void UserDict::reset_cache() { memset(caches_, 0, sizeof(caches_)); } bool UserDict::load_miss_cache(UserDictSearchable *searchable) { UserDictMissCache *cache = &miss_caches_[searchable->splids_len - 1]; if (cache->head == cache->tail) return false; uint16 j, sig_len = kMaxLemmaSize / 4; uint16 i = cache->head; while (1) { j = 0; for (; j < sig_len; j++) { if (cache->signatures[i][j] != searchable->signature[j]) break; } if (j < sig_len) { i++; if (i >= kUserDictMissCacheSize) i -= kUserDictMissCacheSize; if (i == cache->tail) break; continue; } return true; } return false; } void UserDict::save_miss_cache(UserDictSearchable *searchable) { UserDictMissCache *cache = &miss_caches_[searchable->splids_len - 1]; uint16 next = cache->tail; uint16 sig_len = kMaxLemmaSize / 4; uint16 j = 0; for (; j < sig_len; j++) { cache->signatures[next][j] = searchable->signature[j]; } if (++next >= kUserDictMissCacheSize) { next -= kUserDictMissCacheSize; } if (next == cache->head) { cache->head++; if (cache->head >= kUserDictMissCacheSize) { cache->head -= kUserDictMissCacheSize; } } cache->tail = next; } void UserDict::reset_miss_cache() { memset(miss_caches_, 0, sizeof(miss_caches_)); } void UserDict::cache_init() { reset_cache(); reset_miss_cache(); } bool UserDict::cache_hit(UserDictSearchable *searchable, uint32 *offset, uint32 *length) { bool hit = load_miss_cache(searchable); if (hit) { *offset = 0; *length = 0; return true; } hit = load_cache(searchable, offset, length); if (hit) { return true; } return false; } void UserDict::cache_push(UserDictCacheType type, UserDictSearchable *searchable, uint32 offset, uint32 length) { switch (type) { case USER_DICT_MISS_CACHE: save_miss_cache(searchable); break; case USER_DICT_CACHE: save_cache(searchable, offset, length); break; default: break; } } #endif void UserDict::defragment(void) { #ifdef ___DEBUG_PERF___ DEBUG_PERF_BEGIN; #endif if (is_valid_state() == false) return; // Fixup offsets_, set REMOVE flag to lemma's flag if needed size_t first_freed = 0; size_t first_inuse = 0; while (first_freed < dict_info_.lemma_count) { // Find first freed offset while ((offsets_[first_freed] & kUserDictOffsetFlagRemove) == 0 && first_freed < dict_info_.lemma_count) { first_freed++; } if (first_freed < dict_info_.lemma_count) { // Save REMOVE flag to lemma flag int off = offsets_[first_freed]; set_lemma_flag(off, kUserDictLemmaFlagRemove); } else { break; } // Find first inuse offse after first_freed first_inuse = first_freed + 1; while ((offsets_[first_inuse] & kUserDictOffsetFlagRemove) && (first_inuse < dict_info_.lemma_count)) { // Save REMOVE flag to lemma flag int off = offsets_[first_inuse]; set_lemma_flag(off, kUserDictLemmaFlagRemove); first_inuse++; } if (first_inuse >= dict_info_.lemma_count) { break; } // Swap offsets_ int tmp = offsets_[first_inuse]; offsets_[first_inuse] = offsets_[first_freed]; offsets_[first_freed] = tmp; // Move scores_, no need to swap tmp = scores_[first_inuse]; scores_[first_inuse] = scores_[first_freed]; scores_[first_freed] = tmp; // Swap ids_ LemmaIdType tmpid = ids_[first_inuse]; ids_[first_inuse] = ids_[first_freed]; ids_[first_freed] = tmpid; // Go on first_freed++; } #ifdef ___PREDICT_ENABLED___ // Fixup predicts_ first_freed = 0; first_inuse = 0; while (first_freed < dict_info_.lemma_count) { // Find first freed offset while ((predicts_[first_freed] & kUserDictOffsetFlagRemove) == 0 && first_freed < dict_info_.lemma_count) { first_freed++; } if (first_freed >= dict_info_.lemma_count) break; // Find first inuse offse after first_freed first_inuse = first_freed + 1; while ((predicts_[first_inuse] & kUserDictOffsetFlagRemove) && (first_inuse < dict_info_.lemma_count)) { first_inuse++; } if (first_inuse >= dict_info_.lemma_count) { break; } // Swap offsets_ int tmp = predicts_[first_inuse]; predicts_[first_inuse] = predicts_[first_freed]; predicts_[first_freed] = tmp; // Go on first_freed++; } #endif dict_info_.lemma_count = first_freed; // Fixup lemmas_ size_t begin = 0; size_t end = 0; size_t dst = 0; int total_size = dict_info_.lemma_size + lemma_size_left_; int total_count = dict_info_.lemma_count + lemma_count_left_; size_t real_size = total_size - lemma_size_left_; while (dst < real_size) { unsigned char flag = get_lemma_flag(dst); unsigned char nchr = get_lemma_nchar(dst); if ((flag & kUserDictLemmaFlagRemove) == 0) { dst += nchr * 4 + 2; continue; } break; } if (dst >= real_size) return; end = dst; while (end < real_size) { begin = end + get_lemma_nchar(end) * 4 + 2; repeat: // not used any more if (begin >= real_size) break; unsigned char flag = get_lemma_flag(begin); unsigned char nchr = get_lemma_nchar(begin); if (flag & kUserDictLemmaFlagRemove) { begin += nchr * 4 + 2; goto repeat; } end = begin + nchr * 4 + 2; while (end < real_size) { unsigned char eflag = get_lemma_flag(end); unsigned char enchr = get_lemma_nchar(end); if ((eflag & kUserDictLemmaFlagRemove) == 0) { end += enchr * 4 + 2; continue; } break; } memmove(lemmas_ + dst, lemmas_ + begin, end - begin); for (size_t j = 0; j < dict_info_.lemma_count; j++) { if (offsets_[j] >= begin && offsets_[j] < end) { offsets_[j] -= (begin - dst); offsets_by_id_[ids_[j] - start_id_] = offsets_[j]; } #ifdef ___PREDICT_ENABLED___ if (predicts_[j] >= begin && predicts_[j] < end) { predicts_[j] -= (begin - dst); } #endif } #ifdef ___SYNC_ENABLED___ for (size_t j = 0; j < dict_info_.sync_count; j++) { if (syncs_[j] >= begin && syncs_[j] < end) { syncs_[j] -= (begin - dst); } } #endif dst += (end - begin); } dict_info_.free_count = 0; dict_info_.free_size = 0; dict_info_.lemma_size = dst; lemma_size_left_ = total_size - dict_info_.lemma_size; lemma_count_left_ = total_count - dict_info_.lemma_count; // XXX Without following code, // offsets_by_id_ is not reordered. // That's to say, all removed lemmas' ids are not collected back. // There may not be room for addition of new lemmas due to // offsests_by_id_ reason, although lemma_size_left_ is fixed. // By default, we do want defrag as fast as possible, because // during defrag procedure, other peers can not write new lemmas // to user dictionary file. // XXX If write-back is invoked immediately after // this defragment, no need to fix up following in-mem data. for (uint32 i = 0; i < dict_info_.lemma_count; i++) { ids_[i] = start_id_ + i; offsets_by_id_[i] = offsets_[i]; } state_ = USER_DICT_DEFRAGMENTED; #ifdef ___DEBUG_PERF___ DEBUG_PERF_END; LOGD_PERF("defragment"); #endif } #ifdef ___SYNC_ENABLED___ void UserDict::clear_sync_lemmas(unsigned int start, unsigned int end) { if (is_valid_state() == false) return; if (end > dict_info_.sync_count) end = dict_info_.sync_count; memmove(syncs_ + start, syncs_ + end, (dict_info_.sync_count - end) << 2); dict_info_.sync_count -= (end - start); if (state_ < USER_DICT_SYNC_DIRTY) state_ = USER_DICT_SYNC_DIRTY; } int UserDict::get_sync_count() { if (is_valid_state() == false) return 0; return dict_info_.sync_count; } LemmaIdType UserDict::put_lemma_no_sync(char16 lemma_str[], uint16 splids[], uint16 lemma_len, uint16 count, uint64 lmt) { int again = 0; begin: LemmaIdType id; uint32 * syncs_bak = syncs_; syncs_ = NULL; id = _put_lemma(lemma_str, splids, lemma_len, count, lmt); syncs_ = syncs_bak; if (id == 0 && again == 0) { if ((dict_info_.limit_lemma_count > 0 && dict_info_.lemma_count >= dict_info_.limit_lemma_count) || (dict_info_.limit_lemma_size > 0 && dict_info_.lemma_size + (2 + (lemma_len << 2)) > dict_info_.limit_lemma_size)) { // XXX Always reclaim and defrag in sync code path // sync thread is background thread and ok with heavy work reclaim(); defragment(); flush_cache(); again = 1; goto begin; } } return id; } int UserDict::put_lemmas_no_sync_from_utf16le_string(char16 * lemmas, int len) { int newly_added = 0; SpellingParser * spl_parser = new SpellingParser(); if (!spl_parser) { return 0; } #ifdef ___DEBUG_PERF___ DEBUG_PERF_BEGIN; #endif char16 *ptr = lemmas; // Extract pinyin,words,frequence,last_mod_time char16 * p = ptr, * py16 = ptr; char16 * hz16 = NULL; int py16_len = 0; uint16 splid[kMaxLemmaSize]; int splid_len = 0; int hz16_len = 0; char16 * fr16 = NULL; int fr16_len = 0; while (p - ptr < len) { // Pinyin py16 = p; splid_len = 0; while (*p != 0x2c && (p - ptr) < len) { if (*p == 0x20) splid_len++; p++; } splid_len++; if (p - ptr == len) break; py16_len = p - py16; if (kMaxLemmaSize < splid_len) { break; } bool is_pre; int splidl = spl_parser->splstr16_to_idxs_f( py16, py16_len, splid, NULL, kMaxLemmaSize, is_pre); if (splidl != splid_len) break; // Phrase hz16 = ++p; while (*p != 0x2c && (p - ptr) < len) { p++; } hz16_len = p - hz16; if (hz16_len != splid_len) break; // Frequency fr16 = ++p; fr16_len = 0; while (*p != 0x2c && (p - ptr) < len) { p++; } fr16_len = p - fr16; uint32 intf = (uint32)utf16le_atoll(fr16, fr16_len); // Last modified time fr16 = ++p; fr16_len = 0; while (*p != 0x3b && (p - ptr) < len) { p++; } fr16_len = p - fr16; uint64 last_mod = utf16le_atoll(fr16, fr16_len); put_lemma_no_sync(hz16, splid, splid_len, intf, last_mod); newly_added++; p++; } #ifdef ___DEBUG_PERF___ DEBUG_PERF_END; LOGD_PERF("put_lemmas_no_sync_from_utf16le_string"); #endif return newly_added; } int UserDict::get_sync_lemmas_in_utf16le_string_from_beginning( char16 * str, int size, int * count) { int len = 0; *count = 0; int left_len = size; if (is_valid_state() == false) return len; SpellingTrie * spl_trie = &SpellingTrie::get_instance(); if (!spl_trie) { return 0; } uint32 i; for (i = 0; i < dict_info_.sync_count; i++) { int offset = syncs_[i]; uint32 nchar = get_lemma_nchar(offset); uint16 *spl = get_lemma_spell_ids(offset); uint16 *wrd = get_lemma_word(offset); int score = _get_lemma_score(wrd, spl, nchar); static char score_temp[32], *pscore_temp = score_temp; static char16 temp[256], *ptemp = temp; pscore_temp = score_temp; ptemp = temp; uint32 j; // Add pinyin for (j = 0; j < nchar; j++) { int ret_len = spl_trie->get_spelling_str16( spl[j], ptemp, temp + sizeof(temp) - ptemp); if (ret_len <= 0) break; ptemp += ret_len; if (ptemp < temp + sizeof(temp) - 1) { *(ptemp++) = ' '; } else { j = 0; break; } } if (j < nchar) { continue; } ptemp--; if (ptemp < temp + sizeof(temp) - 1) { *(ptemp++) = ','; } else { continue; } // Add phrase for (j = 0; j < nchar; j++) { if (ptemp < temp + sizeof(temp) - 1) { *(ptemp++) = wrd[j]; } else { break; } } if (j < nchar) { continue; } if (ptemp < temp + sizeof(temp) - 1) { *(ptemp++) = ','; } else { continue; } // Add frequency uint32 intf = extract_score_freq(score); int ret_len = utf16le_lltoa(intf, ptemp, temp + sizeof(temp) - ptemp); if (ret_len <= 0) continue; ptemp += ret_len; if (ptemp < temp + sizeof(temp) - 1) { *(ptemp++) = ','; } else { continue; } // Add last modified time uint64 last_mod = extract_score_lmt(score); ret_len = utf16le_lltoa(last_mod, ptemp, temp + sizeof(temp) - ptemp); if (ret_len <= 0) continue; ptemp += ret_len; if (ptemp < temp + sizeof(temp) - 1) { *(ptemp++) = ';'; } else { continue; } // Write to string int need_len = ptemp - temp; if (need_len > left_len) break; memcpy(str + len, temp, need_len * 2); left_len -= need_len; len += need_len; (*count)++; } if (len > 0) { if (state_ < USER_DICT_SYNC_DIRTY) state_ = USER_DICT_SYNC_DIRTY; } return len; } #endif bool UserDict::state(UserDictStat * stat) { if (is_valid_state() == false) return false; if (!stat) return false; stat->version = version_; stat->file_name = dict_file_; stat->load_time.tv_sec = load_time_.tv_sec; stat->load_time.tv_usec = load_time_.tv_usec; pthread_mutex_lock(&g_mutex_); stat->last_update.tv_sec = g_last_update_.tv_sec; stat->last_update.tv_usec = g_last_update_.tv_usec; pthread_mutex_unlock(&g_mutex_); stat->disk_size = get_dict_file_size(&dict_info_); stat->lemma_count = dict_info_.lemma_count; stat->lemma_size = dict_info_.lemma_size; stat->delete_count = dict_info_.free_count; stat->delete_size = dict_info_.free_size; #ifdef ___SYNC_ENABLED___ stat->sync_count = dict_info_.sync_count; #endif stat->limit_lemma_count = dict_info_.limit_lemma_count; stat->limit_lemma_size = dict_info_.limit_lemma_size; stat->reclaim_ratio = dict_info_.reclaim_ratio; return true; } void UserDict::set_limit(uint32 max_lemma_count, uint32 max_lemma_size, uint32 reclaim_ratio) { dict_info_.limit_lemma_count = max_lemma_count; dict_info_.limit_lemma_size = max_lemma_size; if (reclaim_ratio > 100) reclaim_ratio = 100; dict_info_.reclaim_ratio = reclaim_ratio; } void UserDict::reclaim() { if (is_valid_state() == false) return; switch (dict_info_.reclaim_ratio) { case 0: return; case 100: // TODO: CLEAR to be implemented assert(false); return; default: break; } // XXX Reclaim is only based on count, not size uint32 count = dict_info_.lemma_count; int rc = count * dict_info_.reclaim_ratio / 100; UserDictScoreOffsetPair * score_offset_pairs = NULL; score_offset_pairs = (UserDictScoreOffsetPair *)malloc( sizeof(UserDictScoreOffsetPair) * rc); if (score_offset_pairs == NULL) { return; } for (int i = 0; i < rc; i++) { int s = scores_[i]; score_offset_pairs[i].score = s; score_offset_pairs[i].offset_index = i; } for (int i = (rc + 1) / 2; i >= 0; i--) shift_down(score_offset_pairs, i, rc); for (uint32 i = rc; i < dict_info_.lemma_count; i++) { int s = scores_[i]; if (s < score_offset_pairs[0].score) { score_offset_pairs[0].score = s; score_offset_pairs[0].offset_index = i; shift_down(score_offset_pairs, 0, rc); } } for (int i = 0; i < rc; i++) { int off = score_offset_pairs[i].offset_index; remove_lemma_by_offset_index(off); } if (rc > 0) { if (state_ < USER_DICT_OFFSET_DIRTY) state_ = USER_DICT_OFFSET_DIRTY; } free(score_offset_pairs); } inline void UserDict::swap(UserDictScoreOffsetPair * sop, int i, int j) { int s = sop[i].score; int p = sop[i].offset_index; sop[i].score = sop[j].score; sop[i].offset_index = sop[j].offset_index; sop[j].score = s; sop[j].offset_index = p; } void UserDict::shift_down(UserDictScoreOffsetPair * sop, int i, int n) { int par = i; while (par < n) { int left = par * 2 + 1; int right = left + 1; if (left >= n && right >= n) break; if (right >= n) { if (sop[left].score > sop[par].score) { swap(sop, left, par); par = left; continue; } } else if (sop[left].score > sop[right].score && sop[left].score > sop[par].score) { swap(sop, left, par); par = left; continue; } else if (sop[right].score > sop[left].score && sop[right].score > sop[par].score) { swap(sop, right, par); par = right; continue; } break; } } LemmaIdType UserDict::put_lemma(char16 lemma_str[], uint16 splids[], uint16 lemma_len, uint16 count) { return _put_lemma(lemma_str, splids, lemma_len, count, time(NULL)); } LemmaIdType UserDict::_put_lemma(char16 lemma_str[], uint16 splids[], uint16 lemma_len, uint16 count, uint64 lmt) { #ifdef ___DEBUG_PERF___ DEBUG_PERF_BEGIN; #endif if (is_valid_state() == false) return 0; int32 off = locate_in_offsets(lemma_str, splids, lemma_len); if (off != -1) { int delta_score = count - scores_[off]; dict_info_.total_nfreq += delta_score; scores_[off] = build_score(lmt, count); if (state_ < USER_DICT_SCORE_DIRTY) state_ = USER_DICT_SCORE_DIRTY; #ifdef ___DEBUG_PERF___ DEBUG_PERF_END; LOGD_PERF("_put_lemma(update)"); #endif return ids_[off]; } else { if ((dict_info_.limit_lemma_count > 0 && dict_info_.lemma_count >= dict_info_.limit_lemma_count) || (dict_info_.limit_lemma_size > 0 && dict_info_.lemma_size + (2 + (lemma_len << 2)) > dict_info_.limit_lemma_size)) { // XXX Don't defragment here, it's too time-consuming. return 0; } int flushed = 0; if (lemma_count_left_ == 0 || lemma_size_left_ < (size_t)(2 + (lemma_len << 2))) { // XXX When there is no space for new lemma, we flush to disk // flush_cache() may be called by upper user // and better place shoule be found instead of here flush_cache(); flushed = 1; // Or simply return and do nothing // return 0; } #ifdef ___DEBUG_PERF___ DEBUG_PERF_END; LOGD_PERF(flushed ? "_put_lemma(flush+add)" : "_put_lemma(add)"); #endif LemmaIdType id = append_a_lemma(lemma_str, splids, lemma_len, count, lmt); #ifdef ___SYNC_ENABLED___ if (syncs_ && id != 0) { queue_lemma_for_sync(id); } #endif return id; } return 0; } #ifdef ___SYNC_ENABLED___ void UserDict::queue_lemma_for_sync(LemmaIdType id) { if (dict_info_.sync_count < sync_count_size_) { syncs_[dict_info_.sync_count++] = offsets_by_id_[id - start_id_]; } else { uint32 * syncs = (uint32*)realloc( syncs_, (sync_count_size_ + kUserDictPreAlloc) << 2); if (syncs) { sync_count_size_ += kUserDictPreAlloc; syncs_ = syncs; syncs_[dict_info_.sync_count++] = offsets_by_id_[id - start_id_]; } } } #endif LemmaIdType UserDict::update_lemma(LemmaIdType lemma_id, int16 delta_count, bool selected) { #ifdef ___DEBUG_PERF___ DEBUG_PERF_BEGIN; #endif if (is_valid_state() == false) return 0; if (is_valid_lemma_id(lemma_id) == false) return 0; uint32 offset = offsets_by_id_[lemma_id - start_id_]; uint8 lemma_len = get_lemma_nchar(offset); char16 * lemma_str = get_lemma_word(offset); uint16 * splids = get_lemma_spell_ids(offset); int32 off = locate_in_offsets(lemma_str, splids, lemma_len); if (off != -1) { int score = scores_[off]; int count = extract_score_freq(score); uint64 lmt = extract_score_lmt(score); if (count + delta_count > kUserDictMaxFrequency || count + delta_count < count) { delta_count = kUserDictMaxFrequency - count; } count += delta_count; dict_info_.total_nfreq += delta_count; if (selected) { lmt = time(NULL); } scores_[off] = build_score(lmt, count); if (state_ < USER_DICT_SCORE_DIRTY) state_ = USER_DICT_SCORE_DIRTY; #ifdef ___DEBUG_PERF___ DEBUG_PERF_END; LOGD_PERF("update_lemma"); #endif #ifdef ___SYNC_ENABLED___ queue_lemma_for_sync(ids_[off]); #endif return ids_[off]; } return 0; } size_t UserDict::get_total_lemma_count() { return dict_info_.total_nfreq; } void UserDict::set_total_lemma_count_of_others(size_t count) { total_other_nfreq_ = count; } LemmaIdType UserDict::append_a_lemma(char16 lemma_str[], uint16 splids[], uint16 lemma_len, uint16 count, uint64 lmt) { LemmaIdType id = get_max_lemma_id() + 1; size_t offset = dict_info_.lemma_size; if (offset > kUserDictOffsetMask) return 0; lemmas_[offset] = 0; lemmas_[offset + 1] = (uint8)lemma_len; for (size_t i = 0; i < lemma_len; i++) { *((uint16*)&lemmas_[offset + 2 + (i << 1)]) = splids[i]; *((char16*)&lemmas_[offset + 2 + (lemma_len << 1) + (i << 1)]) = lemma_str[i]; } uint32 off = dict_info_.lemma_count; offsets_[off] = offset; scores_[off] = build_score(lmt, count); ids_[off] = id; #ifdef ___PREDICT_ENABLED___ predicts_[off] = offset; #endif offsets_by_id_[id - start_id_] = offset; dict_info_.lemma_count++; dict_info_.lemma_size += (2 + (lemma_len << 2)); lemma_count_left_--; lemma_size_left_ -= (2 + (lemma_len << 2)); // Sort UserDictSearchable searchable; prepare_locate(&searchable, splids, lemma_len); size_t i = 0; while (i < off) { offset = offsets_[i]; uint32 nchar = get_lemma_nchar(offset); uint16 * spl = get_lemma_spell_ids(offset); if (0 <= fuzzy_compare_spell_id(spl, nchar, &searchable)) break; i++; } if (i != off) { uint32 temp = offsets_[off]; memmove(offsets_ + i + 1, offsets_ + i, (off - i) << 2); offsets_[i] = temp; temp = scores_[off]; memmove(scores_ + i + 1, scores_ + i, (off - i) << 2); scores_[i] = temp; temp = ids_[off]; memmove(ids_ + i + 1, ids_ + i, (off - i) << 2); ids_[i] = temp; } #ifdef ___PREDICT_ENABLED___ uint32 j = 0; uint16 * words_new = get_lemma_word(predicts_[off]); j = locate_where_to_insert_in_predicts(words_new, lemma_len); if (j != off) { uint32 temp = predicts_[off]; memmove(predicts_ + j + 1, predicts_ + j, (off - j) << 2); predicts_[j] = temp; } #endif if (state_ < USER_DICT_LEMMA_DIRTY) state_ = USER_DICT_LEMMA_DIRTY; #ifdef ___CACHE_ENABLED___ cache_init(); #endif dict_info_.total_nfreq += count; return id; } }