/* * Copyright (C) 2008,2009 OMRON SOFTWARE Co., Ltd. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package jp.co.omronsoft.openwnn.JAJP; import jp.co.omronsoft.openwnn.*; import java.util.*; /** * The penWnn Clause Converter class for Japanese IME. * * @author Copyright (C) 2009 OMRON SOFTWARE CO., LTD. All Rights Reserved. */ public class OpenWnnClauseConverterJAJP { /** Score(frequency value) of word in the learning dictionary */ private static final int FREQ_LEARN = 600; /** Score(frequency value) of word in the user dictionary */ private static final int FREQ_USER = 500; /** Maximum limit length of input */ public static final int MAX_INPUT_LENGTH = 50; /** search cache for unique independent words (jiritsugo) */ private HashMap<String, ArrayList<WnnWord>> mIndepWordBag; /** search cache for all independent words (jiritsugo) */ private HashMap<String, ArrayList<WnnWord>> mAllIndepWordBag; /** search cache for ancillary words (fuzokugo) */ private HashMap<String, ArrayList<WnnWord>> mFzkPatterns; /** connect matrix for generating a clause */ private byte[][] mConnectMatrix; /** dictionaries */ private WnnDictionary mDictionary; /** candidates of conversion */ private LinkedList mConvertResult; /** work area for consecutive clause conversion */ private WnnSentence[] mSentenceBuffer; /** part of speech (default) */ private WnnPOS mPosDefault; /** part of speech (end of clause/not end of sentence) */ private WnnPOS mPosEndOfClause1; /** part of speech (end of clause/any place) */ private WnnPOS mPosEndOfClause2; /** part of speech (end of sentence) */ private WnnPOS mPosEndOfClause3; /** cost value of a clause */ private static final int CLAUSE_COST = -1000; /** The candidate filter */ private CandidateFilter mFilter = null; /** * Constructor */ public OpenWnnClauseConverterJAJP() { mIndepWordBag = new HashMap<String, ArrayList<WnnWord>>(); mAllIndepWordBag = new HashMap<String, ArrayList<WnnWord>>(); mFzkPatterns = new HashMap(); mConvertResult = new LinkedList(); mSentenceBuffer = new WnnSentence[MAX_INPUT_LENGTH]; } /** * Set the dictionary * * @param dict The dictionary for phrase conversion */ public void setDictionary(WnnDictionary dict) { /* get connect matrix */ mConnectMatrix = dict.getConnectMatrix(); /* clear dictionary settings */ mDictionary = dict; dict.clearDictionary(); dict.clearApproxPattern(); /* clear work areas */ mIndepWordBag.clear(); mAllIndepWordBag.clear(); mFzkPatterns.clear(); /* get part of speech tags */ mPosDefault = dict.getPOS(WnnDictionary.POS_TYPE_MEISI); mPosEndOfClause1 = dict.getPOS(WnnDictionary.POS_TYPE_V1); mPosEndOfClause2 = dict.getPOS(WnnDictionary.POS_TYPE_V2); mPosEndOfClause3 = dict.getPOS(WnnDictionary.POS_TYPE_V3); } /** * Set the candidate filter * * @param filter The candidate filter */ public void setFilter(CandidateFilter filter) { mFilter = filter; } /** * Kana-to-Kanji conversion (single clause). * <br> * This method execute single clause conversion. * * @param input The input string * @return The candidates of conversion; {@code null} if an error occurs. */ public Iterator convert(String input) { /* do nothing if no dictionary is specified */ if (mConnectMatrix == null || mDictionary == null) { return null; } /* do nothing if the length of input exceeds the limit */ if (input.length() > MAX_INPUT_LENGTH) { return null; } /* clear the candidates list */ mConvertResult.clear(); /* try single clause conversion */ if (!singleClauseConvert(mConvertResult, input, mPosEndOfClause2, true)) { return null; } return mConvertResult.iterator(); } /** * Consecutive clause conversion. * * @param input The input string * @return The result of consecutive clause conversion; {@code null} if fail. */ public WnnSentence consecutiveClauseConvert(String input) { LinkedList clauses = new LinkedList(); /* clear the cache which is not matched */ for (int i = 0; i < input.length(); i++) { mSentenceBuffer[i] = null; } WnnSentence[] sentence = mSentenceBuffer; /* consecutive clause conversion */ for (int start = 0; start < input.length(); start++) { if (start != 0 && sentence[start-1] == null) { continue; } /* limit the length of a clause */ int end = input.length(); if (end > start + 20) { end = start + 20; } /* make clauses */ for ( ; end > start; end--) { int idx = end - 1; /* cutting a branch */ if (sentence[idx] != null) { if (start != 0) { if (sentence[idx].frequency > sentence[start-1].frequency + CLAUSE_COST + FREQ_LEARN) { /* there may be no way to be the best sequence from the 'start' */ break; } } else { if (sentence[idx].frequency > CLAUSE_COST + FREQ_LEARN) { /* there may be no way to be the best sequence from the 'start' */ break; } } } String key = input.substring(start, end); clauses.clear(); WnnClause bestClause = null; if (end == input.length()) { /* get the clause which can be the end of the sentence */ singleClauseConvert(clauses, key, mPosEndOfClause1, false); } else { /* get the clause which is not the end of the sentence */ singleClauseConvert(clauses, key, mPosEndOfClause3, false); } if (clauses.isEmpty()) { bestClause = defaultClause(key); } else { bestClause = (WnnClause)clauses.get(0); } /* make a sub-sentence */ WnnSentence ws; if (start == 0) { ws = new WnnSentence(key, bestClause); } else { ws = new WnnSentence(sentence[start-1], bestClause); } ws.frequency += CLAUSE_COST; /* update the best sub-sentence on the cache buffer */ if (sentence[idx] == null || (sentence[idx].frequency < ws.frequency)) { sentence[idx] = ws; } } } /* return the result of the consecutive clause conversion */ if (sentence[input.length() - 1] != null) { return sentence[input.length() - 1]; } return null; } /** * Consecutive clause conversion. * * @param resultList Where to store the result * @param input Input string * @return {@code true} if success; {@code false} if fail. */ private boolean consecutiveClauseConvert(LinkedList resultList, String input) { WnnSentence sentence = consecutiveClauseConvert(input); /* set the result of the consecutive clause conversion on the top of the list */ if (sentence != null) { resultList.add(0, sentence); return true; } return false; } /** * Single clause conversion. * * @param clauseList Where to store the results * @param input Input string * @param terminal Part of speech tag at the terminal * @param all Get all candidates or not * @return {@code true} if success; {@code false} if fail. */ private boolean singleClauseConvert(LinkedList clauseList, String input, WnnPOS terminal, boolean all) { boolean ret = false; /* get clauses without ancillary word */ ArrayList<WnnWord> stems = getIndependentWords(input, all); if (stems != null && (!stems.isEmpty())) { Iterator<WnnWord> stemsi = stems.iterator(); while (stemsi.hasNext()) { WnnWord stem = stemsi.next(); if (addClause(clauseList, input, stem, null, terminal, all)) { ret = true; } } } /* get clauses with ancillary word */ int max = CLAUSE_COST * 2; for (int split = 1; split < input.length(); split++) { /* get ancillary patterns */ String str = input.substring(split); ArrayList<WnnWord> fzks = getAncillaryPattern(str); if (fzks == null || fzks.isEmpty()) { continue; } /* get candidates of stem in a clause */ str = input.substring(0, split); stems = getIndependentWords(str, all); if (stems == null || stems.isEmpty()) { if (mDictionary.searchWord(WnnDictionary.SEARCH_PREFIX, WnnDictionary.ORDER_BY_FREQUENCY, str) <= 0) { break; } else { continue; } } /* make clauses */ Iterator<WnnWord> stemsi = stems.iterator(); while (stemsi.hasNext()) { WnnWord stem = stemsi.next(); if (all || stem.frequency > max) { Iterator<WnnWord> fzksi = fzks.iterator(); while (fzksi.hasNext()) { WnnWord fzk = fzksi.next(); if (addClause(clauseList, input, stem, fzk, terminal, all)) { ret = true; max = stem.frequency; } } } } } return ret; } /** * Add valid clause to the candidates list. * * @param clauseList Where to store the results * @param input Input string * @param stem Stem of the clause (a independent word) * @param fzk Ancillary pattern * @param terminal Part of speech tag at the terminal * @param all Get all candidates or not * @return {@code true} if add the clause to the list; {@code false} if not. */ private boolean addClause(LinkedList<WnnClause> clauseList, String input, WnnWord stem, WnnWord fzk, WnnPOS terminal, boolean all) { WnnClause clause = null; /* check if the part of speech is valid */ if (fzk == null) { if (connectible(stem.partOfSpeech.right, terminal.left)) { clause = new WnnClause(input, stem); } } else { if (connectible(stem.partOfSpeech.right, fzk.partOfSpeech.left) && connectible(fzk.partOfSpeech.right, terminal.left)) { clause = new WnnClause(input, stem, fzk); } } if (clause == null) { return false; } if (mFilter != null && !mFilter.isAllowed(clause)) { return false; } /* store to the list */ if (clauseList.isEmpty()) { /* add if the list is empty */ clauseList.add(0, clause); return true; } else { if (!all) { /* reserve only the best clause */ WnnClause best = (WnnClause)clauseList.get(0); if (best.frequency < clause.frequency) { clauseList.set(0, clause); return true; } } else { /* reserve all clauses */ Iterator clauseListi = clauseList.iterator(); int index = 0; while (clauseListi.hasNext()) { WnnClause clausei = (WnnClause)clauseListi.next(); if (clausei.frequency < clause.frequency) { break; } index++; } clauseList.add(index, clause); return true; } } return false; } /** * Check the part-of-speeches are connectable. * * @param right Right attribute of the preceding word/clause * @param left Left attribute of the following word/clause * @return {@code true} if there are connectable; {@code false} if otherwise */ private boolean connectible(int right, int left) { try { if (mConnectMatrix[left][right] != 0) { return true; } } catch (Exception ex) { } return false; } /** * Get all exact matched ancillary words(Fuzokugo) list. * * @param input Search key * @return List of ancillary words */ private ArrayList<WnnWord> getAncillaryPattern(String input) { if (input.length() == 0) { return null; } HashMap<String,ArrayList<WnnWord>> fzkPat = mFzkPatterns; ArrayList<WnnWord> fzks = fzkPat.get(input); if (fzks != null) { return fzks; } /* set dictionaries */ WnnDictionary dict = mDictionary; dict.clearDictionary(); dict.clearApproxPattern(); dict.setDictionary(6, 400, 500); for (int start = input.length() - 1; start >= 0; start--) { String key = input.substring(start); fzks = fzkPat.get(key); if (fzks != null) { continue; } fzks = new ArrayList<WnnWord>(); mFzkPatterns.put(key, fzks); /* search ancillary words */ dict.searchWord(WnnDictionary.SEARCH_EXACT, WnnDictionary.ORDER_BY_FREQUENCY, key); WnnWord word; while ((word = dict.getNextWord()) != null) { fzks.add(word); } /* concatenate sequence of ancillary words */ for (int end = input.length() - 1; end > start; end--) { ArrayList<WnnWord> followFzks = fzkPat.get(input.substring(end)); if (followFzks == null || followFzks.isEmpty()) { continue; } dict.searchWord(WnnDictionary.SEARCH_EXACT, WnnDictionary.ORDER_BY_FREQUENCY, input.substring(start, end)); while ((word = dict.getNextWord()) != null) { Iterator<WnnWord> followFzksi = followFzks.iterator(); while (followFzksi.hasNext()) { WnnWord follow = followFzksi.next(); if (connectible(word.partOfSpeech.right, follow.partOfSpeech.left)) { fzks.add(new WnnWord(key, key, new WnnPOS(word.partOfSpeech.left, follow.partOfSpeech.right))); } } } } } return fzks; } /** * Get all exact matched independent words(Jiritsugo) list. * * @param input Search key * @param all {@code true} if list all words; {@code false} if list words which has an unique part of speech tag. * @return List of words; {@code null} if {@code input.length() == 0}. */ private ArrayList<WnnWord> getIndependentWords(String input, boolean all) { if (input.length() == 0) { return null; } ArrayList<WnnWord> words = (all)? mAllIndepWordBag.get(input) : mIndepWordBag.get(input); if (words == null) { /* set dictionaries */ WnnDictionary dict = mDictionary; dict.clearDictionary(); dict.clearApproxPattern(); dict.setDictionary(4, 0, 10); dict.setDictionary(5, 400, 500); dict.setDictionary(WnnDictionary.INDEX_USER_DICTIONARY, FREQ_USER, FREQ_USER); dict.setDictionary(WnnDictionary.INDEX_LEARN_DICTIONARY, FREQ_LEARN, FREQ_LEARN); words = new ArrayList<WnnWord>(); WnnWord word; if (all) { mAllIndepWordBag.put(input, words); dict.searchWord(WnnDictionary.SEARCH_EXACT, WnnDictionary.ORDER_BY_FREQUENCY, input); /* store all words */ while ((word = dict.getNextWord()) != null) { if (input.equals(word.stroke)) { words.add(word); } } } else { mIndepWordBag.put(input, words); dict.searchWord(WnnDictionary.SEARCH_EXACT, WnnDictionary.ORDER_BY_FREQUENCY, input); /* store a word which has an unique part of speech tag */ while ((word = dict.getNextWord()) != null) { if (input.equals(word.stroke)) { Iterator<WnnWord> list = words.iterator(); boolean found = false; while (list.hasNext()) { WnnWord w = (WnnWord)list.next(); if (w.partOfSpeech.right == word.partOfSpeech.right) { found = true; break; } } if (!found) { words.add(word); } if (word.frequency < 400) { break; } } } } addAutoGeneratedCandidates(input, words, all); } return words; } /** * Add some words not including in the dictionary. * <br> * This method adds some words which are not in the dictionary. * * @param input Input string * @param wordList List to store words * @param all Get all candidates or not */ private void addAutoGeneratedCandidates(String input, ArrayList wordList, boolean all) { wordList.add(new WnnWord(input, input, mPosDefault, (CLAUSE_COST - 1) * input.length())); } /** * Get a default clause. * <br> * This method generates a clause which has a string same as input * and the default part-of-speech tag. * * @param input Input string * @return Default clause */ private WnnClause defaultClause(String input) { return (new WnnClause(input, input, mPosDefault, (CLAUSE_COST - 1) * input.length())); } }