/*
* Copyright (C) 2008,2009 OMRON SOFTWARE Co., Ltd.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package jp.co.omronsoft.openwnn.JAJP;
import jp.co.omronsoft.openwnn.*;
import java.util.*;
/**
* The penWnn Clause Converter class for Japanese IME.
*
* @author Copyright (C) 2009 OMRON SOFTWARE CO., LTD. All Rights Reserved.
*/
public class OpenWnnClauseConverterJAJP {
/** Score(frequency value) of word in the learning dictionary */
private static final int FREQ_LEARN = 600;
/** Score(frequency value) of word in the user dictionary */
private static final int FREQ_USER = 500;
/** Maximum limit length of input */
public static final int MAX_INPUT_LENGTH = 50;
/** search cache for unique independent words (jiritsugo) */
private HashMap<String, ArrayList<WnnWord>> mIndepWordBag;
/** search cache for all independent words (jiritsugo) */
private HashMap<String, ArrayList<WnnWord>> mAllIndepWordBag;
/** search cache for ancillary words (fuzokugo) */
private HashMap<String, ArrayList<WnnWord>> mFzkPatterns;
/** connect matrix for generating a clause */
private byte[][] mConnectMatrix;
/** dictionaries */
private WnnDictionary mDictionary;
/** candidates of conversion */
private LinkedList mConvertResult;
/** work area for consecutive clause conversion */
private WnnSentence[] mSentenceBuffer;
/** part of speech (default) */
private WnnPOS mPosDefault;
/** part of speech (end of clause/not end of sentence) */
private WnnPOS mPosEndOfClause1;
/** part of speech (end of clause/any place) */
private WnnPOS mPosEndOfClause2;
/** part of speech (end of sentence) */
private WnnPOS mPosEndOfClause3;
/** cost value of a clause */
private static final int CLAUSE_COST = -1000;
/** The candidate filter */
private CandidateFilter mFilter = null;
/**
* Constructor
*/
public OpenWnnClauseConverterJAJP() {
mIndepWordBag = new HashMap<String, ArrayList<WnnWord>>();
mAllIndepWordBag = new HashMap<String, ArrayList<WnnWord>>();
mFzkPatterns = new HashMap();
mConvertResult = new LinkedList();
mSentenceBuffer = new WnnSentence[MAX_INPUT_LENGTH];
}
/**
* Set the dictionary
*
* @param dict The dictionary for phrase conversion
*/
public void setDictionary(WnnDictionary dict) {
/* get connect matrix */
mConnectMatrix = dict.getConnectMatrix();
/* clear dictionary settings */
mDictionary = dict;
dict.clearDictionary();
dict.clearApproxPattern();
/* clear work areas */
mIndepWordBag.clear();
mAllIndepWordBag.clear();
mFzkPatterns.clear();
/* get part of speech tags */
mPosDefault = dict.getPOS(WnnDictionary.POS_TYPE_MEISI);
mPosEndOfClause1 = dict.getPOS(WnnDictionary.POS_TYPE_V1);
mPosEndOfClause2 = dict.getPOS(WnnDictionary.POS_TYPE_V2);
mPosEndOfClause3 = dict.getPOS(WnnDictionary.POS_TYPE_V3);
}
/**
* Set the candidate filter
*
* @param filter The candidate filter
*/
public void setFilter(CandidateFilter filter) {
mFilter = filter;
}
/**
* Kana-to-Kanji conversion (single clause).
* <br>
* This method execute single clause conversion.
*
* @param input The input string
* @return The candidates of conversion; {@code null} if an error occurs.
*/
public Iterator convert(String input) {
/* do nothing if no dictionary is specified */
if (mConnectMatrix == null || mDictionary == null) {
return null;
}
/* do nothing if the length of input exceeds the limit */
if (input.length() > MAX_INPUT_LENGTH) {
return null;
}
/* clear the candidates list */
mConvertResult.clear();
/* try single clause conversion */
if (!singleClauseConvert(mConvertResult, input, mPosEndOfClause2, true)) {
return null;
}
return mConvertResult.iterator();
}
/**
* Consecutive clause conversion.
*
* @param input The input string
* @return The result of consecutive clause conversion; {@code null} if fail.
*/
public WnnSentence consecutiveClauseConvert(String input) {
LinkedList clauses = new LinkedList();
/* clear the cache which is not matched */
for (int i = 0; i < input.length(); i++) {
mSentenceBuffer[i] = null;
}
WnnSentence[] sentence = mSentenceBuffer;
/* consecutive clause conversion */
for (int start = 0; start < input.length(); start++) {
if (start != 0 && sentence[start-1] == null) {
continue;
}
/* limit the length of a clause */
int end = input.length();
if (end > start + 20) {
end = start + 20;
}
/* make clauses */
for ( ; end > start; end--) {
int idx = end - 1;
/* cutting a branch */
if (sentence[idx] != null) {
if (start != 0) {
if (sentence[idx].frequency > sentence[start-1].frequency + CLAUSE_COST + FREQ_LEARN) {
/* there may be no way to be the best sequence from the 'start' */
break;
}
} else {
if (sentence[idx].frequency > CLAUSE_COST + FREQ_LEARN) {
/* there may be no way to be the best sequence from the 'start' */
break;
}
}
}
String key = input.substring(start, end);
clauses.clear();
WnnClause bestClause = null;
if (end == input.length()) {
/* get the clause which can be the end of the sentence */
singleClauseConvert(clauses, key, mPosEndOfClause1, false);
} else {
/* get the clause which is not the end of the sentence */
singleClauseConvert(clauses, key, mPosEndOfClause3, false);
}
if (clauses.isEmpty()) {
bestClause = defaultClause(key);
} else {
bestClause = (WnnClause)clauses.get(0);
}
/* make a sub-sentence */
WnnSentence ws;
if (start == 0) {
ws = new WnnSentence(key, bestClause);
} else {
ws = new WnnSentence(sentence[start-1], bestClause);
}
ws.frequency += CLAUSE_COST;
/* update the best sub-sentence on the cache buffer */
if (sentence[idx] == null || (sentence[idx].frequency < ws.frequency)) {
sentence[idx] = ws;
}
}
}
/* return the result of the consecutive clause conversion */
if (sentence[input.length() - 1] != null) {
return sentence[input.length() - 1];
}
return null;
}
/**
* Consecutive clause conversion.
*
* @param resultList Where to store the result
* @param input Input string
* @return {@code true} if success; {@code false} if fail.
*/
private boolean consecutiveClauseConvert(LinkedList resultList, String input) {
WnnSentence sentence = consecutiveClauseConvert(input);
/* set the result of the consecutive clause conversion on the top of the list */
if (sentence != null) {
resultList.add(0, sentence);
return true;
}
return false;
}
/**
* Single clause conversion.
*
* @param clauseList Where to store the results
* @param input Input string
* @param terminal Part of speech tag at the terminal
* @param all Get all candidates or not
* @return {@code true} if success; {@code false} if fail.
*/
private boolean singleClauseConvert(LinkedList clauseList, String input, WnnPOS terminal, boolean all) {
boolean ret = false;
/* get clauses without ancillary word */
ArrayList<WnnWord> stems = getIndependentWords(input, all);
if (stems != null && (!stems.isEmpty())) {
Iterator<WnnWord> stemsi = stems.iterator();
while (stemsi.hasNext()) {
WnnWord stem = stemsi.next();
if (addClause(clauseList, input, stem, null, terminal, all)) {
ret = true;
}
}
}
/* get clauses with ancillary word */
int max = CLAUSE_COST * 2;
for (int split = 1; split < input.length(); split++) {
/* get ancillary patterns */
String str = input.substring(split);
ArrayList<WnnWord> fzks = getAncillaryPattern(str);
if (fzks == null || fzks.isEmpty()) {
continue;
}
/* get candidates of stem in a clause */
str = input.substring(0, split);
stems = getIndependentWords(str, all);
if (stems == null || stems.isEmpty()) {
if (mDictionary.searchWord(WnnDictionary.SEARCH_PREFIX, WnnDictionary.ORDER_BY_FREQUENCY, str) <= 0) {
break;
} else {
continue;
}
}
/* make clauses */
Iterator<WnnWord> stemsi = stems.iterator();
while (stemsi.hasNext()) {
WnnWord stem = stemsi.next();
if (all || stem.frequency > max) {
Iterator<WnnWord> fzksi = fzks.iterator();
while (fzksi.hasNext()) {
WnnWord fzk = fzksi.next();
if (addClause(clauseList, input, stem, fzk, terminal, all)) {
ret = true;
max = stem.frequency;
}
}
}
}
}
return ret;
}
/**
* Add valid clause to the candidates list.
*
* @param clauseList Where to store the results
* @param input Input string
* @param stem Stem of the clause (a independent word)
* @param fzk Ancillary pattern
* @param terminal Part of speech tag at the terminal
* @param all Get all candidates or not
* @return {@code true} if add the clause to the list; {@code false} if not.
*/
private boolean addClause(LinkedList<WnnClause> clauseList, String input, WnnWord stem, WnnWord fzk,
WnnPOS terminal, boolean all) {
WnnClause clause = null;
/* check if the part of speech is valid */
if (fzk == null) {
if (connectible(stem.partOfSpeech.right, terminal.left)) {
clause = new WnnClause(input, stem);
}
} else {
if (connectible(stem.partOfSpeech.right, fzk.partOfSpeech.left)
&& connectible(fzk.partOfSpeech.right, terminal.left)) {
clause = new WnnClause(input, stem, fzk);
}
}
if (clause == null) {
return false;
}
if (mFilter != null && !mFilter.isAllowed(clause)) {
return false;
}
/* store to the list */
if (clauseList.isEmpty()) {
/* add if the list is empty */
clauseList.add(0, clause);
return true;
} else {
if (!all) {
/* reserve only the best clause */
WnnClause best = (WnnClause)clauseList.get(0);
if (best.frequency < clause.frequency) {
clauseList.set(0, clause);
return true;
}
} else {
/* reserve all clauses */
Iterator clauseListi = clauseList.iterator();
int index = 0;
while (clauseListi.hasNext()) {
WnnClause clausei = (WnnClause)clauseListi.next();
if (clausei.frequency < clause.frequency) {
break;
}
index++;
}
clauseList.add(index, clause);
return true;
}
}
return false;
}
/**
* Check the part-of-speeches are connectable.
*
* @param right Right attribute of the preceding word/clause
* @param left Left attribute of the following word/clause
* @return {@code true} if there are connectable; {@code false} if otherwise
*/
private boolean connectible(int right, int left) {
try {
if (mConnectMatrix[left][right] != 0) {
return true;
}
} catch (Exception ex) {
}
return false;
}
/**
* Get all exact matched ancillary words(Fuzokugo) list.
*
* @param input Search key
* @return List of ancillary words
*/
private ArrayList<WnnWord> getAncillaryPattern(String input) {
if (input.length() == 0) {
return null;
}
HashMap<String,ArrayList<WnnWord>> fzkPat = mFzkPatterns;
ArrayList<WnnWord> fzks = fzkPat.get(input);
if (fzks != null) {
return fzks;
}
/* set dictionaries */
WnnDictionary dict = mDictionary;
dict.clearDictionary();
dict.clearApproxPattern();
dict.setDictionary(6, 400, 500);
for (int start = input.length() - 1; start >= 0; start--) {
String key = input.substring(start);
fzks = fzkPat.get(key);
if (fzks != null) {
continue;
}
fzks = new ArrayList<WnnWord>();
mFzkPatterns.put(key, fzks);
/* search ancillary words */
dict.searchWord(WnnDictionary.SEARCH_EXACT, WnnDictionary.ORDER_BY_FREQUENCY, key);
WnnWord word;
while ((word = dict.getNextWord()) != null) {
fzks.add(word);
}
/* concatenate sequence of ancillary words */
for (int end = input.length() - 1; end > start; end--) {
ArrayList<WnnWord> followFzks = fzkPat.get(input.substring(end));
if (followFzks == null || followFzks.isEmpty()) {
continue;
}
dict.searchWord(WnnDictionary.SEARCH_EXACT, WnnDictionary.ORDER_BY_FREQUENCY, input.substring(start, end));
while ((word = dict.getNextWord()) != null) {
Iterator<WnnWord> followFzksi = followFzks.iterator();
while (followFzksi.hasNext()) {
WnnWord follow = followFzksi.next();
if (connectible(word.partOfSpeech.right, follow.partOfSpeech.left)) {
fzks.add(new WnnWord(key, key, new WnnPOS(word.partOfSpeech.left, follow.partOfSpeech.right)));
}
}
}
}
}
return fzks;
}
/**
* Get all exact matched independent words(Jiritsugo) list.
*
* @param input Search key
* @param all {@code true} if list all words; {@code false} if list words which has an unique part of speech tag.
* @return List of words; {@code null} if {@code input.length() == 0}.
*/
private ArrayList<WnnWord> getIndependentWords(String input, boolean all) {
if (input.length() == 0) {
return null;
}
ArrayList<WnnWord> words = (all)? mAllIndepWordBag.get(input) : mIndepWordBag.get(input);
if (words == null) {
/* set dictionaries */
WnnDictionary dict = mDictionary;
dict.clearDictionary();
dict.clearApproxPattern();
dict.setDictionary(4, 0, 10);
dict.setDictionary(5, 400, 500);
dict.setDictionary(WnnDictionary.INDEX_USER_DICTIONARY, FREQ_USER, FREQ_USER);
dict.setDictionary(WnnDictionary.INDEX_LEARN_DICTIONARY, FREQ_LEARN, FREQ_LEARN);
words = new ArrayList<WnnWord>();
WnnWord word;
if (all) {
mAllIndepWordBag.put(input, words);
dict.searchWord(WnnDictionary.SEARCH_EXACT, WnnDictionary.ORDER_BY_FREQUENCY, input);
/* store all words */
while ((word = dict.getNextWord()) != null) {
if (input.equals(word.stroke)) {
words.add(word);
}
}
} else {
mIndepWordBag.put(input, words);
dict.searchWord(WnnDictionary.SEARCH_EXACT, WnnDictionary.ORDER_BY_FREQUENCY, input);
/* store a word which has an unique part of speech tag */
while ((word = dict.getNextWord()) != null) {
if (input.equals(word.stroke)) {
Iterator<WnnWord> list = words.iterator();
boolean found = false;
while (list.hasNext()) {
WnnWord w = (WnnWord)list.next();
if (w.partOfSpeech.right == word.partOfSpeech.right) {
found = true;
break;
}
}
if (!found) {
words.add(word);
}
if (word.frequency < 400) {
break;
}
}
}
}
addAutoGeneratedCandidates(input, words, all);
}
return words;
}
/**
* Add some words not including in the dictionary.
* <br>
* This method adds some words which are not in the dictionary.
*
* @param input Input string
* @param wordList List to store words
* @param all Get all candidates or not
*/
private void addAutoGeneratedCandidates(String input, ArrayList wordList, boolean all) {
wordList.add(new WnnWord(input, input, mPosDefault, (CLAUSE_COST - 1) * input.length()));
}
/**
* Get a default clause.
* <br>
* This method generates a clause which has a string same as input
* and the default part-of-speech tag.
*
* @param input Input string
* @return Default clause
*/
private WnnClause defaultClause(String input) {
return (new WnnClause(input, input, mPosDefault, (CLAUSE_COST - 1) * input.length()));
}
}