// Copyright 2016 PDFium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
// Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
#include "core/fpdftext/cpdf_textpagefind.h"
#include <cwchar>
#include <cwctype>
#include <vector>
#include "core/fpdftext/cpdf_textpage.h"
#include "core/fxcrt/fx_string.h"
#include "core/fxcrt/fx_system.h"
#include "third_party/base/stl_util.h"
namespace {
bool IsIgnoreSpaceCharacter(wchar_t curChar) {
if (curChar < 255 || (curChar >= 0x0600 && curChar <= 0x06FF) ||
(curChar >= 0xFE70 && curChar <= 0xFEFF) ||
(curChar >= 0xFB50 && curChar <= 0xFDFF) ||
(curChar >= 0x0400 && curChar <= 0x04FF) ||
(curChar >= 0x0500 && curChar <= 0x052F) ||
(curChar >= 0xA640 && curChar <= 0xA69F) ||
(curChar >= 0x2DE0 && curChar <= 0x2DFF) || curChar == 8467 ||
(curChar >= 0x2000 && curChar <= 0x206F)) {
return false;
}
return true;
}
} // namespace
CPDF_TextPageFind::CPDF_TextPageFind(const CPDF_TextPage* pTextPage)
: m_pTextPage(pTextPage),
m_flags(0),
m_bMatchCase(false),
m_bMatchWholeWord(false),
m_resStart(0),
m_resEnd(-1),
m_IsFind(false) {
m_strText = m_pTextPage->GetAllPageText();
int nCount = pTextPage->CountChars();
if (nCount)
m_CharIndex.push_back(0);
for (int i = 0; i < nCount; i++) {
FPDF_CHAR_INFO info;
pTextPage->GetCharInfo(i, &info);
int indexSize = pdfium::CollectionSize<int>(m_CharIndex);
if (info.m_Flag == FPDFTEXT_CHAR_NORMAL ||
info.m_Flag == FPDFTEXT_CHAR_GENERATED) {
if (indexSize % 2) {
m_CharIndex.push_back(1);
} else {
if (indexSize <= 0)
continue;
m_CharIndex[indexSize - 1] += 1;
}
} else {
if (indexSize % 2) {
if (indexSize <= 0)
continue;
m_CharIndex[indexSize - 1] = i + 1;
} else {
m_CharIndex.push_back(i + 1);
}
}
}
int indexSize = pdfium::CollectionSize<int>(m_CharIndex);
if (indexSize % 2)
m_CharIndex.erase(m_CharIndex.begin() + indexSize - 1);
}
CPDF_TextPageFind::~CPDF_TextPageFind() {}
int CPDF_TextPageFind::GetCharIndex(int index) const {
return m_pTextPage->CharIndexFromTextIndex(index);
}
bool CPDF_TextPageFind::FindFirst(const WideString& findwhat,
int flags,
Optional<size_t> startPos) {
if (!m_pTextPage)
return false;
if (m_strText.IsEmpty() || m_bMatchCase != (flags & FPDFTEXT_MATCHCASE))
m_strText = m_pTextPage->GetAllPageText();
WideString findwhatStr = findwhat;
m_findWhat = findwhatStr;
m_flags = flags;
m_bMatchCase = flags & FPDFTEXT_MATCHCASE;
if (m_strText.IsEmpty()) {
m_IsFind = false;
return true;
}
size_t len = findwhatStr.GetLength();
if (!m_bMatchCase) {
findwhatStr.MakeLower();
m_strText.MakeLower();
}
m_bMatchWholeWord = !!(flags & FPDFTEXT_MATCHWHOLEWORD);
m_findNextStart = startPos;
if (!startPos.has_value()) {
if (!m_strText.IsEmpty())
m_findPreStart = m_strText.GetLength() - 1;
} else {
m_findPreStart = startPos;
}
m_csFindWhatArray.clear();
size_t i = 0;
for (i = 0; i < len; ++i)
if (findwhatStr[i] != ' ')
break;
if (i < len)
ExtractFindWhat(findwhatStr);
else
m_csFindWhatArray.push_back(findwhatStr);
if (m_csFindWhatArray.empty())
return false;
m_IsFind = true;
m_resStart = 0;
m_resEnd = -1;
return true;
}
bool CPDF_TextPageFind::FindNext() {
if (!m_pTextPage)
return false;
m_resArray.clear();
if (!m_findNextStart.has_value())
return false;
if (m_strText.IsEmpty()) {
m_IsFind = false;
return m_IsFind;
}
size_t strLen = m_strText.GetLength();
if (m_findNextStart.value() > strLen - 1) {
m_IsFind = false;
return m_IsFind;
}
int nCount = pdfium::CollectionSize<int>(m_csFindWhatArray);
Optional<size_t> nResultPos = 0;
size_t nStartPos = m_findNextStart.value();
bool bSpaceStart = false;
for (int iWord = 0; iWord < nCount; iWord++) {
WideString csWord = m_csFindWhatArray[iWord];
if (csWord.IsEmpty()) {
if (iWord == nCount - 1) {
wchar_t strInsert = m_strText[nStartPos];
if (strInsert == TEXT_LINEFEED_CHAR || strInsert == TEXT_SPACE_CHAR ||
strInsert == TEXT_RETURN_CHAR || strInsert == 160) {
nResultPos = nStartPos + 1;
break;
}
iWord = -1;
} else if (iWord == 0) {
bSpaceStart = true;
}
continue;
}
nResultPos = m_strText.Find(csWord.c_str(), nStartPos);
if (!nResultPos.has_value()) {
m_IsFind = false;
return m_IsFind;
}
size_t endIndex = nResultPos.value() + csWord.GetLength() - 1;
if (iWord == 0)
m_resStart = nResultPos.value();
bool bMatch = true;
if (iWord != 0 && !bSpaceStart) {
size_t PreResEndPos = nStartPos;
int curChar = csWord[0];
WideString lastWord = m_csFindWhatArray[iWord - 1];
int lastChar = lastWord[lastWord.GetLength() - 1];
if (nStartPos == nResultPos.value() &&
!(IsIgnoreSpaceCharacter(lastChar) ||
IsIgnoreSpaceCharacter(curChar))) {
bMatch = false;
}
for (size_t d = PreResEndPos; d < nResultPos.value(); d++) {
wchar_t strInsert = m_strText[d];
if (strInsert != TEXT_LINEFEED_CHAR && strInsert != TEXT_SPACE_CHAR &&
strInsert != TEXT_RETURN_CHAR && strInsert != 160) {
bMatch = false;
break;
}
}
} else if (bSpaceStart) {
if (nResultPos.value() > 0) {
wchar_t strInsert = m_strText[nResultPos.value() - 1];
if (strInsert != TEXT_LINEFEED_CHAR && strInsert != TEXT_SPACE_CHAR &&
strInsert != TEXT_RETURN_CHAR && strInsert != 160) {
bMatch = false;
m_resStart = nResultPos.value();
} else {
m_resStart = nResultPos.value() - 1;
}
}
}
if (m_bMatchWholeWord && bMatch) {
bMatch = IsMatchWholeWord(m_strText, nResultPos.value(), endIndex);
}
nStartPos = endIndex + 1;
if (!bMatch) {
iWord = -1;
if (bSpaceStart)
nStartPos = m_resStart + m_csFindWhatArray[1].GetLength();
else
nStartPos = m_resStart + m_csFindWhatArray[0].GetLength();
}
}
m_resEnd = nResultPos.value() + m_csFindWhatArray.back().GetLength() - 1;
m_IsFind = true;
int resStart = GetCharIndex(m_resStart);
int resEnd = GetCharIndex(m_resEnd);
m_resArray = m_pTextPage->GetRectArray(resStart, resEnd - resStart + 1);
if (m_flags & FPDFTEXT_CONSECUTIVE) {
m_findNextStart = m_resStart + 1;
m_findPreStart = m_resEnd - 1;
} else {
m_findNextStart = m_resEnd + 1;
m_findPreStart = m_resStart - 1;
}
return m_IsFind;
}
bool CPDF_TextPageFind::FindPrev() {
if (!m_pTextPage)
return false;
m_resArray.clear();
if (m_strText.IsEmpty() || !m_findPreStart.has_value()) {
m_IsFind = false;
return m_IsFind;
}
CPDF_TextPageFind findEngine(m_pTextPage.Get());
bool ret = findEngine.FindFirst(m_findWhat, m_flags, Optional<size_t>(0));
if (!ret) {
m_IsFind = false;
return m_IsFind;
}
int order = -1;
int MatchedCount = 0;
while (ret) {
ret = findEngine.FindNext();
if (ret) {
int order1 = findEngine.GetCurOrder();
int MatchedCount1 = findEngine.GetMatchedCount();
int temp = order1 + MatchedCount1;
if (temp < 0 || static_cast<size_t>(temp) > m_findPreStart.value() + 1)
break;
order = order1;
MatchedCount = MatchedCount1;
}
}
if (order == -1) {
m_IsFind = false;
return m_IsFind;
}
m_resStart = m_pTextPage->TextIndexFromCharIndex(order);
m_resEnd = m_pTextPage->TextIndexFromCharIndex(order + MatchedCount - 1);
m_IsFind = true;
m_resArray = m_pTextPage->GetRectArray(order, MatchedCount);
if (m_flags & FPDFTEXT_CONSECUTIVE) {
m_findNextStart = m_resStart + 1;
m_findPreStart = m_resEnd - 1;
} else {
m_findNextStart = m_resEnd + 1;
m_findPreStart = m_resStart - 1;
}
return m_IsFind;
}
void CPDF_TextPageFind::ExtractFindWhat(const WideString& findwhat) {
if (findwhat.IsEmpty())
return;
int index = 0;
while (1) {
Optional<WideString> word =
ExtractSubString(findwhat.c_str(), index, TEXT_SPACE_CHAR);
if (!word)
break;
if (word->IsEmpty()) {
m_csFindWhatArray.push_back(L"");
index++;
continue;
}
size_t pos = 0;
while (pos < word->GetLength()) {
WideString curStr = word->Mid(pos, 1);
wchar_t curChar = word->operator[](pos);
if (IsIgnoreSpaceCharacter(curChar)) {
if (pos > 0 && curChar == 0x2019) {
pos++;
continue;
}
if (pos > 0)
m_csFindWhatArray.push_back(word->Left(pos));
m_csFindWhatArray.push_back(curStr);
if (pos == word->GetLength() - 1) {
word->clear();
break;
}
word.emplace(word->Right(word->GetLength() - pos - 1));
pos = 0;
continue;
}
pos++;
}
if (!word->IsEmpty())
m_csFindWhatArray.push_back(word.value());
index++;
}
}
bool CPDF_TextPageFind::IsMatchWholeWord(const WideString& csPageText,
size_t startPos,
size_t endPos) {
if (startPos > endPos)
return false;
wchar_t char_left = 0;
wchar_t char_right = 0;
size_t char_count = endPos - startPos + 1;
if (char_count == 0)
return false;
if (char_count == 1 && csPageText[startPos] > 255)
return true;
if (startPos >= 1)
char_left = csPageText[startPos - 1];
if (startPos + char_count < csPageText.GetLength())
char_right = csPageText[startPos + char_count];
if ((char_left > 'A' && char_left < 'a') ||
(char_left > 'a' && char_left < 'z') ||
(char_left > 0xfb00 && char_left < 0xfb06) || std::iswdigit(char_left) ||
(char_right > 'A' && char_right < 'a') ||
(char_right > 'a' && char_right < 'z') ||
(char_right > 0xfb00 && char_right < 0xfb06) ||
std::iswdigit(char_right)) {
return false;
}
if (!(('A' > char_left || char_left > 'Z') &&
('a' > char_left || char_left > 'z') &&
('A' > char_right || char_right > 'Z') &&
('a' > char_right || char_right > 'z'))) {
return false;
}
if (char_count > 0) {
if (std::iswdigit(char_left) && std::iswdigit(csPageText[startPos]))
return false;
if (std::iswdigit(char_right) && std::iswdigit(csPageText[endPos]))
return false;
}
return true;
}
Optional<WideString> CPDF_TextPageFind::ExtractSubString(
const wchar_t* lpszFullString,
int iSubString,
wchar_t chSep) {
if (!lpszFullString)
return {};
while (iSubString--) {
lpszFullString = std::wcschr(lpszFullString, chSep);
if (!lpszFullString)
return {};
lpszFullString++;
while (*lpszFullString == chSep)
lpszFullString++;
}
const wchar_t* lpchEnd = std::wcschr(lpszFullString, chSep);
int nLen = lpchEnd ? static_cast<int>(lpchEnd - lpszFullString)
: static_cast<int>(wcslen(lpszFullString));
if (nLen < 0)
return {};
return {WideString(lpszFullString, static_cast<size_t>(nLen))};
}
int CPDF_TextPageFind::GetCurOrder() const {
return GetCharIndex(m_resStart);
}
int CPDF_TextPageFind::GetMatchedCount() const {
int resStart = GetCharIndex(m_resStart);
int resEnd = GetCharIndex(m_resEnd);
return resEnd - resStart + 1;
}