// Copyright (c) 2010 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "chrome/browser/history/text_database_manager.h"
#include "base/compiler_specific.h"
#include "base/file_util.h"
#include "base/metrics/histogram.h"
#include "base/logging.h"
#include "base/message_loop.h"
#include "base/string_util.h"
#include "base/utf_string_conversions.h"
#include "chrome/browser/history/history_publisher.h"
#include "chrome/browser/history/visit_database.h"
#include "content/common/mru_cache.h"
using base::Time;
using base::TimeDelta;
using base::TimeTicks;
namespace history {
namespace {
// The number of database files we will be attached to at once.
const int kCacheDBSize = 5;
std::string ConvertStringForIndexer(const string16& input) {
// TODO(evanm): other transformations here?
return UTF16ToUTF8(CollapseWhitespace(input, false));
}
// Data older than this will be committed to the full text index even if we
// haven't gotten a title and/or body.
const int kExpirationSec = 20;
} // namespace
// TextDatabaseManager::ChangeSet ----------------------------------------------
TextDatabaseManager::ChangeSet::ChangeSet() {}
TextDatabaseManager::ChangeSet::~ChangeSet() {}
// TextDatabaseManager::PageInfo -----------------------------------------------
TextDatabaseManager::PageInfo::PageInfo(URLID url_id,
VisitID visit_id,
Time visit_time)
: url_id_(url_id),
visit_id_(visit_id),
visit_time_(visit_time) {
added_time_ = TimeTicks::Now();
}
TextDatabaseManager::PageInfo::~PageInfo() {}
void TextDatabaseManager::PageInfo::set_title(const string16& ttl) {
if (ttl.empty()) // Make the title nonempty when we set it for EverybodySet.
title_ = ASCIIToUTF16(" ");
else
title_ = ttl;
}
void TextDatabaseManager::PageInfo::set_body(const string16& bdy) {
if (bdy.empty()) // Make the body nonempty when we set it for EverybodySet.
body_ = ASCIIToUTF16(" ");
else
body_ = bdy;
}
bool TextDatabaseManager::PageInfo::Expired(TimeTicks now) const {
return now - added_time_ > TimeDelta::FromSeconds(kExpirationSec);
}
// TextDatabaseManager ---------------------------------------------------------
TextDatabaseManager::TextDatabaseManager(const FilePath& dir,
URLDatabase* url_database,
VisitDatabase* visit_database)
: dir_(dir),
url_database_(url_database),
visit_database_(visit_database),
recent_changes_(RecentChangeList::NO_AUTO_EVICT),
transaction_nesting_(0),
db_cache_(DBCache::NO_AUTO_EVICT),
present_databases_loaded_(false),
ALLOW_THIS_IN_INITIALIZER_LIST(factory_(this)),
history_publisher_(NULL) {
}
TextDatabaseManager::~TextDatabaseManager() {
if (transaction_nesting_)
CommitTransaction();
}
// static
TextDatabase::DBIdent TextDatabaseManager::TimeToID(Time time) {
Time::Exploded exploded;
time.UTCExplode(&exploded);
// We combine the month and year into a 6-digit number (200801 for
// January, 2008). The month is 1-based.
return exploded.year * 100 + exploded.month;
}
// static
Time TextDatabaseManager::IDToTime(TextDatabase::DBIdent id) {
Time::Exploded exploded;
memset(&exploded, 0, sizeof(Time::Exploded));
exploded.year = id / 100;
exploded.month = id % 100;
return Time::FromUTCExploded(exploded);
}
bool TextDatabaseManager::Init(const HistoryPublisher* history_publisher) {
history_publisher_ = history_publisher;
// Start checking recent changes and committing them.
ScheduleFlushOldChanges();
return true;
}
void TextDatabaseManager::BeginTransaction() {
transaction_nesting_++;
}
void TextDatabaseManager::CommitTransaction() {
DCHECK(transaction_nesting_);
transaction_nesting_--;
if (transaction_nesting_)
return; // Still more nesting of transactions before committing.
// Commit all databases with open transactions on them.
for (DBIdentSet::const_iterator i = open_transactions_.begin();
i != open_transactions_.end(); ++i) {
DBCache::iterator iter = db_cache_.Get(*i);
if (iter == db_cache_.end()) {
NOTREACHED() << "All open transactions should be cached.";
continue;
}
iter->second->CommitTransaction();
}
open_transactions_.clear();
// Now that the transaction is over, we can expire old connections.
db_cache_.ShrinkToSize(kCacheDBSize);
}
void TextDatabaseManager::InitDBList() {
if (present_databases_loaded_)
return;
present_databases_loaded_ = true;
// Find files on disk matching our pattern so we can quickly test for them.
FilePath::StringType filepattern(TextDatabase::file_base());
filepattern.append(FILE_PATH_LITERAL("*"));
file_util::FileEnumerator enumerator(
dir_, false, file_util::FileEnumerator::FILES, filepattern);
FilePath cur_file;
while (!(cur_file = enumerator.Next()).empty()) {
// Convert to the number representing this file.
TextDatabase::DBIdent id = TextDatabase::FileNameToID(cur_file);
if (id) // Will be 0 on error.
present_databases_.insert(id);
}
}
void TextDatabaseManager::AddPageURL(const GURL& url,
URLID url_id,
VisitID visit_id,
Time time) {
// Delete any existing page info.
RecentChangeList::iterator found = recent_changes_.Peek(url);
if (found != recent_changes_.end())
recent_changes_.Erase(found);
// Just save this info for later. We will save it when it expires or when all
// the data is complete.
recent_changes_.Put(url, PageInfo(url_id, visit_id, time));
}
void TextDatabaseManager::AddPageTitle(const GURL& url,
const string16& title) {
RecentChangeList::iterator found = recent_changes_.Peek(url);
if (found == recent_changes_.end()) {
// This page is not in our cache of recent pages. This is very much an edge
// case as normally a title will come in <20 seconds after the page commits,
// and TabContents will avoid spamming us with >1 title per page. However,
// it could come up if your connection is unhappy, and we don't want to
// miss anything.
//
// To solve this problem, we'll just associate the most recent visit with
// the new title and index that using the regular code path.
URLRow url_row;
if (!url_database_->GetRowForURL(url, &url_row))
return; // URL is unknown, give up.
VisitRow visit;
if (!visit_database_->GetMostRecentVisitForURL(url_row.id(), &visit))
return; // No recent visit, give up.
if (visit.is_indexed) {
// If this page was already indexed, we could have a body that came in
// first and we don't want to overwrite it. We could go query for the
// current body, or have a special setter for only the title, but this is
// not worth it for this edge case.
//
// It will be almost impossible for the title to take longer than
// kExpirationSec yet we got a body in less than that time, since the
// title should always come in first.
return;
}
AddPageData(url, url_row.id(), visit.visit_id, visit.visit_time,
title, string16());
return; // We don't know about this page, give up.
}
PageInfo& info = found->second;
if (info.has_body()) {
// This info is complete, write to the database.
AddPageData(url, info.url_id(), info.visit_id(), info.visit_time(),
title, info.body());
recent_changes_.Erase(found);
return;
}
info.set_title(title);
}
void TextDatabaseManager::AddPageContents(const GURL& url,
const string16& body) {
RecentChangeList::iterator found = recent_changes_.Peek(url);
if (found == recent_changes_.end()) {
// This page is not in our cache of recent pages. This means that the page
// took more than kExpirationSec to load. Often, this will be the result of
// a very slow iframe or other resource on the page that makes us think its
// still loading.
//
// As a fallback, set the most recent visit's contents using the input, and
// use the last set title in the URL table as the title to index.
URLRow url_row;
if (!url_database_->GetRowForURL(url, &url_row))
return; // URL is unknown, give up.
VisitRow visit;
if (!visit_database_->GetMostRecentVisitForURL(url_row.id(), &visit))
return; // No recent visit, give up.
// Use the title from the URL row as the title for the indexing.
AddPageData(url, url_row.id(), visit.visit_id, visit.visit_time,
url_row.title(), body);
return;
}
PageInfo& info = found->second;
if (info.has_title()) {
// This info is complete, write to the database.
AddPageData(url, info.url_id(), info.visit_id(), info.visit_time(),
info.title(), body);
recent_changes_.Erase(found);
return;
}
info.set_body(body);
}
bool TextDatabaseManager::AddPageData(const GURL& url,
URLID url_id,
VisitID visit_id,
Time visit_time,
const string16& title,
const string16& body) {
TextDatabase* db = GetDBForTime(visit_time, true);
if (!db)
return false;
TimeTicks beginning_time = TimeTicks::Now();
// First delete any recently-indexed data for this page. This will delete
// anything in the main database, but we don't bother looking through the
// archived database.
VisitVector visits;
visit_database_->GetVisitsForURL(url_id, &visits);
size_t our_visit_row_index = visits.size();
for (size_t i = 0; i < visits.size(); i++) {
// While we're going trough all the visits, also find our row so we can
// avoid another DB query.
if (visits[i].visit_id == visit_id) {
our_visit_row_index = i;
} else if (visits[i].is_indexed) {
visits[i].is_indexed = false;
visit_database_->UpdateVisitRow(visits[i]);
DeletePageData(visits[i].visit_time, url, NULL);
}
}
if (visit_id) {
// We're supposed to update the visit database.
if (our_visit_row_index >= visits.size()) {
NOTREACHED() << "We should always have found a visit when given an ID.";
return false;
}
DCHECK(visit_time == visits[our_visit_row_index].visit_time);
// Update the visit database to reference our addition.
visits[our_visit_row_index].is_indexed = true;
if (!visit_database_->UpdateVisitRow(visits[our_visit_row_index]))
return false;
}
// Now index the data.
std::string url_str = URLDatabase::GURLToDatabaseURL(url);
bool success = db->AddPageData(visit_time, url_str,
ConvertStringForIndexer(title),
ConvertStringForIndexer(body));
UMA_HISTOGRAM_TIMES("History.AddFTSData",
TimeTicks::Now() - beginning_time);
if (history_publisher_)
history_publisher_->PublishPageContent(visit_time, url, title, body);
return success;
}
void TextDatabaseManager::DeletePageData(Time time, const GURL& url,
ChangeSet* change_set) {
TextDatabase::DBIdent db_ident = TimeToID(time);
// We want to open the database for writing, but only if it exists. To
// achieve this, we check whether it exists by saying we're not going to
// write to it (avoiding the autocreation code normally called when writing)
// and then access it for writing only if it succeeds.
TextDatabase* db = GetDB(db_ident, false);
if (!db)
return;
db = GetDB(db_ident, true);
if (change_set)
change_set->Add(db_ident);
db->DeletePageData(time, URLDatabase::GURLToDatabaseURL(url));
}
void TextDatabaseManager::DeleteFromUncommitted(
const std::set<GURL>& restrict_urls, Time begin, Time end) {
// First find the beginning of the range to delete. Recall that the list
// has the most recent item at the beginning. There won't normally be very
// many items, so a brute-force search is fine.
RecentChangeList::iterator cur = recent_changes_.begin();
if (!end.is_null()) {
// Walk from the beginning of the list backwards in time to find the newest
// entry that should be deleted.
while (cur != recent_changes_.end() && cur->second.visit_time() >= end)
++cur;
}
// Now delete all visits up to the oldest one we were supposed to delete.
// Note that if begin is_null, it will be less than or equal to any other
// time.
if (restrict_urls.empty()) {
while (cur != recent_changes_.end() && cur->second.visit_time() >= begin)
cur = recent_changes_.Erase(cur);
} else {
while (cur != recent_changes_.end() && cur->second.visit_time() >= begin) {
if (restrict_urls.find(cur->first) != restrict_urls.end())
cur = recent_changes_.Erase(cur);
else
++cur;
}
}
}
void TextDatabaseManager::DeleteAll() {
DCHECK_EQ(0, transaction_nesting_) << "Calling deleteAll in a transaction.";
InitDBList();
// Close all open databases.
db_cache_.Clear();
// Now go through and delete all the files.
for (DBIdentSet::iterator i = present_databases_.begin();
i != present_databases_.end(); ++i) {
FilePath file_name = dir_.Append(TextDatabase::IDToFileName(*i));
file_util::Delete(file_name, false);
}
}
void TextDatabaseManager::OptimizeChangedDatabases(
const ChangeSet& change_set) {
for (ChangeSet::DBSet::const_iterator i =
change_set.changed_databases_.begin();
i != change_set.changed_databases_.end(); ++i) {
// We want to open the database for writing, but only if it exists. To
// achieve this, we check whether it exists by saying we're not going to
// write to it (avoiding the autocreation code normally called when writing)
// and then access it for writing only if it succeeds.
TextDatabase* db = GetDB(*i, false);
if (!db)
continue;
db = GetDB(*i, true);
if (!db)
continue; // The file may have changed or something.
db->Optimize();
}
}
void TextDatabaseManager::GetTextMatches(
const string16& query,
const QueryOptions& options,
std::vector<TextDatabase::Match>* results,
Time* first_time_searched) {
results->clear();
InitDBList();
if (present_databases_.empty()) {
// Nothing to search.
*first_time_searched = options.begin_time;
return;
}
// Get the query into the proper format for the individual DBs.
string16 fts_query16;
query_parser_.ParseQuery(query, &fts_query16);
std::string fts_query = UTF16ToUTF8(fts_query16);
// Need a copy of the options so we can modify the max count for each call
// to the individual databases.
QueryOptions cur_options(options);
// Compute the minimum and maximum values for the identifiers that could
// encompass the input time range.
TextDatabase::DBIdent min_ident = options.begin_time.is_null() ?
*present_databases_.begin() :
TimeToID(options.begin_time);
TextDatabase::DBIdent max_ident = options.end_time.is_null() ?
*present_databases_.rbegin() :
TimeToID(options.end_time);
// Iterate over the databases from the most recent backwards.
bool checked_one = false;
TextDatabase::URLSet found_urls;
for (DBIdentSet::reverse_iterator i = present_databases_.rbegin();
i != present_databases_.rend();
++i) {
// TODO(brettw) allow canceling the query in the middle.
// if (canceled_or_something)
// break;
// This code is stupid, we just loop until we find the correct starting
// time range rather than search in an intelligent way. Users will have a
// few dozen files at most, so this should not be an issue.
if (*i > max_ident)
continue; // Haven't gotten to the time range yet.
if (*i < min_ident)
break; // Covered all the time range.
TextDatabase* cur_db = GetDB(*i, false);
if (!cur_db)
continue;
// Adjust the max count according to how many results we've already got.
if (options.max_count) {
cur_options.max_count = options.max_count -
static_cast<int>(results->size());
}
// Since we are going backwards in time, it is always OK to pass the
// current first_time_searched, since it will always be smaller than
// any previous set.
cur_db->GetTextMatches(fts_query, cur_options,
results, &found_urls, first_time_searched);
checked_one = true;
DCHECK(options.max_count == 0 ||
static_cast<int>(results->size()) <= options.max_count);
if (options.max_count &&
static_cast<int>(results->size()) >= options.max_count)
break; // Got the max number of results.
}
// When there were no databases in the range, we need to fix up the min time.
if (!checked_one)
*first_time_searched = options.begin_time;
}
TextDatabase* TextDatabaseManager::GetDB(TextDatabase::DBIdent id,
bool for_writing) {
DBCache::iterator found_db = db_cache_.Get(id);
if (found_db != db_cache_.end()) {
if (transaction_nesting_ && for_writing &&
open_transactions_.find(id) == open_transactions_.end()) {
// If we currently have an open transaction, that database is not yet
// part of the transaction, and the database will be written to, it needs
// to be part of our transaction.
found_db->second->BeginTransaction();
open_transactions_.insert(id);
}
return found_db->second;
}
// Need to make the database.
TextDatabase* new_db = new TextDatabase(dir_, id, for_writing);
if (!new_db->Init()) {
delete new_db;
return NULL;
}
db_cache_.Put(id, new_db);
present_databases_.insert(id);
if (transaction_nesting_ && for_writing) {
// If we currently have an open transaction and the new database will be
// written to, it needs to be part of our transaction.
new_db->BeginTransaction();
open_transactions_.insert(id);
}
// When no transaction is open, allow this new one to kick out an old one.
if (!transaction_nesting_)
db_cache_.ShrinkToSize(kCacheDBSize);
return new_db;
}
TextDatabase* TextDatabaseManager::GetDBForTime(Time time,
bool create_if_necessary) {
return GetDB(TimeToID(time), create_if_necessary);
}
void TextDatabaseManager::ScheduleFlushOldChanges() {
factory_.RevokeAll();
MessageLoop::current()->PostDelayedTask(FROM_HERE, factory_.NewRunnableMethod(
&TextDatabaseManager::FlushOldChanges),
kExpirationSec * Time::kMillisecondsPerSecond);
}
void TextDatabaseManager::FlushOldChanges() {
FlushOldChangesForTime(TimeTicks::Now());
}
void TextDatabaseManager::FlushOldChangesForTime(TimeTicks now) {
// The end of the list is the oldest, so we just start from there committing
// things until we get something too new.
RecentChangeList::reverse_iterator i = recent_changes_.rbegin();
while (i != recent_changes_.rend() && i->second.Expired(now)) {
AddPageData(i->first, i->second.url_id(), i->second.visit_id(),
i->second.visit_time(), i->second.title(), i->second.body());
i = recent_changes_.Erase(i);
}
ScheduleFlushOldChanges();
}
} // namespace history