普通文本  |  559行  |  19.15 KB

// Copyright (c) 2010 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#include "chrome/browser/history/text_database_manager.h"

#include "base/compiler_specific.h"
#include "base/file_util.h"
#include "base/metrics/histogram.h"
#include "base/logging.h"
#include "base/message_loop.h"
#include "base/string_util.h"
#include "base/utf_string_conversions.h"
#include "chrome/browser/history/history_publisher.h"
#include "chrome/browser/history/visit_database.h"
#include "content/common/mru_cache.h"

using base::Time;
using base::TimeDelta;
using base::TimeTicks;

namespace history {

namespace {

// The number of database files we will be attached to at once.
const int kCacheDBSize = 5;

std::string ConvertStringForIndexer(const string16& input) {
  // TODO(evanm): other transformations here?
  return UTF16ToUTF8(CollapseWhitespace(input, false));
}

// Data older than this will be committed to the full text index even if we
// haven't gotten a title and/or body.
const int kExpirationSec = 20;

}  // namespace

// TextDatabaseManager::ChangeSet ----------------------------------------------

TextDatabaseManager::ChangeSet::ChangeSet() {}

TextDatabaseManager::ChangeSet::~ChangeSet() {}

// TextDatabaseManager::PageInfo -----------------------------------------------

TextDatabaseManager::PageInfo::PageInfo(URLID url_id,
                                        VisitID visit_id,
                                        Time visit_time)
    : url_id_(url_id),
      visit_id_(visit_id),
      visit_time_(visit_time) {
  added_time_ = TimeTicks::Now();
}

TextDatabaseManager::PageInfo::~PageInfo() {}

void TextDatabaseManager::PageInfo::set_title(const string16& ttl) {
  if (ttl.empty())  // Make the title nonempty when we set it for EverybodySet.
    title_ = ASCIIToUTF16(" ");
  else
    title_ = ttl;
}

void TextDatabaseManager::PageInfo::set_body(const string16& bdy) {
  if (bdy.empty())  // Make the body nonempty when we set it for EverybodySet.
    body_ = ASCIIToUTF16(" ");
  else
    body_ = bdy;
}

bool TextDatabaseManager::PageInfo::Expired(TimeTicks now) const {
  return now - added_time_ > TimeDelta::FromSeconds(kExpirationSec);
}

// TextDatabaseManager ---------------------------------------------------------

TextDatabaseManager::TextDatabaseManager(const FilePath& dir,
                                         URLDatabase* url_database,
                                         VisitDatabase* visit_database)
    : dir_(dir),
      url_database_(url_database),
      visit_database_(visit_database),
      recent_changes_(RecentChangeList::NO_AUTO_EVICT),
      transaction_nesting_(0),
      db_cache_(DBCache::NO_AUTO_EVICT),
      present_databases_loaded_(false),
      ALLOW_THIS_IN_INITIALIZER_LIST(factory_(this)),
      history_publisher_(NULL) {
}

TextDatabaseManager::~TextDatabaseManager() {
  if (transaction_nesting_)
    CommitTransaction();
}

// static
TextDatabase::DBIdent TextDatabaseManager::TimeToID(Time time) {
  Time::Exploded exploded;
  time.UTCExplode(&exploded);

  // We combine the month and year into a 6-digit number (200801 for
  // January, 2008). The month is 1-based.
  return exploded.year * 100 + exploded.month;
}

// static
Time TextDatabaseManager::IDToTime(TextDatabase::DBIdent id) {
  Time::Exploded exploded;
  memset(&exploded, 0, sizeof(Time::Exploded));
  exploded.year = id / 100;
  exploded.month = id % 100;
  return Time::FromUTCExploded(exploded);
}

bool TextDatabaseManager::Init(const HistoryPublisher* history_publisher) {
  history_publisher_ = history_publisher;

  // Start checking recent changes and committing them.
  ScheduleFlushOldChanges();
  return true;
}

void TextDatabaseManager::BeginTransaction() {
  transaction_nesting_++;
}

void TextDatabaseManager::CommitTransaction() {
  DCHECK(transaction_nesting_);
  transaction_nesting_--;
  if (transaction_nesting_)
    return;  // Still more nesting of transactions before committing.

  // Commit all databases with open transactions on them.
  for (DBIdentSet::const_iterator i = open_transactions_.begin();
       i != open_transactions_.end(); ++i) {
    DBCache::iterator iter = db_cache_.Get(*i);
    if (iter == db_cache_.end()) {
      NOTREACHED() << "All open transactions should be cached.";
      continue;
    }
    iter->second->CommitTransaction();
  }
  open_transactions_.clear();

  // Now that the transaction is over, we can expire old connections.
  db_cache_.ShrinkToSize(kCacheDBSize);
}

void TextDatabaseManager::InitDBList() {
  if (present_databases_loaded_)
    return;

  present_databases_loaded_ = true;

  // Find files on disk matching our pattern so we can quickly test for them.
  FilePath::StringType filepattern(TextDatabase::file_base());
  filepattern.append(FILE_PATH_LITERAL("*"));
  file_util::FileEnumerator enumerator(
      dir_, false, file_util::FileEnumerator::FILES, filepattern);
  FilePath cur_file;
  while (!(cur_file = enumerator.Next()).empty()) {
    // Convert to the number representing this file.
    TextDatabase::DBIdent id = TextDatabase::FileNameToID(cur_file);
    if (id)  // Will be 0 on error.
      present_databases_.insert(id);
  }
}

void TextDatabaseManager::AddPageURL(const GURL& url,
                                     URLID url_id,
                                     VisitID visit_id,
                                     Time time) {
  // Delete any existing page info.
  RecentChangeList::iterator found = recent_changes_.Peek(url);
  if (found != recent_changes_.end())
    recent_changes_.Erase(found);

  // Just save this info for later. We will save it when it expires or when all
  // the data is complete.
  recent_changes_.Put(url, PageInfo(url_id, visit_id, time));
}

void TextDatabaseManager::AddPageTitle(const GURL& url,
                                       const string16& title) {
  RecentChangeList::iterator found = recent_changes_.Peek(url);
  if (found == recent_changes_.end()) {
    // This page is not in our cache of recent pages. This is very much an edge
    // case as normally a title will come in <20 seconds after the page commits,
    // and TabContents will avoid spamming us with >1 title per page. However,
    // it could come up if your connection is unhappy, and we don't want to
    // miss anything.
    //
    // To solve this problem, we'll just associate the most recent visit with
    // the new title and index that using the regular code path.
    URLRow url_row;
    if (!url_database_->GetRowForURL(url, &url_row))
      return;  // URL is unknown, give up.
    VisitRow visit;
    if (!visit_database_->GetMostRecentVisitForURL(url_row.id(), &visit))
      return;  // No recent visit, give up.

    if (visit.is_indexed) {
      // If this page was already indexed, we could have a body that came in
      // first and we don't want to overwrite it. We could go query for the
      // current body, or have a special setter for only the title, but this is
      // not worth it for this edge case.
      //
      // It will be almost impossible for the title to take longer than
      // kExpirationSec yet we got a body in less than that time, since the
      // title should always come in first.
      return;
    }

    AddPageData(url, url_row.id(), visit.visit_id, visit.visit_time,
                title, string16());
    return;  // We don't know about this page, give up.
  }

  PageInfo& info = found->second;
  if (info.has_body()) {
    // This info is complete, write to the database.
    AddPageData(url, info.url_id(), info.visit_id(), info.visit_time(),
                title, info.body());
    recent_changes_.Erase(found);
    return;
  }

  info.set_title(title);
}

void TextDatabaseManager::AddPageContents(const GURL& url,
                                          const string16& body) {
  RecentChangeList::iterator found = recent_changes_.Peek(url);
  if (found == recent_changes_.end()) {
    // This page is not in our cache of recent pages. This means that the page
    // took more than kExpirationSec to load. Often, this will be the result of
    // a very slow iframe or other resource on the page that makes us think its
    // still loading.
    //
    // As a fallback, set the most recent visit's contents using the input, and
    // use the last set title in the URL table as the title to index.
    URLRow url_row;
    if (!url_database_->GetRowForURL(url, &url_row))
      return;  // URL is unknown, give up.
    VisitRow visit;
    if (!visit_database_->GetMostRecentVisitForURL(url_row.id(), &visit))
      return;  // No recent visit, give up.

    // Use the title from the URL row as the title for the indexing.
    AddPageData(url, url_row.id(), visit.visit_id, visit.visit_time,
                url_row.title(), body);
    return;
  }

  PageInfo& info = found->second;
  if (info.has_title()) {
    // This info is complete, write to the database.
    AddPageData(url, info.url_id(), info.visit_id(), info.visit_time(),
                info.title(), body);
    recent_changes_.Erase(found);
    return;
  }

  info.set_body(body);
}

bool TextDatabaseManager::AddPageData(const GURL& url,
                                      URLID url_id,
                                      VisitID visit_id,
                                      Time visit_time,
                                      const string16& title,
                                      const string16& body) {
  TextDatabase* db = GetDBForTime(visit_time, true);
  if (!db)
    return false;

  TimeTicks beginning_time = TimeTicks::Now();

  // First delete any recently-indexed data for this page. This will delete
  // anything in the main database, but we don't bother looking through the
  // archived database.
  VisitVector visits;
  visit_database_->GetVisitsForURL(url_id, &visits);
  size_t our_visit_row_index = visits.size();
  for (size_t i = 0; i < visits.size(); i++) {
    // While we're going trough all the visits, also find our row so we can
    // avoid another DB query.
    if (visits[i].visit_id == visit_id) {
      our_visit_row_index = i;
    } else if (visits[i].is_indexed) {
      visits[i].is_indexed = false;
      visit_database_->UpdateVisitRow(visits[i]);
      DeletePageData(visits[i].visit_time, url, NULL);
    }
  }

  if (visit_id) {
    // We're supposed to update the visit database.
    if (our_visit_row_index >= visits.size()) {
      NOTREACHED() << "We should always have found a visit when given an ID.";
      return false;
    }

    DCHECK(visit_time == visits[our_visit_row_index].visit_time);

    // Update the visit database to reference our addition.
    visits[our_visit_row_index].is_indexed = true;
    if (!visit_database_->UpdateVisitRow(visits[our_visit_row_index]))
      return false;
  }

  // Now index the data.
  std::string url_str = URLDatabase::GURLToDatabaseURL(url);
  bool success = db->AddPageData(visit_time, url_str,
                                 ConvertStringForIndexer(title),
                                 ConvertStringForIndexer(body));

  UMA_HISTOGRAM_TIMES("History.AddFTSData",
                      TimeTicks::Now() - beginning_time);

  if (history_publisher_)
    history_publisher_->PublishPageContent(visit_time, url, title, body);

  return success;
}

void TextDatabaseManager::DeletePageData(Time time, const GURL& url,
                                         ChangeSet* change_set) {
  TextDatabase::DBIdent db_ident = TimeToID(time);

  // We want to open the database for writing, but only if it exists. To
  // achieve this, we check whether it exists by saying we're not going to
  // write to it (avoiding the autocreation code normally called when writing)
  // and then access it for writing only if it succeeds.
  TextDatabase* db = GetDB(db_ident, false);
  if (!db)
    return;
  db = GetDB(db_ident, true);

  if (change_set)
    change_set->Add(db_ident);

  db->DeletePageData(time, URLDatabase::GURLToDatabaseURL(url));
}

void TextDatabaseManager::DeleteFromUncommitted(
    const std::set<GURL>& restrict_urls, Time begin, Time end) {
  // First find the beginning of the range to delete. Recall that the list
  // has the most recent item at the beginning. There won't normally be very
  // many items, so a brute-force search is fine.
  RecentChangeList::iterator cur = recent_changes_.begin();
  if (!end.is_null()) {
    // Walk from the beginning of the list backwards in time to find the newest
    // entry that should be deleted.
    while (cur != recent_changes_.end() && cur->second.visit_time() >= end)
      ++cur;
  }

  // Now delete all visits up to the oldest one we were supposed to delete.
  // Note that if begin is_null, it will be less than or equal to any other
  // time.
  if (restrict_urls.empty()) {
    while (cur != recent_changes_.end() && cur->second.visit_time() >= begin)
      cur = recent_changes_.Erase(cur);
  } else {
    while (cur != recent_changes_.end() && cur->second.visit_time() >= begin) {
      if (restrict_urls.find(cur->first) != restrict_urls.end())
        cur = recent_changes_.Erase(cur);
      else
        ++cur;
    }
  }
}

void TextDatabaseManager::DeleteAll() {
  DCHECK_EQ(0, transaction_nesting_) << "Calling deleteAll in a transaction.";

  InitDBList();

  // Close all open databases.
  db_cache_.Clear();

  // Now go through and delete all the files.
  for (DBIdentSet::iterator i = present_databases_.begin();
       i != present_databases_.end(); ++i) {
    FilePath file_name = dir_.Append(TextDatabase::IDToFileName(*i));
    file_util::Delete(file_name, false);
  }
}

void TextDatabaseManager::OptimizeChangedDatabases(
    const ChangeSet& change_set) {
  for (ChangeSet::DBSet::const_iterator i =
           change_set.changed_databases_.begin();
       i != change_set.changed_databases_.end(); ++i) {
    // We want to open the database for writing, but only if it exists. To
    // achieve this, we check whether it exists by saying we're not going to
    // write to it (avoiding the autocreation code normally called when writing)
    // and then access it for writing only if it succeeds.
    TextDatabase* db = GetDB(*i, false);
    if (!db)
      continue;
    db = GetDB(*i, true);
    if (!db)
      continue;  // The file may have changed or something.
    db->Optimize();
  }
}

void TextDatabaseManager::GetTextMatches(
    const string16& query,
    const QueryOptions& options,
    std::vector<TextDatabase::Match>* results,
    Time* first_time_searched) {
  results->clear();

  InitDBList();
  if (present_databases_.empty()) {
    // Nothing to search.
    *first_time_searched = options.begin_time;
    return;
  }

  // Get the query into the proper format for the individual DBs.
  string16 fts_query16;
  query_parser_.ParseQuery(query, &fts_query16);
  std::string fts_query = UTF16ToUTF8(fts_query16);

  // Need a copy of the options so we can modify the max count for each call
  // to the individual databases.
  QueryOptions cur_options(options);

  // Compute the minimum and maximum values for the identifiers that could
  // encompass the input time range.
  TextDatabase::DBIdent min_ident = options.begin_time.is_null() ?
      *present_databases_.begin() :
      TimeToID(options.begin_time);
  TextDatabase::DBIdent max_ident = options.end_time.is_null() ?
      *present_databases_.rbegin() :
      TimeToID(options.end_time);

  // Iterate over the databases from the most recent backwards.
  bool checked_one = false;
  TextDatabase::URLSet found_urls;
  for (DBIdentSet::reverse_iterator i = present_databases_.rbegin();
       i != present_databases_.rend();
       ++i) {
    // TODO(brettw) allow canceling the query in the middle.
    // if (canceled_or_something)
    //   break;

    // This code is stupid, we just loop until we find the correct starting
    // time range rather than search in an intelligent way. Users will have a
    // few dozen files at most, so this should not be an issue.
    if (*i > max_ident)
      continue;  // Haven't gotten to the time range yet.
    if (*i < min_ident)
      break;  // Covered all the time range.

    TextDatabase* cur_db = GetDB(*i, false);
    if (!cur_db)
      continue;

    // Adjust the max count according to how many results we've already got.
    if (options.max_count) {
      cur_options.max_count = options.max_count -
          static_cast<int>(results->size());
    }

    // Since we are going backwards in time, it is always OK to pass the
    // current first_time_searched, since it will always be smaller than
    // any previous set.
    cur_db->GetTextMatches(fts_query, cur_options,
                           results, &found_urls, first_time_searched);
    checked_one = true;

    DCHECK(options.max_count == 0 ||
           static_cast<int>(results->size()) <= options.max_count);
    if (options.max_count &&
        static_cast<int>(results->size()) >= options.max_count)
      break;  // Got the max number of results.
  }

  // When there were no databases in the range, we need to fix up the min time.
  if (!checked_one)
    *first_time_searched = options.begin_time;
}

TextDatabase* TextDatabaseManager::GetDB(TextDatabase::DBIdent id,
                                         bool for_writing) {
  DBCache::iterator found_db = db_cache_.Get(id);
  if (found_db != db_cache_.end()) {
    if (transaction_nesting_ && for_writing &&
        open_transactions_.find(id) == open_transactions_.end()) {
      // If we currently have an open transaction, that database is not yet
      // part of the transaction, and the database will be written to, it needs
      // to be part of our transaction.
      found_db->second->BeginTransaction();
      open_transactions_.insert(id);
    }
    return found_db->second;
  }

  // Need to make the database.
  TextDatabase* new_db = new TextDatabase(dir_, id, for_writing);
  if (!new_db->Init()) {
    delete new_db;
    return NULL;
  }
  db_cache_.Put(id, new_db);
  present_databases_.insert(id);

  if (transaction_nesting_ && for_writing) {
    // If we currently have an open transaction and the new database will be
    // written to, it needs to be part of our transaction.
    new_db->BeginTransaction();
    open_transactions_.insert(id);
  }

  // When no transaction is open, allow this new one to kick out an old one.
  if (!transaction_nesting_)
    db_cache_.ShrinkToSize(kCacheDBSize);

  return new_db;
}

TextDatabase* TextDatabaseManager::GetDBForTime(Time time,
                                                bool create_if_necessary) {
  return GetDB(TimeToID(time), create_if_necessary);
}

void TextDatabaseManager::ScheduleFlushOldChanges() {
  factory_.RevokeAll();
  MessageLoop::current()->PostDelayedTask(FROM_HERE, factory_.NewRunnableMethod(
          &TextDatabaseManager::FlushOldChanges),
      kExpirationSec * Time::kMillisecondsPerSecond);
}

void TextDatabaseManager::FlushOldChanges() {
  FlushOldChangesForTime(TimeTicks::Now());
}

void TextDatabaseManager::FlushOldChangesForTime(TimeTicks now) {
  // The end of the list is the oldest, so we just start from there committing
  // things until we get something too new.
  RecentChangeList::reverse_iterator i = recent_changes_.rbegin();
  while (i != recent_changes_.rend() && i->second.Expired(now)) {
    AddPageData(i->first, i->second.url_id(), i->second.visit_id(),
                i->second.visit_time(), i->second.title(), i->second.body());
    i = recent_changes_.Erase(i);
  }

  ScheduleFlushOldChanges();
}

}  // namespace history