/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ /* ***** BEGIN LICENSE BLOCK ***** * Version: MPL 1.1/GPL 2.0/LGPL 2.1 * * The contents of this file are subject to the Mozilla Public License Version * 1.1 (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * http://www.mozilla.org/MPL/ * * Software distributed under the License is distributed on an "AS IS" basis, * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License * for the specific language governing rights and limitations under the * License. * * The Original Code is the Mork Reader. * * The Initial Developer of the Original Code is * Google Inc. * Portions created by the Initial Developer are Copyright (C) 2006 * the Initial Developer. All Rights Reserved. * * Contributor(s): * Brian Ryner <bryner@brianryner.com> (original author) * * Alternatively, the contents of this file may be used under the terms of * either the GNU General Public License Version 2 or later (the "GPL"), or * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), * in which case the provisions of the GPL or the LGPL are applicable instead * of those above. If you wish to allow use of your version of this file only * under the terms of either the GPL or the LGPL, and not to allow others to * use your version of this file under the terms of the MPL, indicate your * decision by deleting the provisions above and replace them with the notice * and other provisions required by the GPL or the LGPL. If you do not delete * the provisions above, a recipient may use your version of this file under * the terms of any one of the MPL, the GPL or the LGPL. * * ***** END LICENSE BLOCK ***** */ // Source: // http://mxr.mozilla.org/firefox/source/db/morkreader/nsMorkReader.cpp // This file has been converted to google style. #include "chrome/browser/importer/mork_reader.h" #include <algorithm> #include "base/file_path.h" #include "base/i18n/icu_string_conversions.h" #include "base/logging.h" #include "base/message_loop.h" #include "base/string_number_conversions.h" #include "base/string_util.h" #include "chrome/browser/history/history_types.h" #include "chrome/browser/importer/firefox_importer_utils.h" #include "chrome/browser/importer/importer_bridge.h" namespace { // Convert a hex character (0-9, A-F) to its corresponding byte value. // Returns -1 if the character is invalid. inline int HexCharToInt(char c) { if ('0' <= c && c <= '9') return c - '0'; if ('A' <= c && c <= 'F') return c - 'A' + 10; return -1; } // Unescape a Mork value. Mork uses $xx escaping to encode non-ASCII // characters. Additionally, '$' and '\' are backslash-escaped. // The result of the unescape is in returned. std::string MorkUnescape(const std::string& input) { // We optimize for speed over space here -- size the result buffer to // the size of the source, which is an upper bound on the size of the // unescaped string. std::string result; size_t input_length = input.size(); result.reserve(input_length); for (size_t i = 0; i < input_length; i++) { char c = input[i]; if (c == '\\') { // Escaped literal, slip the backslash, append the next character. i++; if (i < input_length) result.push_back(input[i]); } else if (c == '$') { // Dollar sign denotes a hex character. if (i < input_length - 2) { // Would be nice to use ToInteger() here, but it currently // requires a null-terminated string. int first = HexCharToInt(input[++i]); int second = HexCharToInt(input[++i]); if (first >= 0 && second >= 0) result.push_back((first << 4) | second); } } else { // Regular character, just append. result.push_back(input[i]); } } return result; } } // namespace MorkReader::MorkReader() { } MorkReader::~MorkReader() { // Need to delete all the pointers to vectors we have in the table. for (RowMap::iterator i = table_.begin(); i != table_.end(); ++i) delete i->second; } bool MorkReader::Read(const FilePath& path) { stream_.open(path.value().c_str()); if (!stream_.is_open()) return false; std::string line; if (!ReadLine(&line) || line.compare("// <!-- <mdb:mork:z v=\"1.4\"/> -->") != 0) return false; // Unexpected file format. IndexMap column_map; while (ReadLine(&line)) { // Trim off leading spaces size_t idx = 0; size_t len = line.size(); while (idx < len && line[idx] == ' ') ++idx; if (idx >= len) continue; // Look at the line to figure out what section type this is if (StartsWithASCII(&line[idx], "< <(a=c)>", true)) { // Column map. We begin by creating a hash of column id to column name. StringMap column_name_map; ParseMap(line, idx, &column_name_map); // Now that we have the list of columns, we put them into a flat array. // Rows will have value arrays of the same size, with indexes that // correspond to the columns array. As we insert each column into the // array, we also make an entry in columnMap so that we can look up the // index given the column id. columns_.reserve(column_name_map.size()); for (StringMap::const_iterator i = column_name_map.begin(); i != column_name_map.end(); ++i) { column_map[i->first] = static_cast<int>(columns_.size()); MorkColumn col(i->first, i->second); columns_.push_back(col); } } else if (StartsWithASCII(&line[idx], "<(", true)) { // Value map. ParseMap(line, idx, &value_map_); } else if (line[idx] == '{' || line[idx] == '[') { // Table / table row. ParseTable(line, idx, &column_map); } else { // Don't know, hopefully don't care. } } return true; } // Parses a key/value map of the form // <(k1=v1)(k2=v2)...> bool MorkReader::ParseMap(const std::string& first_line, size_t start_index, StringMap* map) { // If the first line is the a=c line (column map), just skip over it. std::string line(first_line); if (StartsWithASCII(line, "< <(a=c)>", true)) ReadLine(&line); std::string key; do { size_t idx = start_index; size_t len = line.size(); size_t token_start; while (idx < len) { switch (line[idx++]) { case '(': // Beginning of a key/value pair. if (!key.empty()) { DLOG(WARNING) << "unterminated key/value pair?"; key.clear(); } token_start = idx; while (idx < len && line[idx] != '=') ++idx; key.assign(&line[token_start], idx - token_start); break; case '=': { // Beginning of the value. if (key.empty()) { DLOG(WARNING) << "stray value"; break; } token_start = idx; while (idx < len && line[idx] != ')') { if (line[idx] == '\\') ++idx; // Skip escaped ')' characters. ++idx; } size_t token_end = std::min(idx, len); ++idx; std::string value = MorkUnescape( std::string(&line[token_start], token_end - token_start)); (*map)[key] = value; key.clear(); break; } case '>': // End of the map. DLOG_IF(WARNING, key.empty()) << "map terminates inside of key/value pair"; return true; } } // We should start reading the next line at the beginning. start_index = 0; } while (ReadLine(&line)); // We ran out of lines and the map never terminated. This probably indicates // a parsing error. DLOG(WARNING) << "didn't find end of key/value map"; return false; } // Parses a table row of the form [123(^45^67)..] // (row id 123 has the value with id 67 for the column with id 45). // A '^' prefix for a column or value references an entry in the column or // value map. '=' is used as the separator when the value is a literal. void MorkReader::ParseTable(const std::string& first_line, size_t start_index, const IndexMap* column_map) { std::string line(first_line); // Column index of the cell we're parsing, minus one if invalid. int column_index = -1; // Points to the current row we're parsing inside of the |table_|, will be // NULL if we're not inside a row. ColumnDataList* current_row = NULL; bool in_meta_row = false; do { size_t idx = start_index; size_t len = line.size(); while (idx < len) { switch (line[idx++]) { case '{': // This marks the beginning of a table section. There's a lot of // junk before the first row that looks like cell values but isn't. // Skip to the first '['. while (idx < len && line[idx] != '[') { if (line[idx] == '{') { in_meta_row = true; // The meta row is enclosed in { } } else if (line[idx] == '}') { in_meta_row = false; } ++idx; } break; case '[': { // Start of a new row. Consume the row id, up to the first '('. // Row edits also have a table namespace, separated from the row id // by a colon. We don't make use of the namespace, but we need to // make sure not to consider it part of the row id. if (current_row) { DLOG(WARNING) << "unterminated row?"; current_row = NULL; } // Check for a '-' at the start of the id. This signifies that // if the row already exists, we should delete all columns from it // before adding the new values. bool cut_columns; if (idx < len && line[idx] == '-') { cut_columns = true; ++idx; } else { cut_columns = false; } // Locate the range of the ID. size_t token_start = idx; // Index of the first char of the token. while (idx < len && line[idx] != '(' && line[idx] != ']' && line[idx] != ':') { ++idx; } size_t token_end = idx; // Index of the char following the token. while (idx < len && line[idx] != '(' && line[idx] != ']') { ++idx; } if (in_meta_row) { // Need to create the meta row. meta_row_.resize(columns_.size()); current_row = &meta_row_; } else { // Find or create the regular row for this. IDString row_id(&line[token_start], token_end - token_start); RowMap::iterator found_row = table_.find(row_id); if (found_row == table_.end()) { // We don't already have this row, create a new one for it. current_row = new ColumnDataList(columns_.size()); table_[row_id] = current_row; } else { // The row already exists and we're adding/replacing things. current_row = found_row->second; } } if (cut_columns) { for (size_t i = 0; i < current_row->size(); ++i) (*current_row)[i].clear(); } break; } case ']': // We're done with the row. current_row = NULL; in_meta_row = false; break; case '(': { if (!current_row) { DLOG(WARNING) << "cell value outside of row"; break; } bool column_is_atom; if (line[idx] == '^') { column_is_atom = true; ++idx; // This is not part of the column id, advance past it. } else { column_is_atom = false; } size_t token_start = idx; while (idx < len && line[idx] != '^' && line[idx] != '=') { if (line[idx] == '\\') ++idx; // Skip escaped characters. ++idx; } size_t token_end = std::min(idx, len); IDString column; if (column_is_atom) column.assign(&line[token_start], token_end - token_start); else column = MorkUnescape(line.substr(token_start, token_end - token_start)); IndexMap::const_iterator found_column = column_map->find(column); if (found_column == column_map->end()) { DLOG(WARNING) << "Column not in column map, discarding it"; column_index = -1; } else { column_index = found_column->second; } break; } case '=': case '^': { if (column_index == -1) { DLOG(WARNING) << "stray ^ or = marker"; break; } bool value_is_atom = (line[idx - 1] == '^'); size_t token_start = idx - 1; // Include the '=' or '^' marker. while (idx < len && line[idx] != ')') { if (line[idx] == '\\') ++idx; // Skip escaped characters. ++idx; } size_t token_end = std::min(idx, len); ++idx; if (value_is_atom) { (*current_row)[column_index].assign(&line[token_start], token_end - token_start); } else { (*current_row)[column_index] = MorkUnescape(line.substr(token_start, token_end - token_start)); } column_index = -1; } break; } } // Start parsing the next line at the beginning. start_index = 0; } while (current_row && ReadLine(&line)); } bool MorkReader::ReadLine(std::string* line) { line->resize(256); std::getline(stream_, *line); if (stream_.eof() || stream_.bad()) return false; while (!line->empty() && (*line)[line->size() - 1] == '\\') { // There is a continuation for this line. Read it and append. std::string new_line; std::getline(stream_, new_line); if (stream_.eof()) return false; line->erase(line->size() - 1); line->append(new_line); } return true; } void MorkReader::NormalizeValue(std::string* value) const { if (value->empty()) return; MorkReader::StringMap::const_iterator i; switch (value->at(0)) { case '^': // Hex ID, lookup the name for it in the |value_map_|. i = value_map_.find(value->substr(1)); if (i == value_map_.end()) value->clear(); else *value = i->second; break; case '=': // Just use the literal after the equals sign. value->erase(value->begin()); break; default: // Anything else is invalid. value->clear(); break; } } // Source: // http://mxr.mozilla.org/firefox/source/toolkit/components/places/src/nsMorkHistoryImporter.cpp // Columns for entry (non-meta) history rows enum { kURLColumn, kNameColumn, kVisitCountColumn, kHiddenColumn, kTypedColumn, kLastVisitColumn, kColumnCount // Keep me last. }; static const char * const gColumnNames[] = { "URL", "Name", "VisitCount", "Hidden", "Typed", "LastVisitDate" }; struct TableReadClosure { explicit TableReadClosure(const MorkReader& r) : reader(r), swap_bytes(false), byte_order_column(-1) { for (int i = 0; i < kColumnCount; ++i) column_indexes[i] = -1; } // Backpointers to the reader and history we're operating on. const MorkReader& reader; // Whether we need to swap bytes (file format is other-endian). bool swap_bytes; // Indexes of the columns that we care about. int column_indexes[kColumnCount]; int byte_order_column; }; void AddToHistory(MorkReader::ColumnDataList* column_values, const TableReadClosure& data, std::vector<history::URLRow>* rows) { std::string values[kColumnCount]; for (size_t i = 0; i < kColumnCount; ++i) { if (data.column_indexes[i] != -1) { values[i] = column_values->at(data.column_indexes[i]); data.reader.NormalizeValue(&values[i]); // Do not import hidden records. if (i == kHiddenColumn && values[i] == "1") return; } } GURL url(values[kURLColumn]); if (CanImportURL(url)) { history::URLRow row(url); string16 title; if (data.swap_bytes) { base::CodepageToUTF16(values[kNameColumn], base::kCodepageUTF16BE, base::OnStringConversionError::SKIP, &title); } else { base::CodepageToUTF16(values[kNameColumn], base::kCodepageUTF16LE, base::OnStringConversionError::SKIP, &title); } row.set_title(title); int count = atoi(values[kVisitCountColumn].c_str()); if (count == 0) count = 1; row.set_visit_count(count); int64 date; base::StringToInt64(values[kLastVisitColumn], &date); if (date != 0) row.set_last_visit(base::Time::FromTimeT(date / 1000000)); bool is_typed = (values[kTypedColumn] == "1"); if (is_typed) row.set_typed_count(1); rows->push_back(row); } } // It sets up the file stream and loops over the lines in the file to // parse them, then adds the resulting row set to history. void ImportHistoryFromFirefox2(const FilePath& file, ImporterBridge* bridge) { MorkReader reader; reader.Read(file); // Gather up the column ids so we don't need to find them on each row TableReadClosure data(reader); const MorkReader::MorkColumnList& columns = reader.columns(); for (size_t i = 0; i < columns.size(); ++i) { for (int j = 0; j < kColumnCount; ++j) if (columns[i].name == gColumnNames[j]) { data.column_indexes[j] = static_cast<int>(i); break; } if (columns[i].name == "ByteOrder") data.byte_order_column = static_cast<int>(i); } // Determine the byte order from the table's meta-row. const MorkReader::ColumnDataList& meta_row = reader.meta_row(); if (!meta_row.empty() && data.byte_order_column != -1) { std::string byte_order = meta_row[data.byte_order_column]; if (!byte_order.empty()) { // Note whether the file uses a non-native byte ordering. // If it does, we'll have to swap bytes for PRUnichar values. // "BE" and "LE" are the only recognized values, anything // else is garbage and the file will be treated as native-endian // (no swapping). std::string byte_order_value(byte_order); reader.NormalizeValue(&byte_order_value); data.swap_bytes = (byte_order_value == "BE"); } } std::vector<history::URLRow> rows; for (MorkReader::iterator i = reader.begin(); i != reader.end(); ++i) AddToHistory(i->second, data, &rows); if (!rows.empty()) bridge->SetHistoryItems(rows, history::SOURCE_FIREFOX_IMPORTED); }