普通文本  |  330行  |  8.67 KB

/*
 * Copyright (C) 2017 The Android Open Source Project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#include "common/fml-parser.h"

#include <ctype.h>
#include <string>

#include "util/base/logging.h"
#include "util/strings/numbers.h"

namespace libtextclassifier {
namespace nlp_core {

namespace {
inline bool IsValidCharAtStartOfIdentifier(char c) {
  return isalpha(c) || (c == '_') || (c == '/');
}

// Returns true iff character c can appear inside an identifier.
inline bool IsValidCharInsideIdentifier(char c) {
  return isalnum(c) || (c == '_') || (c == '-') || (c == '/');
}

// Returns true iff character c can appear at the beginning of a number.
inline bool IsValidCharAtStartOfNumber(char c) {
  return isdigit(c) || (c == '+') || (c == '-');
}

// Returns true iff character c can appear inside a number.
inline bool IsValidCharInsideNumber(char c) {
  return isdigit(c) || (c == '.');
}
}  // namespace

bool FMLParser::Initialize(const std::string &source) {
  // Initialize parser state.
  source_ = source;
  current_ = source_.begin();
  item_start_ = line_start_ = current_;
  line_number_ = item_line_number_ = 1;

  // Read first input item.
  return NextItem();
}

void FMLParser::ReportError(const std::string &error_message) {
  const int position = item_start_ - line_start_ + 1;
  const std::string line(line_start_, current_);

  TC_LOG(ERROR) << "Error in feature model, line " << item_line_number_
                << ", position " << position << ": " << error_message
                << "\n    " << line << " <--HERE";
}

void FMLParser::Next() {
  // Move to the next input character. If we are at a line break update line
  // number and line start position.
  if (CurrentChar() == '\n') {
    ++line_number_;
    ++current_;
    line_start_ = current_;
  } else {
    ++current_;
  }
}

bool FMLParser::NextItem() {
  // Skip white space and comments.
  while (!eos()) {
    if (CurrentChar() == '#') {
      // Skip comment.
      while (!eos() && CurrentChar() != '\n') Next();
    } else if (isspace(CurrentChar())) {
      // Skip whitespace.
      while (!eos() && isspace(CurrentChar())) Next();
    } else {
      break;
    }
  }

  // Record start position for next item.
  item_start_ = current_;
  item_line_number_ = line_number_;

  // Check for end of input.
  if (eos()) {
    item_type_ = END;
    return true;
  }

  // Parse number.
  if (IsValidCharAtStartOfNumber(CurrentChar())) {
    std::string::iterator start = current_;
    Next();
    while (!eos() && IsValidCharInsideNumber(CurrentChar())) Next();
    item_text_.assign(start, current_);
    item_type_ = NUMBER;
    return true;
  }

  // Parse std::string.
  if (CurrentChar() == '"') {
    Next();
    std::string::iterator start = current_;
    while (CurrentChar() != '"') {
      if (eos()) {
        ReportError("Unterminated string");
        return false;
      }
      Next();
    }
    item_text_.assign(start, current_);
    item_type_ = STRING;
    Next();
    return true;
  }

  // Parse identifier name.
  if (IsValidCharAtStartOfIdentifier(CurrentChar())) {
    std::string::iterator start = current_;
    while (!eos() && IsValidCharInsideIdentifier(CurrentChar())) {
      Next();
    }
    item_text_.assign(start, current_);
    item_type_ = NAME;
    return true;
  }

  // Single character item.
  item_type_ = CurrentChar();
  Next();
  return true;
}

bool FMLParser::Parse(const std::string &source,
                      FeatureExtractorDescriptor *result) {
  // Initialize parser.
  if (!Initialize(source)) {
    return false;
  }

  while (item_type_ != END) {
    // Current item should be a feature name.
    if (item_type_ != NAME) {
      ReportError("Feature type name expected");
      return false;
    }
    std::string name = item_text_;
    if (!NextItem()) {
      return false;
    }

    // Parse feature.
    FeatureFunctionDescriptor *descriptor = result->add_feature();
    descriptor->set_type(name);
    if (!ParseFeature(descriptor)) {
      return false;
    }
  }

  return true;
}

bool FMLParser::ParseFeature(FeatureFunctionDescriptor *result) {
  // Parse argument and parameters.
  if (item_type_ == '(') {
    if (!NextItem()) return false;
    if (!ParseParameter(result)) return false;
    while (item_type_ == ',') {
      if (!NextItem()) return false;
      if (!ParseParameter(result)) return false;
    }

    if (item_type_ != ')') {
      ReportError(") expected");
      return false;
    }
    if (!NextItem()) return false;
  }

  // Parse feature name.
  if (item_type_ == ':') {
    if (!NextItem()) return false;
    if (item_type_ != NAME && item_type_ != STRING) {
      ReportError("Feature name expected");
      return false;
    }
    std::string name = item_text_;
    if (!NextItem()) return false;

    // Set feature name.
    result->set_name(name);
  }

  // Parse sub-features.
  if (item_type_ == '.') {
    // Parse dotted sub-feature.
    if (!NextItem()) return false;
    if (item_type_ != NAME) {
      ReportError("Feature type name expected");
      return false;
    }
    std::string type = item_text_;
    if (!NextItem()) return false;

    // Parse sub-feature.
    FeatureFunctionDescriptor *subfeature = result->add_feature();
    subfeature->set_type(type);
    if (!ParseFeature(subfeature)) return false;
  } else if (item_type_ == '{') {
    // Parse sub-feature block.
    if (!NextItem()) return false;
    while (item_type_ != '}') {
      if (item_type_ != NAME) {
        ReportError("Feature type name expected");
        return false;
      }
      std::string type = item_text_;
      if (!NextItem()) return false;

      // Parse sub-feature.
      FeatureFunctionDescriptor *subfeature = result->add_feature();
      subfeature->set_type(type);
      if (!ParseFeature(subfeature)) return false;
    }
    if (!NextItem()) return false;
  }
  return true;
}

bool FMLParser::ParseParameter(FeatureFunctionDescriptor *result) {
  if (item_type_ == NUMBER) {
    int32 argument;
    if (!ParseInt32(item_text_.c_str(), &argument)) {
      ReportError("Unable to parse number");
      return false;
    }
    if (!NextItem()) return false;

    // Set default argument for feature.
    result->set_argument(argument);
  } else if (item_type_ == NAME) {
    std::string name = item_text_;
    if (!NextItem()) return false;
    if (item_type_ != '=') {
      ReportError("= expected");
      return false;
    }
    if (!NextItem()) return false;
    if (item_type_ >= END) {
      ReportError("Parameter value expected");
      return false;
    }
    std::string value = item_text_;
    if (!NextItem()) return false;

    // Add parameter to feature.
    Parameter *parameter;
    parameter = result->add_parameter();
    parameter->set_name(name);
    parameter->set_value(value);
  } else {
    ReportError("Syntax error in parameter list");
    return false;
  }
  return true;
}

void ToFMLFunction(const FeatureFunctionDescriptor &function,
                   std::string *output) {
  output->append(function.type());
  if (function.argument() != 0 || function.parameter_size() > 0) {
    output->append("(");
    bool first = true;
    if (function.argument() != 0) {
      output->append(IntToString(function.argument()));
      first = false;
    }
    for (int i = 0; i < function.parameter_size(); ++i) {
      if (!first) output->append(",");
      output->append(function.parameter(i).name());
      output->append("=");
      output->append("\"");
      output->append(function.parameter(i).value());
      output->append("\"");
      first = false;
    }
    output->append(")");
  }
}

void ToFML(const FeatureFunctionDescriptor &function, std::string *output) {
  ToFMLFunction(function, output);
  if (function.feature_size() == 1) {
    output->append(".");
    ToFML(function.feature(0), output);
  } else if (function.feature_size() > 1) {
    output->append(" { ");
    for (int i = 0; i < function.feature_size(); ++i) {
      if (i > 0) output->append(" ");
      ToFML(function.feature(i), output);
    }
    output->append(" } ");
  }
}

void ToFML(const FeatureExtractorDescriptor &extractor, std::string *output) {
  for (int i = 0; i < extractor.feature_size(); ++i) {
    ToFML(extractor.feature(i), output);
    output->append("\n");
  }
}

}  // namespace nlp_core
}  // namespace libtextclassifier