C++程序  |  112行  |  4.26 KB

/*
 * Copyright (C) 2017 The Android Open Source Project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#ifndef LIBTEXTCLASSIFIER_DATETIME_EXTRACTOR_H_
#define LIBTEXTCLASSIFIER_DATETIME_EXTRACTOR_H_

#include <string>
#include <unordered_map>
#include <vector>

#include "model_generated.h"
#include "types.h"
#include "util/strings/stringpiece.h"
#include "util/utf8/unicodetext.h"
#include "util/utf8/unilib.h"

namespace libtextclassifier2 {

struct CompiledRule {
  // The compiled regular expression.
  std::unique_ptr<const UniLib::RegexPattern> compiled_regex;

  // The uncompiled pattern and information about the pattern groups.
  const DatetimeModelPattern_::Regex* regex;

  // DatetimeModelPattern which 'regex' is part of and comes from.
  const DatetimeModelPattern* pattern;
};

// A helper class for DatetimeParser that extracts structured data
// (DateParseDate) from the current match of the passed RegexMatcher.
class DatetimeExtractor {
 public:
  DatetimeExtractor(
      const CompiledRule& rule, const UniLib::RegexMatcher& matcher,
      int locale_id, const UniLib& unilib,
      const std::vector<std::unique_ptr<const UniLib::RegexPattern>>&
          extractor_rules,
      const std::unordered_map<DatetimeExtractorType,
                               std::unordered_map<int, int>>&
          type_and_locale_to_extractor_rule)
      : rule_(rule),
        matcher_(matcher),
        locale_id_(locale_id),
        unilib_(unilib),
        rules_(extractor_rules),
        type_and_locale_to_rule_(type_and_locale_to_extractor_rule) {}
  bool Extract(DateParseData* result, CodepointSpan* result_span) const;

 private:
  bool RuleIdForType(DatetimeExtractorType type, int* rule_id) const;

  // Returns true if the rule for given extractor matched. If it matched,
  // match_result will contain the first group of the rule (if match_result not
  // nullptr).
  bool ExtractType(const UnicodeText& input,
                   DatetimeExtractorType extractor_type,
                   UnicodeText* match_result = nullptr) const;

  bool GroupTextFromMatch(int group_id, UnicodeText* result) const;

  // Updates the span to include the current match for the given group.
  bool UpdateMatchSpan(int group_id, CodepointSpan* span) const;

  // Returns true if any of the extractors from 'mapping' matched. If it did,
  // will fill 'result' with the associated value from 'mapping'.
  template <typename T>
  bool MapInput(const UnicodeText& input,
                const std::vector<std::pair<DatetimeExtractorType, T>>& mapping,
                T* result) const;

  bool ParseDigits(const UnicodeText& input, int* parsed_digits) const;
  bool ParseWrittenNumber(const UnicodeText& input, int* parsed_number) const;
  bool ParseYear(const UnicodeText& input, int* parsed_year) const;
  bool ParseMonth(const UnicodeText& input, int* parsed_month) const;
  bool ParseAMPM(const UnicodeText& input, int* parsed_ampm) const;
  bool ParseRelation(const UnicodeText& input,
                     DateParseData::Relation* parsed_relation) const;
  bool ParseRelationDistance(const UnicodeText& input,
                             int* parsed_distance) const;
  bool ParseTimeUnit(const UnicodeText& input, int* parsed_time_unit) const;
  bool ParseRelationType(
      const UnicodeText& input,
      DateParseData::RelationType* parsed_relation_type) const;
  bool ParseWeekday(const UnicodeText& input, int* parsed_weekday) const;

  const CompiledRule& rule_;
  const UniLib::RegexMatcher& matcher_;
  int locale_id_;
  const UniLib& unilib_;
  const std::vector<std::unique_ptr<const UniLib::RegexPattern>>& rules_;
  const std::unordered_map<DatetimeExtractorType, std::unordered_map<int, int>>&
      type_and_locale_to_rule_;
};

}  // namespace libtextclassifier2

#endif  // LIBTEXTCLASSIFIER_DATETIME_EXTRACTOR_H_