llvm/flang/lib/parser/token-parsers.h

#ifndef FORTRAN_PARSER_TOKEN_PARSERS_H_
#define FORTRAN_PARSER_TOKEN_PARSERS_H_

// These parsers are driven by the Fortran grammar (grammar.h) to consume
// the prescanned character stream and recognize context-sensitive tokens.

#include "basic-parsers.h"
#include "characters.h"
#include "idioms.h"
#include "provenance.h"
#include <cstddef>
#include <cstring>
#include <functional>
#include <limits>
#include <list>
#include <optional>
#include <string>

namespace Fortran {
namespace parser {

class CharPredicateGuard {
public:
  using resultType = const char *;
  constexpr CharPredicateGuard(const CharPredicateGuard &) = default;
  constexpr CharPredicateGuard(bool (*f)(char), MessageFixedText m)
    : predicate_{f}, messageText_{m} {}
  std::optional<const char *> Parse(ParseState *state) const {
    if (std::optional<const char *> at{state->PeekAtNextChar()}) {
      if (predicate_(**at)) {
        state->UncheckedAdvance();
        return at;
      }
    }
    state->Say(messageText_);
    return {};
  }

private:
  bool (*const predicate_)(char);
  const MessageFixedText messageText_;
};

constexpr auto letter =
    CharPredicateGuard{IsLetter, "expected letter"_err_en_US};
constexpr auto digit =
    CharPredicateGuard{IsDecimalDigit, "expected digit"_err_en_US};

// "xyz"_ch matches one instance of the characters x, y, or z without skipping
// any spaces before or after.  The parser returns the location of the character
// on success.
class AnyOfChar {
public:
  using resultType = const char *;
  constexpr AnyOfChar(const AnyOfChar &) = default;
  constexpr AnyOfChar(const char *chars, std::size_t n)
    : chars_{chars}, bytes_{n} {}
  std::optional<const char *> Parse(ParseState *state) const {
    if (std::optional<const char *> at{state->PeekAtNextChar()}) {
      char ch{**at};
      const char *p{chars_};
      for (std::size_t j{0}; j < bytes_ && *p != '\0'; ++j, ++p) {
        if (ch == ToLowerCaseLetter(*p)) {
          state->UncheckedAdvance();
          return at;
        }
      }
    }
    state->Say(MessageExpectedText{chars_, bytes_});
    return {};
  }

private:
  const char *const chars_;
  const std::size_t bytes_{std::numeric_limits<std::size_t>::max()};
};

constexpr AnyOfChar operator""_ch(const char str[], std::size_t n) {
  return AnyOfChar{str, n};
}

// Skips over optional spaces.  Always succeeds.
constexpr struct Space {
  using resultType = Success;
  constexpr Space() {}
  static std::optional<Success> Parse(ParseState *state) {
    while (std::optional<const char *> p{state->PeekAtNextChar()}) {
      if (**p != ' ') {
        break;
      }
      state->UncheckedAdvance();
    }
    return {Success{}};
  }
} space;

// Skips a space that in free form requires a warning if it precedes a
// character that could begin an identifier or keyword.  Always succeeds.
static inline void MissingSpace(ParseState *state) {
  if (!state->inFixedForm()) {
    state->set_anyConformanceViolation();
    if (state->warnOnNonstandardUsage()) {
      state->Say("expected space"_err_en_US);
    }
  }
}

constexpr struct SpaceCheck {
  using resultType = Success;
  constexpr SpaceCheck() {}
  static std::optional<Success> Parse(ParseState *state) {
    if (std::optional<const char *> p{state->PeekAtNextChar()}) {
      char ch{**p};
      if (ch == ' ') {
        state->UncheckedAdvance();
        return space.Parse(state);
      }
      if (IsLegalInIdentifier(ch)) {
        MissingSpace(state);
      }
    }
    return {Success{}};
  }
} spaceCheck;

// Matches a token string.  Spaces in the token string denote where
// spaces may appear in the source; they can be made mandatory for
// some free form keyword sequences.  Missing mandatory spaces in free
// form elicit a warning; they are not necessary for recognition.
// Spaces before and after the token are also skipped.
//
// Token strings appear in the grammar as C++ user-defined literals
// like "BIND ( C )"_tok and "SYNC ALL"_sptok.  The _tok suffix is implied
// when a string literal appears before the sequencing operator >> or
// after the sequencing operator /.
class TokenStringMatch {
public:
  using resultType = Success;
  constexpr TokenStringMatch(const TokenStringMatch &) = default;
  constexpr TokenStringMatch(const char *str, std::size_t n, bool mandatory)
    : str_{str}, bytes_{n}, mandatoryFreeFormSpace_{mandatory} {}
  constexpr TokenStringMatch(const char *str, bool mandatory)
    : str_{str}, mandatoryFreeFormSpace_{mandatory} {}
  std::optional<Success> Parse(ParseState *state) const {
    space.Parse(state);
    const char *start{state->GetLocation()};
    const char *p{str_};
    std::optional<const char *> at;  // initially empty
    for (std::size_t j{0}; j < bytes_ && *p != '\0'; ++j, ++p) {
      const auto spaceSkipping{*p == ' '};
      if (spaceSkipping) {
        if (j + 1 == bytes_ || p[1] == ' ' || p[1] == '\0') {
          continue;  // redundant; ignore
        }
      }
      if (!at.has_value()) {
        at = nextCh.Parse(state);
        if (!at.has_value()) {
          return {};
        }
      }
      if (spaceSkipping) {
        if (**at == ' ') {
          at = nextCh.Parse(state);
          if (!at.has_value()) {
            return {};
          }
        } else if (mandatoryFreeFormSpace_) {
          MissingSpace(state);
        }
        // 'at' remains full for next iteration
      } else if (**at == ToLowerCaseLetter(*p)) {
        at.reset();
      } else {
        state->Say(start, MessageExpectedText{str_, bytes_});
        return {};
      }
    }
    if (IsLegalInIdentifier(p[-1])) {
      return spaceCheck.Parse(state);
    } else {
      return space.Parse(state);
    }
  }

private:
  const char *const str_;
  const std::size_t bytes_{std::numeric_limits<std::size_t>::max()};
  const bool mandatoryFreeFormSpace_;
};

constexpr TokenStringMatch operator""_tok(const char str[], std::size_t n) {
  return TokenStringMatch{str, n, false};
}

constexpr TokenStringMatch operator""_sptok(const char str[], std::size_t n) {
  return TokenStringMatch{str, n, true};
}

template<class PA, std::enable_if_t<std::is_class<PA>::value, int> = 0>
inline constexpr SequenceParser<TokenStringMatch, PA> operator>>(
    const char *str, const PA &p) {
  return SequenceParser<TokenStringMatch, PA>{TokenStringMatch{str, false}, p};
}

template<class PA, std::enable_if_t<std::is_class<PA>::value, int> = 0>
inline constexpr InvertedSequenceParser<PA, TokenStringMatch> operator/(
    const PA &p, const char *str) {
  return InvertedSequenceParser<PA, TokenStringMatch>{
      p, TokenStringMatch{str, false}};
}

template<class PA>
inline constexpr SequenceParser<TokenStringMatch,
    InvertedSequenceParser<PA, TokenStringMatch>>
parenthesized(const PA &p) {
  return "(" >> p / ")";
}

template<class PA>
inline constexpr SequenceParser<TokenStringMatch,
    InvertedSequenceParser<PA, TokenStringMatch>>
bracketed(const PA &p) {
  return "[" >> p / "]";
}

// Quoted character literal constants.
struct CharLiteralChar {
  struct Result {
    Result(char c, bool esc) : ch{c}, wasEscaped{esc} {}
    static Result Bare(char c) { return Result{c, false}; }
    static Result Escaped(char c) { return Result{c, true}; }
    char ch;
    bool wasEscaped;
  };
  using resultType = Result;
  static std::optional<Result> Parse(ParseState *state) {
    auto at = state->GetLocation();
    std::optional<const char *> och{nextCh.Parse(state)};
    if (!och.has_value()) {
      return {};
    }
    char ch{**och};
    if (ch == '\n') {
      state->Say(at, "unclosed character constant"_err_en_US);
      return {};
    }
    if (ch != '\\') {
      return {Result::Bare(ch)};
    }
    if (!(och = nextCh.Parse(state)).has_value()) {
      return {};
    }
    ch = **och;
    if (ch == '\n') {
      state->Say(at, "unclosed character constant"_err_en_US);
      return {};
    }
    if (std::optional<char> escChar{BackslashEscapeValue(ch)}) {
      return {Result::Escaped(*escChar)};
    }
    if (IsOctalDigit(ch)) {
      ch -= '0';
      for (int j = (ch > 3 ? 1 : 2); j-- > 0;) {
        static constexpr auto octalDigit =
            CharPredicateGuard{IsOctalDigit, "expected octal digit"_en_US};
        och = octalDigit.Parse(state);
        if (och.has_value()) {
          ch = 8 * ch + **och - '0';
        } else {
          break;
        }
      }
    } else if (ch == 'x' || ch == 'X') {
      ch = 0;
      for (int j = 0; j++ < 2;) {
        static constexpr auto hexDigit = CharPredicateGuard{
            IsHexadecimalDigit, "expected hexadecimal digit"_en_US};
        och = hexDigit.Parse(state);
        if (och.has_value()) {
          ch = 16 * ch + HexadecimalDigitValue(**och);
        } else {
          break;
        }
      }
    } else {
      state->Say(at, "bad escaped character"_en_US);
    }
    return {Result::Escaped(ch)};
  }
};

template<char quote> struct CharLiteral {
  using resultType = std::string;
  static std::optional<std::string> Parse(ParseState *state) {
    std::string str;
    static constexpr auto nextch = attempt(CharLiteralChar{});
    static char q{quote};
    while (std::optional<CharLiteralChar::Result> ch{nextch.Parse(state)}) {
      if (ch->ch == quote && !ch->wasEscaped) {
        static constexpr auto doubled = attempt(AnyOfChar{&q, 1});
        if (!doubled.Parse(state).has_value()) {
          return {str};
        }
      }
      str += ch->ch;
    }
    return {};
  }
};

static bool IsNonstandardUsageOk(ParseState *state) {
  if (state->strictConformance()) {
    return false;
  }
  state->set_anyConformanceViolation();
  if (state->warnOnNonstandardUsage()) {
    state->Say("nonstandard usage"_en_US);
  }
  return true;
}

// Parse "BOZ" binary literal quoted constants.
// As extensions, support X as an alternate hexadecimal marker, and allow
// BOZX markers to appear as suffixes.
struct BOZLiteral {
  using resultType = std::uint64_t;
  static std::optional<std::uint64_t> Parse(ParseState *state) {
    std::optional<int> shift;
    auto baseChar = [&shift](char ch) -> bool {
      switch (ch) {
      case 'b': shift = 1; return true;
      case 'o': shift = 3; return true;
      case 'z': shift = 4; return true;
      case 'x': shift = 4; return true;
      default: return false;
      }
    };

    space.Parse(state);
    const char *start{state->GetLocation()};
    std::optional<const char *> at{nextCh.Parse(state)};
    if (!at.has_value()) {
      return {};
    }
    if (**at == 'x' && !IsNonstandardUsageOk(state)) {
      return {};
    }
    if (baseChar(**at)) {
      at = nextCh.Parse(state);
      if (!at.has_value()) {
        return {};
      }
    }

    char quote = **at;
    if (quote != '\'' && quote != '"') {
      return {};
    }

    std::string content;
    while (true) {
      at = nextCh.Parse(state);
      if (!at.has_value()) {
        return {};
      }
      if (**at == quote) {
        break;
      }
      if (**at == ' ') {
        continue;
      }
      if (!IsHexadecimalDigit(**at)) {
        return {};
      }
      content += **at;
    }

    if (!shift) {
      // extension: base allowed to appear as suffix, too
      if (!IsNonstandardUsageOk(state) || !(at = nextCh.Parse(state)) ||
          !baseChar(**at)) {
        return {};
      }
      spaceCheck.Parse(state);
    }

    if (content.empty()) {
      state->Say(start, "no digit in BOZ literal"_err_en_US);
      return {};
    }

    std::uint64_t value{0};
    for (auto digit : content) {
      digit = HexadecimalDigitValue(digit);
      if ((digit >> *shift) > 0) {
        state->Say(start, "bad digit in BOZ literal"_err_en_US);
        return {};
      }
      std::uint64_t was{value};
      value <<= *shift;
      if ((value >> *shift) != was) {
        state->Say(start, "excessive digits in BOZ literal"_err_en_US);
        return {};
      }
      value |= digit;
    }
    return {value};
  }
};

// Unsigned decimal digit string; no space skipping
struct DigitString {
  using resultType = std::uint64_t;
  static std::optional<std::uint64_t> Parse(ParseState *state) {
    static constexpr auto getDigit = attempt(digit);
    std::optional<const char *> firstDigit{getDigit.Parse(state)};
    if (!firstDigit.has_value()) {
      return {};
    }
    std::uint64_t value = **firstDigit - '0';
    bool overflow{false};
    while (auto nextDigit{getDigit.Parse(state)}) {
      if (value > std::numeric_limits<std::uint64_t>::max() / 10) {
        overflow = true;
      }
      value *= 10;
      int digitValue = **nextDigit - '0';
      if (value > std::numeric_limits<std::uint64_t>::max() - digitValue) {
        overflow = true;
      }
      value += digitValue;
    }
    if (overflow) {
      state->Say(*firstDigit, "overflow in decimal literal"_err_en_US);
    }
    return {value};
  }
};

constexpr struct SkipDigitString {
  using resultType = Success;
  static std::optional<Success> Parse(ParseState *state) {
    if (std::optional<const char *> ch1{state->PeekAtNextChar()}) {
      if (IsDecimalDigit(**ch1)) {
        state->UncheckedAdvance();
        while (std::optional<const char *> p{state->PeekAtNextChar()}) {
          if (!IsDecimalDigit(**p)) {
            break;
          }
          state->UncheckedAdvance();
        }
        return {Success{}};
      }
    }
    return {};
  }
} skipDigitString;

struct DigitStringAsPositive {
  using resultType = std::int64_t;
  static std::optional<std::int64_t> Parse(ParseState *state) {
    Location at{state->GetLocation()};
    std::optional<std::uint64_t> x{DigitString{}.Parse(state)};
    if (!x.has_value()) {
      return {};
    }
    if (*x > std::numeric_limits<std::int64_t>::max()) {
      state->Say(at, "overflow in positive decimal literal"_err_en_US);
    }
    std::int64_t value = *x;
    return {value};
  }
};

struct SignedDigitString {
  using resultType = std::int64_t;
  static std::optional<std::int64_t> Parse(ParseState *state) {
    std::optional<const char *> sign{state->PeekAtNextChar()};
    if (!sign.has_value()) {
      return {};
    }
    bool negate{**sign == '-'};
    if (negate || **sign == '+') {
      state->UncheckedAdvance();
    }
    std::optional<std::uint64_t> x{DigitString{}.Parse(state)};
    if (!x.has_value()) {
      return {};
    }
    std::uint64_t limit{std::numeric_limits<std::int64_t>::max()};
    if (negate) {
      limit = -(limit + 1);
    }
    if (*x > limit) {
      state->Say(*sign, "overflow in signed decimal literal"_err_en_US);
    }
    std::int64_t value = *x;
    return {negate ? -value : value};
  }
};

// Legacy feature: Hollerith literal constants
struct HollerithLiteral {
  using resultType = std::string;
  static std::optional<std::string> Parse(ParseState *state) {
    space.Parse(state);
    const char *start{state->GetLocation()};
    std::optional<std::uint64_t> charCount{DigitString{}.Parse(state)};
    if (!charCount || *charCount < 1) {
      return {};
    }
    std::optional<const char *> h{letter.Parse(state)};
    if (!h || **h != 'h') {
      return {};
    }
    std::string content;
    for (auto j = *charCount; j-- > 0;) {
      int bytes{1};
      const char *p{state->GetLocation()};
      if (state->encoding() == Encoding::EUC_JP) {
        std::optional<int> chBytes{EUC_JPCharacterBytes(p)};
        if (!chBytes.has_value()) {
          state->Say(start, "bad EUC_JP characters in Hollerith"_err_en_US);
          return {};
        }
        bytes = *chBytes;
      } else if (state->encoding() == Encoding::UTF8) {
        std::optional<int> chBytes{UTF8CharacterBytes(p)};
        if (!chBytes.has_value()) {
          state->Say(start, "bad UTF-8 characters in Hollerith"_err_en_US);
          return {};
        }
        bytes = *chBytes;
      }
      if (bytes == 1) {
        std::optional<const char *> at{nextCh.Parse(state)};
        if (!at.has_value() || !isprint(**at)) {
          state->Say(
              start, "insufficient or bad characters in Hollerith"_err_en_US);
          return {};
        }
        content += **at;
      } else {
        // Multi-byte character
        while (bytes-- > 0) {
          std::optional<const char *> byte{nextCh.Parse(state)};
          CHECK(byte.has_value());
          content += **byte;
        }
      }
    }
    return {content};
  }
};

struct ConsumedAllInputParser {
  using resultType = Success;
  constexpr ConsumedAllInputParser() {}
  static std::optional<Success> Parse(ParseState *state) {
    if (state->IsAtEnd()) {
      return {Success{}};
    }
    return {};
  }
} consumedAllInput;

template<char goal> struct SkipPast {
  using resultType = Success;
  constexpr SkipPast() {}
  constexpr SkipPast(const SkipPast &) {}
  static std::optional<Success> Parse(ParseState *state) {
    while (std::optional<const char *> p{state->GetNextChar()}) {
      if (**p == goal) {
        return {Success{}};
      }
    }
    return {};
  }
};

template<char goal> struct SkipTo {
  using resultType = Success;
  constexpr SkipTo() {}
  constexpr SkipTo(const SkipTo &) {}
  static std::optional<Success> Parse(ParseState *state) {
    while (std::optional<const char *> p{state->PeekAtNextChar()}) {
      if (**p == goal) {
        return {Success{}};
      }
      state->UncheckedAdvance();
    }
    return {};
  }
};

// A common idiom in the Fortran grammar is an optional item (usually
// a nonempty comma-separated list) that, if present, must follow a comma
// and precede a doubled colon.  When the item is absent, the comma must
// not appear, and the doubled colons are optional.
//   [[, xyz] ::]     is  optionalBeforeColons(xyz)
//   [[, xyz]... ::]  is  optionalBeforeColons(nonemptyList(xyz))
template<typename PA> inline constexpr auto optionalBeforeColons(const PA &p) {
  return "," >> construct<std::optional<typename PA::resultType>>{}(p) / "::" ||
      ("::"_tok || !","_tok) >> defaulted(cut >> maybe(p));
}
template<typename PA>
inline constexpr auto optionalListBeforeColons(const PA &p) {
  return "," >> nonemptyList(p) / "::" ||
      ("::"_tok || !","_tok) >> defaulted(cut >> nonemptyList(p));
}

// Compiler directives can switch the parser between fixed and free form.
constexpr struct FormDirectivesAndEmptyLines {
  using resultType = Success;
  static std::optional<Success> Parse(ParseState *state) {
    while (std::optional<const char *> at{state->PeekAtNextChar()}) {
      if (**at == '\n') {
        state->UncheckedAdvance();
      } else if (**at == '!') {
        static const char fixed[] = "!dir$ fixed\n", free[] = "!dir$ free\n";
        static constexpr std::size_t fixedBytes{sizeof fixed - 1};
        static constexpr std::size_t freeBytes{sizeof free - 1};
        std::size_t remain{state->BytesRemaining()};
        if (remain >= fixedBytes && std::memcmp(*at, fixed, fixedBytes) == 0) {
          state->set_inFixedForm(true).UncheckedAdvance(fixedBytes);
        } else if (remain >= freeBytes &&
            std::memcmp(*at, free, freeBytes) == 0) {
          state->set_inFixedForm(false).UncheckedAdvance(freeBytes);
        } else {
          break;
        }
      } else {
        break;
      }
    }
    return {Success{}};
  }
} skipEmptyLines;
}  // namespace parser
}  // namespace Fortran
#endif  // FORTRAN_PARSER_TOKEN_PARSERS_H_