// Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #ifndef FORTRAN_PARSER_TOKEN_PARSERS_H_ #define FORTRAN_PARSER_TOKEN_PARSERS_H_ // These parsers are driven by the Fortran grammar (grammar.h) to consume // the prescanned character stream and recognize context-sensitive tokens. #include "basic-parsers.h" #include "char-set.h" #include "characters.h" #include "instrumented-parser.h" #include "provenance.h" #include "type-parsers.h" #include "../common/idioms.h" #include #include #include #include #include #include #include #include namespace Fortran::parser { // "xyz"_ch matches one instance of the characters x, y, or z without skipping // any spaces before or after. The parser returns the location of the character // on success. class AnyOfChars { public: using resultType = const char *; constexpr AnyOfChars(const AnyOfChars &) = default; constexpr AnyOfChars(SetOfChars set) : set_{set} {} std::optional Parse(ParseState &state) const { if (std::optional at{state.PeekAtNextChar()}) { if (set_.Has(**at)) { state.UncheckedAdvance(); state.set_anyTokenMatched(); return at; } } state.Say(MessageExpectedText{set_}); return std::nullopt; } private: const SetOfChars set_; }; constexpr AnyOfChars operator""_ch(const char str[], std::size_t n) { return AnyOfChars{SetOfChars(str, n)}; } constexpr auto letter{"abcdefghijklmnopqrstuvwxyz"_ch}; constexpr auto digit{"0123456789"_ch}; // Skips over optional spaces. Always succeeds. constexpr struct Space { using resultType = Success; constexpr Space() {} static std::optional Parse(ParseState &state) { while (std::optional p{state.PeekAtNextChar()}) { if (**p != ' ') { break; } state.UncheckedAdvance(); } return {Success{}}; } } space; // Skips a space that in free form requires a warning if it precedes a // character that could begin an identifier or keyword. Always succeeds. inline void MissingSpace(ParseState &state) { if (!state.inFixedForm()) { state.Nonstandard( LanguageFeature::OptionalFreeFormSpace, "missing space"_en_US); } } constexpr struct SpaceCheck { using resultType = Success; constexpr SpaceCheck() {} static std::optional Parse(ParseState &state) { if (std::optional p{state.PeekAtNextChar()}) { char ch{**p}; if (ch == ' ') { state.UncheckedAdvance(); return space.Parse(state); } if (IsLegalInIdentifier(ch)) { MissingSpace(state); } } return {Success{}}; } } spaceCheck; // Matches a token string. Spaces in the token string denote where // spaces may appear in the source; they can be made mandatory for // some free form keyword sequences. Missing mandatory spaces in free // form elicit a warning; they are not necessary for recognition. // Spaces before and after the token are also skipped. // // Token strings appear in the grammar as C++ user-defined literals // like "BIND ( C )"_tok and "SYNC ALL"_sptok. The _tok suffix is implied // when a string literal appears before the sequencing operator >> or // after the sequencing operator /. The literal "..."_id parses a // token that cannot be a prefix of a longer identifier. template class TokenStringMatch { public: using resultType = Success; constexpr TokenStringMatch(const TokenStringMatch &) = default; constexpr TokenStringMatch(const char *str, std::size_t n) : str_{str}, bytes_{n} {} explicit constexpr TokenStringMatch(const char *str) : str_{str} {} std::optional Parse(ParseState &state) const { space.Parse(state); const char *start{state.GetLocation()}; const char *p{str_}; std::optional at; // initially empty for (std::size_t j{0}; j < bytes_ && *p != '\0'; ++j, ++p) { bool spaceSkipping{*p == ' '}; if (spaceSkipping) { if (j + 1 == bytes_ || p[1] == ' ' || p[1] == '\0') { continue; // redundant; ignore } } if (!at.has_value()) { at = nextCh.Parse(state); if (!at.has_value()) { return std::nullopt; } } if (spaceSkipping) { if (**at == ' ') { at = nextCh.Parse(state); if (!at.has_value()) { return std::nullopt; } } else if constexpr (MandatoryFreeFormSpace) { MissingSpace(state); } // 'at' remains full for next iteration } else if (**at == ToLowerCaseLetter(*p)) { at.reset(); } else { state.Say(start, MessageExpectedText{str_, bytes_}); return std::nullopt; } } if constexpr (MustBeComplete) { if (auto after{state.PeekAtNextChar()}) { if (IsLegalInIdentifier(**after)) { state.Say(start, MessageExpectedText{str_, bytes_}); return std::nullopt; } } } state.set_anyTokenMatched(); if (IsLegalInIdentifier(p[-1])) { return spaceCheck.Parse(state); } else { return space.Parse(state); } } private: const char *const str_; const std::size_t bytes_{std::string::npos}; }; constexpr TokenStringMatch<> operator""_tok(const char str[], std::size_t n) { return {str, n}; } constexpr TokenStringMatch operator""_sptok( const char str[], std::size_t n) { return {str, n}; } constexpr TokenStringMatch operator""_id( const char str[], std::size_t n) { return {str, n}; } template inline constexpr std::enable_if_t, SequenceParser, PA>> operator>>(const char *str, const PA &p) { return SequenceParser, PA>{TokenStringMatch<>{str}, p}; } template inline constexpr std::enable_if_t, FollowParser>> operator/(const PA &p, const char *str) { return FollowParser>{p, TokenStringMatch<>{str}}; } template inline constexpr auto parenthesized(const PA &p) { return "(" >> p / ")"; } template inline constexpr auto bracketed(const PA &p) { return "[" >> p / "]"; } // Quoted character literal constants. struct CharLiteralChar { using resultType = std::pair; static std::optional Parse(ParseState &state) { auto at{state.GetLocation()}; if (std::optional cp{nextCh.Parse(state)}) { char ch{**cp}; if (ch == '\n') { state.Say(CharBlock{at, state.GetLocation()}, "Unclosed character constant"_err_en_US); return std::nullopt; } if (ch == '\\') { // Most escape sequences in character literals are processed later, // but we have to look for quotes here so that doubled quotes work. if (std::optional next{state.PeekAtNextChar()}) { char escaped{**next}; if (escaped == '\'' || escaped == '"' || escaped == '\\') { state.UncheckedAdvance(); return std::make_pair(escaped, true); } } } return std::make_pair(ch, false); } return std::nullopt; } }; template struct CharLiteral { using resultType = std::string; static std::optional Parse(ParseState &state) { std::string str; static constexpr auto nextch{attempt(CharLiteralChar{})}; while (auto ch{nextch.Parse(state)}) { if (ch->second) { str += '\\'; } else if (ch->first == quote) { static constexpr auto doubled{attempt(AnyOfChars{SetOfChars{quote}})}; if (!doubled.Parse(state).has_value()) { return str; } } str += ch->first; } return std::nullopt; } }; // Parse "BOZ" binary literal quoted constants. // As extensions, support X as an alternate hexadecimal marker, and allow // BOZX markers to appear as suffixes. struct BOZLiteral { using resultType = std::string; static std::optional Parse(ParseState &state) { char base{'\0'}; auto baseChar{[&base](char ch) -> bool { switch (ch) { case 'b': case 'o': case 'z': base = ch; return true; case 'x': base = 'z'; return true; default: return false; } }}; space.Parse(state); const char *start{state.GetLocation()}; std::optional at{nextCh.Parse(state)}; if (!at.has_value()) { return std::nullopt; } if (**at == 'x' && !state.IsNonstandardOk( LanguageFeature::BOZExtensions, "nonstandard BOZ literal"_en_US)) { return std::nullopt; } if (baseChar(**at)) { at = nextCh.Parse(state); if (!at.has_value()) { return std::nullopt; } } char quote = **at; if (quote != '\'' && quote != '"') { return std::nullopt; } std::string content; while (true) { at = nextCh.Parse(state); if (!at.has_value()) { return std::nullopt; } if (**at == quote) { break; } if (**at == ' ') { continue; } if (!IsHexadecimalDigit(**at)) { return std::nullopt; } content += ToLowerCaseLetter(**at); } if (!base) { // extension: base allowed to appear as suffix, too if (!(at = nextCh.Parse(state)).has_value() || !baseChar(**at) || !state.IsNonstandardOk(LanguageFeature::BOZExtensions, "nonstandard BOZ literal"_en_US)) { return std::nullopt; } spaceCheck.Parse(state); } if (content.empty()) { state.Say(start, "no digit in BOZ literal"_err_en_US); return std::nullopt; } return {std::string{base} + '"' + content + '"'}; } }; // R711 digit-string -> digit [digit]... // N.B. not a token -- no space is skipped constexpr struct DigitString { using resultType = CharBlock; static std::optional Parse(ParseState &state) { if (std::optional ch1{state.PeekAtNextChar()}) { if (IsDecimalDigit(**ch1)) { state.UncheckedAdvance(); while (std::optional p{state.PeekAtNextChar()}) { if (!IsDecimalDigit(**p)) { break; } state.UncheckedAdvance(); } return CharBlock{*ch1, state.GetLocation()}; } } return std::nullopt; } } digitString; struct SignedIntLiteralConstantWithoutKind { using resultType = CharBlock; static std::optional Parse(ParseState &state) { resultType result{state.GetLocation()}; static constexpr auto sign{maybe("+-"_ch / space)}; if (sign.Parse(state).has_value()) { if (auto digits{digitString.Parse(state)}) { result.ExtendToCover(*digits); return result; } } return std::nullopt; } }; constexpr struct DigitString64 { using resultType = std::uint64_t; static std::optional Parse(ParseState &state) { std::optional firstDigit{digit.Parse(state)}; if (!firstDigit.has_value()) { return std::nullopt; } std::uint64_t value = **firstDigit - '0'; bool overflow{false}; static constexpr auto getDigit{attempt(digit)}; while (auto nextDigit{getDigit.Parse(state)}) { if (value > std::numeric_limits::max() / 10) { overflow = true; } value *= 10; int digitValue = **nextDigit - '0'; if (value > std::numeric_limits::max() - digitValue) { overflow = true; } value += digitValue; } if (overflow) { state.Say(*firstDigit, "overflow in decimal literal"_err_en_US); } return {value}; } } digitString64; // R707 signed-int-literal-constant -> [sign] int-literal-constant // N.B. Spaces are consumed before and after the sign, since the sign // and the int-literal-constant are distinct tokens. Does not // handle a trailing kind parameter. static std::optional SignedInteger( const std::optional &x, Location at, bool negate, ParseState &state) { if (!x.has_value()) { return std::nullopt; } std::uint64_t limit{std::numeric_limits::max()}; if (negate) { limit = -(limit + 1); } if (*x > limit) { state.Say(at, "overflow in signed decimal literal"_err_en_US); } std::int64_t value = *x; return std::make_optional(negate ? -value : value); } // R710 signed-digit-string -> [sign] digit-string // N.B. Not a complete token -- no space is skipped. // Used only in the exponent parts of real literal constants. struct SignedDigitString { using resultType = std::int64_t; static std::optional Parse(ParseState &state) { std::optional sign{state.PeekAtNextChar()}; if (!sign.has_value()) { return std::nullopt; } bool negate{**sign == '-'}; if (negate || **sign == '+') { state.UncheckedAdvance(); } return SignedInteger(digitString64.Parse(state), *sign, negate, state); } }; // Variants of the above for use in FORMAT specifications, where spaces // must be ignored. struct DigitStringIgnoreSpaces { using resultType = std::uint64_t; static std::optional Parse(ParseState &state) { static constexpr auto getFirstDigit{space >> digit}; std::optional firstDigit{getFirstDigit.Parse(state)}; if (!firstDigit.has_value()) { return std::nullopt; } std::uint64_t value = **firstDigit - '0'; bool overflow{false}; static constexpr auto getDigit{space >> attempt(digit)}; while (auto nextDigit{getDigit.Parse(state)}) { if (value > std::numeric_limits::max() / 10) { overflow = true; } value *= 10; int digitValue = **nextDigit - '0'; if (value > std::numeric_limits::max() - digitValue) { overflow = true; } value += digitValue; } if (overflow) { state.Say(*firstDigit, "overflow in decimal literal"_err_en_US); } return value; } }; struct PositiveDigitStringIgnoreSpaces { using resultType = std::int64_t; static std::optional Parse(ParseState &state) { Location at{state.GetLocation()}; return SignedInteger( DigitStringIgnoreSpaces{}.Parse(state), at, false /*positive*/, state); } }; struct SignedDigitStringIgnoreSpaces { using resultType = std::int64_t; static std::optional Parse(ParseState &state) { static constexpr auto getSign{space >> attempt("+-"_ch)}; bool negate{false}; if (std::optional sign{getSign.Parse(state)}) { negate = **sign == '-'; } Location at{state.GetLocation()}; return SignedInteger( DigitStringIgnoreSpaces{}.Parse(state), at, negate, state); } }; // Legacy feature: Hollerith literal constants struct HollerithLiteral { using resultType = std::string; static std::optional Parse(ParseState &state) { space.Parse(state); const char *start{state.GetLocation()}; std::optional charCount{ DigitStringIgnoreSpaces{}.Parse(state)}; if (!charCount.has_value() || *charCount < 1) { return std::nullopt; } static constexpr auto letterH{"h"_ch}; std::optional h{letterH.Parse(state)}; if (!h.has_value()) { return std::nullopt; } std::string content; for (auto j{*charCount}; j-- > 0;) { int chBytes{UTF_8CharacterBytes(state.GetLocation())}; for (int bytes{chBytes}; bytes > 0; --bytes) { if (std::optional at{nextCh.Parse(state)}) { if (chBytes == 1 && !std::isprint(**at)) { state.Say(start, "Bad character in Hollerith"_err_en_US); return std::nullopt; } content += **at; } else { state.Say(start, "Insufficient characters in Hollerith"_err_en_US); return std::nullopt; } } } return content; } }; constexpr struct ConsumedAllInputParser { using resultType = Success; constexpr ConsumedAllInputParser() {} static inline std::optional Parse(ParseState &state) { if (state.IsAtEnd()) { return {Success{}}; } return std::nullopt; } } consumedAllInput; template struct SkipPast { using resultType = Success; constexpr SkipPast() {} constexpr SkipPast(const SkipPast &) {} static std::optional Parse(ParseState &state) { while (std::optional p{state.GetNextChar()}) { if (**p == goal) { return {Success{}}; } } return std::nullopt; } }; template struct SkipTo { using resultType = Success; constexpr SkipTo() {} constexpr SkipTo(const SkipTo &) {} static std::optional Parse(ParseState &state) { while (std::optional p{state.PeekAtNextChar()}) { if (**p == goal) { return {Success{}}; } state.UncheckedAdvance(); } return std::nullopt; } }; // A common idiom in the Fortran grammar is an optional item (usually // a nonempty comma-separated list) that, if present, must follow a comma // and precede a doubled colon. When the item is absent, the comma must // not appear, and the doubled colons are optional. // [[, xyz] ::] is optionalBeforeColons(xyz) // [[, xyz]... ::] is optionalBeforeColons(nonemptyList(xyz)) template inline constexpr auto optionalBeforeColons(const PA &p) { return "," >> construct>(p) / "::" || ("::"_tok || !","_tok) >> defaulted(cut >> maybe(p)); } template inline constexpr auto optionalListBeforeColons(const PA &p) { return "," >> nonemptyList(p) / "::" || ("::"_tok || !","_tok) >> defaulted(cut >> nonemptyList(p)); } // Skip over empty lines, leading spaces, and some compiler directives (viz., // the ones that specify the source form) that might appear before the // next statement. Skip over empty statements (bare semicolons) when // not in strict standard conformance mode. Always succeeds. constexpr struct SkipStuffBeforeStatement { using resultType = Success; static std::optional Parse(ParseState &state) { if (UserState * ustate{state.userState()}) { if (ParsingLog * log{ustate->log()}) { // Save memory: vacate the parsing log before each statement unless // we're logging the whole parse for debugging. if (!ustate->instrumentedParse()) { log->clear(); } } } while (std::optional at{state.PeekAtNextChar()}) { if (**at == '\n' || **at == ' ') { state.UncheckedAdvance(); } else if (**at == '!') { static const char fixed[] = "!dir$ fixed\n", free[] = "!dir$ free\n"; static constexpr std::size_t fixedBytes{sizeof fixed - 1}; static constexpr std::size_t freeBytes{sizeof free - 1}; std::size_t remain{state.BytesRemaining()}; if (remain >= fixedBytes && std::memcmp(*at, fixed, fixedBytes) == 0) { state.set_inFixedForm(true).UncheckedAdvance(fixedBytes); } else if (remain >= freeBytes && std::memcmp(*at, free, freeBytes) == 0) { state.set_inFixedForm(false).UncheckedAdvance(freeBytes); } else { break; } } else if (**at == ';' && state.IsNonstandardOk( LanguageFeature::EmptyStatement, "empty statement"_en_US)) { state.UncheckedAdvance(); } else { break; } } return {Success{}}; } } skipStuffBeforeStatement; // R602 underscore -> _ constexpr auto underscore{"_"_ch}; // R516 keyword -> name // R601 alphanumeric-character -> letter | digit | underscore // R603 name -> letter [alphanumeric-character]... // N.B. Don't accept an underscore if it is immediately followed by a // quotation mark, so that kindParameter_"character literal" is parsed properly. // PGI and ifort accept '$' in identifiers, even as the initial character. // Cray and gfortran accept '$', but not as the first character. // Cray accepts '@' as well. constexpr auto otherIdChar{underscore / !"'\""_ch || extension("$@"_ch)}; constexpr auto nonDigitIdChar{letter || otherIdChar}; constexpr auto rawName{nonDigitIdChar >> many(nonDigitIdChar || digit)}; TYPE_PARSER(space >> sourced(rawName >> construct())) constexpr auto keyword{construct(name)}; constexpr auto logicalTRUE{ (".TRUE."_tok || extension(".T."_tok)) >> pure(true)}; constexpr auto logicalFALSE{ (".FALSE."_tok || extension(".F."_tok)) >> pure(false)}; // R1003 defined-unary-op -> . letter [letter]... . // R1023 defined-binary-op -> . letter [letter]... . // R1414 local-defined-operator -> defined-unary-op | defined-binary-op // R1415 use-defined-operator -> defined-unary-op | defined-binary-op // C1003 A defined operator must be distinct from logical literal constants // and intrinsic operator names; this is handled by attempting their parses // first, and by name resolution on their definitions, for best errors. // N.B. The name of the operator is captured with the dots around it. constexpr auto definedOpNameChar{ letter || extension("$@"_ch)}; TYPE_PARSER( space >> construct(sourced("."_ch >> some(definedOpNameChar) >> construct() / "."_ch))) } #endif // FORTRAN_PARSER_TOKEN_PARSERS_H_