llvm/flang/lib/parser/token-parsers.h

669 lines
22 KiB
C
Raw Normal View History

// Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef FORTRAN_PARSER_TOKEN_PARSERS_H_
#define FORTRAN_PARSER_TOKEN_PARSERS_H_
// These parsers are driven by the Fortran grammar (grammar.h) to consume
// the prescanned character stream and recognize context-sensitive tokens.
#include "basic-parsers.h"
#include "char-set.h"
#include "characters.h"
#include "instrumented-parser.h"
#include "provenance.h"
#include "type-parsers.h"
#include "../common/idioms.h"
#include <cctype>
#include <cstddef>
#include <cstring>
#include <functional>
#include <limits>
#include <list>
#include <optional>
#include <string>
namespace Fortran::parser {
// "xyz"_ch matches one instance of the characters x, y, or z without skipping
// any spaces before or after. The parser returns the location of the character
// on success.
class AnyOfChars {
public:
using resultType = const char *;
constexpr AnyOfChars(const AnyOfChars &) = default;
constexpr AnyOfChars(SetOfChars set) : set_{set} {}
std::optional<const char *> Parse(ParseState &state) const {
if (std::optional<const char *> at{state.PeekAtNextChar()}) {
if (set_.Has(**at)) {
state.UncheckedAdvance();
state.set_anyTokenMatched();
return at;
}
}
state.Say(MessageExpectedText{set_});
return std::nullopt;
}
private:
const SetOfChars set_;
};
constexpr AnyOfChars operator""_ch(const char str[], std::size_t n) {
return AnyOfChars{SetOfChars(str, n)};
}
constexpr auto letter{"abcdefghijklmnopqrstuvwxyz"_ch};
constexpr auto digit{"0123456789"_ch};
// Skips over optional spaces. Always succeeds.
constexpr struct Space {
using resultType = Success;
constexpr Space() {}
static std::optional<Success> Parse(ParseState &state) {
while (std::optional<const char *> p{state.PeekAtNextChar()}) {
if (**p != ' ') {
break;
}
state.UncheckedAdvance();
}
return {Success{}};
}
} space;
// Skips a space that in free form requires a warning if it precedes a
// character that could begin an identifier or keyword. Always succeeds.
inline void MissingSpace(ParseState &state) {
if (!state.inFixedForm()) {
state.Nonstandard(
LanguageFeature::OptionalFreeFormSpace, "missing space"_en_US);
}
}
constexpr struct SpaceCheck {
using resultType = Success;
constexpr SpaceCheck() {}
static std::optional<Success> Parse(ParseState &state) {
if (std::optional<const char *> p{state.PeekAtNextChar()}) {
char ch{**p};
if (ch == ' ') {
state.UncheckedAdvance();
return space.Parse(state);
}
if (IsLegalInIdentifier(ch)) {
MissingSpace(state);
}
}
return {Success{}};
}
} spaceCheck;
// Matches a token string. Spaces in the token string denote where
// spaces may appear in the source; they can be made mandatory for
// some free form keyword sequences. Missing mandatory spaces in free
// form elicit a warning; they are not necessary for recognition.
// Spaces before and after the token are also skipped.
//
// Token strings appear in the grammar as C++ user-defined literals
// like "BIND ( C )"_tok and "SYNC ALL"_sptok. The _tok suffix is implied
// when a string literal appears before the sequencing operator >> or
// after the sequencing operator /.
class TokenStringMatch {
public:
using resultType = Success;
constexpr TokenStringMatch(const TokenStringMatch &) = default;
constexpr TokenStringMatch(const char *str, std::size_t n, bool mandatory)
: str_{str}, bytes_{n}, mandatoryFreeFormSpace_{mandatory} {}
constexpr TokenStringMatch(const char *str, bool mandatory)
: str_{str}, mandatoryFreeFormSpace_{mandatory} {}
std::optional<Success> Parse(ParseState &state) const {
space.Parse(state);
const char *start{state.GetLocation()};
const char *p{str_};
std::optional<const char *> at; // initially empty
for (std::size_t j{0}; j < bytes_ && *p != '\0'; ++j, ++p) {
const auto spaceSkipping{*p == ' '};
if (spaceSkipping) {
if (j + 1 == bytes_ || p[1] == ' ' || p[1] == '\0') {
continue; // redundant; ignore
}
}
if (!at.has_value()) {
at = nextCh.Parse(state);
if (!at.has_value()) {
return std::nullopt;
}
}
if (spaceSkipping) {
if (**at == ' ') {
at = nextCh.Parse(state);
if (!at.has_value()) {
return std::nullopt;
}
} else if (mandatoryFreeFormSpace_) {
MissingSpace(state);
}
// 'at' remains full for next iteration
} else if (**at == ToLowerCaseLetter(*p)) {
at.reset();
} else {
state.Say(start, MessageExpectedText{str_, bytes_});
return std::nullopt;
}
}
state.set_anyTokenMatched();
if (IsLegalInIdentifier(p[-1])) {
return spaceCheck.Parse(state);
} else {
return space.Parse(state);
}
}
private:
const char *const str_;
const std::size_t bytes_{std::string::npos};
const bool mandatoryFreeFormSpace_;
};
constexpr TokenStringMatch operator""_tok(const char str[], std::size_t n) {
return TokenStringMatch{str, n, false};
}
constexpr TokenStringMatch operator""_sptok(const char str[], std::size_t n) {
return TokenStringMatch{str, n, true};
}
template<class PA>
inline constexpr std::enable_if_t<std::is_class_v<PA>,
SequenceParser<TokenStringMatch, PA>>
operator>>(const char *str, const PA &p) {
return SequenceParser<TokenStringMatch, PA>{TokenStringMatch{str, false}, p};
}
template<class PA>
inline constexpr std::enable_if_t<std::is_class_v<PA>,
FollowParser<PA, TokenStringMatch>>
operator/(const PA &p, const char *str) {
return FollowParser<PA, TokenStringMatch>{p, TokenStringMatch{str, false}};
}
template<class PA> inline constexpr auto parenthesized(const PA &p) {
return "(" >> p / ")";
}
template<class PA> inline constexpr auto bracketed(const PA &p) {
return "[" >> p / "]";
}
// Quoted character literal constants.
struct CharLiteralChar {
using resultType = std::pair<char, bool /* was escaped */>;
static std::optional<resultType> Parse(ParseState &state) {
auto at{state.GetLocation()};
if (std::optional<const char *> cp{nextCh.Parse(state)}) {
char ch{**cp};
if (ch == '\n') {
state.Say(CharBlock{at, state.GetLocation()},
"Unclosed character constant"_err_en_US);
return std::nullopt;
}
if (ch == '\\') {
// Most escape sequences in character literals are processed later,
// but we have to look for quotes here so that doubled quotes work.
if (std::optional<const char *> next{state.PeekAtNextChar()}) {
char escaped{**next};
if (escaped == '\'' || escaped == '"' || escaped == '\\') {
state.UncheckedAdvance();
return std::make_pair(escaped, true);
}
}
}
return std::make_pair(ch, false);
}
return std::nullopt;
}
};
template<char quote> struct CharLiteral {
using resultType = std::string;
static std::optional<std::string> Parse(ParseState &state) {
std::string str;
static constexpr auto nextch{attempt(CharLiteralChar{})};
while (auto ch{nextch.Parse(state)}) {
if (ch->second) {
str += '\\';
} else if (ch->first == quote) {
static constexpr auto doubled{attempt(AnyOfChars{SetOfChars{quote}})};
if (!doubled.Parse(state).has_value()) {
return str;
}
}
str += ch->first;
}
return std::nullopt;
}
};
// Parse "BOZ" binary literal quoted constants.
// As extensions, support X as an alternate hexadecimal marker, and allow
// BOZX markers to appear as suffixes.
struct BOZLiteral {
using resultType = std::string;
static std::optional<resultType> Parse(ParseState &state) {
char base{'\0'};
auto baseChar{[&base](char ch) -> bool {
switch (ch) {
case 'b':
case 'o':
case 'z': base = ch; return true;
case 'x': base = 'z'; return true;
default: return false;
}
}};
space.Parse(state);
const char *start{state.GetLocation()};
std::optional<const char *> at{nextCh.Parse(state)};
if (!at.has_value()) {
return std::nullopt;
}
if (**at == 'x' &&
!state.IsNonstandardOk(
LanguageFeature::BOZExtensions, "nonstandard BOZ literal"_en_US)) {
return std::nullopt;
}
if (baseChar(**at)) {
at = nextCh.Parse(state);
if (!at.has_value()) {
return std::nullopt;
}
}
char quote = **at;
if (quote != '\'' && quote != '"') {
return std::nullopt;
}
std::string content;
while (true) {
at = nextCh.Parse(state);
if (!at.has_value()) {
return std::nullopt;
}
if (**at == quote) {
break;
}
if (**at == ' ') {
continue;
}
if (!IsHexadecimalDigit(**at)) {
return std::nullopt;
}
content += ToLowerCaseLetter(**at);
}
if (!base) {
// extension: base allowed to appear as suffix, too
if (!(at = nextCh.Parse(state)).has_value() || !baseChar(**at) ||
!state.IsNonstandardOk(LanguageFeature::BOZExtensions,
"nonstandard BOZ literal"_en_US)) {
return std::nullopt;
}
spaceCheck.Parse(state);
}
if (content.empty()) {
state.Say(start, "no digit in BOZ literal"_err_en_US);
return std::nullopt;
}
return {std::string{base} + '"' + content + '"'};
}
};
// R711 digit-string -> digit [digit]...
// N.B. not a token -- no space is skipped
constexpr struct DigitString {
using resultType = CharBlock;
static std::optional<resultType> Parse(ParseState &state) {
if (std::optional<const char *> ch1{state.PeekAtNextChar()}) {
if (IsDecimalDigit(**ch1)) {
state.UncheckedAdvance();
while (std::optional<const char *> p{state.PeekAtNextChar()}) {
if (!IsDecimalDigit(**p)) {
break;
}
state.UncheckedAdvance();
}
return CharBlock{*ch1, state.GetLocation()};
}
}
return std::nullopt;
}
} digitString;
struct SignedIntLiteralConstantWithoutKind {
using resultType = CharBlock;
static std::optional<resultType> Parse(ParseState &state) {
resultType result{state.GetLocation()};
static constexpr auto sign{maybe("+-"_ch / space)};
if (sign.Parse(state).has_value()) {
if (auto digits{digitString.Parse(state)}) {
result.ExtendToCover(*digits);
return result;
}
}
return std::nullopt;
}
};
constexpr struct DigitString64 {
using resultType = std::uint64_t;
static std::optional<std::uint64_t> Parse(ParseState &state) {
std::optional<const char *> firstDigit{digit.Parse(state)};
if (!firstDigit.has_value()) {
return std::nullopt;
}
std::uint64_t value = **firstDigit - '0';
bool overflow{false};
static constexpr auto getDigit{attempt(digit)};
while (auto nextDigit{getDigit.Parse(state)}) {
if (value > std::numeric_limits<std::uint64_t>::max() / 10) {
overflow = true;
}
value *= 10;
int digitValue = **nextDigit - '0';
if (value > std::numeric_limits<std::uint64_t>::max() - digitValue) {
overflow = true;
}
value += digitValue;
}
if (overflow) {
state.Say(*firstDigit, "overflow in decimal literal"_err_en_US);
}
return {value};
}
} digitString64;
// R707 signed-int-literal-constant -> [sign] int-literal-constant
// N.B. Spaces are consumed before and after the sign, since the sign
// and the int-literal-constant are distinct tokens. Does not
// handle a trailing kind parameter.
static std::optional<std::int64_t> SignedInteger(
const std::optional<std::uint64_t> &x, Location at, bool negate,
ParseState &state) {
if (!x.has_value()) {
return std::nullopt;
}
std::uint64_t limit{std::numeric_limits<std::int64_t>::max()};
if (negate) {
limit = -(limit + 1);
}
if (*x > limit) {
state.Say(at, "overflow in signed decimal literal"_err_en_US);
}
std::int64_t value = *x;
return std::make_optional<std::int64_t>(negate ? -value : value);
}
// R710 signed-digit-string -> [sign] digit-string
// N.B. Not a complete token -- no space is skipped.
// Used only in the exponent parts of real literal constants.
struct SignedDigitString {
using resultType = std::int64_t;
static std::optional<std::int64_t> Parse(ParseState &state) {
std::optional<const char *> sign{state.PeekAtNextChar()};
if (!sign.has_value()) {
return std::nullopt;
}
bool negate{**sign == '-'};
if (negate || **sign == '+') {
state.UncheckedAdvance();
}
return SignedInteger(digitString64.Parse(state), *sign, negate, state);
}
};
// Variants of the above for use in FORMAT specifications, where spaces
// must be ignored.
struct DigitStringIgnoreSpaces {
using resultType = std::uint64_t;
static std::optional<std::uint64_t> Parse(ParseState &state) {
static constexpr auto getFirstDigit{space >> digit};
std::optional<const char *> firstDigit{getFirstDigit.Parse(state)};
if (!firstDigit.has_value()) {
return std::nullopt;
}
std::uint64_t value = **firstDigit - '0';
bool overflow{false};
static constexpr auto getDigit{space >> attempt(digit)};
while (auto nextDigit{getDigit.Parse(state)}) {
if (value > std::numeric_limits<std::uint64_t>::max() / 10) {
overflow = true;
}
value *= 10;
int digitValue = **nextDigit - '0';
if (value > std::numeric_limits<std::uint64_t>::max() - digitValue) {
overflow = true;
}
value += digitValue;
}
if (overflow) {
state.Say(*firstDigit, "overflow in decimal literal"_err_en_US);
}
return value;
}
};
struct PositiveDigitStringIgnoreSpaces {
using resultType = std::int64_t;
static std::optional<std::int64_t> Parse(ParseState &state) {
Location at{state.GetLocation()};
return SignedInteger(
DigitStringIgnoreSpaces{}.Parse(state), at, false /*positive*/, state);
}
};
struct SignedDigitStringIgnoreSpaces {
using resultType = std::int64_t;
static std::optional<std::int64_t> Parse(ParseState &state) {
static constexpr auto getSign{space >> attempt("+-"_ch)};
bool negate{false};
if (std::optional<const char *> sign{getSign.Parse(state)}) {
negate = **sign == '-';
}
Location at{state.GetLocation()};
return SignedInteger(
DigitStringIgnoreSpaces{}.Parse(state), at, negate, state);
}
};
// Legacy feature: Hollerith literal constants
struct HollerithLiteral {
using resultType = std::string;
static std::optional<std::string> Parse(ParseState &state) {
space.Parse(state);
const char *start{state.GetLocation()};
std::optional<std::uint64_t> charCount{
DigitStringIgnoreSpaces{}.Parse(state)};
if (!charCount.has_value() || *charCount < 1) {
return std::nullopt;
}
static constexpr auto letterH{"h"_ch};
std::optional<const char *> h{letterH.Parse(state)};
if (!h.has_value()) {
return std::nullopt;
}
std::string content;
for (auto j{*charCount}; j-- > 0;) {
int chBytes{UTF_8CharacterBytes(state.GetLocation())};
for (int bytes{chBytes}; bytes > 0; --bytes) {
if (std::optional<const char *> at{nextCh.Parse(state)}) {
if (chBytes == 1 && !std::isprint(**at)) {
state.Say(start, "Bad character in Hollerith"_err_en_US);
return std::nullopt;
}
content += **at;
} else {
state.Say(start, "Insufficient characters in Hollerith"_err_en_US);
return std::nullopt;
}
}
}
return content;
}
};
constexpr struct ConsumedAllInputParser {
using resultType = Success;
constexpr ConsumedAllInputParser() {}
static inline std::optional<Success> Parse(ParseState &state) {
if (state.IsAtEnd()) {
return {Success{}};
}
return std::nullopt;
}
} consumedAllInput;
template<char goal> struct SkipPast {
using resultType = Success;
constexpr SkipPast() {}
constexpr SkipPast(const SkipPast &) {}
static std::optional<Success> Parse(ParseState &state) {
while (std::optional<const char *> p{state.GetNextChar()}) {
if (**p == goal) {
return {Success{}};
}
}
return std::nullopt;
}
};
template<char goal> struct SkipTo {
using resultType = Success;
constexpr SkipTo() {}
constexpr SkipTo(const SkipTo &) {}
static std::optional<Success> Parse(ParseState &state) {
while (std::optional<const char *> p{state.PeekAtNextChar()}) {
if (**p == goal) {
return {Success{}};
}
state.UncheckedAdvance();
}
return std::nullopt;
}
};
// A common idiom in the Fortran grammar is an optional item (usually
// a nonempty comma-separated list) that, if present, must follow a comma
// and precede a doubled colon. When the item is absent, the comma must
// not appear, and the doubled colons are optional.
// [[, xyz] ::] is optionalBeforeColons(xyz)
// [[, xyz]... ::] is optionalBeforeColons(nonemptyList(xyz))
template<typename PA> inline constexpr auto optionalBeforeColons(const PA &p) {
return "," >> construct<std::optional<typename PA::resultType>>(p) / "::" ||
("::"_tok || !","_tok) >> defaulted(cut >> maybe(p));
}
template<typename PA>
inline constexpr auto optionalListBeforeColons(const PA &p) {
return "," >> nonemptyList(p) / "::" ||
("::"_tok || !","_tok) >> defaulted(cut >> nonemptyList(p));
}
// Skip over empty lines, leading spaces, and some compiler directives (viz.,
// the ones that specify the source form) that might appear before the
// next statement. Skip over empty statements (bare semicolons) when
// not in strict standard conformance mode. Always succeeds.
constexpr struct SkipStuffBeforeStatement {
using resultType = Success;
static std::optional<Success> Parse(ParseState &state) {
if (UserState * ustate{state.userState()}) {
if (ParsingLog * log{ustate->log()}) {
// Save memory: vacate the parsing log before each statement unless
// we're logging the whole parse for debugging.
if (!ustate->instrumentedParse()) {
log->clear();
}
}
}
while (std::optional<const char *> at{state.PeekAtNextChar()}) {
if (**at == '\n' || **at == ' ') {
state.UncheckedAdvance();
} else if (**at == '!') {
static const char fixed[] = "!dir$ fixed\n", free[] = "!dir$ free\n";
static constexpr std::size_t fixedBytes{sizeof fixed - 1};
static constexpr std::size_t freeBytes{sizeof free - 1};
std::size_t remain{state.BytesRemaining()};
if (remain >= fixedBytes && std::memcmp(*at, fixed, fixedBytes) == 0) {
state.set_inFixedForm(true).UncheckedAdvance(fixedBytes);
} else if (remain >= freeBytes &&
std::memcmp(*at, free, freeBytes) == 0) {
state.set_inFixedForm(false).UncheckedAdvance(freeBytes);
} else {
break;
}
} else if (**at == ';' &&
state.IsNonstandardOk(
LanguageFeature::EmptyStatement, "empty statement"_en_US)) {
state.UncheckedAdvance();
} else {
break;
}
}
return {Success{}};
}
} skipStuffBeforeStatement;
// R602 underscore -> _
constexpr auto underscore{"_"_ch};
// R516 keyword -> name
// R601 alphanumeric-character -> letter | digit | underscore
// R603 name -> letter [alphanumeric-character]...
// N.B. Don't accept an underscore if it is immediately followed by a
// quotation mark, so that kindParameter_"character literal" is parsed properly.
// PGI and ifort accept '$' in identifiers, even as the initial character.
// Cray and gfortran accept '$', but not as the first character.
// Cray accepts '@' as well.
constexpr auto otherIdChar{underscore / !"'\""_ch ||
extension<LanguageFeature::PunctuationInNames>("$@"_ch)};
constexpr auto nonDigitIdChar{letter || otherIdChar};
constexpr auto rawName{nonDigitIdChar >> many(nonDigitIdChar || digit)};
TYPE_PARSER(space >> sourced(rawName >> construct<Name>()))
constexpr auto keyword{construct<Keyword>(name)};
constexpr auto logicalTRUE{
(".TRUE."_tok ||
extension<LanguageFeature::LogicalAbbreviations>(".T."_tok)) >>
pure(true)};
constexpr auto logicalFALSE{
(".FALSE."_tok ||
extension<LanguageFeature::LogicalAbbreviations>(".F."_tok)) >>
pure(false)};
// R1003 defined-unary-op -> . letter [letter]... .
// R1023 defined-binary-op -> . letter [letter]... .
// R1414 local-defined-operator -> defined-unary-op | defined-binary-op
// R1415 use-defined-operator -> defined-unary-op | defined-binary-op
// C1003 A defined operator must be distinct from logical literal constants
// and intrinsic operator names; this is handled by attempting their parses
// first, and by name resolution on their definitions, for best errors.
[flang] Name resolution for defined operators Instead of tracking just genericName_ while in a generic interface block or generic statement, now we immediately create a symbol for it. A parser::Name isn't good enough because a defined-operator or defined-io-generic-spec doesn't have a name. Change the parse tree to add a source field to GenericSpec. Use these as names for symbols for defined-operator and defined-io-generic-spec (e.g. "operator(+)" or "read(formatted)"). Change the source for defined-op-name to include the dots so that they can be distinguished from normal symbols with the same name (e.g. you can have both ".foo." and "foo"). These symbols have names in the symbol table like ".foo.", not "operator(.foo.)", because references to them have that form. Add GenericKind enum to GenericDetails and GenericBindingDetails. This allows us to know a symbol is "assignment(=)", for example, without having to do a string comparison. Add GenericSpecInfo to handle analyzing the various kinds of generic-spec and generating symbol names and GenericKind for them. Add reference to LanguageFeatureControl to SemanticsContext so that they can be checked during semantics. For this change, if LogicalAbbreviations is enabled, report an error if the user tries to define an operator named ".T." or ".F.". Add resolve-name-utils.cc to hold utility functions and classes that don't have to be in the ResolveNamesVisitor class hierarchy. The goal is to reduce the size of resolve-names.cc where possible. Original-commit: flang-compiler/f18@3081f694e21dbcaef2554198a682c9af57f2e185 Reviewed-on: https://github.com/flang-compiler/f18/pull/338
2019-03-18 19:48:02 +01:00
// N.B. The name of the operator is captured with the dots around it.
constexpr auto definedOpNameChar{
letter || extension<LanguageFeature::PunctuationInNames>("$@"_ch)};
[flang] Name resolution for defined operators Instead of tracking just genericName_ while in a generic interface block or generic statement, now we immediately create a symbol for it. A parser::Name isn't good enough because a defined-operator or defined-io-generic-spec doesn't have a name. Change the parse tree to add a source field to GenericSpec. Use these as names for symbols for defined-operator and defined-io-generic-spec (e.g. "operator(+)" or "read(formatted)"). Change the source for defined-op-name to include the dots so that they can be distinguished from normal symbols with the same name (e.g. you can have both ".foo." and "foo"). These symbols have names in the symbol table like ".foo.", not "operator(.foo.)", because references to them have that form. Add GenericKind enum to GenericDetails and GenericBindingDetails. This allows us to know a symbol is "assignment(=)", for example, without having to do a string comparison. Add GenericSpecInfo to handle analyzing the various kinds of generic-spec and generating symbol names and GenericKind for them. Add reference to LanguageFeatureControl to SemanticsContext so that they can be checked during semantics. For this change, if LogicalAbbreviations is enabled, report an error if the user tries to define an operator named ".T." or ".F.". Add resolve-name-utils.cc to hold utility functions and classes that don't have to be in the ResolveNamesVisitor class hierarchy. The goal is to reduce the size of resolve-names.cc where possible. Original-commit: flang-compiler/f18@3081f694e21dbcaef2554198a682c9af57f2e185 Reviewed-on: https://github.com/flang-compiler/f18/pull/338
2019-03-18 19:48:02 +01:00
TYPE_PARSER(
space >> construct<DefinedOpName>(sourced("."_ch >>
some(definedOpNameChar) >> construct<Name>() / "."_ch)))
}
#endif // FORTRAN_PARSER_TOKEN_PARSERS_H_