dbb202c5be
Original-commit: flang-compiler/f18@10a592a591 Reviewed-on: https://github.com/flang-compiler/f18/pull/535 Tree-same-pre-rewrite: false
233 lines
7.3 KiB
C++
233 lines
7.3 KiB
C++
// Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
#ifndef FORTRAN_PARSER_CHARACTERS_H_
|
|
#define FORTRAN_PARSER_CHARACTERS_H_
|
|
|
|
// Define some character classification predicates and
|
|
// conversions here to avoid dependences upon <cctype> and
|
|
// also to accomodate Fortran tokenization.
|
|
|
|
#include <cstddef>
|
|
#include <optional>
|
|
#include <string>
|
|
|
|
namespace Fortran::parser {
|
|
|
|
// We can easily support Fortran program source in any character
|
|
// set whose first 128 code points correspond to ASCII codes 0-127 (ISO/IEC646).
|
|
// The specific encodings that we can handle include:
|
|
// LATIN_1: ISO 8859-1 Latin-1
|
|
// UTF_8: Multi-byte encoding of Unicode (ISO/IEC 10646)
|
|
enum class Encoding { LATIN_1, UTF_8 };
|
|
|
|
inline constexpr bool IsUpperCaseLetter(char ch) {
|
|
return ch >= 'A' && ch <= 'Z';
|
|
}
|
|
|
|
inline constexpr bool IsLowerCaseLetter(char ch) {
|
|
return ch >= 'a' && ch <= 'z';
|
|
}
|
|
|
|
inline constexpr bool IsLetter(char ch) {
|
|
return IsUpperCaseLetter(ch) || IsLowerCaseLetter(ch);
|
|
}
|
|
|
|
inline constexpr bool IsDecimalDigit(char ch) { return ch >= '0' && ch <= '9'; }
|
|
|
|
inline constexpr bool IsHexadecimalDigit(char ch) {
|
|
return (ch >= '0' && ch <= '9') || (ch >= 'A' && ch <= 'F') ||
|
|
(ch >= 'a' && ch <= 'f');
|
|
}
|
|
|
|
inline constexpr bool IsOctalDigit(char ch) { return ch >= '0' && ch <= '7'; }
|
|
|
|
inline constexpr bool IsLegalIdentifierStart(char ch) {
|
|
return IsLetter(ch) || ch == '_' || ch == '@' || ch == '$';
|
|
}
|
|
|
|
inline constexpr bool IsLegalInIdentifier(char ch) {
|
|
return IsLegalIdentifierStart(ch) || IsDecimalDigit(ch);
|
|
}
|
|
|
|
inline constexpr char ToLowerCaseLetter(char ch) {
|
|
return IsUpperCaseLetter(ch) ? ch - 'A' + 'a' : ch;
|
|
}
|
|
|
|
inline constexpr char ToLowerCaseLetter(char &&ch) {
|
|
return IsUpperCaseLetter(ch) ? ch - 'A' + 'a' : ch;
|
|
}
|
|
|
|
inline std::string ToLowerCaseLetters(const std::string &str) {
|
|
std::string lowered{str};
|
|
for (char &ch : lowered) {
|
|
ch = ToLowerCaseLetter(ch);
|
|
}
|
|
return lowered;
|
|
}
|
|
|
|
inline constexpr char ToUpperCaseLetter(char ch) {
|
|
return IsLowerCaseLetter(ch) ? ch - 'a' + 'A' : ch;
|
|
}
|
|
|
|
inline constexpr char ToUpperCaseLetter(char &&ch) {
|
|
return IsLowerCaseLetter(ch) ? ch - 'a' + 'A' : ch;
|
|
}
|
|
|
|
inline std::string ToUpperCaseLetters(const std::string &str) {
|
|
std::string raised{str};
|
|
for (char &ch : raised) {
|
|
ch = ToUpperCaseLetter(ch);
|
|
}
|
|
return raised;
|
|
}
|
|
|
|
inline constexpr bool IsSameApartFromCase(char x, char y) {
|
|
return ToLowerCaseLetter(x) == ToLowerCaseLetter(y);
|
|
}
|
|
|
|
inline constexpr char DecimalDigitValue(char ch) { return ch - '0'; }
|
|
|
|
inline constexpr char HexadecimalDigitValue(char ch) {
|
|
return IsUpperCaseLetter(ch)
|
|
? ch - 'A' + 10
|
|
: IsLowerCaseLetter(ch) ? ch - 'a' + 10 : DecimalDigitValue(ch);
|
|
}
|
|
|
|
inline constexpr std::optional<char> BackslashEscapeValue(char ch) {
|
|
switch (ch) {
|
|
case 'a': return std::nullopt; // '\a'; PGF90 doesn't know \a
|
|
case 'b': return '\b';
|
|
case 'f': return '\f';
|
|
case 'n': return '\n';
|
|
case 'r': return '\r';
|
|
case 't': return '\t';
|
|
case 'v': return '\v';
|
|
case '"':
|
|
case '\'':
|
|
case '\\': return ch;
|
|
default: return std::nullopt;
|
|
}
|
|
}
|
|
|
|
inline constexpr std::optional<char> BackslashEscapeChar(char ch) {
|
|
switch (ch) {
|
|
case '\a': return std::nullopt; // 'a'; PGF90 doesn't know \a
|
|
case '\b': return 'b';
|
|
case '\f': return 'f';
|
|
case '\n': return 'n';
|
|
case '\r': return 'r';
|
|
case '\t': return 't';
|
|
case '\v': return 'v';
|
|
case '"':
|
|
case '\'':
|
|
case '\\': return ch;
|
|
default: return std::nullopt;
|
|
}
|
|
}
|
|
|
|
struct EncodedCharacter {
|
|
static constexpr int maxEncodingBytes{6};
|
|
char buffer[maxEncodingBytes];
|
|
int bytes{0};
|
|
};
|
|
|
|
template<Encoding ENCODING> EncodedCharacter EncodeCharacter(char32_t ucs);
|
|
template<> EncodedCharacter EncodeCharacter<Encoding::LATIN_1>(char32_t);
|
|
template<> EncodedCharacter EncodeCharacter<Encoding::UTF_8>(char32_t);
|
|
|
|
EncodedCharacter EncodeCharacter(Encoding, char32_t ucs);
|
|
|
|
template<Encoding ENCODING, typename STRING>
|
|
std::string EncodeString(const STRING &);
|
|
extern template std::string EncodeString<Encoding::LATIN_1, std::string>(
|
|
const std::string &);
|
|
extern template std::string EncodeString<Encoding::UTF_8, std::u32string>(
|
|
const std::u32string &);
|
|
|
|
// EmitQuotedChar drives callbacks "emit" and "insert" to output the
|
|
// bytes of an encoding for a codepoint.
|
|
template<typename NORMAL, typename INSERTED>
|
|
void EmitQuotedChar(char32_t ch, const NORMAL &emit, const INSERTED &insert,
|
|
bool backslashEscapes = true, Encoding encoding = Encoding::UTF_8) {
|
|
auto emitOneChar{[&](std::uint8_t ch) {
|
|
if (ch < ' ' || (backslashEscapes && (ch == '\\' || ch >= 0x7f))) {
|
|
insert('\\');
|
|
if (std::optional<char> escape{BackslashEscapeChar(ch)}) {
|
|
emit(*escape);
|
|
} else {
|
|
// octal escape sequence; always emit 3 digits to avoid ambiguity
|
|
insert('0' + (ch >> 6));
|
|
insert('0' + ((ch >> 3) & 7));
|
|
insert('0' + (ch & 7));
|
|
}
|
|
} else {
|
|
emit(ch);
|
|
}
|
|
}};
|
|
if (ch <= 0x7f) {
|
|
emitOneChar(ch);
|
|
} else {
|
|
EncodedCharacter encoded{EncodeCharacter(encoding, ch)};
|
|
for (int j{0}; j < encoded.bytes; ++j) {
|
|
emitOneChar(encoded.buffer[j]);
|
|
}
|
|
}
|
|
}
|
|
|
|
std::string QuoteCharacterLiteral(const std::string &,
|
|
bool backslashEscapes = true, Encoding = Encoding::LATIN_1);
|
|
std::string QuoteCharacterLiteral(const std::u16string &,
|
|
bool backslashEscapes = true, Encoding = Encoding::UTF_8);
|
|
std::string QuoteCharacterLiteral(const std::u32string &,
|
|
bool backslashEscapes = true, Encoding = Encoding::UTF_8);
|
|
|
|
int UTF_8CharacterBytes(const char *);
|
|
|
|
struct DecodedCharacter {
|
|
char32_t codepoint{0};
|
|
int bytes{0}; // signifying failure
|
|
};
|
|
|
|
template<Encoding ENCODING>
|
|
DecodedCharacter DecodeRawCharacter(const char *, std::size_t);
|
|
template<>
|
|
DecodedCharacter DecodeRawCharacter<Encoding::LATIN_1>(
|
|
const char *, std::size_t);
|
|
|
|
template<>
|
|
DecodedCharacter DecodeRawCharacter<Encoding::UTF_8>(const char *, std::size_t);
|
|
|
|
// DecodeCharacter optionally handles backslash escape sequences, too.
|
|
template<Encoding ENCODING>
|
|
DecodedCharacter DecodeCharacter(
|
|
const char *, std::size_t, bool backslashEscapes);
|
|
extern template DecodedCharacter DecodeCharacter<Encoding::LATIN_1>(
|
|
const char *, std::size_t, bool);
|
|
extern template DecodedCharacter DecodeCharacter<Encoding::UTF_8>(
|
|
const char *, std::size_t, bool);
|
|
|
|
DecodedCharacter DecodeCharacter(
|
|
Encoding, const char *, std::size_t, bool backslashEscapes);
|
|
|
|
template<typename RESULT, Encoding ENCODING>
|
|
RESULT DecodeString(const std::string &, bool backslashEscapes);
|
|
extern template std::string DecodeString<std::string, Encoding::LATIN_1>(
|
|
const std::string &, bool);
|
|
extern template std::u16string DecodeString<std::u16string, Encoding::UTF_8>(
|
|
const std::string &, bool);
|
|
extern template std::u32string DecodeString<std::u32string, Encoding::UTF_8>(
|
|
const std::string &, bool);
|
|
}
|
|
#endif // FORTRAN_PARSER_CHARACTERS_H_
|