llvm/flang/lib/parser/char-set.h

// Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#ifndef FORTRAN_PARSER_CHAR_SET_H_
#define FORTRAN_PARSER_CHAR_SET_H_

// Sets of distinct characters that are valid in Fortran programs outside
// character literals are encoded as 64-bit integers by mapping them to a 6-bit
// character set encoding in which the case of letters is lost (even if
// mixed case input reached the parser, which it does not).  These sets
// need to be suitable for constexprs, so std::bitset<> was not eligible.

#include <cinttypes>
#include <string>

namespace Fortran::parser {

struct SetOfChars {
  constexpr SetOfChars() {}

  constexpr SetOfChars(char c) {
    // This is basically the old DECSIX encoding, which maps the
    // 7-bit ASCII codes [32..95] to [0..63].  Only '#', '&', '?', '\', and '^'
    // in that range are unused in Fortran after preprocessing outside
    // character literals.  We repurpose '^' and '?' for newline and unknown
    // characters (resp.), leaving the others alone in case this code might
    // be useful in preprocssing.
    // TODO: EBCDIC?
    if (c == '\n') {
      // map newline to '^'
      c = '^';
    } else if (c < 32 || c >= 127) {
      // map other control characters, DEL, and 8-bit characters to '?'
      c = '?';
    } else if (c >= 96) {
      // map lower-case letters to upper-case
      c -= 32;
    }
    // range is now [32..95]; reduce to [0..63] and use as a shift count
    bits_ = static_cast<std::uint64_t>(1) << (c - 32);
  }

  constexpr SetOfChars(const char str[], std::size_t n) {
    for (std::size_t j{0}; j < n; ++j) {
      bits_ |= SetOfChars{str[j]}.bits_;
    }
  }

  constexpr SetOfChars(const SetOfChars &) = default;
  constexpr SetOfChars(SetOfChars &&) = default;
  constexpr SetOfChars &operator=(const SetOfChars &) = default;
  constexpr SetOfChars &operator=(SetOfChars &&) = default;
  constexpr bool empty() const { return bits_ == 0; }

  constexpr bool Has(SetOfChars that) const {
    return (that.bits_ & ~bits_) == 0;
  }
  constexpr SetOfChars Union(SetOfChars that) const {
    return SetOfChars{bits_ | that.bits_};
  }
  constexpr SetOfChars Intersection(SetOfChars that) const {
    return SetOfChars{bits_ & that.bits_};
  }
  constexpr SetOfChars Difference(SetOfChars that) const {
    return SetOfChars{bits_ & ~that.bits_};
  }

  std::string ToString() const;

private:
  constexpr SetOfChars(std::uint64_t b) : bits_{b} {}
  std::uint64_t bits_{0};
};

}  // namespace Fortran::parser
#endif  // FORTRAN_PARSER_CHAR_SET_H_