[pseudo] A basic implementation of compiling cxx grammar at build time.

The main idea is to compile the cxx grammar at build time, and construct
the core pieces (Grammar, LRTable) of the pseudoparse based on the compiled
data sources.

This is a tiny implementation, which is good for start:

- defines how the public API should look like;
- integrates the cxx grammar compilation workflow with the cmake system.
- onlynonterminal symbols of the C++ grammar are compiled, anything
  else are still doing the real compilation work at runtime, we can opt-in more
  bits in the future;
- splits the monolithic clangPsuedo library for better layering;

Reviewed By: sammccall

Differential Revision: https://reviews.llvm.org/D125667
This commit is contained in:
Haojian Wu 2022-05-24 20:21:45 +02:00
parent 8919447c71
commit cd2292ef82
14 changed files with 246 additions and 5 deletions

View file

@ -1,5 +1,7 @@
include_directories(include)
include_directories(${CMAKE_CURRENT_BINARY_DIR}/include)
add_subdirectory(include)
add_subdirectory(gen)
add_subdirectory(lib)
add_subdirectory(tool)
add_subdirectory(fuzzer)

View file

@ -0,0 +1,10 @@
set(LLVM_LINK_COMPONENTS Support)
add_clang_executable(pseudo-gen
Main.cpp
)
target_link_libraries(pseudo-gen
PRIVATE
clangPseudoGrammar
)

View file

@ -0,0 +1,89 @@
//===--- Main.cpp - Compile BNF grammar -----------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This is a tool to compile a BNF grammar, it is used by the build system to
// generate a necessary data bits to statically construct core pieces (Grammar,
// LRTable etc) of the LR parser.
//
//===----------------------------------------------------------------------===//
#include "clang-pseudo/Grammar.h"
#include "llvm/ADT/StringExtras.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/FormatVariadic.h"
#include "llvm/Support/MemoryBuffer.h"
#include <algorithm>
using llvm::cl::desc;
using llvm::cl::init;
using llvm::cl::opt;
using llvm::cl::values;
namespace {
enum EmitType {
EmitSymbolList,
EmitGrammarContent,
};
opt<std::string> Grammar("grammar", desc("Parse a BNF grammar file."),
init(""));
opt<EmitType>
Emit(desc("which information to emit:"),
values(clEnumValN(EmitSymbolList, "emit-symbol-list",
"Print nonterminal symbols (default)"),
clEnumValN(EmitGrammarContent, "emit-grammar-content",
"Print the BNF grammar content as a string")));
std::string readOrDie(llvm::StringRef Path) {
llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> Text =
llvm::MemoryBuffer::getFile(Path);
if (std::error_code EC = Text.getError()) {
llvm::errs() << "Error: can't read grammar file '" << Path
<< "': " << EC.message() << "\n";
::exit(1);
}
return Text.get()->getBuffer().str();
}
} // namespace
int main(int argc, char *argv[]) {
llvm::cl::ParseCommandLineOptions(argc, argv, "");
if (!Grammar.getNumOccurrences()) {
llvm::errs() << "Grammar file must be provided!\n";
return 1;
}
std::string GrammarText = readOrDie(Grammar);
std::vector<std::string> Diags;
auto G = clang::pseudo::Grammar::parseBNF(GrammarText, Diags);
if (!Diags.empty()) {
llvm::errs() << llvm::join(Diags, "\n");
return 1;
}
switch (Emit) {
case EmitSymbolList:
for (clang::pseudo::SymbolID ID = 0; ID < G->table().Nonterminals.size();
++ID) {
std::string Name = G->symbolName(ID).str();
// translation-unit -> translation_unit
std::replace(Name.begin(), Name.end(), '-', '_');
llvm::outs() << (llvm::formatv("NONTERMINAL({0}, {1})\n", Name, ID));
}
break;
case EmitGrammarContent:
for (llvm::StringRef Line : llvm::split(GrammarText, '\n')) {
llvm::outs() << '"';
llvm::outs().write_escaped((Line + "\n").str());
llvm::outs() << "\"\n";
}
break;
}
return 0;
}

View file

@ -0,0 +1,29 @@
# The cxx.bnf grammar file
set(cxx_bnf ${CMAKE_CURRENT_SOURCE_DIR}/../lib/cxx.bnf)
# Generate inc files.
set(cxx_symbols_inc ${CMAKE_CURRENT_BINARY_DIR}/CXXSymbols.inc)
add_custom_command(OUTPUT ${cxx_symbols_inc}
COMMAND "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/pseudo-gen"
--grammar ${cxx_bnf}
--emit-symbol-list
> ${cxx_symbols_inc}
COMMENT "Generating nonterminal symbol file for cxx grammar..."
DEPENDS pseudo-gen
VERBATIM)
set(cxx_bnf_inc ${CMAKE_CURRENT_BINARY_DIR}/CXXBNF.inc)
add_custom_command(OUTPUT ${cxx_bnf_inc}
COMMAND "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/pseudo-gen"
--grammar ${cxx_bnf}
--emit-grammar-content
> ${cxx_bnf_inc}
COMMENT "Generating bnf string file for cxx grammar..."
DEPENDS pseudo-gen
VERBATIM)
# add_custom_command does not create a new target, we need to deine a target
# explicitly, so that other targets can depend on it.
add_custom_target(cxx_gen
DEPENDS ${cxx_symbols_inc} ${cxx_bnf_inc}
VERBATIM)

View file

@ -0,0 +1,51 @@
//===--- CXX.h - Public interfaces for the C++ grammar -----------*- C++-*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file defines public interfaces for the C++ grammar
// (pseudo/lib/cxx.bnf). It provides a fast way to access core building pieces
// of the LR parser, e.g. Grammar, LRTable, rather than parsing the grammar
// file at the runtime.
//
// We do a compilation of the C++ BNF grammar at build time, and generate
// critical data sources. The implementation of the interfaces are based on the
// generated data sources.
//
// FIXME: not everything is fully compiled yet. The implementation of the
// interfaces are still parsing the grammar file at the runtime.
//
//===----------------------------------------------------------------------===//
#ifndef CLANG_PSEUDO_CXX_CXX_H
#define CLANG_PSEUDO_CXX_CXX_H
#include "clang-pseudo/Grammar.h"
namespace clang {
namespace pseudo {
class LRTable;
namespace cxx {
// Symbol represents nonterminal symbols in the C++ grammar.
// It provides a simple uniform way to access a particular nonterminal.
enum class Symbol : SymbolID {
#define NONTERMINAL(X, Y) X = Y,
#include "CXXSymbols.inc"
#undef NONTERMINAL
};
// Returns the C++ grammar.
const Grammar &getGrammar();
// Returns the corresponding LRTable for the C++ grammar.
const LRTable &getLRTable();
} // namespace cxx
} // namespace pseudo
} // namespace clang
#endif // CLANG_PSEUDO_CXX_CXX_H

View file

@ -1,3 +1,6 @@
add_subdirectory(cxx)
add_subdirectory(grammar)
set(LLVM_LINK_COMPONENTS Support)
add_clang_library(clangPseudo
@ -5,15 +8,11 @@ add_clang_library(clangPseudo
DirectiveTree.cpp
Forest.cpp
GLR.cpp
Grammar.cpp
GrammarBNF.cpp
Lex.cpp
LRGraph.cpp
LRTable.cpp
LRTableBuild.cpp
Token.cpp
LINK_LIBS
clangBasic
clangLex
clangPseudoGrammar
)

View file

@ -0,0 +1,9 @@
add_clang_library(clangPseudoCXX
CXX.cpp
DEPENDS
cxx_gen
LINK_LIBS
clangPseudoGrammar
)

View file

@ -0,0 +1,34 @@
//===--- CXX.cpp - Define public interfaces for C++ grammar ---------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#include "clang-pseudo/cxx/CXX.h"
#include "clang-pseudo/LRTable.h"
namespace clang {
namespace pseudo {
namespace cxx {
static const char *CXXBNF =
#include "CXXBNF.inc"
;
const Grammar &getGrammar() {
static std::vector<std::string> Diags;
static Grammar *G = Grammar::parseBNF(CXXBNF, Diags).release();
assert(Diags.empty());
return *G;
}
const LRTable &getLRTable() {
static LRTable *Table = new LRTable(LRTable::buildSLR(getGrammar()));
return *Table;
}
} // namespace cxx
} // namespace pseudo
} // namespace clang

View file

@ -0,0 +1,18 @@
set(LLVM_LINK_COMPONENTS Support)
# This library intents to keep as minimal dependencies as possible, it is a base
# library of the cxx generator, to avoid creating long dep paths in the build
# graph.
add_clang_library(clangPseudoGrammar
Grammar.cpp
GrammarBNF.cpp
LRGraph.cpp
LRTable.cpp
LRTableBuild.cpp
# FIXME: can we get rid of the clangBasic dependency? We need it for the
# clang::tok::getTokenName and clang::tok::getPunctuatorSpelling functions, we
# could consider remimplement these functions.
LINK_LIBS
clangBasic
)