[NFC][Lexer] Make Lexer::LangOpts const reference

This change can be seen as code cleanup but motivation is more performance related. While browsing perf reports captured during Linux build we can notice unusual portion of instructions executed in std::vector<std::string> copy constructor like: 0.59% 0.58% clang-14 clang-14 [.] std::vector<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::allocator<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > > >::vector or even: 1.42% 0.26% clang clang-14 [.] clang::LangOptions::LangOptions | --1.16%--clang::LangOptions::LangOptions | --0.74%--std::vector<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::allocator<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > > >::vector After more digging we can see that relevant LangOptions std::vector members (*Files, ModuleFeatures and NoBuiltinFuncs) are constructed when Lexer::LangOpts field is initialized on list: Lexer::Lexer(..., const LangOptions &langOpts, ...) : ..., LangOpts(langOpts), Since LangOptions copy constructor is called by Lexer(..., const LangOptions &LangOpts,...) and local Lexer objects are created thousands times (in Lexer::getRawToken, Preprocessor::EnterSourceFile and more) during single module processing in frontend it makes std::vector copy constructors surprisingly hot. Unfortunately even though in current Lexer implementation mentioned std::vector members are unused and most of time empty, no compiler is smart enough to optimize their std::vector copy constructors out (take a look at test assembly): https://godbolt.org/z/hdoxPfMYY even with LTO enabled. However there is simple way to fix this. Since Lexer doesn't access *Files, ModuleFeatures, NoBuiltinFuncs and any other LangOptions fields (but only LangOptionsBase) we can simply get rid of redundant copy constructor assembly by changing LangOpts type to more appropriate const LangOptions reference: https://godbolt.org/z/fP7de9176 Additionally we need to store LineComment outside LangOpts because it's written in SkipLineComment function. Also FormatTokenLexer need to be adjusted a bit to avoid lifetime issues related to passing local LangOpts reference to Lexer. After this change I can see more than 1% speedup in some of my microbenchmarks when using Clang release binary built with LTO. For Linux build gains are not so significant but still nice at the level of -0.4%/-0.5% instructions drop. Differential Revision: https://reviews.llvm.org/D120334
2022-02-25 12:07:33 +01:00 · 2022-02-25 12:07:33 +01:00 · a64d3c602f
parent 3c53d3a733
commit a64d3c602f
4 changed files with 29 additions and 17 deletions
--- a/clang/include/clang/Lex/Lexer.h
+++ b/clang/include/clang/Lex/Lexer.h
@ -13,7 +13,6 @@
 #ifndef LLVM_CLANG_LEX_LEXER_H
 #define LLVM_CLANG_LEX_LEXER_H

-#include "clang/Basic/LangOptions.h"
 #include "clang/Basic/SourceLocation.h"
 #include "clang/Basic/TokenKinds.h"
 #include "clang/Lex/PreprocessorLexer.h"
@ -36,6 +35,7 @@ namespace clang {
 class DiagnosticBuilder;
 class Preprocessor;
 class SourceManager;
+class LangOptions;

 /// ConflictMarkerKind - Kinds of conflict marker which the lexer might be
 /// recovering from.
@ -90,8 +90,18 @@ class Lexer : public PreprocessorLexer {
  // Location for start of file.
  SourceLocation FileLoc;

-  // LangOpts enabled by this language (cache).
-  LangOptions LangOpts;
+  // LangOpts enabled by this language.
+  // Storing LangOptions as reference here is important from performance point
+  // of view. Lack of reference means that LangOptions copy constructor would be
+  // called by Lexer(..., const LangOptions &LangOpts,...). Given that local
+  // Lexer objects are created thousands times (in Lexer::getRawToken,
+  // Preprocessor::EnterSourceFile and other places) during single module
+  // processing in frontend it would make std::vector<std::string> copy
+  // constructors surprisingly hot.
+  const LangOptions &LangOpts;
+
+  // True if '//' line comments are enabled.
+  bool LineComment;

  // True if lexer for _Pragma handling.
  bool Is_PragmaLexer;
--- a/clang/lib/Format/FormatTokenLexer.cpp
+++ b/clang/lib/Format/FormatTokenLexer.cpp
@ -28,13 +28,13 @@ FormatTokenLexer::FormatTokenLexer(
    llvm::SpecificBumpPtrAllocator<FormatToken> &Allocator,
    IdentifierTable &IdentTable)
    : FormatTok(nullptr), IsFirstToken(true), StateStack({LexerState::NORMAL}),
-      Column(Column), TrailingWhitespace(0), SourceMgr(SourceMgr), ID(ID),
+      Column(Column), TrailingWhitespace(0),
+      LangOpts(getFormattingLangOpts(Style)), SourceMgr(SourceMgr), ID(ID),
      Style(Style), IdentTable(IdentTable), Keywords(IdentTable),
      Encoding(Encoding), Allocator(Allocator), FirstInLineIndex(0),
      FormattingDisabled(false), MacroBlockBeginRegex(Style.MacroBlockBegin),
      MacroBlockEndRegex(Style.MacroBlockEnd) {
-  Lex.reset(new Lexer(ID, SourceMgr.getBufferOrFake(ID), SourceMgr,
-                      getFormattingLangOpts(Style)));
+  Lex.reset(new Lexer(ID, SourceMgr.getBufferOrFake(ID), SourceMgr, LangOpts));
  Lex->SetKeepWhitespaceMode(true);

  for (const std::string &ForEachMacro : Style.ForEachMacros) {
@ -1079,9 +1079,9 @@ void FormatTokenLexer::readRawToken(FormatToken &Tok) {

 void FormatTokenLexer::resetLexer(unsigned Offset) {
  StringRef Buffer = SourceMgr.getBufferData(ID);
-  Lex.reset(new Lexer(SourceMgr.getLocForStartOfFile(ID),
-                      getFormattingLangOpts(Style), Buffer.begin(),
-                      Buffer.begin() + Offset, Buffer.end()));
+  LangOpts = getFormattingLangOpts(Style);
+  Lex.reset(new Lexer(SourceMgr.getLocForStartOfFile(ID), LangOpts,
+                      Buffer.begin(), Buffer.begin() + Offset, Buffer.end()));
  Lex->SetKeepWhitespaceMode(true);
  TrailingWhitespace = 0;
 }
--- a/clang/lib/Format/FormatTokenLexer.h
+++ b/clang/lib/Format/FormatTokenLexer.h
@ -17,6 +17,7 @@

 #include "Encoding.h"
 #include "FormatToken.h"
+#include "clang/Basic/LangOptions.h"
 #include "clang/Basic/SourceLocation.h"
 #include "clang/Basic/SourceManager.h"
 #include "clang/Format/Format.h"
@ -101,6 +102,7 @@ private:
  unsigned Column;
  unsigned TrailingWhitespace;
  std::unique_ptr<Lexer> Lex;
+  LangOptions LangOpts;
  const SourceManager &SourceMgr;
  FileID ID;
  const FormatStyle &Style;
--- a/clang/lib/Lex/Lexer.cpp
+++ b/clang/lib/Lex/Lexer.cpp
@ -136,7 +136,8 @@ Lexer::Lexer(FileID FID, const llvm::MemoryBufferRef &InputFile,
             Preprocessor &PP, bool IsFirstIncludeOfFile)
    : PreprocessorLexer(&PP, FID),
      FileLoc(PP.getSourceManager().getLocForStartOfFile(FID)),
-      LangOpts(PP.getLangOpts()), IsFirstTimeLexingFile(IsFirstIncludeOfFile) {
+      LangOpts(PP.getLangOpts()), LineComment(LangOpts.LineComment),
+      IsFirstTimeLexingFile(IsFirstIncludeOfFile) {
  InitLexer(InputFile.getBufferStart(), InputFile.getBufferStart(),
            InputFile.getBufferEnd());

@ -149,7 +150,7 @@ Lexer::Lexer(FileID FID, const llvm::MemoryBufferRef &InputFile,
 Lexer::Lexer(SourceLocation fileloc, const LangOptions &langOpts,
             const char *BufStart, const char *BufPtr, const char *BufEnd,
             bool IsFirstIncludeOfFile)
-    : FileLoc(fileloc), LangOpts(langOpts),
+    : FileLoc(fileloc), LangOpts(langOpts), LineComment(LangOpts.LineComment),
      IsFirstTimeLexingFile(IsFirstIncludeOfFile) {
  InitLexer(BufStart, BufPtr, BufEnd);

@ -2376,13 +2377,13 @@ bool Lexer::SkipLineComment(Token &Result, const char *CurPtr,
                            bool &TokAtPhysicalStartOfLine) {
  // If Line comments aren't explicitly enabled for this language, emit an
  // extension warning.
-  if (!LangOpts.LineComment) {
+  if (!LineComment) {
    if (!isLexingRawMode()) // There's no PP in raw mode, so can't emit diags.
      Diag(BufferPtr, diag::ext_line_comment);

    // Mark them enabled so we only emit one warning for this translation
    // unit.
-    LangOpts.LineComment = true;
+    LineComment = true;
  }

  // Scan over the body of the comment.  The common case, when scanning, is that
@ -3433,8 +3434,7 @@ LexNextToken:
    // If the next token is obviously a // or /* */ comment, skip it efficiently
    // too (without going through the big switch stmt).
    if (CurPtr[0] == '/' && CurPtr[1] == '/' && !inKeepCommentMode() &&
-        LangOpts.LineComment &&
-        (LangOpts.CPlusPlus || !LangOpts.TraditionalCPP)) {
+        LineComment && (LangOpts.CPlusPlus || !LangOpts.TraditionalCPP)) {
      if (SkipLineComment(Result, CurPtr+2, TokAtPhysicalStartOfLine))
        return true; // There is a token to return.
      goto SkipIgnoredUnits;
@ -3741,8 +3741,8 @@ LexNextToken:
      // "foo".  Check to see if the character after the second slash is a '*'.
      // If so, we will lex that as a "/" instead of the start of a comment.
      // However, we never do this if we are just preprocessing.
-      bool TreatAsComment = LangOpts.LineComment &&
-                            (LangOpts.CPlusPlus || !LangOpts.TraditionalCPP);
+      bool TreatAsComment =
+          LineComment && (LangOpts.CPlusPlus || !LangOpts.TraditionalCPP);
      if (!TreatAsComment)
        if (!(PP && PP->isPreprocessedOutput()))
          TreatAsComment = getCharAndSize(CurPtr+SizeTmp, SizeTmp2) != '*';