[Demangle][Rust] Parse non-ASCII identifiers
Rust allows use of non-ASCII identifiers, which in Rust mangling scheme are encoded using Punycode. The encoding deviates from the standard by using an underscore as the separator between ASCII part and a base-36 encoding of non-ASCII characters (avoiding hypen-minus in the symbol name). Other than that, the encoding follows the standard, and the decoder implemented here in turn follows the one given in RFC 3492. To avoid an extra intermediate memory allocation while decoding Punycode, the interface of OutputStream is extended with an insert method. Reviewed By: dblaikie Differential Revision: https://reviews.llvm.org/D104366
This commit is contained in:
parent
df672f66b6
commit
c8c2b4629f
|
@ -126,6 +126,16 @@ public:
|
|||
return this->operator<<(static_cast<unsigned long long>(N));
|
||||
}
|
||||
|
||||
void insert(size_t Pos, const char *S, size_t N) {
|
||||
assert(Pos <= CurrentPosition);
|
||||
if (N == 0)
|
||||
return;
|
||||
grow(N);
|
||||
std::memmove(Buffer + Pos + N, Buffer + Pos, CurrentPosition - Pos);
|
||||
std::memcpy(Buffer + Pos, S, N);
|
||||
CurrentPosition += N;
|
||||
}
|
||||
|
||||
size_t getCurrentPosition() const { return CurrentPosition; }
|
||||
void setCurrentPosition(size_t NewPos) { CurrentPosition = NewPos; }
|
||||
|
||||
|
|
|
@ -126,6 +126,16 @@ public:
|
|||
return this->operator<<(static_cast<unsigned long long>(N));
|
||||
}
|
||||
|
||||
void insert(size_t Pos, const char *S, size_t N) {
|
||||
assert(Pos <= CurrentPosition);
|
||||
if (N == 0)
|
||||
return;
|
||||
grow(N);
|
||||
std::memmove(Buffer + Pos + N, Buffer + Pos, CurrentPosition - Pos);
|
||||
std::memcpy(Buffer + Pos, S, N);
|
||||
CurrentPosition += N;
|
||||
}
|
||||
|
||||
size_t getCurrentPosition() const { return CurrentPosition; }
|
||||
void setCurrentPosition(size_t NewPos) { CurrentPosition = NewPos; }
|
||||
|
||||
|
|
|
@ -135,6 +135,7 @@ private:
|
|||
void printDecimalNumber(uint64_t N);
|
||||
void printBasicType(BasicType);
|
||||
void printLifetime(uint64_t Index);
|
||||
void printIdentifier(Identifier Ident);
|
||||
|
||||
char look() const;
|
||||
char consume();
|
||||
|
@ -283,8 +284,7 @@ bool Demangler::demanglePath(IsInType InType, LeaveGenericsOpen LeaveOpen) {
|
|||
switch (consume()) {
|
||||
case 'C': {
|
||||
parseOptionalBase62Number('s');
|
||||
Identifier Ident = parseIdentifier();
|
||||
print(Ident.Name);
|
||||
printIdentifier(parseIdentifier());
|
||||
break;
|
||||
}
|
||||
case 'M': {
|
||||
|
@ -333,7 +333,7 @@ bool Demangler::demanglePath(IsInType InType, LeaveGenericsOpen LeaveOpen) {
|
|||
print(NS);
|
||||
if (!Ident.empty()) {
|
||||
print(":");
|
||||
print(Ident.Name);
|
||||
printIdentifier(Ident);
|
||||
}
|
||||
print('#');
|
||||
printDecimalNumber(Disambiguator);
|
||||
|
@ -342,7 +342,7 @@ bool Demangler::demanglePath(IsInType InType, LeaveGenericsOpen LeaveOpen) {
|
|||
// Implementation internal namespaces.
|
||||
if (!Ident.empty()) {
|
||||
print("::");
|
||||
print(Ident.Name);
|
||||
printIdentifier(Ident);
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
@ -669,6 +669,8 @@ void Demangler::demangleFnSig() {
|
|||
print("C");
|
||||
} else {
|
||||
Identifier Ident = parseIdentifier();
|
||||
if (Ident.Punycode)
|
||||
Error = true;
|
||||
for (char C : Ident.Name) {
|
||||
// When mangling ABI string, the "-" is replaced with "_".
|
||||
if (C == '_')
|
||||
|
@ -1078,6 +1080,172 @@ void Demangler::printLifetime(uint64_t Index) {
|
|||
}
|
||||
}
|
||||
|
||||
static inline bool decodePunycodeDigit(char C, size_t &Value) {
|
||||
if (isLower(C)) {
|
||||
Value = C - 'a';
|
||||
return true;
|
||||
}
|
||||
|
||||
if (isDigit(C)) {
|
||||
Value = 26 + (C - '0');
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
static void removeNullBytes(OutputStream &Output, size_t StartIdx) {
|
||||
char *Buffer = Output.getBuffer();
|
||||
char *Start = Buffer + StartIdx;
|
||||
char *End = Buffer + Output.getCurrentPosition();
|
||||
Output.setCurrentPosition(std::remove(Start, End, '\0') - Buffer);
|
||||
}
|
||||
|
||||
// Encodes code point as UTF-8 and stores results in Output. Returns false if
|
||||
// CodePoint is not a valid unicode scalar value.
|
||||
static inline bool encodeUTF8(size_t CodePoint, char *Output) {
|
||||
if (0xD800 <= CodePoint && CodePoint <= 0xDFFF)
|
||||
return false;
|
||||
|
||||
if (CodePoint <= 0x7F) {
|
||||
Output[0] = CodePoint;
|
||||
return true;
|
||||
}
|
||||
|
||||
if (CodePoint <= 0x7FF) {
|
||||
Output[0] = 0xC0 | ((CodePoint >> 6) & 0x3F);
|
||||
Output[1] = 0x80 | (CodePoint & 0x3F);
|
||||
return true;
|
||||
}
|
||||
|
||||
if (CodePoint <= 0xFFFF) {
|
||||
Output[0] = 0xE0 | (CodePoint >> 12);
|
||||
Output[1] = 0x80 | ((CodePoint >> 6) & 0x3F);
|
||||
Output[2] = 0x80 | (CodePoint & 0x3F);
|
||||
return true;
|
||||
}
|
||||
|
||||
if (CodePoint <= 0x10FFFF) {
|
||||
Output[0] = 0xF0 | (CodePoint >> 18);
|
||||
Output[1] = 0x80 | ((CodePoint >> 12) & 0x3F);
|
||||
Output[2] = 0x80 | ((CodePoint >> 6) & 0x3F);
|
||||
Output[3] = 0x80 | (CodePoint & 0x3F);
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
// Decodes string encoded using punycode and appends results to Output.
|
||||
// Returns true if decoding was successful.
|
||||
static bool decodePunycode(StringView Input, OutputStream &Output) {
|
||||
size_t OutputSize = Output.getCurrentPosition();
|
||||
size_t InputIdx = 0;
|
||||
|
||||
// Rust uses an underscore as a delimiter.
|
||||
size_t DelimiterPos = StringView::npos;
|
||||
for (size_t I = 0; I != Input.size(); ++I)
|
||||
if (Input[I] == '_')
|
||||
DelimiterPos = I;
|
||||
|
||||
if (DelimiterPos != StringView::npos) {
|
||||
// Copy basic code points before the last delimiter to the output.
|
||||
for (; InputIdx != DelimiterPos; ++InputIdx) {
|
||||
char C = Input[InputIdx];
|
||||
if (!isValid(C))
|
||||
return false;
|
||||
// Code points are padded with zeros while decoding is in progress.
|
||||
char UTF8[4] = {C};
|
||||
Output += StringView(UTF8, UTF8 + 4);
|
||||
}
|
||||
// Skip over the delimiter.
|
||||
++InputIdx;
|
||||
}
|
||||
|
||||
size_t Base = 36;
|
||||
size_t Skew = 38;
|
||||
size_t Bias = 72;
|
||||
size_t N = 0x80;
|
||||
size_t TMin = 1;
|
||||
size_t TMax = 26;
|
||||
size_t Damp = 700;
|
||||
|
||||
auto Adapt = [&](size_t Delta, size_t NumPoints) {
|
||||
Delta /= Damp;
|
||||
Delta += Delta / NumPoints;
|
||||
Damp = 2;
|
||||
|
||||
size_t K = 0;
|
||||
while (Delta > (Base - TMin) * TMax / 2) {
|
||||
Delta /= Base - TMin;
|
||||
K += Base;
|
||||
}
|
||||
return K + (((Base - TMin + 1) * Delta) / (Delta + Skew));
|
||||
};
|
||||
|
||||
// Main decoding loop.
|
||||
for (size_t I = 0; InputIdx != Input.size(); I += 1) {
|
||||
size_t OldI = I;
|
||||
size_t W = 1;
|
||||
size_t Max = std::numeric_limits<size_t>::max();
|
||||
for (size_t K = Base; true; K += Base) {
|
||||
if (InputIdx == Input.size())
|
||||
return false;
|
||||
char C = Input[InputIdx++];
|
||||
size_t Digit = 0;
|
||||
if (!decodePunycodeDigit(C, Digit))
|
||||
return false;
|
||||
|
||||
if (Digit > (Max - I) / W)
|
||||
return false;
|
||||
I += Digit * W;
|
||||
|
||||
size_t T;
|
||||
if (K <= Bias)
|
||||
T = TMin;
|
||||
else if (K >= Bias + TMax)
|
||||
T = TMax;
|
||||
else
|
||||
T = K - Bias;
|
||||
|
||||
if (Digit < T)
|
||||
break;
|
||||
|
||||
if (W > Max / (Base - T))
|
||||
return false;
|
||||
W *= (Base - T);
|
||||
}
|
||||
size_t NumPoints = (Output.getCurrentPosition() - OutputSize) / 4 + 1;
|
||||
Bias = Adapt(I - OldI, NumPoints);
|
||||
|
||||
if (I / NumPoints > Max - N)
|
||||
return false;
|
||||
N += I / NumPoints;
|
||||
I = I % NumPoints;
|
||||
|
||||
// Insert N at position I in the output.
|
||||
char UTF8[4] = {};
|
||||
if (!encodeUTF8(N, UTF8))
|
||||
return false;
|
||||
Output.insert(OutputSize + I * 4, UTF8, 4);
|
||||
}
|
||||
|
||||
removeNullBytes(Output, OutputSize);
|
||||
return true;
|
||||
}
|
||||
|
||||
void Demangler::printIdentifier(Identifier Ident) {
|
||||
if (Error || !Print)
|
||||
return;
|
||||
|
||||
if (Ident.Punycode) {
|
||||
if (!decodePunycode(Ident.Name, Output))
|
||||
Error = true;
|
||||
} else {
|
||||
print(Ident.Name);
|
||||
}
|
||||
}
|
||||
|
||||
char Demangler::look() const {
|
||||
if (Error || Position >= Input.size())
|
||||
return 0;
|
||||
|
|
|
@ -237,6 +237,11 @@ CHECK: function::<extern "cdecl" fn()>
|
|||
CHECK: function::<unsafe extern "C-cmse-nonsecure-call" fn()>
|
||||
_RIC8functionFUK21C_cmse_nonsecure_callEuE
|
||||
|
||||
; Invalid ABI with punycode.
|
||||
|
||||
CHECK: _RIC8functionFKu3n3hEuE
|
||||
_RIC8functionFKu3n3hEuE
|
||||
|
||||
; Trait objects
|
||||
|
||||
CHECK: trait::<dyn >
|
||||
|
@ -456,6 +461,44 @@ CHECK: dot (.llvm.1234)
|
|||
CHECK: dot (.llvm.6789)
|
||||
_RC3dotC5crate.llvm.6789
|
||||
|
||||
; Punycode
|
||||
|
||||
CHECK: punycode::東京
|
||||
_RNvC8punycodeu7_1lqs71d
|
||||
|
||||
CHECK: punycode::zażółć_gęślą_jaźń
|
||||
_RNvC8punycodeu29za_gl_ja_w3a7psa2tqtgb10airva
|
||||
|
||||
CHECK: punycode::საჭმელად_გემრიელი_სადილი
|
||||
_RNvC8punycodeu30____7hkackfecea1cbdathfdh9hlq6y
|
||||
|
||||
CHECK: Gödel::Escher::Bach
|
||||
_RNtNvCu8Gdel_5qa6Escher4Bach
|
||||
|
||||
CHECK: punycode::🦁🐅
|
||||
_RNvC8punycodeu7wn8hx1g
|
||||
|
||||
; Punycode - invalid code point
|
||||
|
||||
CHECK: _RCu5r731r
|
||||
_RCu5r731r
|
||||
|
||||
CHECK: _RCu8b44444yy
|
||||
_RCu8b44444yy
|
||||
|
||||
CHECK: _RNvC1au25zzzzzzzzzzzzzzzzzzzzzzzzz
|
||||
_RNvC1au25zzzzzzzzzzzzzzzzzzzzzzzzz
|
||||
|
||||
; Punycode - early EOF
|
||||
|
||||
CHECK: _RCu8_CCCAR_u4
|
||||
_RCu8_CCCAR_u4
|
||||
|
||||
; Punycode - overflow
|
||||
|
||||
CHECK: _RNvC1au21p18888888888888888888
|
||||
_RNvC1au21p18888888888888888888
|
||||
|
||||
; Invalid mangled characters
|
||||
|
||||
CHECK: _RNvC2a.1c
|
||||
|
|
|
@ -6,6 +6,7 @@ set(LLVM_LINK_COMPONENTS
|
|||
add_llvm_unittest(DemangleTests
|
||||
DemangleTest.cpp
|
||||
ItaniumDemangleTest.cpp
|
||||
OutputStreamTest.cpp
|
||||
PartialDemangleTest.cpp
|
||||
RustDemangleTest.cpp
|
||||
StringViewTest.cpp
|
||||
|
|
61
llvm/unittests/Demangle/OutputStreamTest.cpp
Normal file
61
llvm/unittests/Demangle/OutputStreamTest.cpp
Normal file
|
@ -0,0 +1,61 @@
|
|||
//===- llvm/unittest/OutputStreamTest.cpp - OutputStream unit tests -------===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "llvm/Demangle/Utility.h"
|
||||
#include "gtest/gtest.h"
|
||||
#include <string>
|
||||
|
||||
using namespace llvm;
|
||||
using llvm::itanium_demangle::OutputStream;
|
||||
|
||||
static std::string toString(OutputStream &OS) {
|
||||
return {OS.getBuffer(), OS.getCurrentPosition()};
|
||||
}
|
||||
|
||||
template <typename T> static std::string printToString(const T &Value) {
|
||||
OutputStream OS;
|
||||
OS << Value;
|
||||
std::string s = toString(OS);
|
||||
std::free(OS.getBuffer());
|
||||
return s;
|
||||
}
|
||||
|
||||
TEST(OutputStreamTest, Format) {
|
||||
EXPECT_EQ("0", printToString(0));
|
||||
EXPECT_EQ("1", printToString(1));
|
||||
EXPECT_EQ("-1", printToString(-1));
|
||||
EXPECT_EQ("-90", printToString(-90));
|
||||
EXPECT_EQ("109", printToString(109));
|
||||
EXPECT_EQ("400", printToString(400));
|
||||
|
||||
EXPECT_EQ("a", printToString('a'));
|
||||
EXPECT_EQ("?", printToString('?'));
|
||||
|
||||
EXPECT_EQ("abc", printToString("abc"));
|
||||
}
|
||||
|
||||
TEST(OutputStreamTest, Insert) {
|
||||
OutputStream OS;
|
||||
|
||||
OS.insert(0, "", 0);
|
||||
EXPECT_EQ("", toString(OS));
|
||||
|
||||
OS.insert(0, "abcd", 4);
|
||||
EXPECT_EQ("abcd", toString(OS));
|
||||
|
||||
OS.insert(0, "x", 1);
|
||||
EXPECT_EQ("xabcd", toString(OS));
|
||||
|
||||
OS.insert(5, "y", 1);
|
||||
EXPECT_EQ("xabcdy", toString(OS));
|
||||
|
||||
OS.insert(3, "defghi", 6);
|
||||
EXPECT_EQ("xabdefghicdy", toString(OS));
|
||||
|
||||
std::free(OS.getBuffer());
|
||||
}
|
Loading…
Reference in a new issue