[Demangle][Rust] Parse non-ASCII identifiers

Rust allows use of non-ASCII identifiers, which in Rust mangling scheme
are encoded using Punycode.

The encoding deviates from the standard by using an underscore as the
separator between ASCII part and a base-36 encoding of non-ASCII
characters (avoiding hypen-minus in the symbol name). Other than that,
the encoding follows the standard, and the decoder implemented here in
turn follows the one given in RFC 3492.

To avoid an extra intermediate memory allocation while decoding
Punycode, the interface of OutputStream is extended with an insert
method.

Reviewed By: dblaikie

Differential Revision: https://reviews.llvm.org/D104366
This commit is contained in:
Tomasz Miąsko 2021-10-01 00:00:00 +00:00
parent df672f66b6
commit c8c2b4629f
6 changed files with 297 additions and 4 deletions

View file

@ -126,6 +126,16 @@ public:
return this->operator<<(static_cast<unsigned long long>(N));
}
void insert(size_t Pos, const char *S, size_t N) {
assert(Pos <= CurrentPosition);
if (N == 0)
return;
grow(N);
std::memmove(Buffer + Pos + N, Buffer + Pos, CurrentPosition - Pos);
std::memcpy(Buffer + Pos, S, N);
CurrentPosition += N;
}
size_t getCurrentPosition() const { return CurrentPosition; }
void setCurrentPosition(size_t NewPos) { CurrentPosition = NewPos; }

View file

@ -126,6 +126,16 @@ public:
return this->operator<<(static_cast<unsigned long long>(N));
}
void insert(size_t Pos, const char *S, size_t N) {
assert(Pos <= CurrentPosition);
if (N == 0)
return;
grow(N);
std::memmove(Buffer + Pos + N, Buffer + Pos, CurrentPosition - Pos);
std::memcpy(Buffer + Pos, S, N);
CurrentPosition += N;
}
size_t getCurrentPosition() const { return CurrentPosition; }
void setCurrentPosition(size_t NewPos) { CurrentPosition = NewPos; }

View file

@ -135,6 +135,7 @@ private:
void printDecimalNumber(uint64_t N);
void printBasicType(BasicType);
void printLifetime(uint64_t Index);
void printIdentifier(Identifier Ident);
char look() const;
char consume();
@ -283,8 +284,7 @@ bool Demangler::demanglePath(IsInType InType, LeaveGenericsOpen LeaveOpen) {
switch (consume()) {
case 'C': {
parseOptionalBase62Number('s');
Identifier Ident = parseIdentifier();
print(Ident.Name);
printIdentifier(parseIdentifier());
break;
}
case 'M': {
@ -333,7 +333,7 @@ bool Demangler::demanglePath(IsInType InType, LeaveGenericsOpen LeaveOpen) {
print(NS);
if (!Ident.empty()) {
print(":");
print(Ident.Name);
printIdentifier(Ident);
}
print('#');
printDecimalNumber(Disambiguator);
@ -342,7 +342,7 @@ bool Demangler::demanglePath(IsInType InType, LeaveGenericsOpen LeaveOpen) {
// Implementation internal namespaces.
if (!Ident.empty()) {
print("::");
print(Ident.Name);
printIdentifier(Ident);
}
}
break;
@ -669,6 +669,8 @@ void Demangler::demangleFnSig() {
print("C");
} else {
Identifier Ident = parseIdentifier();
if (Ident.Punycode)
Error = true;
for (char C : Ident.Name) {
// When mangling ABI string, the "-" is replaced with "_".
if (C == '_')
@ -1078,6 +1080,172 @@ void Demangler::printLifetime(uint64_t Index) {
}
}
static inline bool decodePunycodeDigit(char C, size_t &Value) {
if (isLower(C)) {
Value = C - 'a';
return true;
}
if (isDigit(C)) {
Value = 26 + (C - '0');
return true;
}
return false;
}
static void removeNullBytes(OutputStream &Output, size_t StartIdx) {
char *Buffer = Output.getBuffer();
char *Start = Buffer + StartIdx;
char *End = Buffer + Output.getCurrentPosition();
Output.setCurrentPosition(std::remove(Start, End, '\0') - Buffer);
}
// Encodes code point as UTF-8 and stores results in Output. Returns false if
// CodePoint is not a valid unicode scalar value.
static inline bool encodeUTF8(size_t CodePoint, char *Output) {
if (0xD800 <= CodePoint && CodePoint <= 0xDFFF)
return false;
if (CodePoint <= 0x7F) {
Output[0] = CodePoint;
return true;
}
if (CodePoint <= 0x7FF) {
Output[0] = 0xC0 | ((CodePoint >> 6) & 0x3F);
Output[1] = 0x80 | (CodePoint & 0x3F);
return true;
}
if (CodePoint <= 0xFFFF) {
Output[0] = 0xE0 | (CodePoint >> 12);
Output[1] = 0x80 | ((CodePoint >> 6) & 0x3F);
Output[2] = 0x80 | (CodePoint & 0x3F);
return true;
}
if (CodePoint <= 0x10FFFF) {
Output[0] = 0xF0 | (CodePoint >> 18);
Output[1] = 0x80 | ((CodePoint >> 12) & 0x3F);
Output[2] = 0x80 | ((CodePoint >> 6) & 0x3F);
Output[3] = 0x80 | (CodePoint & 0x3F);
return true;
}
return false;
}
// Decodes string encoded using punycode and appends results to Output.
// Returns true if decoding was successful.
static bool decodePunycode(StringView Input, OutputStream &Output) {
size_t OutputSize = Output.getCurrentPosition();
size_t InputIdx = 0;
// Rust uses an underscore as a delimiter.
size_t DelimiterPos = StringView::npos;
for (size_t I = 0; I != Input.size(); ++I)
if (Input[I] == '_')
DelimiterPos = I;
if (DelimiterPos != StringView::npos) {
// Copy basic code points before the last delimiter to the output.
for (; InputIdx != DelimiterPos; ++InputIdx) {
char C = Input[InputIdx];
if (!isValid(C))
return false;
// Code points are padded with zeros while decoding is in progress.
char UTF8[4] = {C};
Output += StringView(UTF8, UTF8 + 4);
}
// Skip over the delimiter.
++InputIdx;
}
size_t Base = 36;
size_t Skew = 38;
size_t Bias = 72;
size_t N = 0x80;
size_t TMin = 1;
size_t TMax = 26;
size_t Damp = 700;
auto Adapt = [&](size_t Delta, size_t NumPoints) {
Delta /= Damp;
Delta += Delta / NumPoints;
Damp = 2;
size_t K = 0;
while (Delta > (Base - TMin) * TMax / 2) {
Delta /= Base - TMin;
K += Base;
}
return K + (((Base - TMin + 1) * Delta) / (Delta + Skew));
};
// Main decoding loop.
for (size_t I = 0; InputIdx != Input.size(); I += 1) {
size_t OldI = I;
size_t W = 1;
size_t Max = std::numeric_limits<size_t>::max();
for (size_t K = Base; true; K += Base) {
if (InputIdx == Input.size())
return false;
char C = Input[InputIdx++];
size_t Digit = 0;
if (!decodePunycodeDigit(C, Digit))
return false;
if (Digit > (Max - I) / W)
return false;
I += Digit * W;
size_t T;
if (K <= Bias)
T = TMin;
else if (K >= Bias + TMax)
T = TMax;
else
T = K - Bias;
if (Digit < T)
break;
if (W > Max / (Base - T))
return false;
W *= (Base - T);
}
size_t NumPoints = (Output.getCurrentPosition() - OutputSize) / 4 + 1;
Bias = Adapt(I - OldI, NumPoints);
if (I / NumPoints > Max - N)
return false;
N += I / NumPoints;
I = I % NumPoints;
// Insert N at position I in the output.
char UTF8[4] = {};
if (!encodeUTF8(N, UTF8))
return false;
Output.insert(OutputSize + I * 4, UTF8, 4);
}
removeNullBytes(Output, OutputSize);
return true;
}
void Demangler::printIdentifier(Identifier Ident) {
if (Error || !Print)
return;
if (Ident.Punycode) {
if (!decodePunycode(Ident.Name, Output))
Error = true;
} else {
print(Ident.Name);
}
}
char Demangler::look() const {
if (Error || Position >= Input.size())
return 0;

View file

@ -237,6 +237,11 @@ CHECK: function::<extern "cdecl" fn()>
CHECK: function::<unsafe extern "C-cmse-nonsecure-call" fn()>
_RIC8functionFUK21C_cmse_nonsecure_callEuE
; Invalid ABI with punycode.
CHECK: _RIC8functionFKu3n3hEuE
_RIC8functionFKu3n3hEuE
; Trait objects
CHECK: trait::<dyn >
@ -456,6 +461,44 @@ CHECK: dot (.llvm.1234)
CHECK: dot (.llvm.6789)
_RC3dotC5crate.llvm.6789
; Punycode
CHECK: punycode::東京
_RNvC8punycodeu7_1lqs71d
CHECK: punycode::zażółć_gęślą_jaźń
_RNvC8punycodeu29za_gl_ja_w3a7psa2tqtgb10airva
CHECK: punycode::საჭმელად_გემრიელი_სადილი
_RNvC8punycodeu30____7hkackfecea1cbdathfdh9hlq6y
CHECK: Gödel::Escher::Bach
_RNtNvCu8Gdel_5qa6Escher4Bach
CHECK: punycode::🦁🐅
_RNvC8punycodeu7wn8hx1g
; Punycode - invalid code point
CHECK: _RCu5r731r
_RCu5r731r
CHECK: _RCu8b44444yy
_RCu8b44444yy
CHECK: _RNvC1au25zzzzzzzzzzzzzzzzzzzzzzzzz
_RNvC1au25zzzzzzzzzzzzzzzzzzzzzzzzz
; Punycode - early EOF
CHECK: _RCu8_CCCAR_u4
_RCu8_CCCAR_u4
; Punycode - overflow
CHECK: _RNvC1au21p18888888888888888888
_RNvC1au21p18888888888888888888
; Invalid mangled characters
CHECK: _RNvC2a.1c

View file

@ -6,6 +6,7 @@ set(LLVM_LINK_COMPONENTS
add_llvm_unittest(DemangleTests
DemangleTest.cpp
ItaniumDemangleTest.cpp
OutputStreamTest.cpp
PartialDemangleTest.cpp
RustDemangleTest.cpp
StringViewTest.cpp

View file

@ -0,0 +1,61 @@
//===- llvm/unittest/OutputStreamTest.cpp - OutputStream unit tests -------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#include "llvm/Demangle/Utility.h"
#include "gtest/gtest.h"
#include <string>
using namespace llvm;
using llvm::itanium_demangle::OutputStream;
static std::string toString(OutputStream &OS) {
return {OS.getBuffer(), OS.getCurrentPosition()};
}
template <typename T> static std::string printToString(const T &Value) {
OutputStream OS;
OS << Value;
std::string s = toString(OS);
std::free(OS.getBuffer());
return s;
}
TEST(OutputStreamTest, Format) {
EXPECT_EQ("0", printToString(0));
EXPECT_EQ("1", printToString(1));
EXPECT_EQ("-1", printToString(-1));
EXPECT_EQ("-90", printToString(-90));
EXPECT_EQ("109", printToString(109));
EXPECT_EQ("400", printToString(400));
EXPECT_EQ("a", printToString('a'));
EXPECT_EQ("?", printToString('?'));
EXPECT_EQ("abc", printToString("abc"));
}
TEST(OutputStreamTest, Insert) {
OutputStream OS;
OS.insert(0, "", 0);
EXPECT_EQ("", toString(OS));
OS.insert(0, "abcd", 4);
EXPECT_EQ("abcd", toString(OS));
OS.insert(0, "x", 1);
EXPECT_EQ("xabcd", toString(OS));
OS.insert(5, "y", 1);
EXPECT_EQ("xabcdy", toString(OS));
OS.insert(3, "defghi", 6);
EXPECT_EQ("xabdefghicdy", toString(OS));
std::free(OS.getBuffer());
}