1 //===--- Encoding.h - Format C++ code -------------------------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 /// 10 /// \file 11 /// \brief Contains functions for text encoding manipulation. Supports UTF-8, 12 /// 8-bit encodings and escape sequences in C++ string literals. 13 /// 14 //===----------------------------------------------------------------------===// 15 16 #ifndef LLVM_CLANG_LIB_FORMAT_ENCODING_H 17 #define LLVM_CLANG_LIB_FORMAT_ENCODING_H 18 19 #include "clang/Basic/LLVM.h" 20 #include "llvm/ADT/StringRef.h" 21 #include "llvm/Support/ConvertUTF.h" 22 #include "llvm/Support/Unicode.h" 23 24 namespace clang { 25 namespace format { 26 namespace encoding { 27 28 enum Encoding { 29 Encoding_UTF8, 30 Encoding_Unknown // We treat all other encodings as 8-bit encodings. 31 }; 32 33 /// \brief Detects encoding of the Text. If the Text can be decoded using UTF-8, 34 /// it is considered UTF8, otherwise we treat it as some 8-bit encoding. 35 inline Encoding detectEncoding(StringRef Text) { 36 const UTF8 *Ptr = reinterpret_cast<const UTF8 *>(Text.begin()); 37 const UTF8 *BufEnd = reinterpret_cast<const UTF8 *>(Text.end()); 38 if (::isLegalUTF8String(&Ptr, BufEnd)) 39 return Encoding_UTF8; 40 return Encoding_Unknown; 41 } 42 43 inline unsigned getCodePointCountUTF8(StringRef Text) { 44 unsigned CodePoints = 0; 45 for (size_t i = 0, e = Text.size(); i < e; i += getNumBytesForUTF8(Text[i])) { 46 ++CodePoints; 47 } 48 return CodePoints; 49 } 50 51 /// \brief Gets the number of code points in the Text using the specified 52 /// Encoding. 53 inline unsigned getCodePointCount(StringRef Text, Encoding Encoding) { 54 switch (Encoding) { 55 case Encoding_UTF8: 56 return getCodePointCountUTF8(Text); 57 default: 58 return Text.size(); 59 } 60 } 61 62 /// \brief Returns the number of columns required to display the \p Text on a 63 /// generic Unicode-capable terminal. Text is assumed to use the specified 64 /// \p Encoding. 65 inline unsigned columnWidth(StringRef Text, Encoding Encoding) { 66 if (Encoding == Encoding_UTF8) { 67 int ContentWidth = llvm::sys::unicode::columnWidthUTF8(Text); 68 // FIXME: Figure out the correct way to handle this in the presence of both 69 // printable and unprintable multi-byte UTF-8 characters. Falling back to 70 // returning the number of bytes may cause problems, as columnWidth suddenly 71 // becomes non-additive. 72 if (ContentWidth >= 0) 73 return ContentWidth; 74 } 75 return Text.size(); 76 } 77 78 /// \brief Returns the number of columns required to display the \p Text, 79 /// starting from the \p StartColumn on a terminal with the \p TabWidth. The 80 /// text is assumed to use the specified \p Encoding. 81 inline unsigned columnWidthWithTabs(StringRef Text, unsigned StartColumn, 82 unsigned TabWidth, Encoding Encoding) { 83 unsigned TotalWidth = 0; 84 StringRef Tail = Text; 85 for (;;) { 86 StringRef::size_type TabPos = Tail.find('\t'); 87 if (TabPos == StringRef::npos) 88 return TotalWidth + columnWidth(Tail, Encoding); 89 TotalWidth += columnWidth(Tail.substr(0, TabPos), Encoding); 90 TotalWidth += TabWidth - (TotalWidth + StartColumn) % TabWidth; 91 Tail = Tail.substr(TabPos + 1); 92 } 93 } 94 95 /// \brief Gets the number of bytes in a sequence representing a single 96 /// codepoint and starting with FirstChar in the specified Encoding. 97 inline unsigned getCodePointNumBytes(char FirstChar, Encoding Encoding) { 98 switch (Encoding) { 99 case Encoding_UTF8: 100 return getNumBytesForUTF8(FirstChar); 101 default: 102 return 1; 103 } 104 } 105 106 inline bool isOctDigit(char c) { return '0' <= c && c <= '7'; } 107 108 inline bool isHexDigit(char c) { 109 return ('0' <= c && c <= '9') || ('a' <= c && c <= 'f') || 110 ('A' <= c && c <= 'F'); 111 } 112 113 /// \brief Gets the length of an escape sequence inside a C++ string literal. 114 /// Text should span from the beginning of the escape sequence (starting with a 115 /// backslash) to the end of the string literal. 116 inline unsigned getEscapeSequenceLength(StringRef Text) { 117 assert(Text[0] == '\\'); 118 if (Text.size() < 2) 119 return 1; 120 121 switch (Text[1]) { 122 case 'u': 123 return 6; 124 case 'U': 125 return 10; 126 case 'x': { 127 unsigned I = 2; // Point after '\x'. 128 while (I < Text.size() && isHexDigit(Text[I])) 129 ++I; 130 return I; 131 } 132 default: 133 if (isOctDigit(Text[1])) { 134 unsigned I = 1; 135 while (I < Text.size() && I < 4 && isOctDigit(Text[I])) 136 ++I; 137 return I; 138 } 139 return 1 + getNumBytesForUTF8(Text[1]); 140 } 141 } 142 143 } // namespace encoding 144 } // namespace format 145 } // namespace clang 146 147 #endif 148