1 //===--- Encoding.h - Format C++ code -------------------------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 /// 10 /// \file 11 /// \brief Contains functions for text encoding manipulation. Supports UTF-8, 12 /// 8-bit encodings and escape sequences in C++ string literals. 13 /// 14 //===----------------------------------------------------------------------===// 15 16 #ifndef LLVM_CLANG_FORMAT_ENCODING_H 17 #define LLVM_CLANG_FORMAT_ENCODING_H 18 19 #include "clang/Basic/LLVM.h" 20 #include "llvm/Support/ConvertUTF.h" 21 22 namespace clang { 23 namespace format { 24 namespace encoding { 25 26 enum Encoding { 27 Encoding_UTF8, 28 Encoding_Unknown // We treat all other encodings as 8-bit encodings. 29 }; 30 31 /// \brief Detects encoding of the Text. If the Text can be decoded using UTF-8, 32 /// it is considered UTF8, otherwise we treat it as some 8-bit encoding. 33 inline Encoding detectEncoding(StringRef Text) { 34 const UTF8 *Ptr = reinterpret_cast<const UTF8 *>(Text.begin()); 35 const UTF8 *BufEnd = reinterpret_cast<const UTF8 *>(Text.end()); 36 if (::isLegalUTF8String(&Ptr, BufEnd)) 37 return Encoding_UTF8; 38 return Encoding_Unknown; 39 } 40 41 inline unsigned getCodePointCountUTF8(StringRef Text) { 42 unsigned CodePoints = 0; 43 for (size_t i = 0, e = Text.size(); i < e; i += getNumBytesForUTF8(Text[i])) { 44 ++CodePoints; 45 } 46 return CodePoints; 47 } 48 49 /// \brief Gets the number of code points in the Text using the specified 50 /// Encoding. 51 inline unsigned getCodePointCount(StringRef Text, Encoding Encoding) { 52 switch (Encoding) { 53 case Encoding_UTF8: 54 return getCodePointCountUTF8(Text); 55 default: 56 return Text.size(); 57 } 58 } 59 60 /// \brief Gets the number of bytes in a sequence representing a single 61 /// codepoint and starting with FirstChar in the specified Encoding. 62 inline unsigned getCodePointNumBytes(char FirstChar, Encoding Encoding) { 63 switch (Encoding) { 64 case Encoding_UTF8: 65 return getNumBytesForUTF8(FirstChar); 66 default: 67 return 1; 68 } 69 } 70 71 inline bool isOctDigit(char c) { return '0' <= c && c <= '7'; } 72 73 inline bool isHexDigit(char c) { 74 return ('0' <= c && c <= '9') || ('a' <= c && c <= 'f') || 75 ('A' <= c && c <= 'F'); 76 } 77 78 /// \brief Gets the length of an escape sequence inside a C++ string literal. 79 /// Text should span from the beginning of the escape sequence (starting with a 80 /// backslash) to the end of the string literal. 81 inline unsigned getEscapeSequenceLength(StringRef Text) { 82 assert(Text[0] == '\\'); 83 if (Text.size() < 2) 84 return 1; 85 86 switch (Text[1]) { 87 case 'u': 88 return 6; 89 case 'U': 90 return 10; 91 case 'x': { 92 unsigned I = 2; // Point after '\x'. 93 while (I < Text.size() && isHexDigit(Text[I])) 94 ++I; 95 return I; 96 } 97 default: 98 if (isOctDigit(Text[1])) { 99 unsigned I = 1; 100 while (I < Text.size() && I < 4 && isOctDigit(Text[I])) 101 ++I; 102 return I; 103 } 104 return 2; 105 } 106 } 107 108 } // namespace encoding 109 } // namespace format 110 } // namespace clang 111 112 #endif // LLVM_CLANG_FORMAT_ENCODING_H 113