Home | History | Annotate | Download | only in font
      1 // Copyright 2017 PDFium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
      6 
      7 #include "core/fpdfapi/font/cpdf_cmap.h"
      8 
      9 #include <memory>
     10 #include <utility>
     11 #include <vector>
     12 
     13 #include "core/fpdfapi/cmaps/cmap_int.h"
     14 #include "core/fpdfapi/font/cpdf_cmapmanager.h"
     15 #include "core/fpdfapi/font/cpdf_cmapparser.h"
     16 #include "core/fpdfapi/parser/cpdf_simple_parser.h"
     17 
     18 namespace {
     19 
     20 struct ByteRange {
     21   uint8_t m_First;
     22   uint8_t m_Last;  // Inclusive.
     23 };
     24 
     25 struct PredefinedCMap {
     26   const char* m_pName;
     27   CIDSet m_Charset;
     28   CIDCoding m_Coding;
     29   CPDF_CMap::CodingScheme m_CodingScheme;
     30   uint8_t m_LeadingSegCount;
     31   ByteRange m_LeadingSegs[2];
     32 };
     33 
     34 const PredefinedCMap g_PredefinedCMaps[] = {
     35     {"GB-EUC",
     36      CIDSET_GB1,
     37      CIDCODING_GB,
     38      CPDF_CMap::MixedTwoBytes,
     39      1,
     40      {{0xa1, 0xfe}}},
     41     {"GBpc-EUC",
     42      CIDSET_GB1,
     43      CIDCODING_GB,
     44      CPDF_CMap::MixedTwoBytes,
     45      1,
     46      {{0xa1, 0xfc}}},
     47     {"GBK-EUC",
     48      CIDSET_GB1,
     49      CIDCODING_GB,
     50      CPDF_CMap::MixedTwoBytes,
     51      1,
     52      {{0x81, 0xfe}}},
     53     {"GBKp-EUC",
     54      CIDSET_GB1,
     55      CIDCODING_GB,
     56      CPDF_CMap::MixedTwoBytes,
     57      1,
     58      {{0x81, 0xfe}}},
     59     {"GBK2K-EUC",
     60      CIDSET_GB1,
     61      CIDCODING_GB,
     62      CPDF_CMap::MixedTwoBytes,
     63      1,
     64      {{0x81, 0xfe}}},
     65     {"GBK2K",
     66      CIDSET_GB1,
     67      CIDCODING_GB,
     68      CPDF_CMap::MixedTwoBytes,
     69      1,
     70      {{0x81, 0xfe}}},
     71     {"UniGB-UCS2", CIDSET_GB1, CIDCODING_UCS2, CPDF_CMap::TwoBytes, 0, {}},
     72     {"UniGB-UTF16", CIDSET_GB1, CIDCODING_UTF16, CPDF_CMap::TwoBytes, 0, {}},
     73     {"B5pc",
     74      CIDSET_CNS1,
     75      CIDCODING_BIG5,
     76      CPDF_CMap::MixedTwoBytes,
     77      1,
     78      {{0xa1, 0xfc}}},
     79     {"HKscs-B5",
     80      CIDSET_CNS1,
     81      CIDCODING_BIG5,
     82      CPDF_CMap::MixedTwoBytes,
     83      1,
     84      {{0x88, 0xfe}}},
     85     {"ETen-B5",
     86      CIDSET_CNS1,
     87      CIDCODING_BIG5,
     88      CPDF_CMap::MixedTwoBytes,
     89      1,
     90      {{0xa1, 0xfe}}},
     91     {"ETenms-B5",
     92      CIDSET_CNS1,
     93      CIDCODING_BIG5,
     94      CPDF_CMap::MixedTwoBytes,
     95      1,
     96      {{0xa1, 0xfe}}},
     97     {"UniCNS-UCS2", CIDSET_CNS1, CIDCODING_UCS2, CPDF_CMap::TwoBytes, 0, {}},
     98     {"UniCNS-UTF16", CIDSET_CNS1, CIDCODING_UTF16, CPDF_CMap::TwoBytes, 0, {}},
     99     {"83pv-RKSJ",
    100      CIDSET_JAPAN1,
    101      CIDCODING_JIS,
    102      CPDF_CMap::MixedTwoBytes,
    103      2,
    104      {{0x81, 0x9f}, {0xe0, 0xfc}}},
    105     {"90ms-RKSJ",
    106      CIDSET_JAPAN1,
    107      CIDCODING_JIS,
    108      CPDF_CMap::MixedTwoBytes,
    109      2,
    110      {{0x81, 0x9f}, {0xe0, 0xfc}}},
    111     {"90msp-RKSJ",
    112      CIDSET_JAPAN1,
    113      CIDCODING_JIS,
    114      CPDF_CMap::MixedTwoBytes,
    115      2,
    116      {{0x81, 0x9f}, {0xe0, 0xfc}}},
    117     {"90pv-RKSJ",
    118      CIDSET_JAPAN1,
    119      CIDCODING_JIS,
    120      CPDF_CMap::MixedTwoBytes,
    121      2,
    122      {{0x81, 0x9f}, {0xe0, 0xfc}}},
    123     {"Add-RKSJ",
    124      CIDSET_JAPAN1,
    125      CIDCODING_JIS,
    126      CPDF_CMap::MixedTwoBytes,
    127      2,
    128      {{0x81, 0x9f}, {0xe0, 0xfc}}},
    129     {"EUC",
    130      CIDSET_JAPAN1,
    131      CIDCODING_JIS,
    132      CPDF_CMap::MixedTwoBytes,
    133      2,
    134      {{0x8e, 0x8e}, {0xa1, 0xfe}}},
    135     {"H", CIDSET_JAPAN1, CIDCODING_JIS, CPDF_CMap::TwoBytes, 1, {{0x21, 0x7e}}},
    136     {"V", CIDSET_JAPAN1, CIDCODING_JIS, CPDF_CMap::TwoBytes, 1, {{0x21, 0x7e}}},
    137     {"Ext-RKSJ",
    138      CIDSET_JAPAN1,
    139      CIDCODING_JIS,
    140      CPDF_CMap::MixedTwoBytes,
    141      2,
    142      {{0x81, 0x9f}, {0xe0, 0xfc}}},
    143     {"UniJIS-UCS2", CIDSET_JAPAN1, CIDCODING_UCS2, CPDF_CMap::TwoBytes, 0, {}},
    144     {"UniJIS-UCS2-HW",
    145      CIDSET_JAPAN1,
    146      CIDCODING_UCS2,
    147      CPDF_CMap::TwoBytes,
    148      0,
    149      {}},
    150     {"UniJIS-UTF16",
    151      CIDSET_JAPAN1,
    152      CIDCODING_UTF16,
    153      CPDF_CMap::TwoBytes,
    154      0,
    155      {}},
    156     {"KSC-EUC",
    157      CIDSET_KOREA1,
    158      CIDCODING_KOREA,
    159      CPDF_CMap::MixedTwoBytes,
    160      1,
    161      {{0xa1, 0xfe}}},
    162     {"KSCms-UHC",
    163      CIDSET_KOREA1,
    164      CIDCODING_KOREA,
    165      CPDF_CMap::MixedTwoBytes,
    166      1,
    167      {{0x81, 0xfe}}},
    168     {"KSCms-UHC-HW",
    169      CIDSET_KOREA1,
    170      CIDCODING_KOREA,
    171      CPDF_CMap::MixedTwoBytes,
    172      1,
    173      {{0x81, 0xfe}}},
    174     {"KSCpc-EUC",
    175      CIDSET_KOREA1,
    176      CIDCODING_KOREA,
    177      CPDF_CMap::MixedTwoBytes,
    178      1,
    179      {{0xa1, 0xfd}}},
    180     {"UniKS-UCS2", CIDSET_KOREA1, CIDCODING_UCS2, CPDF_CMap::TwoBytes, 0, {}},
    181     {"UniKS-UTF16", CIDSET_KOREA1, CIDCODING_UTF16, CPDF_CMap::TwoBytes, 0, {}},
    182 };
    183 
    184 int CheckFourByteCodeRange(uint8_t* codes,
    185                            size_t size,
    186                            const std::vector<CPDF_CMap::CodeRange>& ranges) {
    187   for (size_t i = ranges.size(); i > 0; i--) {
    188     size_t seg = i - 1;
    189     if (ranges[seg].m_CharSize < size)
    190       continue;
    191     size_t iChar = 0;
    192     while (iChar < size) {
    193       if (codes[iChar] < ranges[seg].m_Lower[iChar] ||
    194           codes[iChar] > ranges[seg].m_Upper[iChar]) {
    195         break;
    196       }
    197       ++iChar;
    198     }
    199     if (iChar == ranges[seg].m_CharSize)
    200       return 2;
    201     if (iChar)
    202       return (size == ranges[seg].m_CharSize) ? 2 : 1;
    203   }
    204   return 0;
    205 }
    206 
    207 size_t GetFourByteCharSizeImpl(
    208     uint32_t charcode,
    209     const std::vector<CPDF_CMap::CodeRange>& ranges) {
    210   if (ranges.empty())
    211     return 1;
    212 
    213   uint8_t codes[4];
    214   codes[0] = codes[1] = 0x00;
    215   codes[2] = static_cast<uint8_t>(charcode >> 8 & 0xFF);
    216   codes[3] = static_cast<uint8_t>(charcode);
    217   for (size_t offset = 0; offset < 4; offset++) {
    218     size_t size = 4 - offset;
    219     for (size_t j = 0; j < ranges.size(); j++) {
    220       size_t iSeg = (ranges.size() - 1) - j;
    221       if (ranges[iSeg].m_CharSize < size)
    222         continue;
    223       size_t iChar = 0;
    224       while (iChar < size) {
    225         if (codes[offset + iChar] < ranges[iSeg].m_Lower[iChar] ||
    226             codes[offset + iChar] > ranges[iSeg].m_Upper[iChar]) {
    227           break;
    228         }
    229         ++iChar;
    230       }
    231       if (iChar == ranges[iSeg].m_CharSize)
    232         return size;
    233     }
    234   }
    235   return 1;
    236 }
    237 
    238 }  // namespace
    239 
    240 CPDF_CMap::CPDF_CMap()
    241     : m_bLoaded(false),
    242       m_bVertical(false),
    243       m_Charset(CIDSET_UNKNOWN),
    244       m_CodingScheme(TwoBytes),
    245       m_Coding(CIDCODING_UNKNOWN),
    246       m_pEmbedMap(nullptr) {}
    247 
    248 CPDF_CMap::~CPDF_CMap() {}
    249 
    250 void CPDF_CMap::LoadPredefined(CPDF_CMapManager* pMgr,
    251                                const ByteString& bsName,
    252                                bool bPromptCJK) {
    253   m_PredefinedCMap = bsName;
    254   if (m_PredefinedCMap == "Identity-H" || m_PredefinedCMap == "Identity-V") {
    255     m_Coding = CIDCODING_CID;
    256     m_bVertical = bsName.Last() == 'V';
    257     m_bLoaded = true;
    258     return;
    259   }
    260   ByteString cmapid = m_PredefinedCMap;
    261   m_bVertical = cmapid.Last() == 'V';
    262   if (cmapid.GetLength() > 2) {
    263     cmapid = cmapid.Left(cmapid.GetLength() - 2);
    264   }
    265   const PredefinedCMap* map = nullptr;
    266   for (size_t i = 0; i < FX_ArraySize(g_PredefinedCMaps); ++i) {
    267     if (cmapid == ByteStringView(g_PredefinedCMaps[i].m_pName)) {
    268       map = &g_PredefinedCMaps[i];
    269       break;
    270     }
    271   }
    272   if (!map)
    273     return;
    274 
    275   m_Charset = map->m_Charset;
    276   m_Coding = map->m_Coding;
    277   m_CodingScheme = map->m_CodingScheme;
    278   if (m_CodingScheme == MixedTwoBytes) {
    279     m_MixedTwoByteLeadingBytes = std::vector<bool>(256);
    280     for (uint32_t i = 0; i < map->m_LeadingSegCount; ++i) {
    281       const ByteRange& seg = map->m_LeadingSegs[i];
    282       for (int b = seg.m_First; b <= seg.m_Last; ++b)
    283         m_MixedTwoByteLeadingBytes[b] = true;
    284     }
    285   }
    286   m_pEmbedMap = FPDFAPI_FindEmbeddedCMap(bsName, m_Charset, m_Coding);
    287   if (!m_pEmbedMap)
    288     return;
    289 
    290   m_bLoaded = true;
    291 }
    292 
    293 void CPDF_CMap::LoadEmbedded(const uint8_t* pData, uint32_t size) {
    294   m_DirectCharcodeToCIDTable = std::vector<uint16_t>(65536);
    295   CPDF_CMapParser parser(this);
    296   CPDF_SimpleParser syntax(pData, size);
    297   while (1) {
    298     ByteStringView word = syntax.GetWord();
    299     if (word.IsEmpty()) {
    300       break;
    301     }
    302     parser.ParseWord(word);
    303   }
    304   if (m_CodingScheme == MixedFourBytes && parser.HasAdditionalMappings()) {
    305     m_AdditionalCharcodeToCIDMappings = parser.TakeAdditionalMappings();
    306     std::sort(
    307         m_AdditionalCharcodeToCIDMappings.begin(),
    308         m_AdditionalCharcodeToCIDMappings.end(),
    309         [](const CPDF_CMap::CIDRange& arg1, const CPDF_CMap::CIDRange& arg2) {
    310           return arg1.m_EndCode < arg2.m_EndCode;
    311         });
    312   }
    313 }
    314 
    315 uint16_t CPDF_CMap::CIDFromCharCode(uint32_t charcode) const {
    316   if (m_Coding == CIDCODING_CID)
    317     return static_cast<uint16_t>(charcode);
    318 
    319   if (m_pEmbedMap)
    320     return FPDFAPI_CIDFromCharCode(m_pEmbedMap, charcode);
    321 
    322   if (m_DirectCharcodeToCIDTable.empty())
    323     return static_cast<uint16_t>(charcode);
    324 
    325   if (charcode < 0x10000)
    326     return m_DirectCharcodeToCIDTable[charcode];
    327 
    328   auto it = std::lower_bound(m_AdditionalCharcodeToCIDMappings.begin(),
    329                              m_AdditionalCharcodeToCIDMappings.end(), charcode,
    330                              [](const CPDF_CMap::CIDRange& arg, uint32_t val) {
    331                                return arg.m_EndCode < val;
    332                              });
    333   if (it == m_AdditionalCharcodeToCIDMappings.end() ||
    334       it->m_StartCode > charcode) {
    335     return 0;
    336   }
    337   return it->m_StartCID + charcode - it->m_StartCode;
    338 }
    339 
    340 uint32_t CPDF_CMap::GetNextChar(const char* pString,
    341                                 int nStrLen,
    342                                 int& offset) const {
    343   auto* pBytes = reinterpret_cast<const uint8_t*>(pString);
    344   switch (m_CodingScheme) {
    345     case OneByte: {
    346       return pBytes[offset++];
    347     }
    348     case TwoBytes: {
    349       uint8_t byte1 = pBytes[offset++];
    350       return 256 * byte1 + pBytes[offset++];
    351     }
    352     case MixedTwoBytes: {
    353       uint8_t byte1 = pBytes[offset++];
    354       if (!m_MixedTwoByteLeadingBytes[byte1])
    355         return byte1;
    356       return 256 * byte1 + pBytes[offset++];
    357     }
    358     case MixedFourBytes: {
    359       uint8_t codes[4];
    360       int char_size = 1;
    361       codes[0] = pBytes[offset++];
    362       while (1) {
    363         int ret = CheckFourByteCodeRange(codes, char_size,
    364                                          m_MixedFourByteLeadingRanges);
    365         if (ret == 0)
    366           return 0;
    367         if (ret == 2) {
    368           uint32_t charcode = 0;
    369           for (int i = 0; i < char_size; i++)
    370             charcode = (charcode << 8) + codes[i];
    371           return charcode;
    372         }
    373         if (char_size == 4 || offset == nStrLen)
    374           return 0;
    375         codes[char_size++] = pBytes[offset++];
    376       }
    377       break;
    378     }
    379   }
    380   return 0;
    381 }
    382 
    383 int CPDF_CMap::GetCharSize(uint32_t charcode) const {
    384   switch (m_CodingScheme) {
    385     case OneByte:
    386       return 1;
    387     case TwoBytes:
    388       return 2;
    389     case MixedTwoBytes:
    390       if (charcode < 0x100)
    391         return 1;
    392       return 2;
    393     case MixedFourBytes:
    394       if (charcode < 0x100)
    395         return 1;
    396       if (charcode < 0x10000)
    397         return 2;
    398       if (charcode < 0x1000000)
    399         return 3;
    400       return 4;
    401   }
    402   return 1;
    403 }
    404 
    405 int CPDF_CMap::CountChar(const char* pString, int size) const {
    406   switch (m_CodingScheme) {
    407     case OneByte:
    408       return size;
    409     case TwoBytes:
    410       return (size + 1) / 2;
    411     case MixedTwoBytes: {
    412       int count = 0;
    413       for (int i = 0; i < size; i++) {
    414         count++;
    415         if (m_MixedTwoByteLeadingBytes[reinterpret_cast<const uint8_t*>(
    416                 pString)[i]]) {
    417           i++;
    418         }
    419       }
    420       return count;
    421     }
    422     case MixedFourBytes: {
    423       int count = 0, offset = 0;
    424       while (offset < size) {
    425         GetNextChar(pString, size, offset);
    426         count++;
    427       }
    428       return count;
    429     }
    430   }
    431   return size;
    432 }
    433 
    434 int CPDF_CMap::AppendChar(char* str, uint32_t charcode) const {
    435   switch (m_CodingScheme) {
    436     case OneByte:
    437       str[0] = static_cast<char>(charcode);
    438       return 1;
    439     case TwoBytes:
    440       str[0] = static_cast<char>(charcode / 256);
    441       str[1] = static_cast<char>(charcode % 256);
    442       return 2;
    443     case MixedTwoBytes:
    444       if (charcode < 0x100 && !m_MixedTwoByteLeadingBytes[charcode]) {
    445         str[0] = static_cast<char>(charcode);
    446         return 1;
    447       }
    448       str[0] = static_cast<char>(charcode >> 8);
    449       str[1] = static_cast<char>(charcode);
    450       return 2;
    451     case MixedFourBytes:
    452       if (charcode < 0x100) {
    453         int iSize = static_cast<int>(
    454             GetFourByteCharSizeImpl(charcode, m_MixedFourByteLeadingRanges));
    455         if (iSize == 0)
    456           iSize = 1;
    457         str[iSize - 1] = static_cast<char>(charcode);
    458         if (iSize > 1)
    459           memset(str, 0, iSize - 1);
    460         return iSize;
    461       }
    462       if (charcode < 0x10000) {
    463         str[0] = static_cast<char>(charcode >> 8);
    464         str[1] = static_cast<char>(charcode);
    465         return 2;
    466       }
    467       if (charcode < 0x1000000) {
    468         str[0] = static_cast<char>(charcode >> 16);
    469         str[1] = static_cast<char>(charcode >> 8);
    470         str[2] = static_cast<char>(charcode);
    471         return 3;
    472       }
    473       str[0] = static_cast<char>(charcode >> 24);
    474       str[1] = static_cast<char>(charcode >> 16);
    475       str[2] = static_cast<char>(charcode >> 8);
    476       str[3] = static_cast<char>(charcode);
    477       return 4;
    478   }
    479   return 0;
    480 }
    481