Home | History | Annotate | Download | only in fpdftext
      1 // Copyright 2016 PDFium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
      6 
      7 #include "core/fpdftext/cpdf_linkextract.h"
      8 
      9 #include <vector>
     10 
     11 #include "core/fpdftext/cpdf_textpage.h"
     12 #include "core/fxcrt/fx_extension.h"
     13 #include "core/fxcrt/fx_string.h"
     14 #include "core/fxcrt/fx_system.h"
     15 
     16 namespace {
     17 
     18 // Find the end of a web link starting from offset |start| and ending at offset
     19 // |end|. The purpose of this function is to separate url from the surrounding
     20 // context characters, we do not intend to fully validate the url. |str|
     21 // contains lower case characters only.
     22 size_t FindWebLinkEnding(const WideString& str, size_t start, size_t end) {
     23   if (str.Contains(L'/', start)) {
     24     // When there is a path and query after '/', most ASCII chars are allowed.
     25     // We don't sanitize in this case.
     26     return end;
     27   }
     28 
     29   // When there is no path, it only has IP address or host name.
     30   // Port is optional at the end.
     31   if (str[start] == L'[') {
     32     // IPv6 reference.
     33     // Find the end of the reference.
     34     auto result = str.Find(L']', start + 1);
     35     if (result.has_value()) {
     36       end = result.value();
     37       if (end > start + 1) {  // Has content inside brackets.
     38         size_t len = str.GetLength();
     39         size_t off = end + 1;
     40         if (off < len && str[off] == L':') {
     41           off++;
     42           while (off < len && str[off] >= L'0' && str[off] <= L'9')
     43             off++;
     44           if (off > end + 2 &&
     45               off <= len)   // At least one digit in port number.
     46             end = off - 1;  // |off| is offset of the first invalid char.
     47         }
     48       }
     49     }
     50     return end;
     51   }
     52 
     53   // According to RFC1123, host name only has alphanumeric chars, hyphens,
     54   // and periods. Hyphen should not at the end though.
     55   // Non-ASCII chars are ignored during checking.
     56   while (end > start && str[end] < 0x80) {
     57     if ((str[end] >= L'0' && str[end] <= L'9') ||
     58         (str[end] >= L'a' && str[end] <= L'z') || str[end] == L'.')
     59       break;
     60     end--;
     61   }
     62   return end;
     63 }
     64 
     65 // Remove characters from the end of |str|, delimited by |start| and |end|, up
     66 // to and including |charToFind|. No-op if |charToFind| is not present. Updates
     67 // |end| if characters were removed.
     68 void TrimBackwardsToChar(const WideString& str,
     69                          wchar_t charToFind,
     70                          size_t start,
     71                          size_t* end) {
     72   for (size_t pos = *end; pos >= start; pos--) {
     73     if (str[pos] == charToFind) {
     74       *end = pos - 1;
     75       break;
     76     }
     77   }
     78 }
     79 
     80 // Finds opening brackets ()[]{}<> and quotes "'  before the URL delimited by
     81 // |start| and |end| in |str|. Matches a closing bracket or quote for each
     82 // opening character and, if present, removes everything afterwards. Returns the
     83 // new end position for the string.
     84 size_t TrimExternalBracketsFromWebLink(const WideString& str,
     85                                        size_t start,
     86                                        size_t end) {
     87   for (size_t pos = 0; pos < start; pos++) {
     88     if (str[pos] == '(') {
     89       TrimBackwardsToChar(str, ')', start, &end);
     90     } else if (str[pos] == '[') {
     91       TrimBackwardsToChar(str, ']', start, &end);
     92     } else if (str[pos] == '{') {
     93       TrimBackwardsToChar(str, '}', start, &end);
     94     } else if (str[pos] == '<') {
     95       TrimBackwardsToChar(str, '>', start, &end);
     96     } else if (str[pos] == '"') {
     97       TrimBackwardsToChar(str, '"', start, &end);
     98     } else if (str[pos] == '\'') {
     99       TrimBackwardsToChar(str, '\'', start, &end);
    100     }
    101   }
    102   return end;
    103 }
    104 
    105 }  // namespace
    106 
    107 CPDF_LinkExtract::CPDF_LinkExtract(const CPDF_TextPage* pTextPage)
    108     : m_pTextPage(pTextPage) {}
    109 
    110 CPDF_LinkExtract::~CPDF_LinkExtract() {}
    111 
    112 void CPDF_LinkExtract::ExtractLinks() {
    113   m_LinkArray.clear();
    114   if (!m_pTextPage->IsParsed())
    115     return;
    116 
    117   m_strPageText = m_pTextPage->GetAllPageText();
    118   if (m_strPageText.IsEmpty())
    119     return;
    120 
    121   ParseLink();
    122 }
    123 
    124 void CPDF_LinkExtract::ParseLink() {
    125   int start = 0;
    126   int pos = 0;
    127   int nTotalChar = m_pTextPage->CountChars();
    128   bool bAfterHyphen = false;
    129   bool bLineBreak = false;
    130   while (pos < nTotalChar) {
    131     FPDF_CHAR_INFO pageChar;
    132     m_pTextPage->GetCharInfo(pos, &pageChar);
    133     if (pageChar.m_Flag == FPDFTEXT_CHAR_GENERATED ||
    134         pageChar.m_Unicode == TEXT_SPACE_CHAR || pos == nTotalChar - 1) {
    135       int nCount = pos - start;
    136       if (pos == nTotalChar - 1) {
    137         nCount++;
    138       } else if (bAfterHyphen && (pageChar.m_Unicode == TEXT_LINEFEED_CHAR ||
    139                                   pageChar.m_Unicode == TEXT_RETURN_CHAR)) {
    140         // Handle text breaks with a hyphen to the next line.
    141         bLineBreak = true;
    142         pos++;
    143         continue;
    144       }
    145       WideString strBeCheck;
    146       strBeCheck = m_pTextPage->GetPageText(start, nCount);
    147       if (bLineBreak) {
    148         strBeCheck.Remove(TEXT_LINEFEED_CHAR);
    149         strBeCheck.Remove(TEXT_RETURN_CHAR);
    150         bLineBreak = false;
    151       }
    152       // Replace the generated code with the hyphen char.
    153       strBeCheck.Replace(L"\xfffe", TEXT_HYPHEN);
    154 
    155       if (strBeCheck.GetLength() > 5) {
    156         while (strBeCheck.GetLength() > 0) {
    157           wchar_t ch = strBeCheck[strBeCheck.GetLength() - 1];
    158           if (ch == L')' || ch == L',' || ch == L'>' || ch == L'.') {
    159             strBeCheck = strBeCheck.Left(strBeCheck.GetLength() - 1);
    160             nCount--;
    161           } else {
    162             break;
    163           }
    164         }
    165         // Check for potential web URLs and email addresses.
    166         // Ftp address, file system links, data, blob etc. are not checked.
    167         if (nCount > 5) {
    168           int32_t nStartOffset;
    169           int32_t nCountOverload;
    170           if (CheckWebLink(&strBeCheck, &nStartOffset, &nCountOverload)) {
    171             m_LinkArray.push_back(
    172                 {start + nStartOffset, nCountOverload, strBeCheck});
    173           } else if (CheckMailLink(&strBeCheck)) {
    174             m_LinkArray.push_back({start, nCount, strBeCheck});
    175           }
    176         }
    177       }
    178       start = ++pos;
    179     } else {
    180       bAfterHyphen = (pageChar.m_Flag == FPDFTEXT_CHAR_HYPHEN ||
    181                       (pageChar.m_Flag == FPDFTEXT_CHAR_NORMAL &&
    182                        pageChar.m_Unicode == TEXT_HYPHEN_CHAR));
    183       pos++;
    184     }
    185   }
    186 }
    187 
    188 bool CPDF_LinkExtract::CheckWebLink(WideString* strBeCheck,
    189                                     int32_t* nStart,
    190                                     int32_t* nCount) {
    191   static const wchar_t kHttpScheme[] = L"http";
    192   static const size_t kHttpSchemeLen = FXSYS_len(kHttpScheme);
    193   static const wchar_t kWWWAddrStart[] = L"www.";
    194   static const size_t kWWWAddrStartLen = FXSYS_len(kWWWAddrStart);
    195 
    196   WideString str = *strBeCheck;
    197   str.MakeLower();
    198 
    199   size_t len = str.GetLength();
    200   // First, try to find the scheme.
    201   auto start = str.Find(kHttpScheme);
    202   if (start.has_value()) {
    203     size_t off = start.value() + kHttpSchemeLen;  // move after "http".
    204     if (len > off + 4) {                      // At least "://<char>" follows.
    205       if (str[off] == L's')                   // "https" scheme is accepted.
    206         off++;
    207       if (str[off] == L':' && str[off + 1] == L'/' && str[off + 2] == L'/') {
    208         off += 3;
    209         size_t end = TrimExternalBracketsFromWebLink(str, start.value(),
    210                                                      str.GetLength() - 1);
    211         end = FindWebLinkEnding(str, off, end);
    212         if (end > off) {  // Non-empty host name.
    213           *nStart = start.value();
    214           *nCount = end - start.value() + 1;
    215           *strBeCheck = strBeCheck->Mid(*nStart, *nCount);
    216           return true;
    217         }
    218       }
    219     }
    220   }
    221 
    222   // When there is no scheme, try to find url starting with "www.".
    223   start = str.Find(kWWWAddrStart);
    224   if (start.has_value() && len > start.value() + kWWWAddrStartLen) {
    225     size_t end = TrimExternalBracketsFromWebLink(str, start.value(),
    226                                                  str.GetLength() - 1);
    227     end = FindWebLinkEnding(str, start.value(), end);
    228     if (end > start.value() + kWWWAddrStartLen) {
    229       *nStart = start.value();
    230       *nCount = end - start.value() + 1;
    231       *strBeCheck = L"http://" + strBeCheck->Mid(*nStart, *nCount);
    232       return true;
    233     }
    234   }
    235   return false;
    236 }
    237 
    238 bool CPDF_LinkExtract::CheckMailLink(WideString* str) {
    239   auto aPos = str->Find(L'@');
    240   // Invalid when no '@' or when starts/ends with '@'.
    241   if (!aPos.has_value() || aPos.value() == 0 || aPos == str->GetLength() - 1)
    242     return false;
    243 
    244   // Check the local part.
    245   size_t pPos = aPos.value();  // Used to track the position of '@' or '.'.
    246   for (size_t i = aPos.value(); i > 0; i--) {
    247     wchar_t ch = (*str)[i - 1];
    248     if (ch == L'_' || ch == L'-' || FXSYS_iswalnum(ch))
    249       continue;
    250 
    251     if (ch != L'.' || i == pPos || i == 1) {
    252       if (i == aPos.value()) {
    253         // There is '.' or invalid char before '@'.
    254         return false;
    255       }
    256       // End extracting for other invalid chars, '.' at the beginning, or
    257       // consecutive '.'.
    258       size_t removed_len = i == pPos ? i + 1 : i;
    259       *str = str->Right(str->GetLength() - removed_len);
    260       break;
    261     }
    262     // Found a valid '.'.
    263     pPos = i - 1;
    264   }
    265 
    266   // Check the domain name part.
    267   aPos = str->Find(L'@');
    268   if (!aPos.has_value() || aPos.value() == 0)
    269     return false;
    270 
    271   str->TrimRight(L'.');
    272   // At least one '.' in domain name, but not at the beginning.
    273   // TODO(weili): RFC5322 allows domain names to be a local name without '.'.
    274   // Check whether we should remove this check.
    275   auto ePos = str->Find(L'.', aPos.value() + 1);
    276   if (!ePos.has_value() || ePos.value() == aPos.value() + 1)
    277     return false;
    278 
    279   // Validate all other chars in domain name.
    280   size_t nLen = str->GetLength();
    281   pPos = 0;  // Used to track the position of '.'.
    282   for (size_t i = aPos.value() + 1; i < nLen; i++) {
    283     wchar_t wch = (*str)[i];
    284     if (wch == L'-' || FXSYS_iswalnum(wch))
    285       continue;
    286 
    287     if (wch != L'.' || i == pPos + 1) {
    288       // Domain name should end before invalid char.
    289       size_t host_end = i == pPos + 1 ? i - 2 : i - 1;
    290       if (pPos > 0 && host_end - aPos.value() >= 3) {
    291         // Trim the ending invalid chars if there is at least one '.' and name.
    292         *str = str->Left(host_end + 1);
    293         break;
    294       }
    295       return false;
    296     }
    297     pPos = i;
    298   }
    299 
    300   if (!str->Contains(L"mailto:"))
    301     *str = L"mailto:" + *str;
    302 
    303   return true;
    304 }
    305 
    306 WideString CPDF_LinkExtract::GetURL(size_t index) const {
    307   return index < m_LinkArray.size() ? m_LinkArray[index].m_strUrl : L"";
    308 }
    309 
    310 std::vector<CFX_FloatRect> CPDF_LinkExtract::GetRects(size_t index) const {
    311   if (index >= m_LinkArray.size())
    312     return std::vector<CFX_FloatRect>();
    313 
    314   return m_pTextPage->GetRectArray(m_LinkArray[index].m_Start,
    315                                    m_LinkArray[index].m_Count);
    316 }
    317