Home | History | Annotate | Download | only in fpdftext
      1 // Copyright 2016 PDFium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
      6 
      7 #include "core/fpdftext/cpdf_linkextract.h"
      8 
      9 #include <vector>
     10 
     11 #include "core/fpdftext/cpdf_textpage.h"
     12 #include "core/fxcrt/fx_ext.h"
     13 #include "core/fxcrt/fx_string.h"
     14 #include "core/fxcrt/fx_system.h"
     15 
     16 CPDF_LinkExtract::CPDF_LinkExtract(const CPDF_TextPage* pTextPage)
     17     : m_pTextPage(pTextPage) {}
     18 
     19 CPDF_LinkExtract::~CPDF_LinkExtract() {}
     20 
     21 void CPDF_LinkExtract::ExtractLinks() {
     22   m_LinkArray.clear();
     23   if (!m_pTextPage->IsParsed())
     24     return;
     25 
     26   m_strPageText = m_pTextPage->GetPageText(0, -1);
     27   if (m_strPageText.IsEmpty())
     28     return;
     29 
     30   ParseLink();
     31 }
     32 
     33 void CPDF_LinkExtract::ParseLink() {
     34   int start = 0, pos = 0;
     35   int TotalChar = m_pTextPage->CountChars();
     36   while (pos < TotalChar) {
     37     FPDF_CHAR_INFO pageChar;
     38     m_pTextPage->GetCharInfo(pos, &pageChar);
     39     if (pageChar.m_Flag == FPDFTEXT_CHAR_GENERATED ||
     40         pageChar.m_Unicode == 0x20 || pos == TotalChar - 1) {
     41       int nCount = pos - start;
     42       if (pos == TotalChar - 1)
     43         nCount++;
     44       CFX_WideString strBeCheck;
     45       strBeCheck = m_pTextPage->GetPageText(start, nCount);
     46       if (strBeCheck.GetLength() > 5) {
     47         while (strBeCheck.GetLength() > 0) {
     48           FX_WCHAR ch = strBeCheck.GetAt(strBeCheck.GetLength() - 1);
     49           if (ch == L')' || ch == L',' || ch == L'>' || ch == L'.') {
     50             strBeCheck = strBeCheck.Mid(0, strBeCheck.GetLength() - 1);
     51             nCount--;
     52           } else {
     53             break;
     54           }
     55         }
     56         if (nCount > 5 &&
     57             (CheckWebLink(strBeCheck) || CheckMailLink(strBeCheck))) {
     58           m_LinkArray.push_back({start, nCount, strBeCheck});
     59         }
     60       }
     61       start = ++pos;
     62     } else {
     63       pos++;
     64     }
     65   }
     66 }
     67 
     68 bool CPDF_LinkExtract::CheckWebLink(CFX_WideString& strBeCheck) {
     69   CFX_WideString str = strBeCheck;
     70   str.MakeLower();
     71   if (str.Find(L"http://www.") != -1) {
     72     strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"http://www."));
     73     return true;
     74   }
     75   if (str.Find(L"http://") != -1) {
     76     strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"http://"));
     77     return true;
     78   }
     79   if (str.Find(L"https://www.") != -1) {
     80     strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"https://www."));
     81     return true;
     82   }
     83   if (str.Find(L"https://") != -1) {
     84     strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"https://"));
     85     return true;
     86   }
     87   if (str.Find(L"www.") != -1) {
     88     strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"www."));
     89     strBeCheck = L"http://" + strBeCheck;
     90     return true;
     91   }
     92   return false;
     93 }
     94 
     95 bool CPDF_LinkExtract::CheckMailLink(CFX_WideString& str) {
     96   int aPos = str.Find(L'@');
     97   // Invalid when no '@'.
     98   if (aPos < 1)
     99     return false;
    100 
    101   // Check the local part.
    102   int pPos = aPos;  // Used to track the position of '@' or '.'.
    103   for (int i = aPos - 1; i >= 0; i--) {
    104     FX_WCHAR ch = str.GetAt(i);
    105     if (ch == L'_' || ch == L'-' || FXSYS_iswalnum(ch))
    106       continue;
    107 
    108     if (ch != L'.' || i == pPos - 1 || i == 0) {
    109       if (i == aPos - 1) {
    110         // There is '.' or invalid char before '@'.
    111         return false;
    112       }
    113       // End extracting for other invalid chars, '.' at the beginning, or
    114       // consecutive '.'.
    115       int removed_len = i == pPos - 1 ? i + 2 : i + 1;
    116       str = str.Right(str.GetLength() - removed_len);
    117       break;
    118     }
    119     // Found a valid '.'.
    120     pPos = i;
    121   }
    122 
    123   // Check the domain name part.
    124   aPos = str.Find(L'@');
    125   if (aPos < 1)
    126     return false;
    127 
    128   str.TrimRight(L'.');
    129   // At least one '.' in domain name, but not at the beginning.
    130   // TODO(weili): RFC5322 allows domain names to be a local name without '.'.
    131   // Check whether we should remove this check.
    132   int ePos = str.Find(L'.', aPos + 1);
    133   if (ePos == -1 || ePos == aPos + 1)
    134     return false;
    135 
    136   // Validate all other chars in domain name.
    137   int nLen = str.GetLength();
    138   pPos = 0;  // Used to track the position of '.'.
    139   for (int i = aPos + 1; i < nLen; i++) {
    140     FX_WCHAR wch = str.GetAt(i);
    141     if (wch == L'-' || FXSYS_iswalnum(wch))
    142       continue;
    143 
    144     if (wch != L'.' || i == pPos + 1) {
    145       // Domain name should end before invalid char.
    146       int host_end = i == pPos + 1 ? i - 2 : i - 1;
    147       if (pPos > 0 && host_end - aPos >= 3) {
    148         // Trim the ending invalid chars if there is at least one '.' and name.
    149         str = str.Left(host_end + 1);
    150         break;
    151       }
    152       return false;
    153     }
    154     pPos = i;
    155   }
    156 
    157   if (str.Find(L"mailto:") == -1)
    158     str = L"mailto:" + str;
    159 
    160   return true;
    161 }
    162 
    163 CFX_WideString CPDF_LinkExtract::GetURL(size_t index) const {
    164   return index < m_LinkArray.size() ? m_LinkArray[index].m_strUrl : L"";
    165 }
    166 
    167 std::vector<CFX_FloatRect> CPDF_LinkExtract::GetRects(size_t index) const {
    168   if (index >= m_LinkArray.size())
    169     return std::vector<CFX_FloatRect>();
    170 
    171   return m_pTextPage->GetRectArray(m_LinkArray[index].m_Start,
    172                                    m_LinkArray[index].m_Count);
    173 }
    174