1 // Copyright 2016 PDFium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com 6 7 #include "core/fpdftext/cpdf_linkextract.h" 8 9 #include <vector> 10 11 #include "core/fpdftext/cpdf_textpage.h" 12 #include "core/fxcrt/fx_extension.h" 13 #include "core/fxcrt/fx_string.h" 14 #include "core/fxcrt/fx_system.h" 15 16 namespace { 17 18 // Find the end of a web link starting from offset |start| and ending at offset 19 // |end|. The purpose of this function is to separate url from the surrounding 20 // context characters, we do not intend to fully validate the url. |str| 21 // contains lower case characters only. 22 size_t FindWebLinkEnding(const WideString& str, size_t start, size_t end) { 23 if (str.Contains(L'/', start)) { 24 // When there is a path and query after '/', most ASCII chars are allowed. 25 // We don't sanitize in this case. 26 return end; 27 } 28 29 // When there is no path, it only has IP address or host name. 30 // Port is optional at the end. 31 if (str[start] == L'[') { 32 // IPv6 reference. 33 // Find the end of the reference. 34 auto result = str.Find(L']', start + 1); 35 if (result.has_value()) { 36 end = result.value(); 37 if (end > start + 1) { // Has content inside brackets. 38 size_t len = str.GetLength(); 39 size_t off = end + 1; 40 if (off < len && str[off] == L':') { 41 off++; 42 while (off < len && str[off] >= L'0' && str[off] <= L'9') 43 off++; 44 if (off > end + 2 && 45 off <= len) // At least one digit in port number. 46 end = off - 1; // |off| is offset of the first invalid char. 47 } 48 } 49 } 50 return end; 51 } 52 53 // According to RFC1123, host name only has alphanumeric chars, hyphens, 54 // and periods. Hyphen should not at the end though. 55 // Non-ASCII chars are ignored during checking. 56 while (end > start && str[end] < 0x80) { 57 if ((str[end] >= L'0' && str[end] <= L'9') || 58 (str[end] >= L'a' && str[end] <= L'z') || str[end] == L'.') 59 break; 60 end--; 61 } 62 return end; 63 } 64 65 // Remove characters from the end of |str|, delimited by |start| and |end|, up 66 // to and including |charToFind|. No-op if |charToFind| is not present. Updates 67 // |end| if characters were removed. 68 void TrimBackwardsToChar(const WideString& str, 69 wchar_t charToFind, 70 size_t start, 71 size_t* end) { 72 for (size_t pos = *end; pos >= start; pos--) { 73 if (str[pos] == charToFind) { 74 *end = pos - 1; 75 break; 76 } 77 } 78 } 79 80 // Finds opening brackets ()[]{}<> and quotes "' before the URL delimited by 81 // |start| and |end| in |str|. Matches a closing bracket or quote for each 82 // opening character and, if present, removes everything afterwards. Returns the 83 // new end position for the string. 84 size_t TrimExternalBracketsFromWebLink(const WideString& str, 85 size_t start, 86 size_t end) { 87 for (size_t pos = 0; pos < start; pos++) { 88 if (str[pos] == '(') { 89 TrimBackwardsToChar(str, ')', start, &end); 90 } else if (str[pos] == '[') { 91 TrimBackwardsToChar(str, ']', start, &end); 92 } else if (str[pos] == '{') { 93 TrimBackwardsToChar(str, '}', start, &end); 94 } else if (str[pos] == '<') { 95 TrimBackwardsToChar(str, '>', start, &end); 96 } else if (str[pos] == '"') { 97 TrimBackwardsToChar(str, '"', start, &end); 98 } else if (str[pos] == '\'') { 99 TrimBackwardsToChar(str, '\'', start, &end); 100 } 101 } 102 return end; 103 } 104 105 } // namespace 106 107 CPDF_LinkExtract::CPDF_LinkExtract(const CPDF_TextPage* pTextPage) 108 : m_pTextPage(pTextPage) {} 109 110 CPDF_LinkExtract::~CPDF_LinkExtract() {} 111 112 void CPDF_LinkExtract::ExtractLinks() { 113 m_LinkArray.clear(); 114 if (!m_pTextPage->IsParsed()) 115 return; 116 117 m_strPageText = m_pTextPage->GetAllPageText(); 118 if (m_strPageText.IsEmpty()) 119 return; 120 121 ParseLink(); 122 } 123 124 void CPDF_LinkExtract::ParseLink() { 125 int start = 0; 126 int pos = 0; 127 int nTotalChar = m_pTextPage->CountChars(); 128 bool bAfterHyphen = false; 129 bool bLineBreak = false; 130 while (pos < nTotalChar) { 131 FPDF_CHAR_INFO pageChar; 132 m_pTextPage->GetCharInfo(pos, &pageChar); 133 if (pageChar.m_Flag == FPDFTEXT_CHAR_GENERATED || 134 pageChar.m_Unicode == TEXT_SPACE_CHAR || pos == nTotalChar - 1) { 135 int nCount = pos - start; 136 if (pos == nTotalChar - 1) { 137 nCount++; 138 } else if (bAfterHyphen && (pageChar.m_Unicode == TEXT_LINEFEED_CHAR || 139 pageChar.m_Unicode == TEXT_RETURN_CHAR)) { 140 // Handle text breaks with a hyphen to the next line. 141 bLineBreak = true; 142 pos++; 143 continue; 144 } 145 WideString strBeCheck; 146 strBeCheck = m_pTextPage->GetPageText(start, nCount); 147 if (bLineBreak) { 148 strBeCheck.Remove(TEXT_LINEFEED_CHAR); 149 strBeCheck.Remove(TEXT_RETURN_CHAR); 150 bLineBreak = false; 151 } 152 // Replace the generated code with the hyphen char. 153 strBeCheck.Replace(L"\xfffe", TEXT_HYPHEN); 154 155 if (strBeCheck.GetLength() > 5) { 156 while (strBeCheck.GetLength() > 0) { 157 wchar_t ch = strBeCheck[strBeCheck.GetLength() - 1]; 158 if (ch == L')' || ch == L',' || ch == L'>' || ch == L'.') { 159 strBeCheck = strBeCheck.Left(strBeCheck.GetLength() - 1); 160 nCount--; 161 } else { 162 break; 163 } 164 } 165 // Check for potential web URLs and email addresses. 166 // Ftp address, file system links, data, blob etc. are not checked. 167 if (nCount > 5) { 168 int32_t nStartOffset; 169 int32_t nCountOverload; 170 if (CheckWebLink(&strBeCheck, &nStartOffset, &nCountOverload)) { 171 m_LinkArray.push_back( 172 {start + nStartOffset, nCountOverload, strBeCheck}); 173 } else if (CheckMailLink(&strBeCheck)) { 174 m_LinkArray.push_back({start, nCount, strBeCheck}); 175 } 176 } 177 } 178 start = ++pos; 179 } else { 180 bAfterHyphen = (pageChar.m_Flag == FPDFTEXT_CHAR_HYPHEN || 181 (pageChar.m_Flag == FPDFTEXT_CHAR_NORMAL && 182 pageChar.m_Unicode == TEXT_HYPHEN_CHAR)); 183 pos++; 184 } 185 } 186 } 187 188 bool CPDF_LinkExtract::CheckWebLink(WideString* strBeCheck, 189 int32_t* nStart, 190 int32_t* nCount) { 191 static const wchar_t kHttpScheme[] = L"http"; 192 static const size_t kHttpSchemeLen = FXSYS_len(kHttpScheme); 193 static const wchar_t kWWWAddrStart[] = L"www."; 194 static const size_t kWWWAddrStartLen = FXSYS_len(kWWWAddrStart); 195 196 WideString str = *strBeCheck; 197 str.MakeLower(); 198 199 size_t len = str.GetLength(); 200 // First, try to find the scheme. 201 auto start = str.Find(kHttpScheme); 202 if (start.has_value()) { 203 size_t off = start.value() + kHttpSchemeLen; // move after "http". 204 if (len > off + 4) { // At least "://<char>" follows. 205 if (str[off] == L's') // "https" scheme is accepted. 206 off++; 207 if (str[off] == L':' && str[off + 1] == L'/' && str[off + 2] == L'/') { 208 off += 3; 209 size_t end = TrimExternalBracketsFromWebLink(str, start.value(), 210 str.GetLength() - 1); 211 end = FindWebLinkEnding(str, off, end); 212 if (end > off) { // Non-empty host name. 213 *nStart = start.value(); 214 *nCount = end - start.value() + 1; 215 *strBeCheck = strBeCheck->Mid(*nStart, *nCount); 216 return true; 217 } 218 } 219 } 220 } 221 222 // When there is no scheme, try to find url starting with "www.". 223 start = str.Find(kWWWAddrStart); 224 if (start.has_value() && len > start.value() + kWWWAddrStartLen) { 225 size_t end = TrimExternalBracketsFromWebLink(str, start.value(), 226 str.GetLength() - 1); 227 end = FindWebLinkEnding(str, start.value(), end); 228 if (end > start.value() + kWWWAddrStartLen) { 229 *nStart = start.value(); 230 *nCount = end - start.value() + 1; 231 *strBeCheck = L"http://" + strBeCheck->Mid(*nStart, *nCount); 232 return true; 233 } 234 } 235 return false; 236 } 237 238 bool CPDF_LinkExtract::CheckMailLink(WideString* str) { 239 auto aPos = str->Find(L'@'); 240 // Invalid when no '@' or when starts/ends with '@'. 241 if (!aPos.has_value() || aPos.value() == 0 || aPos == str->GetLength() - 1) 242 return false; 243 244 // Check the local part. 245 size_t pPos = aPos.value(); // Used to track the position of '@' or '.'. 246 for (size_t i = aPos.value(); i > 0; i--) { 247 wchar_t ch = (*str)[i - 1]; 248 if (ch == L'_' || ch == L'-' || FXSYS_iswalnum(ch)) 249 continue; 250 251 if (ch != L'.' || i == pPos || i == 1) { 252 if (i == aPos.value()) { 253 // There is '.' or invalid char before '@'. 254 return false; 255 } 256 // End extracting for other invalid chars, '.' at the beginning, or 257 // consecutive '.'. 258 size_t removed_len = i == pPos ? i + 1 : i; 259 *str = str->Right(str->GetLength() - removed_len); 260 break; 261 } 262 // Found a valid '.'. 263 pPos = i - 1; 264 } 265 266 // Check the domain name part. 267 aPos = str->Find(L'@'); 268 if (!aPos.has_value() || aPos.value() == 0) 269 return false; 270 271 str->TrimRight(L'.'); 272 // At least one '.' in domain name, but not at the beginning. 273 // TODO(weili): RFC5322 allows domain names to be a local name without '.'. 274 // Check whether we should remove this check. 275 auto ePos = str->Find(L'.', aPos.value() + 1); 276 if (!ePos.has_value() || ePos.value() == aPos.value() + 1) 277 return false; 278 279 // Validate all other chars in domain name. 280 size_t nLen = str->GetLength(); 281 pPos = 0; // Used to track the position of '.'. 282 for (size_t i = aPos.value() + 1; i < nLen; i++) { 283 wchar_t wch = (*str)[i]; 284 if (wch == L'-' || FXSYS_iswalnum(wch)) 285 continue; 286 287 if (wch != L'.' || i == pPos + 1) { 288 // Domain name should end before invalid char. 289 size_t host_end = i == pPos + 1 ? i - 2 : i - 1; 290 if (pPos > 0 && host_end - aPos.value() >= 3) { 291 // Trim the ending invalid chars if there is at least one '.' and name. 292 *str = str->Left(host_end + 1); 293 break; 294 } 295 return false; 296 } 297 pPos = i; 298 } 299 300 if (!str->Contains(L"mailto:")) 301 *str = L"mailto:" + *str; 302 303 return true; 304 } 305 306 WideString CPDF_LinkExtract::GetURL(size_t index) const { 307 return index < m_LinkArray.size() ? m_LinkArray[index].m_strUrl : L""; 308 } 309 310 std::vector<CFX_FloatRect> CPDF_LinkExtract::GetRects(size_t index) const { 311 if (index >= m_LinkArray.size()) 312 return std::vector<CFX_FloatRect>(); 313 314 return m_pTextPage->GetRectArray(m_LinkArray[index].m_Start, 315 m_LinkArray[index].m_Count); 316 } 317