1 // This file is part of TagSoup and is Copyright 2002-2008 by John Cowan. 2 // 3 // TagSoup is licensed under the Apache License, 4 // Version 2.0. You may obtain a copy of this license at 5 // http://www.apache.org/licenses/LICENSE-2.0 . You may also have 6 // additional legal rights not granted by this license. 7 // 8 // TagSoup is distributed in the hope that it will be useful, but 9 // unless required by applicable law or agreed to in writing, TagSoup 10 // is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS 11 // OF ANY KIND, either express or implied; not even the implied warranty 12 // of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. 13 // 14 // 15 package org.ccil.cowan.tagsoup; 16 import java.io.*; 17 import org.xml.sax.SAXException; 18 import org.xml.sax.Locator; 19 20 /** 21 This class implements a table-driven scanner for HTML, allowing for lots of 22 defects. It implements the Scanner interface, which accepts a Reader 23 object to fetch characters from and a ScanHandler object to report lexical 24 events to. 25 */ 26 27 public class HTMLScanner implements Scanner, Locator { 28 29 // Start of state table 30 private static final int S_ANAME = 1; 31 private static final int S_APOS = 2; 32 private static final int S_AVAL = 3; 33 private static final int S_BB = 4; 34 private static final int S_BBC = 5; 35 private static final int S_BBCD = 6; 36 private static final int S_BBCDA = 7; 37 private static final int S_BBCDAT = 8; 38 private static final int S_BBCDATA = 9; 39 private static final int S_CDATA = 10; 40 private static final int S_CDATA2 = 11; 41 private static final int S_CDSECT = 12; 42 private static final int S_CDSECT1 = 13; 43 private static final int S_CDSECT2 = 14; 44 private static final int S_COM = 15; 45 private static final int S_COM2 = 16; 46 private static final int S_COM3 = 17; 47 private static final int S_COM4 = 18; 48 private static final int S_DECL = 19; 49 private static final int S_DECL2 = 20; 50 private static final int S_DONE = 21; 51 private static final int S_EMPTYTAG = 22; 52 private static final int S_ENT = 23; 53 private static final int S_EQ = 24; 54 private static final int S_ETAG = 25; 55 private static final int S_GI = 26; 56 private static final int S_NCR = 27; 57 private static final int S_PCDATA = 28; 58 private static final int S_PI = 29; 59 private static final int S_PITARGET = 30; 60 private static final int S_QUOT = 31; 61 private static final int S_STAGC = 32; 62 private static final int S_TAG = 33; 63 private static final int S_TAGWS = 34; 64 private static final int S_XNCR = 35; 65 private static final int A_ADUP = 1; 66 private static final int A_ADUP_SAVE = 2; 67 private static final int A_ADUP_STAGC = 3; 68 private static final int A_ANAME = 4; 69 private static final int A_ANAME_ADUP = 5; 70 private static final int A_ANAME_ADUP_STAGC = 6; 71 private static final int A_AVAL = 7; 72 private static final int A_AVAL_STAGC = 8; 73 private static final int A_CDATA = 9; 74 private static final int A_CMNT = 10; 75 private static final int A_DECL = 11; 76 private static final int A_EMPTYTAG = 12; 77 private static final int A_ENTITY = 13; 78 private static final int A_ENTITY_START = 14; 79 private static final int A_ETAG = 15; 80 private static final int A_GI = 16; 81 private static final int A_GI_STAGC = 17; 82 private static final int A_LT = 18; 83 private static final int A_LT_PCDATA = 19; 84 private static final int A_MINUS = 20; 85 private static final int A_MINUS2 = 21; 86 private static final int A_MINUS3 = 22; 87 private static final int A_PCDATA = 23; 88 private static final int A_PI = 24; 89 private static final int A_PITARGET = 25; 90 private static final int A_PITARGET_PI = 26; 91 private static final int A_SAVE = 27; 92 private static final int A_SKIP = 28; 93 private static final int A_SP = 29; 94 private static final int A_STAGC = 30; 95 private static final int A_UNGET = 31; 96 private static final int A_UNSAVE_PCDATA = 32; 97 private static int[] statetable = { 98 S_ANAME, '/', A_ANAME_ADUP, S_EMPTYTAG, 99 S_ANAME, '=', A_ANAME, S_AVAL, 100 S_ANAME, '>', A_ANAME_ADUP_STAGC, S_PCDATA, 101 S_ANAME, 0, A_SAVE, S_ANAME, 102 S_ANAME, -1, A_ANAME_ADUP_STAGC, S_DONE, 103 S_ANAME, ' ', A_ANAME, S_EQ, 104 S_ANAME, '\n', A_ANAME, S_EQ, 105 S_ANAME, '\t', A_ANAME, S_EQ, 106 S_APOS, '\'', A_AVAL, S_TAGWS, 107 S_APOS, 0, A_SAVE, S_APOS, 108 S_APOS, -1, A_AVAL_STAGC, S_DONE, 109 S_APOS, ' ', A_SP, S_APOS, 110 S_APOS, '\n', A_SP, S_APOS, 111 S_APOS, '\t', A_SP, S_APOS, 112 S_AVAL, '"', A_SKIP, S_QUOT, 113 S_AVAL, '\'', A_SKIP, S_APOS, 114 S_AVAL, '>', A_AVAL_STAGC, S_PCDATA, 115 S_AVAL, 0, A_SAVE, S_STAGC, 116 S_AVAL, -1, A_AVAL_STAGC, S_DONE, 117 S_AVAL, ' ', A_SKIP, S_AVAL, 118 S_AVAL, '\n', A_SKIP, S_AVAL, 119 S_AVAL, '\t', A_SKIP, S_AVAL, 120 S_BB, 'C', A_SKIP, S_BBC, 121 S_BB, 0, A_SKIP, S_DECL, 122 S_BB, -1, A_SKIP, S_DONE, 123 S_BBC, 'D', A_SKIP, S_BBCD, 124 S_BBC, 0, A_SKIP, S_DECL, 125 S_BBC, -1, A_SKIP, S_DONE, 126 S_BBCD, 'A', A_SKIP, S_BBCDA, 127 S_BBCD, 0, A_SKIP, S_DECL, 128 S_BBCD, -1, A_SKIP, S_DONE, 129 S_BBCDA, 'T', A_SKIP, S_BBCDAT, 130 S_BBCDA, 0, A_SKIP, S_DECL, 131 S_BBCDA, -1, A_SKIP, S_DONE, 132 S_BBCDAT, 'A', A_SKIP, S_BBCDATA, 133 S_BBCDAT, 0, A_SKIP, S_DECL, 134 S_BBCDAT, -1, A_SKIP, S_DONE, 135 S_BBCDATA, '[', A_SKIP, S_CDSECT, 136 S_BBCDATA, 0, A_SKIP, S_DECL, 137 S_BBCDATA, -1, A_SKIP, S_DONE, 138 S_CDATA, '<', A_SAVE, S_CDATA2, 139 S_CDATA, 0, A_SAVE, S_CDATA, 140 S_CDATA, -1, A_PCDATA, S_DONE, 141 S_CDATA2, '/', A_UNSAVE_PCDATA, S_ETAG, 142 S_CDATA2, 0, A_SAVE, S_CDATA, 143 S_CDATA2, -1, A_UNSAVE_PCDATA, S_DONE, 144 S_CDSECT, ']', A_SAVE, S_CDSECT1, 145 S_CDSECT, 0, A_SAVE, S_CDSECT, 146 S_CDSECT, -1, A_SKIP, S_DONE, 147 S_CDSECT1, ']', A_SAVE, S_CDSECT2, 148 S_CDSECT1, 0, A_SAVE, S_CDSECT, 149 S_CDSECT1, -1, A_SKIP, S_DONE, 150 S_CDSECT2, '>', A_CDATA, S_PCDATA, 151 S_CDSECT2, 0, A_SAVE, S_CDSECT, 152 S_CDSECT2, -1, A_SKIP, S_DONE, 153 S_COM, '-', A_SKIP, S_COM2, 154 S_COM, 0, A_SAVE, S_COM2, 155 S_COM, -1, A_CMNT, S_DONE, 156 S_COM2, '-', A_SKIP, S_COM3, 157 S_COM2, 0, A_SAVE, S_COM2, 158 S_COM2, -1, A_CMNT, S_DONE, 159 S_COM3, '-', A_SKIP, S_COM4, 160 S_COM3, 0, A_MINUS, S_COM2, 161 S_COM3, -1, A_CMNT, S_DONE, 162 S_COM4, '-', A_MINUS3, S_COM4, 163 S_COM4, '>', A_CMNT, S_PCDATA, 164 S_COM4, 0, A_MINUS2, S_COM2, 165 S_COM4, -1, A_CMNT, S_DONE, 166 S_DECL, '-', A_SKIP, S_COM, 167 S_DECL, '>', A_SKIP, S_PCDATA, 168 S_DECL, '[', A_SKIP, S_BB, 169 S_DECL, 0, A_SAVE, S_DECL2, 170 S_DECL, -1, A_SKIP, S_DONE, 171 S_DECL2, '>', A_DECL, S_PCDATA, 172 S_DECL2, 0, A_SAVE, S_DECL2, 173 S_DECL2, -1, A_SKIP, S_DONE, 174 S_EMPTYTAG, '>', A_EMPTYTAG, S_PCDATA, 175 S_EMPTYTAG, 0, A_SAVE, S_ANAME, 176 S_EMPTYTAG, ' ', A_SKIP, S_TAGWS, 177 S_EMPTYTAG, '\n', A_SKIP, S_TAGWS, 178 S_EMPTYTAG, '\t', A_SKIP, S_TAGWS, 179 S_ENT, 0, A_ENTITY, S_ENT, 180 S_ENT, -1, A_ENTITY, S_DONE, 181 S_EQ, '=', A_SKIP, S_AVAL, 182 S_EQ, '>', A_ADUP_STAGC, S_PCDATA, 183 S_EQ, 0, A_ADUP_SAVE, S_ANAME, 184 S_EQ, -1, A_ADUP_STAGC, S_DONE, 185 S_EQ, ' ', A_SKIP, S_EQ, 186 S_EQ, '\n', A_SKIP, S_EQ, 187 S_EQ, '\t', A_SKIP, S_EQ, 188 S_ETAG, '>', A_ETAG, S_PCDATA, 189 S_ETAG, 0, A_SAVE, S_ETAG, 190 S_ETAG, -1, A_ETAG, S_DONE, 191 S_ETAG, ' ', A_SKIP, S_ETAG, 192 S_ETAG, '\n', A_SKIP, S_ETAG, 193 S_ETAG, '\t', A_SKIP, S_ETAG, 194 S_GI, '/', A_SKIP, S_EMPTYTAG, 195 S_GI, '>', A_GI_STAGC, S_PCDATA, 196 S_GI, 0, A_SAVE, S_GI, 197 S_GI, -1, A_SKIP, S_DONE, 198 S_GI, ' ', A_GI, S_TAGWS, 199 S_GI, '\n', A_GI, S_TAGWS, 200 S_GI, '\t', A_GI, S_TAGWS, 201 S_NCR, 0, A_ENTITY, S_NCR, 202 S_NCR, -1, A_ENTITY, S_DONE, 203 S_PCDATA, '&', A_ENTITY_START, S_ENT, 204 S_PCDATA, '<', A_PCDATA, S_TAG, 205 S_PCDATA, 0, A_SAVE, S_PCDATA, 206 S_PCDATA, -1, A_PCDATA, S_DONE, 207 S_PI, '>', A_PI, S_PCDATA, 208 S_PI, 0, A_SAVE, S_PI, 209 S_PI, -1, A_PI, S_DONE, 210 S_PITARGET, '>', A_PITARGET_PI, S_PCDATA, 211 S_PITARGET, 0, A_SAVE, S_PITARGET, 212 S_PITARGET, -1, A_PITARGET_PI, S_DONE, 213 S_PITARGET, ' ', A_PITARGET, S_PI, 214 S_PITARGET, '\n', A_PITARGET, S_PI, 215 S_PITARGET, '\t', A_PITARGET, S_PI, 216 S_QUOT, '"', A_AVAL, S_TAGWS, 217 S_QUOT, 0, A_SAVE, S_QUOT, 218 S_QUOT, -1, A_AVAL_STAGC, S_DONE, 219 S_QUOT, ' ', A_SP, S_QUOT, 220 S_QUOT, '\n', A_SP, S_QUOT, 221 S_QUOT, '\t', A_SP, S_QUOT, 222 S_STAGC, '>', A_AVAL_STAGC, S_PCDATA, 223 S_STAGC, 0, A_SAVE, S_STAGC, 224 S_STAGC, -1, A_AVAL_STAGC, S_DONE, 225 S_STAGC, ' ', A_AVAL, S_TAGWS, 226 S_STAGC, '\n', A_AVAL, S_TAGWS, 227 S_STAGC, '\t', A_AVAL, S_TAGWS, 228 S_TAG, '!', A_SKIP, S_DECL, 229 S_TAG, '/', A_SKIP, S_ETAG, 230 S_TAG, '<', A_SAVE, S_TAG, 231 S_TAG, '?', A_SKIP, S_PITARGET, 232 S_TAG, 0, A_SAVE, S_GI, 233 S_TAG, -1, A_LT_PCDATA, S_DONE, 234 S_TAG, ' ', A_LT, S_PCDATA, 235 S_TAG, '\n', A_LT, S_PCDATA, 236 S_TAG, '\t', A_LT, S_PCDATA, 237 S_TAGWS, '/', A_SKIP, S_EMPTYTAG, 238 S_TAGWS, '>', A_STAGC, S_PCDATA, 239 S_TAGWS, 0, A_SAVE, S_ANAME, 240 S_TAGWS, -1, A_STAGC, S_DONE, 241 S_TAGWS, ' ', A_SKIP, S_TAGWS, 242 S_TAGWS, '\n', A_SKIP, S_TAGWS, 243 S_TAGWS, '\t', A_SKIP, S_TAGWS, 244 S_XNCR, 0, A_ENTITY, S_XNCR, 245 S_XNCR, -1, A_ENTITY, S_DONE, 246 247 }; 248 private static final String[] debug_actionnames = { "", "A_ADUP", "A_ADUP_SAVE", "A_ADUP_STAGC", "A_ANAME", "A_ANAME_ADUP", "A_ANAME_ADUP_STAGC", "A_AVAL", "A_AVAL_STAGC", "A_CDATA", "A_CMNT", "A_DECL", "A_EMPTYTAG", "A_ENTITY", "A_ENTITY_START", "A_ETAG", "A_GI", "A_GI_STAGC", "A_LT", "A_LT_PCDATA", "A_MINUS", "A_MINUS2", "A_MINUS3", "A_PCDATA", "A_PI", "A_PITARGET", "A_PITARGET_PI", "A_SAVE", "A_SKIP", "A_SP", "A_STAGC", "A_UNGET", "A_UNSAVE_PCDATA"}; 249 private static final String[] debug_statenames = { "", "S_ANAME", "S_APOS", "S_AVAL", "S_BB", "S_BBC", "S_BBCD", "S_BBCDA", "S_BBCDAT", "S_BBCDATA", "S_CDATA", "S_CDATA2", "S_CDSECT", "S_CDSECT1", "S_CDSECT2", "S_COM", "S_COM2", "S_COM3", "S_COM4", "S_DECL", "S_DECL2", "S_DONE", "S_EMPTYTAG", "S_ENT", "S_EQ", "S_ETAG", "S_GI", "S_NCR", "S_PCDATA", "S_PI", "S_PITARGET", "S_QUOT", "S_STAGC", "S_TAG", "S_TAGWS", "S_XNCR"}; 250 251 252 // End of state table 253 254 private String thePublicid; // Locator state 255 private String theSystemid; 256 private int theLastLine; 257 private int theLastColumn; 258 private int theCurrentLine; 259 private int theCurrentColumn; 260 261 int theState; // Current state 262 int theNextState; // Next state 263 char[] theOutputBuffer = new char[200]; // Output buffer 264 int theSize; // Current buffer size 265 int[] theWinMap = { // Windows chars map 266 0x20AC, 0xFFFD, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021, 267 0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0xFFFD, 0x017D, 0xFFFD, 268 0xFFFD, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014, 269 0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0xFFFD, 0x017E, 0x0178}; 270 271 /** 272 * Index into the state table for [state][input character - 2]. 273 * The state table consists of 4-entry runs on the form 274 * { current state, input character, action, next state }. 275 * We precompute the index into the state table for all possible 276 * { current state, input character } and store the result in 277 * the statetableIndex array. Since only some input characters 278 * are present in the state table, we only do the computation for 279 * characters 0 to the highest character value in the state table. 280 * An input character of -2 is used to cover all other characters 281 * as -2 is guaranteed not to match any input character entry 282 * in the state table. 283 * 284 * <p>When doing lookups, the input character should first be tested 285 * to be in the range [-1 (inclusive), statetableIndexMaxChar (exclusive)]. 286 * if it isn't use -2 as the input character. 287 * 288 * <p>Finally, add 2 to the input character to cover for the fact that 289 * Java doesn't support negative array indexes. Then look up 290 * the value in the statetableIndex. If the value is -1, then 291 * no action or next state was found for the { state, input } that 292 * you had. If it isn't -1, then action = statetable[value + 2] and 293 * next state = statetable[value + 3]. That is, the value points 294 * to the start of the answer 4-tuple in the statetable. 295 */ 296 static short[][] statetableIndex; 297 /** 298 * The highest character value seen in the statetable. 299 * See the doc comment for statetableIndex to see how this 300 * is used. 301 */ 302 static int statetableIndexMaxChar; 303 static { 304 int maxState = -1; 305 int maxChar = -1; 306 for (int i = 0; i < statetable.length; i += 4) { 307 if (statetable[i] > maxState) { 308 maxState = statetable[i]; 309 } 310 if (statetable[i + 1] > maxChar) { 311 maxChar = statetable[i + 1]; 312 } 313 } 314 statetableIndexMaxChar = maxChar + 1; 315 316 statetableIndex = new short[maxState + 1][maxChar + 3]; 317 for (int theState = 0; theState <= maxState; ++theState) { 318 for (int ch = -2; ch <= maxChar; ++ch) { 319 int hit = -1; 320 int action = 0; 321 for (int i = 0; i < statetable.length; i += 4) { 322 if (theState != statetable[i]) { 323 if (action != 0) break; 324 continue; 325 } 326 if (statetable[i+1] == 0) { 327 hit = i; 328 action = statetable[i+2]; 329 } 330 else if (statetable[i+1] == ch) { 331 hit = i; 332 action = statetable[i+2]; 333 break; 334 } 335 } 336 statetableIndex[theState][ch + 2] = (short) hit; 337 } 338 } 339 } 340 341 // Compensate for bug in PushbackReader that allows 342 // pushing back EOF. 343 private void unread(PushbackReader r, int c) throws IOException { 344 if (c != -1) r.unread(c); 345 } 346 347 // Locator implementation 348 349 public int getLineNumber() { 350 return theLastLine; 351 } 352 public int getColumnNumber() { 353 return theLastColumn; 354 } 355 public String getPublicId() { 356 return thePublicid; 357 } 358 public String getSystemId() { 359 return theSystemid; 360 } 361 362 363 // Scanner implementation 364 365 /** 366 Reset document locator, supplying systemid and publicid. 367 @param systemid System id 368 @param publicid Public id 369 */ 370 371 public void resetDocumentLocator(String publicid, String systemid) { 372 thePublicid = publicid; 373 theSystemid = systemid; 374 theLastLine = theLastColumn = theCurrentLine = theCurrentColumn = 0; 375 } 376 377 /** 378 Scan HTML source, reporting lexical events. 379 @param r0 Reader that provides characters 380 @param h ScanHandler that accepts lexical events. 381 */ 382 383 public void scan(Reader r0, ScanHandler h) throws IOException, SAXException { 384 theState = S_PCDATA; 385 PushbackReader r; 386 if (r0 instanceof BufferedReader) { 387 r = new PushbackReader(r0, 5); 388 } 389 else { 390 r = new PushbackReader(new BufferedReader(r0), 5); 391 } 392 393 int firstChar = r.read(); // Remove any leading BOM 394 if (firstChar != '\uFEFF') unread(r, firstChar); 395 396 while (theState != S_DONE) { 397 int ch = r.read(); 398 399 // Process control characters 400 if (ch >= 0x80 && ch <= 0x9F) ch = theWinMap[ch-0x80]; 401 402 if (ch == '\r') { 403 ch = r.read(); // expect LF next 404 if (ch != '\n') { 405 unread(r, ch); // nope 406 ch = '\n'; 407 } 408 } 409 410 if (ch == '\n') { 411 theCurrentLine++; 412 theCurrentColumn = 0; 413 } 414 else { 415 theCurrentColumn++; 416 } 417 418 if (!(ch >= 0x20 || ch == '\n' || ch == '\t' || ch == -1)) continue; 419 420 // Search state table 421 int adjCh = (ch >= -1 && ch < statetableIndexMaxChar) ? ch : -2; 422 int statetableRow = statetableIndex[theState][adjCh + 2]; 423 int action = 0; 424 if (statetableRow != -1) { 425 action = statetable[statetableRow + 2]; 426 theNextState = statetable[statetableRow + 3]; 427 } 428 429 // System.err.println("In " + debug_statenames[theState] + " got " + nicechar(ch) + " doing " + debug_actionnames[action] + " then " + debug_statenames[theNextState]); 430 switch (action) { 431 case 0: 432 throw new Error( 433 "HTMLScanner can't cope with " + Integer.toString(ch) + " in state " + 434 Integer.toString(theState)); 435 case A_ADUP: 436 h.adup(theOutputBuffer, 0, theSize); 437 theSize = 0; 438 break; 439 case A_ADUP_SAVE: 440 h.adup(theOutputBuffer, 0, theSize); 441 theSize = 0; 442 save(ch, h); 443 break; 444 case A_ADUP_STAGC: 445 h.adup(theOutputBuffer, 0, theSize); 446 theSize = 0; 447 h.stagc(theOutputBuffer, 0, theSize); 448 break; 449 case A_ANAME: 450 h.aname(theOutputBuffer, 0, theSize); 451 theSize = 0; 452 break; 453 case A_ANAME_ADUP: 454 h.aname(theOutputBuffer, 0, theSize); 455 theSize = 0; 456 h.adup(theOutputBuffer, 0, theSize); 457 break; 458 case A_ANAME_ADUP_STAGC: 459 h.aname(theOutputBuffer, 0, theSize); 460 theSize = 0; 461 h.adup(theOutputBuffer, 0, theSize); 462 h.stagc(theOutputBuffer, 0, theSize); 463 break; 464 case A_AVAL: 465 h.aval(theOutputBuffer, 0, theSize); 466 theSize = 0; 467 break; 468 case A_AVAL_STAGC: 469 h.aval(theOutputBuffer, 0, theSize); 470 theSize = 0; 471 h.stagc(theOutputBuffer, 0, theSize); 472 break; 473 case A_CDATA: 474 mark(); 475 // suppress the final "]]" in the buffer 476 if (theSize > 1) theSize -= 2; 477 h.pcdata(theOutputBuffer, 0, theSize); 478 theSize = 0; 479 break; 480 case A_ENTITY_START: 481 h.pcdata(theOutputBuffer, 0, theSize); 482 theSize = 0; 483 save(ch, h); 484 break; 485 case A_ENTITY: 486 mark(); 487 char ch1 = (char)ch; 488 // System.out.println("Got " + ch1 + " in state " + ((theState == S_ENT) ? "S_ENT" : ((theState == S_NCR) ? "S_NCR" : "UNK"))); 489 if (theState == S_ENT && ch1 == '#') { 490 theNextState = S_NCR; 491 save(ch, h); 492 break; 493 } 494 else if (theState == S_NCR && (ch1 == 'x' || ch1 == 'X')) { 495 theNextState = S_XNCR; 496 save(ch, h); 497 break; 498 } 499 else if (theState == S_ENT && Character.isLetterOrDigit(ch1)) { 500 save(ch, h); 501 break; 502 } 503 else if (theState == S_NCR && Character.isDigit(ch1)) { 504 save(ch, h); 505 break; 506 } 507 else if (theState == S_XNCR && (Character.isDigit(ch1) || "abcdefABCDEF".indexOf(ch1) != -1)) { 508 save(ch, h); 509 break; 510 } 511 512 // The whole entity reference has been collected 513 // System.err.println("%%" + new String(theOutputBuffer, 0, theSize)); 514 h.entity(theOutputBuffer, 1, theSize - 1); 515 int ent = h.getEntity(); 516 // System.err.println("%% value = " + ent); 517 if (ent != 0) { 518 theSize = 0; 519 if (ent >= 0x80 && ent <= 0x9F) { 520 ent = theWinMap[ent-0x80]; 521 } 522 if (ent < 0x20) { 523 // Control becomes space 524 ent = 0x20; 525 } 526 else if (ent >= 0xD800 && ent <= 0xDFFF) { 527 // Surrogates get dropped 528 ent = 0; 529 } 530 else if (ent <= 0xFFFF) { 531 // BMP character 532 save(ent, h); 533 } 534 else { 535 // Astral converted to two surrogates 536 ent -= 0x10000; 537 save((ent>>10) + 0xD800, h); 538 save((ent&0x3FF) + 0xDC00, h); 539 } 540 if (ch != ';') { 541 unread(r, ch); 542 theCurrentColumn--; 543 } 544 } 545 else { 546 unread(r, ch); 547 theCurrentColumn--; 548 } 549 theNextState = S_PCDATA; 550 break; 551 case A_ETAG: 552 h.etag(theOutputBuffer, 0, theSize); 553 theSize = 0; 554 break; 555 case A_DECL: 556 h.decl(theOutputBuffer, 0, theSize); 557 theSize = 0; 558 break; 559 case A_GI: 560 h.gi(theOutputBuffer, 0, theSize); 561 theSize = 0; 562 break; 563 case A_GI_STAGC: 564 h.gi(theOutputBuffer, 0, theSize); 565 theSize = 0; 566 h.stagc(theOutputBuffer, 0, theSize); 567 break; 568 case A_LT: 569 mark(); 570 save('<', h); 571 save(ch, h); 572 break; 573 case A_LT_PCDATA: 574 mark(); 575 save('<', h); 576 h.pcdata(theOutputBuffer, 0, theSize); 577 theSize = 0; 578 break; 579 case A_PCDATA: 580 mark(); 581 h.pcdata(theOutputBuffer, 0, theSize); 582 theSize = 0; 583 break; 584 case A_CMNT: 585 mark(); 586 h.cmnt(theOutputBuffer, 0, theSize); 587 theSize = 0; 588 break; 589 case A_MINUS3: 590 save('-', h); 591 save(' ', h); 592 break; 593 case A_MINUS2: 594 save('-', h); 595 save(' ', h); 596 // fall through into A_MINUS 597 case A_MINUS: 598 save('-', h); 599 save(ch, h); 600 break; 601 case A_PI: 602 mark(); 603 h.pi(theOutputBuffer, 0, theSize); 604 theSize = 0; 605 break; 606 case A_PITARGET: 607 h.pitarget(theOutputBuffer, 0, theSize); 608 theSize = 0; 609 break; 610 case A_PITARGET_PI: 611 h.pitarget(theOutputBuffer, 0, theSize); 612 theSize = 0; 613 h.pi(theOutputBuffer, 0, theSize); 614 break; 615 case A_SAVE: 616 save(ch, h); 617 break; 618 case A_SKIP: 619 break; 620 case A_SP: 621 save(' ', h); 622 break; 623 case A_STAGC: 624 h.stagc(theOutputBuffer, 0, theSize); 625 theSize = 0; 626 break; 627 case A_EMPTYTAG: 628 mark(); 629 // System.err.println("%%% Empty tag seen"); 630 if (theSize > 0) h.gi(theOutputBuffer, 0, theSize); 631 theSize = 0; 632 h.stage(theOutputBuffer, 0, theSize); 633 break; 634 case A_UNGET: 635 unread(r, ch); 636 theCurrentColumn--; 637 break; 638 case A_UNSAVE_PCDATA: 639 if (theSize > 0) theSize--; 640 h.pcdata(theOutputBuffer, 0, theSize); 641 theSize = 0; 642 break; 643 default: 644 throw new Error("Can't process state " + action); 645 } 646 theState = theNextState; 647 } 648 h.eof(theOutputBuffer, 0, 0); 649 } 650 651 /** 652 * Mark the current scan position as a "point of interest" - start of a tag, 653 * cdata, processing instruction etc. 654 */ 655 656 private void mark() { 657 theLastColumn = theCurrentColumn; 658 theLastLine = theCurrentLine; 659 } 660 661 /** 662 A callback for the ScanHandler that allows it to force 663 the lexer state to CDATA content (no markup is recognized except 664 the end of element. 665 */ 666 667 public void startCDATA() { theNextState = S_CDATA; } 668 669 private void save(int ch, ScanHandler h) throws IOException, SAXException { 670 if (theSize >= theOutputBuffer.length - 20) { 671 if (theState == S_PCDATA || theState == S_CDATA) { 672 // Return a buffer-sized chunk of PCDATA 673 h.pcdata(theOutputBuffer, 0, theSize); 674 theSize = 0; 675 } 676 else { 677 // Grow the buffer size 678 char[] newOutputBuffer = new char[theOutputBuffer.length * 2]; 679 System.arraycopy(theOutputBuffer, 0, newOutputBuffer, 0, theSize+1); 680 theOutputBuffer = newOutputBuffer; 681 } 682 } 683 theOutputBuffer[theSize++] = (char)ch; 684 } 685 686 /** 687 Test procedure. Reads HTML from the standard input and writes 688 PYX to the standard output. 689 */ 690 691 public static void main(String[] argv) throws IOException, SAXException { 692 Scanner s = new HTMLScanner(); 693 Reader r = new InputStreamReader(System.in, "UTF-8"); 694 Writer w = new OutputStreamWriter(System.out, "UTF-8"); 695 PYXWriter pw = new PYXWriter(w); 696 s.scan(r, pw); 697 w.close(); 698 } 699 700 701 private static String nicechar(int in) { 702 if (in == '\n') return "\\n"; 703 if (in < 32) return "0x"+Integer.toHexString(in); 704 return "'"+((char)in)+"'"; 705 } 706 707 } 708