1 // 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html#License 3 /* 4 ******************************************************************************* 5 * Copyright (C) 2001-2012, International Business Machines 6 * Corporation and others. All Rights Reserved. 7 ******************************************************************************* 8 */ 9 10 package com.ibm.icu.text; 11 12 import com.ibm.icu.impl.UBiDiProps; 13 import com.ibm.icu.lang.UCharacterDirection; 14 15 /** 16 * Shape Arabic text on a character basis. 17 * 18 * <p>ArabicShaping performs basic operations for "shaping" Arabic text. It is most 19 * useful for use with legacy data formats and legacy display technology 20 * (simple terminals). All operations are performed on Unicode characters.</p> 21 * 22 * <p>Text-based shaping means that some character code points in the text are 23 * replaced by others depending on the context. It transforms one kind of text 24 * into another. In comparison, modern displays for Arabic text select 25 * appropriate, context-dependent font glyphs for each text element, which means 26 * that they transform text into a glyph vector.</p> 27 * 28 * <p>Text transformations are necessary when modern display technology is not 29 * available or when text needs to be transformed to or from legacy formats that 30 * use "shaped" characters. Since the Arabic script is cursive, connecting 31 * adjacent letters to each other, computers select images for each letter based 32 * on the surrounding letters. This usually results in four images per Arabic 33 * letter: initial, middle, final, and isolated forms. In Unicode, on the other 34 * hand, letters are normally stored abstract, and a display system is expected 35 * to select the necessary glyphs. (This makes searching and other text 36 * processing easier because the same letter has only one code.) It is possible 37 * to mimic this with text transformations because there are characters in 38 * Unicode that are rendered as letters with a specific shape 39 * (or cursive connectivity). They were included for interoperability with 40 * legacy systems and codepages, and for unsophisticated display systems.</p> 41 * 42 * <p>A second kind of text transformations is supported for Arabic digits: 43 * For compatibility with legacy codepages that only include European digits, 44 * it is possible to replace one set of digits by another, changing the 45 * character code points. These operations can be performed for either 46 * Arabic-Indic Digits (U+0660...U+0669) or Eastern (Extended) Arabic-Indic 47 * digits (U+06f0...U+06f9).</p> 48 * 49 * <p>Some replacements may result in more or fewer characters (code points). 50 * By default, this means that the destination buffer may receive text with a 51 * length different from the source length. Some legacy systems rely on the 52 * length of the text to be constant. They expect extra spaces to be added 53 * or consumed either next to the affected character or at the end of the 54 * text.</p> 55 * @stable ICU 2.0 56 */ 57 public final class ArabicShaping { 58 private final int options; 59 private boolean isLogical; // convenience 60 private boolean spacesRelativeToTextBeginEnd; 61 private char tailChar; 62 63 /** 64 * Convert a range of text in the source array, putting the result 65 * into a range of text in the destination array, and return the number 66 * of characters written. 67 * 68 * @param source An array containing the input text 69 * @param sourceStart The start of the range of text to convert 70 * @param sourceLength The length of the range of text to convert 71 * @param dest The destination array that will receive the result. 72 * It may be <code>NULL</code> only if <code>destSize</code> is 0. 73 * @param destStart The start of the range of the destination buffer to use. 74 * @param destSize The size (capacity) of the destination buffer. 75 * If <code>destSize</code> is 0, then no output is produced, 76 * but the necessary buffer size is returned ("preflighting"). This 77 * does not validate the text against the options, for example, 78 * if letters are being unshaped, and spaces are being consumed 79 * following lamalef, this will not detect a lamalef without a 80 * corresponding space. An error will be thrown when the actual 81 * conversion is attempted. 82 * @return The number of chars written to the destination buffer. 83 * If an error occurs, then no output was written, or it may be 84 * incomplete. 85 * @throws ArabicShapingException if the text cannot be converted according to the options. 86 * @stable ICU 2.0 87 */ 88 public int shape(char[] source, int sourceStart, int sourceLength, 89 char[] dest, int destStart, int destSize) throws ArabicShapingException { 90 if (source == null) { 91 throw new IllegalArgumentException("source can not be null"); 92 } 93 if (sourceStart < 0 || sourceLength < 0 || sourceStart + sourceLength > source.length) { 94 throw new IllegalArgumentException("bad source start (" + sourceStart + 95 ") or length (" + sourceLength + 96 ") for buffer of length " + source.length); 97 } 98 if (dest == null && destSize != 0) { 99 throw new IllegalArgumentException("null dest requires destSize == 0"); 100 } 101 if ((destSize != 0) && 102 (destStart < 0 || destSize < 0 || destStart + destSize > dest.length)) { 103 throw new IllegalArgumentException("bad dest start (" + destStart + 104 ") or size (" + destSize + 105 ") for buffer of length " + dest.length); 106 } 107 /* Validate input options */ 108 if ( ((options&TASHKEEL_MASK) != 0) && 109 !(((options & TASHKEEL_MASK)==TASHKEEL_BEGIN) || 110 ((options & TASHKEEL_MASK)==TASHKEEL_END) || 111 ((options & TASHKEEL_MASK)==TASHKEEL_RESIZE) || 112 ((options & TASHKEEL_MASK)==TASHKEEL_REPLACE_BY_TATWEEL))) { 113 throw new IllegalArgumentException("Wrong Tashkeel argument"); 114 } 115 116 ///CLOVER:OFF 117 //According to Steven Loomis, the code is unreachable when you OR all the constants within the if statements 118 if(((options&LAMALEF_MASK) != 0) && 119 !(((options & LAMALEF_MASK)==LAMALEF_BEGIN) || 120 ((options & LAMALEF_MASK)==LAMALEF_END) || 121 ((options & LAMALEF_MASK)==LAMALEF_RESIZE) || 122 ((options & LAMALEF_MASK)==LAMALEF_AUTO) || 123 ((options & LAMALEF_MASK)==LAMALEF_NEAR))) { 124 throw new IllegalArgumentException("Wrong Lam Alef argument"); 125 } 126 ///CLOVER:ON 127 128 /* Validate Tashkeel (Tashkeel replacement options should be enabled in shaping mode only)*/ 129 if(((options&TASHKEEL_MASK) != 0) && (options&LETTERS_MASK) == LETTERS_UNSHAPE) { 130 throw new IllegalArgumentException("Tashkeel replacement should not be enabled in deshaping mode "); 131 } 132 return internalShape(source, sourceStart, sourceLength, dest, destStart, destSize); 133 } 134 135 /** 136 * Convert a range of text in place. This may only be used if the Length option 137 * does not grow or shrink the text. 138 * 139 * @param source An array containing the input text 140 * @param start The start of the range of text to convert 141 * @param length The length of the range of text to convert 142 * @throws ArabicShapingException if the text cannot be converted according to the options. 143 * @stable ICU 2.0 144 */ 145 public void shape(char[] source, int start, int length) throws ArabicShapingException { 146 if ((options & LAMALEF_MASK) == LAMALEF_RESIZE) { 147 throw new ArabicShapingException("Cannot shape in place with length option resize."); 148 } 149 shape(source, start, length, source, start, length); 150 } 151 152 /** 153 * Convert a string, returning the new string. 154 * 155 * @param text the string to convert 156 * @return the converted string 157 * @throws ArabicShapingException if the string cannot be converted according to the options. 158 * @stable ICU 2.0 159 */ 160 public String shape(String text) throws ArabicShapingException { 161 char[] src = text.toCharArray(); 162 char[] dest = src; 163 if (((options & LAMALEF_MASK) == LAMALEF_RESIZE) && 164 ((options & LETTERS_MASK) == LETTERS_UNSHAPE)) { 165 166 dest = new char[src.length * 2]; // max 167 } 168 int len = shape(src, 0, src.length, dest, 0, dest.length); 169 170 return new String(dest, 0, len); 171 } 172 173 /** 174 * Construct ArabicShaping using the options flags. 175 * The flags are as follows:<br> 176 * 'LENGTH' flags control whether the text can change size, and if not, 177 * how to maintain the size of the text when LamAlef ligatures are 178 * formed or broken.<br> 179 * 'TEXT_DIRECTION' flags control whether the text is read and written 180 * in visual order or in logical order.<br> 181 * 'LETTERS_SHAPE' flags control whether conversion is to or from 182 * presentation forms.<br> 183 * 'DIGITS' flags control whether digits are shaped, and whether from 184 * European to Arabic-Indic or vice-versa.<br> 185 * 'DIGIT_TYPE' flags control whether standard or extended Arabic-Indic 186 * digits are used when performing digit conversion. 187 * @stable ICU 2.0 188 */ 189 public ArabicShaping(int options) { 190 this.options = options; 191 if ((options & DIGITS_MASK) > 0x80) { 192 throw new IllegalArgumentException("bad DIGITS options"); 193 } 194 195 isLogical = ( (options & TEXT_DIRECTION_MASK) == TEXT_DIRECTION_LOGICAL ); 196 /* Validate options */ 197 spacesRelativeToTextBeginEnd = ( (options & SPACES_RELATIVE_TO_TEXT_MASK) == SPACES_RELATIVE_TO_TEXT_BEGIN_END ); 198 if ( (options&SHAPE_TAIL_TYPE_MASK) == SHAPE_TAIL_NEW_UNICODE){ 199 tailChar = NEW_TAIL_CHAR; 200 } else { 201 tailChar = OLD_TAIL_CHAR; 202 } 203 } 204 205 /* Seen Tail options */ 206 /** 207 * Memory option: the result must have the same length as the source. 208 * Shaping mode: The SEEN family character will expand into two characters using space near 209 * the SEEN family character(i.e. the space after the character). 210 * if there are no spaces found, ArabicShapingException will be thrown 211 * 212 * De-shaping mode: Any Seen character followed by Tail character will be 213 * replaced by one cell Seen and a space will replace the Tail. 214 * Affects: Seen options 215 * @stable ICU 4.2 216 */ 217 public static final int SEEN_TWOCELL_NEAR = 0x200000; 218 219 /** Bit mask for Seen memory options. 220 * @stable ICU 4.2 221 */ 222 public static final int SEEN_MASK = 0x700000; 223 224 /* YehHamza options */ 225 /** 226 * Memory option: the result must have the same length as the source. 227 * Shaping mode: The YEHHAMZA character will expand into two characters using space near it 228 * (i.e. the space after the character) 229 * if there are no spaces found, ArabicShapingException will be thrown 230 * 231 * De-shaping mode: Any Yeh (final or isolated) character followed by Hamza character will be 232 * replaced by one cell YehHamza and space will replace the Hamza. 233 * Affects: YehHamza options 234 * @stable ICU 4.2 235 */ 236 public static final int YEHHAMZA_TWOCELL_NEAR = 0x1000000; 237 238 239 /** Bit mask for YehHamza memory options. 240 * @stable ICU 4.2 241 */ 242 public static final int YEHHAMZA_MASK = 0x3800000; 243 244 /* New Tashkeel options */ 245 /** 246 * Memory option: the result must have the same length as the source. 247 * Shaping mode: Tashkeel characters will be replaced by spaces. 248 * Spaces will be placed at beginning of the buffer 249 * 250 * De-shaping mode: N/A 251 * Affects: Tashkeel options 252 * @stable ICU 4.2 253 */ 254 public static final int TASHKEEL_BEGIN = 0x40000; 255 256 /** 257 * Memory option: the result must have the same length as the source. 258 * Shaping mode: Tashkeel characters will be replaced by spaces. 259 * Spaces will be placed at end of the buffer 260 * 261 * De-shaping mode: N/A 262 * Affects: Tashkeel options 263 * @stable ICU 4.2 264 */ 265 public static final int TASHKEEL_END = 0x60000; 266 267 /** 268 * Memory option: allow the result to have a different length than the source. 269 * Shaping mode: Tashkeel characters will be removed, buffer length will shrink. 270 * De-shaping mode: N/A 271 * 272 * Affects: Tashkeel options 273 * @stable ICU 4.2 274 */ 275 public static final int TASHKEEL_RESIZE = 0x80000; 276 277 /** 278 * Memory option: the result must have the same length as the source. 279 * Shaping mode: Tashkeel characters will be replaced by Tatweel if it is connected to adjacent 280 * characters (i.e. shaped on Tatweel) or replaced by space if it is not connected. 281 * 282 * De-shaping mode: N/A 283 * Affects: YehHamza options 284 * @stable ICU 4.2 285 */ 286 public static final int TASHKEEL_REPLACE_BY_TATWEEL = 0xC0000; 287 288 /** Bit mask for Tashkeel replacement with Space or Tatweel memory options. 289 * @stable ICU 4.2 290 */ 291 public static final int TASHKEEL_MASK = 0xE0000; 292 293 /* Space location Control options */ 294 /** 295 * This option effects the meaning of BEGIN and END options. if this option is not used the default 296 * for BEGIN and END will be as following: 297 * The Default (for both Visual LTR, Visual RTL and Logical Text) 298 * 1. BEGIN always refers to the start address of physical memory. 299 * 2. END always refers to the end address of physical memory. 300 * 301 * If this option is used it will swap the meaning of BEGIN and END only for Visual LTR text. 302 * 303 * The affect on BEGIN and END Memory Options will be as following: 304 * A. BEGIN For Visual LTR text: This will be the beginning (right side) of the visual text 305 * (corresponding to the physical memory address end, same as END in default behavior) 306 * B. BEGIN For Logical text: Same as BEGIN in default behavior. 307 * C. END For Visual LTR text: This will be the end (left side) of the visual text. (corresponding to 308 * the physical memory address beginning, same as BEGIN in default behavior) 309 * D. END For Logical text: Same as END in default behavior. 310 * Affects: All LamAlef BEGIN, END and AUTO options. 311 * @stable ICU 4.2 312 */ 313 public static final int SPACES_RELATIVE_TO_TEXT_BEGIN_END = 0x4000000; 314 315 /** Bit mask for swapping BEGIN and END for Visual LTR text 316 * @stable ICU 4.2 317 */ 318 public static final int SPACES_RELATIVE_TO_TEXT_MASK = 0x4000000; 319 320 /** 321 * If this option is used, shaping will use the new Unicode code point for TAIL (i.e. 0xFE73). 322 * If this option is not specified (Default), old unofficial Unicode TAIL code point is used (i.e. 0x200B) 323 * De-shaping will not use this option as it will always search for both the new Unicode code point for the 324 * TAIL (i.e. 0xFE73) or the old unofficial Unicode TAIL code point (i.e. 0x200B) and de-shape the 325 * Seen-Family letter accordingly. 326 * 327 * Shaping Mode: Only shaping. 328 * De-shaping Mode: N/A. 329 * Affects: All Seen options 330 * @stable ICU 4.2 331 */ 332 public static final int SHAPE_TAIL_NEW_UNICODE = 0x8000000; 333 334 /** Bit mask for new Unicode Tail option 335 * @stable ICU 4.2 336 */ 337 public static final int SHAPE_TAIL_TYPE_MASK = 0x8000000; 338 339 /** 340 * Memory option: allow the result to have a different length than the source. 341 * @stable ICU 2.0 342 */ 343 public static final int LENGTH_GROW_SHRINK = 0; 344 345 /** 346 * Memory option: allow the result to have a different length than the source. 347 * Affects: LamAlef options 348 * This option is an alias to LENGTH_GROW_SHRINK 349 * @stable ICU 4.2 350 */ 351 public static final int LAMALEF_RESIZE = 0; 352 353 /** 354 * Memory option: the result must have the same length as the source. 355 * If more room is necessary, then try to consume spaces next to modified characters. 356 * @stable ICU 2.0 357 */ 358 public static final int LENGTH_FIXED_SPACES_NEAR = 1; 359 360 /** 361 * Memory option: the result must have the same length as the source. 362 * If more room is necessary, then try to consume spaces next to modified characters. 363 * Affects: LamAlef options 364 * This option is an alias to LENGTH_FIXED_SPACES_NEAR 365 * @stable ICU 4.2 366 */ 367 public static final int LAMALEF_NEAR = 1 ; 368 369 /** 370 * Memory option: the result must have the same length as the source. 371 * If more room is necessary, then try to consume spaces at the end of the text. 372 * @stable ICU 2.0 373 */ 374 public static final int LENGTH_FIXED_SPACES_AT_END = 2; 375 376 377 /** 378 * Memory option: the result must have the same length as the source. 379 * If more room is necessary, then try to consume spaces at the end of the text. 380 * Affects: LamAlef options 381 * This option is an alias to LENGTH_FIXED_SPACES_AT_END 382 * @stable ICU 4.2 383 */ 384 public static final int LAMALEF_END = 2; 385 386 /** 387 * Memory option: the result must have the same length as the source. 388 * If more room is necessary, then try to consume spaces at the beginning of the text. 389 * @stable ICU 2.0 390 */ 391 public static final int LENGTH_FIXED_SPACES_AT_BEGINNING = 3; 392 393 /** 394 * Memory option: the result must have the same length as the source. 395 * If more room is necessary, then try to consume spaces at the beginning of the text. 396 * Affects: LamAlef options 397 * This option is an alias to LENGTH_FIXED_SPACES_AT_BEGINNING 398 * @stable ICU 4.2 399 */ 400 public static final int LAMALEF_BEGIN = 3; 401 402 /** 403 * Memory option: the result must have the same length as the source. 404 * Shaping Mode: For each LAMALEF character found, expand LAMALEF using space at end. 405 * If there is no space at end, use spaces at beginning of the buffer. If there 406 * is no space at beginning of the buffer, use spaces at the near (i.e. the space 407 * after the LAMALEF character). 408 * 409 * Deshaping Mode: Perform the same function as the flag equals LAMALEF_END. 410 * Affects: LamAlef options 411 * @stable ICU 4.2 412 */ 413 public static final int LAMALEF_AUTO = 0x10000; 414 415 /** 416 * Bit mask for memory options. 417 * @stable ICU 2.0 418 */ 419 public static final int LENGTH_MASK = 0x10003; 420 421 /** Bit mask for LamAlef memory options. 422 * @stable ICU 4.2 423 */ 424 425 public static final int LAMALEF_MASK = 0x10003; 426 427 /** 428 * Direction indicator: the source is in logical (keyboard) order. 429 * @stable ICU 2.0 430 */ 431 public static final int TEXT_DIRECTION_LOGICAL = 0; 432 433 /** 434 * Direction indicator:the source is in visual RTL order, 435 * the rightmost displayed character stored first. 436 * This option is an alias to U_SHAPE_TEXT_DIRECTION_LOGICAL 437 * @stable ICU 4.2 438 */ 439 public static final int TEXT_DIRECTION_VISUAL_RTL = 0; 440 441 /** 442 * Direction indicator: the source is in visual (display) order, that is, 443 * the leftmost displayed character is stored first. 444 * @stable ICU 2.0 445 */ 446 public static final int TEXT_DIRECTION_VISUAL_LTR = 4; 447 448 /** 449 * Bit mask for direction indicators. 450 * @stable ICU 2.0 451 */ 452 public static final int TEXT_DIRECTION_MASK = 4; 453 454 455 /** 456 * Letter shaping option: do not perform letter shaping. 457 * @stable ICU 2.0 458 */ 459 public static final int LETTERS_NOOP = 0; 460 461 /** 462 * Letter shaping option: replace normative letter characters in the U+0600 (Arabic) block, 463 * by shaped ones in the U+FE70 (Presentation Forms B) block. Performs Lam-Alef ligature 464 * substitution. 465 * @stable ICU 2.0 466 */ 467 public static final int LETTERS_SHAPE = 8; 468 469 /** 470 * Letter shaping option: replace shaped letter characters in the U+FE70 (Presentation Forms B) block 471 * by normative ones in the U+0600 (Arabic) block. Converts Lam-Alef ligatures to pairs of Lam and 472 * Alef characters, consuming spaces if required. 473 * @stable ICU 2.0 474 */ 475 public static final int LETTERS_UNSHAPE = 0x10; 476 477 /** 478 * Letter shaping option: replace normative letter characters in the U+0600 (Arabic) block, 479 * except for the TASHKEEL characters at U+064B...U+0652, by shaped ones in the U+Fe70 480 * (Presentation Forms B) block. The TASHKEEL characters will always be converted to 481 * the isolated forms rather than to their correct shape. 482 * @stable ICU 2.0 483 */ 484 public static final int LETTERS_SHAPE_TASHKEEL_ISOLATED = 0x18; 485 486 /** 487 * Bit mask for letter shaping options. 488 * @stable ICU 2.0 489 */ 490 public static final int LETTERS_MASK = 0x18; 491 492 493 /** 494 * Digit shaping option: do not perform digit shaping. 495 * @stable ICU 2.0 496 */ 497 public static final int DIGITS_NOOP = 0; 498 499 /** 500 * Digit shaping option: Replace European digits (U+0030...U+0039) by Arabic-Indic digits. 501 * @stable ICU 2.0 502 */ 503 public static final int DIGITS_EN2AN = 0x20; 504 505 /** 506 * Digit shaping option: Replace Arabic-Indic digits by European digits (U+0030...U+0039). 507 * @stable ICU 2.0 508 */ 509 public static final int DIGITS_AN2EN = 0x40; 510 511 /** 512 * Digit shaping option: 513 * Replace European digits (U+0030...U+0039) by Arabic-Indic digits 514 * if the most recent strongly directional character 515 * is an Arabic letter (its Bidi direction value is RIGHT_TO_LEFT_ARABIC). 516 * The initial state at the start of the text is assumed to be not an Arabic, 517 * letter, so European digits at the start of the text will not change. 518 * Compare to DIGITS_ALEN2AN_INIT_AL. 519 * @stable ICU 2.0 520 */ 521 public static final int DIGITS_EN2AN_INIT_LR = 0x60; 522 523 /** 524 * Digit shaping option: 525 * Replace European digits (U+0030...U+0039) by Arabic-Indic digits 526 * if the most recent strongly directional character 527 * is an Arabic letter (its Bidi direction value is RIGHT_TO_LEFT_ARABIC). 528 * The initial state at the start of the text is assumed to be an Arabic, 529 * letter, so European digits at the start of the text will change. 530 * Compare to DIGITS_ALEN2AN_INT_LR. 531 * @stable ICU 2.0 532 */ 533 public static final int DIGITS_EN2AN_INIT_AL = 0x80; 534 535 /** Not a valid option value. */ 536 //private static final int DIGITS_RESERVED = 0xa0; 537 538 /** 539 * Bit mask for digit shaping options. 540 * @stable ICU 2.0 541 */ 542 public static final int DIGITS_MASK = 0xe0; 543 544 /** 545 * Digit type option: Use Arabic-Indic digits (U+0660...U+0669). 546 * @stable ICU 2.0 547 */ 548 public static final int DIGIT_TYPE_AN = 0; 549 550 /** 551 * Digit type option: Use Eastern (Extended) Arabic-Indic digits (U+06f0...U+06f9). 552 * @stable ICU 2.0 553 */ 554 public static final int DIGIT_TYPE_AN_EXTENDED = 0x100; 555 556 /** 557 * Bit mask for digit type options. 558 * @stable ICU 2.0 559 */ 560 public static final int DIGIT_TYPE_MASK = 0x0100; // 0x3f00? 561 562 /** 563 * some constants 564 */ 565 private static final char HAMZAFE_CHAR = '\ufe80'; 566 private static final char HAMZA06_CHAR = '\u0621'; 567 private static final char YEH_HAMZA_CHAR = '\u0626'; 568 private static final char YEH_HAMZAFE_CHAR = '\uFE89'; 569 private static final char LAMALEF_SPACE_SUB = '\uffff'; 570 private static final char TASHKEEL_SPACE_SUB = '\ufffe'; 571 private static final char LAM_CHAR = '\u0644'; 572 private static final char SPACE_CHAR = '\u0020'; 573 private static final char SHADDA_CHAR = '\uFE7C'; 574 private static final char SHADDA06_CHAR = '\u0651'; 575 private static final char TATWEEL_CHAR = '\u0640'; 576 private static final char SHADDA_TATWEEL_CHAR = '\uFE7D'; 577 private static final char NEW_TAIL_CHAR = '\uFE73'; 578 private static final char OLD_TAIL_CHAR = '\u200B'; 579 private static final int SHAPE_MODE = 0; 580 private static final int DESHAPE_MODE = 1; 581 582 /** 583 * @stable ICU 2.0 584 */ 585 @Override 586 public boolean equals(Object rhs) { 587 return rhs != null && 588 rhs.getClass() == ArabicShaping.class && 589 options == ((ArabicShaping)rhs).options; 590 } 591 592 /** 593 * @stable ICU 2.0 594 */ 595 ///CLOVER:OFF 596 @Override 597 public int hashCode() { 598 return options; 599 } 600 601 /** 602 * @stable ICU 2.0 603 */ 604 @Override 605 public String toString() { 606 StringBuilder buf = new StringBuilder(super.toString()); 607 buf.append('['); 608 609 switch (options & LAMALEF_MASK) { 610 case LAMALEF_RESIZE: buf.append("LamAlef resize"); break; 611 case LAMALEF_NEAR: buf.append("LamAlef spaces at near"); break; 612 case LAMALEF_BEGIN: buf.append("LamAlef spaces at begin"); break; 613 case LAMALEF_END: buf.append("LamAlef spaces at end"); break; 614 case LAMALEF_AUTO: buf.append("lamAlef auto"); break; 615 } 616 switch (options & TEXT_DIRECTION_MASK) { 617 case TEXT_DIRECTION_LOGICAL: buf.append(", logical"); break; 618 case TEXT_DIRECTION_VISUAL_LTR: buf.append(", visual"); break; 619 } 620 switch (options & LETTERS_MASK) { 621 case LETTERS_NOOP: buf.append(", no letter shaping"); break; 622 case LETTERS_SHAPE: buf.append(", shape letters"); break; 623 case LETTERS_SHAPE_TASHKEEL_ISOLATED: buf.append(", shape letters tashkeel isolated"); break; 624 case LETTERS_UNSHAPE: buf.append(", unshape letters"); break; 625 } 626 switch (options & SEEN_MASK) { 627 case SEEN_TWOCELL_NEAR: buf.append(", Seen at near"); break; 628 } 629 switch (options & YEHHAMZA_MASK) { 630 case YEHHAMZA_TWOCELL_NEAR: buf.append(", Yeh Hamza at near"); break; 631 } 632 switch (options & TASHKEEL_MASK) { 633 case TASHKEEL_BEGIN: buf.append(", Tashkeel at begin"); break; 634 case TASHKEEL_END: buf.append(", Tashkeel at end"); break; 635 case TASHKEEL_REPLACE_BY_TATWEEL: buf.append(", Tashkeel replace with tatweel"); break; 636 case TASHKEEL_RESIZE: buf.append(", Tashkeel resize"); break; 637 } 638 639 switch (options & DIGITS_MASK) { 640 case DIGITS_NOOP: buf.append(", no digit shaping"); break; 641 case DIGITS_EN2AN: buf.append(", shape digits to AN"); break; 642 case DIGITS_AN2EN: buf.append(", shape digits to EN"); break; 643 case DIGITS_EN2AN_INIT_LR: buf.append(", shape digits to AN contextually: default EN"); break; 644 case DIGITS_EN2AN_INIT_AL: buf.append(", shape digits to AN contextually: default AL"); break; 645 } 646 switch (options & DIGIT_TYPE_MASK) { 647 case DIGIT_TYPE_AN: buf.append(", standard Arabic-Indic digits"); break; 648 case DIGIT_TYPE_AN_EXTENDED: buf.append(", extended Arabic-Indic digits"); break; 649 } 650 buf.append("]"); 651 652 return buf.toString(); 653 } 654 ///CLOVER:ON 655 656 // 657 // ported api 658 // 659 660 private static final int IRRELEVANT = 4; 661 private static final int LAMTYPE = 16; 662 private static final int ALEFTYPE = 32; 663 664 private static final int LINKR = 1; 665 private static final int LINKL = 2; 666 private static final int LINK_MASK = 3; 667 668 private static final int irrelevantPos[] = { 669 0x0, 0x2, 0x4, 0x6, 0x8, 0xA, 0xC, 0xE 670 }; 671 672 /* 673 private static final char convertLamAlef[] = { 674 '\u0622', // FEF5 675 '\u0622', // FEF6 676 '\u0623', // FEF7 677 '\u0623', // FEF8 678 '\u0625', // FEF9 679 '\u0625', // FEFA 680 '\u0627', // FEFB 681 '\u0627' // FEFC 682 }; 683 */ 684 685 private static final int tailFamilyIsolatedFinal[] = { 686 /* FEB1 */ 1, 687 /* FEB2 */ 1, 688 /* FEB3 */ 0, 689 /* FEB4 */ 0, 690 /* FEB5 */ 1, 691 /* FEB6 */ 1, 692 /* FEB7 */ 0, 693 /* FEB8 */ 0, 694 /* FEB9 */ 1, 695 /* FEBA */ 1, 696 /* FEBB */ 0, 697 /* FEBC */ 0, 698 /* FEBD */ 1, 699 /* FEBE */ 1 700 }; 701 702 private static final int tashkeelMedial[] = { 703 /* FE70 */ 0, 704 /* FE71 */ 1, 705 /* FE72 */ 0, 706 /* FE73 */ 0, 707 /* FE74 */ 0, 708 /* FE75 */ 0, 709 /* FE76 */ 0, 710 /* FE77 */ 1, 711 /* FE78 */ 0, 712 /* FE79 */ 1, 713 /* FE7A */ 0, 714 /* FE7B */ 1, 715 /* FE7C */ 0, 716 /* FE7D */ 1, 717 /* FE7E */ 0, 718 /* FE7F */ 1 719 }; 720 721 private static final char yehHamzaToYeh[] = 722 { 723 /* isolated*/ 0xFEEF, 724 /* final */ 0xFEF0 725 }; 726 727 private static final char convertNormalizedLamAlef[] = { 728 '\u0622', // 065C 729 '\u0623', // 065D 730 '\u0625', // 065E 731 '\u0627', // 065F 732 }; 733 734 private static final int[] araLink = { 735 1 + 32 + 256 * 0x11, /*0x0622*/ 736 1 + 32 + 256 * 0x13, /*0x0623*/ 737 1 + 256 * 0x15, /*0x0624*/ 738 1 + 32 + 256 * 0x17, /*0x0625*/ 739 1 + 2 + 256 * 0x19, /*0x0626*/ 740 1 + 32 + 256 * 0x1D, /*0x0627*/ 741 1 + 2 + 256 * 0x1F, /*0x0628*/ 742 1 + 256 * 0x23, /*0x0629*/ 743 1 + 2 + 256 * 0x25, /*0x062A*/ 744 1 + 2 + 256 * 0x29, /*0x062B*/ 745 1 + 2 + 256 * 0x2D, /*0x062C*/ 746 1 + 2 + 256 * 0x31, /*0x062D*/ 747 1 + 2 + 256 * 0x35, /*0x062E*/ 748 1 + 256 * 0x39, /*0x062F*/ 749 1 + 256 * 0x3B, /*0x0630*/ 750 1 + 256 * 0x3D, /*0x0631*/ 751 1 + 256 * 0x3F, /*0x0632*/ 752 1 + 2 + 256 * 0x41, /*0x0633*/ 753 1 + 2 + 256 * 0x45, /*0x0634*/ 754 1 + 2 + 256 * 0x49, /*0x0635*/ 755 1 + 2 + 256 * 0x4D, /*0x0636*/ 756 1 + 2 + 256 * 0x51, /*0x0637*/ 757 1 + 2 + 256 * 0x55, /*0x0638*/ 758 1 + 2 + 256 * 0x59, /*0x0639*/ 759 1 + 2 + 256 * 0x5D, /*0x063A*/ 760 0, 0, 0, 0, 0, /*0x063B-0x063F*/ 761 1 + 2, /*0x0640*/ 762 1 + 2 + 256 * 0x61, /*0x0641*/ 763 1 + 2 + 256 * 0x65, /*0x0642*/ 764 1 + 2 + 256 * 0x69, /*0x0643*/ 765 1 + 2 + 16 + 256 * 0x6D, /*0x0644*/ 766 1 + 2 + 256 * 0x71, /*0x0645*/ 767 1 + 2 + 256 * 0x75, /*0x0646*/ 768 1 + 2 + 256 * 0x79, /*0x0647*/ 769 1 + 256 * 0x7D, /*0x0648*/ 770 1 + 256 * 0x7F, /*0x0649*/ 771 1 + 2 + 256 * 0x81, /*0x064A*/ 772 4, 4, 4, 4, /*0x064B-0x064E*/ 773 4, 4, 4, 4, /*0x064F-0x0652*/ 774 4, 4, 4, 0, 0, /*0x0653-0x0657*/ 775 0, 0, 0, 0, /*0x0658-0x065B*/ 776 1 + 256 * 0x85, /*0x065C*/ 777 1 + 256 * 0x87, /*0x065D*/ 778 1 + 256 * 0x89, /*0x065E*/ 779 1 + 256 * 0x8B, /*0x065F*/ 780 0, 0, 0, 0, 0, /*0x0660-0x0664*/ 781 0, 0, 0, 0, 0, /*0x0665-0x0669*/ 782 0, 0, 0, 0, 0, 0, /*0x066A-0x066F*/ 783 4, /*0x0670*/ 784 0, /*0x0671*/ 785 1 + 32, /*0x0672*/ 786 1 + 32, /*0x0673*/ 787 0, /*0x0674*/ 788 1 + 32, /*0x0675*/ 789 1, 1, /*0x0676-0x0677*/ 790 1+2, 1+2, 1+2, 1+2, 1+2, 1+2, /*0x0678-0x067D*/ 791 1+2, 1+2, 1+2, 1+2, 1+2, 1+2, /*0x067E-0x0683*/ 792 1+2, 1+2, 1+2, 1+2, /*0x0684-0x0687*/ 793 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /*0x0688-0x0691*/ 794 1, 1, 1, 1, 1, 1, 1, 1, /*0x0692-0x0699*/ 795 1+2, 1+2, 1+2, 1+2, 1+2, 1+2, /*0x069A-0x06A3*/ 796 1+2, 1+2, 1+2, 1+2, /*0x069A-0x06A3*/ 797 1+2, 1+2, 1+2, 1+2, 1+2, 1+2, /*0x06A4-0x06AD*/ 798 1+2, 1+2, 1+2, 1+2, /*0x06A4-0x06AD*/ 799 1+2, 1+2, 1+2, 1+2, 1+2, 1+2, /*0x06AE-0x06B7*/ 800 1+2, 1+2, 1+2, 1+2, /*0x06AE-0x06B7*/ 801 1+2, 1+2, 1+2, 1+2, 1+2, 1+2, /*0x06B8-0x06BF*/ 802 1+2, 1+2, /*0x06B8-0x06BF*/ 803 1, /*0x06C0*/ 804 1+2, /*0x06C1*/ 805 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /*0x06C2-0x06CB*/ 806 1+2, /*0x06CC*/ 807 1, /*0x06CD*/ 808 1+2, 1+2, 1+2, 1+2, /*0x06CE-0x06D1*/ 809 1, 1 /*0x06D2-0x06D3*/ 810 }; 811 812 private static final int[] presLink = { 813 1 + 2, /*0xFE70*/ 814 1 + 2, /*0xFE71*/ 815 1 + 2, 0, 1+ 2, 0, 1+ 2, /*0xFE72-0xFE76*/ 816 1 + 2, /*0xFE77*/ 817 1+ 2, 1 + 2, 1+2, 1 + 2, /*0xFE78-0xFE81*/ 818 1+ 2, 1 + 2, 1+2, 1 + 2, /*0xFE82-0xFE85*/ 819 0, 0 + 32, 1 + 32, 0 + 32, /*0xFE86-0xFE89*/ 820 1 + 32, 0, 1, 0 + 32, /*0xFE8A-0xFE8D*/ 821 1 + 32, 0, 2, 1 + 2, /*0xFE8E-0xFE91*/ 822 1, 0 + 32, 1 + 32, 0, /*0xFE92-0xFE95*/ 823 2, 1 + 2, 1, 0, /*0xFE96-0xFE99*/ 824 1, 0, 2, 1 + 2, /*0xFE9A-0xFE9D*/ 825 1, 0, 2, 1 + 2, /*0xFE9E-0xFEA1*/ 826 1, 0, 2, 1 + 2, /*0xFEA2-0xFEA5*/ 827 1, 0, 2, 1 + 2, /*0xFEA6-0xFEA9*/ 828 1, 0, 2, 1 + 2, /*0xFEAA-0xFEAD*/ 829 1, 0, 1, 0, /*0xFEAE-0xFEB1*/ 830 1, 0, 1, 0, /*0xFEB2-0xFEB5*/ 831 1, 0, 2, 1+2, /*0xFEB6-0xFEB9*/ 832 1, 0, 2, 1+2, /*0xFEBA-0xFEBD*/ 833 1, 0, 2, 1+2, /*0xFEBE-0xFEC1*/ 834 1, 0, 2, 1+2, /*0xFEC2-0xFEC5*/ 835 1, 0, 2, 1+2, /*0xFEC6-0xFEC9*/ 836 1, 0, 2, 1+2, /*0xFECA-0xFECD*/ 837 1, 0, 2, 1+2, /*0xFECE-0xFED1*/ 838 1, 0, 2, 1+2, /*0xFED2-0xFED5*/ 839 1, 0, 2, 1+2, /*0xFED6-0xFED9*/ 840 1, 0, 2, 1+2, /*0xFEDA-0xFEDD*/ 841 1, 0, 2, 1+2, /*0xFEDE-0xFEE1*/ 842 1, 0 + 16, 2 + 16, 1 + 2 +16, /*0xFEE2-0xFEE5*/ 843 1 + 16, 0, 2, 1+2, /*0xFEE6-0xFEE9*/ 844 1, 0, 2, 1+2, /*0xFEEA-0xFEED*/ 845 1, 0, 2, 1+2, /*0xFEEE-0xFEF1*/ 846 1, 0, 1, 0, /*0xFEF2-0xFEF5*/ 847 1, 0, 2, 1+2, /*0xFEF6-0xFEF9*/ 848 1, 0, 1, 0, /*0xFEFA-0xFEFD*/ 849 1, 0, 1, 0, 850 1 851 }; 852 853 private static int[] convertFEto06 = { 854 /***********0******1******2******3******4******5******6******7******8******9******A******B******C******D******E******F***/ 855 /*FE7*/ 0x64B, 0x64B, 0x64C, 0x64C, 0x64D, 0x64D, 0x64E, 0x64E, 0x64F, 0x64F, 0x650, 0x650, 0x651, 0x651, 0x652, 0x652, 856 /*FE8*/ 0x621, 0x622, 0x622, 0x623, 0x623, 0x624, 0x624, 0x625, 0x625, 0x626, 0x626, 0x626, 0x626, 0x627, 0x627, 0x628, 857 /*FE9*/ 0x628, 0x628, 0x628, 0x629, 0x629, 0x62A, 0x62A, 0x62A, 0x62A, 0x62B, 0x62B, 0x62B, 0x62B, 0x62C, 0x62C, 0x62C, 858 /*FEA*/ 0x62C, 0x62D, 0x62D, 0x62D, 0x62D, 0x62E, 0x62E, 0x62E, 0x62E, 0x62F, 0x62F, 0x630, 0x630, 0x631, 0x631, 0x632, 859 /*FEB*/ 0x632, 0x633, 0x633, 0x633, 0x633, 0x634, 0x634, 0x634, 0x634, 0x635, 0x635, 0x635, 0x635, 0x636, 0x636, 0x636, 860 /*FEC*/ 0x636, 0x637, 0x637, 0x637, 0x637, 0x638, 0x638, 0x638, 0x638, 0x639, 0x639, 0x639, 0x639, 0x63A, 0x63A, 0x63A, 861 /*FED*/ 0x63A, 0x641, 0x641, 0x641, 0x641, 0x642, 0x642, 0x642, 0x642, 0x643, 0x643, 0x643, 0x643, 0x644, 0x644, 0x644, 862 /*FEE*/ 0x644, 0x645, 0x645, 0x645, 0x645, 0x646, 0x646, 0x646, 0x646, 0x647, 0x647, 0x647, 0x647, 0x648, 0x648, 0x649, 863 /*FEF*/ 0x649, 0x64A, 0x64A, 0x64A, 0x64A, 0x65C, 0x65C, 0x65D, 0x65D, 0x65E, 0x65E, 0x65F, 0x65F 864 }; 865 866 private static final int shapeTable[][][] = { 867 { {0,0,0,0}, {0,0,0,0}, {0,1,0,3}, {0,1,0,1} }, 868 { {0,0,2,2}, {0,0,1,2}, {0,1,1,2}, {0,1,1,3} }, 869 { {0,0,0,0}, {0,0,0,0}, {0,1,0,3}, {0,1,0,3} }, 870 { {0,0,1,2}, {0,0,1,2}, {0,1,1,2}, {0,1,1,3} } 871 }; 872 873 /* 874 * This function shapes European digits to Arabic-Indic digits 875 * in-place, writing over the input characters. Data is in visual 876 * order. 877 */ 878 private void shapeToArabicDigitsWithContext(char[] dest, 879 int start, 880 int length, 881 char digitBase, 882 boolean lastStrongWasAL) { 883 UBiDiProps bdp=UBiDiProps.INSTANCE; 884 digitBase -= '0'; // move common adjustment out of loop 885 886 for(int i = start + length; --i >= start;) { 887 char ch = dest[i]; 888 switch (bdp.getClass(ch)) { 889 case UCharacterDirection.LEFT_TO_RIGHT: 890 case UCharacterDirection.RIGHT_TO_LEFT: 891 lastStrongWasAL = false; 892 break; 893 case UCharacterDirection.RIGHT_TO_LEFT_ARABIC: 894 lastStrongWasAL = true; 895 break; 896 case UCharacterDirection.EUROPEAN_NUMBER: 897 if (lastStrongWasAL && ch <= '\u0039') { 898 dest[i] = (char)(ch + digitBase); 899 } 900 break; 901 default: 902 break; 903 } 904 } 905 } 906 907 /* 908 * Name : invertBuffer 909 * Function: This function inverts the buffer, it's used 910 * in case the user specifies the buffer to be 911 * TEXT_DIRECTION_LOGICAL 912 */ 913 private static void invertBuffer(char[] buffer, 914 int start, 915 int length) { 916 917 for(int i = start, j = start + length - 1; i < j; i++, --j) { 918 char temp = buffer[i]; 919 buffer[i] = buffer[j]; 920 buffer[j] = temp; 921 } 922 } 923 924 /* 925 * Name : changeLamAlef 926 * Function: Converts the Alef characters into an equivalent 927 * LamAlef location in the 0x06xx Range, this is an 928 * intermediate stage in the operation of the program 929 * later it'll be converted into the 0xFExx LamAlefs 930 * in the shaping function. 931 */ 932 private static char changeLamAlef(char ch) { 933 switch(ch) { 934 case '\u0622': return '\u065C'; 935 case '\u0623': return '\u065D'; 936 case '\u0625': return '\u065E'; 937 case '\u0627': return '\u065F'; 938 default: return '\u0000'; // not a lamalef 939 } 940 } 941 942 /* 943 * Name : specialChar 944 * Function: Special Arabic characters need special handling in the shapeUnicode 945 * function, this function returns 1 or 2 for these special characters 946 */ 947 private static int specialChar(char ch) { 948 if ((ch > '\u0621' && ch < '\u0626') || 949 (ch == '\u0627') || 950 (ch > '\u062E' && ch < '\u0633') || 951 (ch > '\u0647' && ch < '\u064A') || 952 (ch == '\u0629')) { 953 return 1; 954 } else if (ch >= '\u064B' && ch<= '\u0652') { 955 return 2; 956 } else if (ch >= 0x0653 && ch <= 0x0655 || 957 ch == 0x0670 || 958 ch >= 0xFE70 && ch <= 0xFE7F) { 959 return 3; 960 } else { 961 return 0; 962 } 963 } 964 965 /* 966 * Name : getLink 967 * Function: Resolves the link between the characters as 968 * Arabic characters have four forms : 969 * Isolated, Initial, Middle and Final Form 970 */ 971 private static int getLink(char ch) { 972 if (ch >= '\u0622' && ch <= '\u06D3') { 973 return araLink[ch - '\u0622']; 974 } else if (ch == '\u200D') { 975 return 3; 976 } else if (ch >= '\u206D' && ch <= '\u206F') { 977 return 4; 978 } else if (ch >= '\uFE70' && ch <= '\uFEFC') { 979 return presLink[ch - '\uFE70']; 980 } else { 981 return 0; 982 } 983 } 984 985 /* 986 * Name : countSpaces 987 * Function: Counts the number of spaces 988 * at each end of the logical buffer 989 */ 990 private static int countSpacesLeft(char[] dest, 991 int start, 992 int count) { 993 for (int i = start, e = start + count; i < e; ++i) { 994 if (dest[i] != SPACE_CHAR) { 995 return i - start; 996 } 997 } 998 return count; 999 } 1000 1001 private static int countSpacesRight(char[] dest, 1002 int start, 1003 int count) { 1004 1005 for (int i = start + count; --i >= start;) { 1006 if (dest[i] != SPACE_CHAR) { 1007 return start + count - 1 - i; 1008 } 1009 } 1010 return count; 1011 } 1012 1013 /* 1014 * Name : isTashkeelChar 1015 * Function: Returns true for Tashkeel characters else return false 1016 */ 1017 private static boolean isTashkeelChar(char ch) { 1018 return ( ch >='\u064B' && ch <= '\u0652' ); 1019 } 1020 1021 /* 1022 *Name : isSeenTailFamilyChar 1023 *Function : returns 1 if the character is a seen family isolated character 1024 * in the FE range otherwise returns 0 1025 */ 1026 1027 private static int isSeenTailFamilyChar(char ch) { 1028 if (ch >= 0xfeb1 && ch < 0xfebf){ 1029 return tailFamilyIsolatedFinal [ch - 0xFEB1]; 1030 } else { 1031 return 0; 1032 } 1033 } 1034 1035 /* Name : isSeenFamilyChar 1036 * Function : returns 1 if the character is a seen family character in the Unicode 1037 * 06 range otherwise returns 0 1038 */ 1039 1040 private static int isSeenFamilyChar(char ch){ 1041 if (ch >= 0x633 && ch <= 0x636){ 1042 return 1; 1043 }else { 1044 return 0; 1045 } 1046 } 1047 1048 /* 1049 *Name : isTailChar 1050 *Function : returns true if the character matches one of the tail characters 1051 * (0xfe73 or 0x200b) otherwise returns false 1052 */ 1053 1054 private static boolean isTailChar(char ch) { 1055 if(ch == OLD_TAIL_CHAR || ch == NEW_TAIL_CHAR){ 1056 return true; 1057 }else{ 1058 return false; 1059 } 1060 } 1061 1062 /* 1063 *Name : isAlefMaksouraChar 1064 *Function : returns true if the character is a Alef Maksoura Final or isolated 1065 * otherwise returns false 1066 */ 1067 private static boolean isAlefMaksouraChar(char ch) { 1068 return ( (ch == 0xFEEF) || ( ch == 0xFEF0) || (ch == 0x0649)); 1069 } 1070 1071 /* 1072 * Name : isYehHamzaChar 1073 * Function : returns true if the character is a yehHamza isolated or yehhamza 1074 * final is found otherwise returns false 1075 */ 1076 private static boolean isYehHamzaChar(char ch) { 1077 if((ch==0xFE89)||(ch==0xFE8A)){ 1078 return true; 1079 }else{ 1080 return false; 1081 } 1082 } 1083 1084 /* 1085 *Name : isTashkeelCharFE 1086 *Function : Returns true for Tashkeel characters in FE range else return false 1087 */ 1088 1089 private static boolean isTashkeelCharFE(char ch) { 1090 return ( ch!=0xFE75 &&(ch>=0xFE70 && ch<= 0xFE7F) ); 1091 } 1092 1093 /* 1094 * Name: isTashkeelOnTatweelChar 1095 * Function: Checks if the Tashkeel Character is on Tatweel or not,if the 1096 * Tashkeel on tatweel (FE range), it returns 1 else if the 1097 * Tashkeel with shadda on tatweel (FC range)return 2 otherwise 1098 * returns 0 1099 */ 1100 private static int isTashkeelOnTatweelChar(char ch){ 1101 if (ch >= 0xfe70 && ch <= 0xfe7f && ch != NEW_TAIL_CHAR && ch != 0xFE75 && ch != SHADDA_TATWEEL_CHAR) 1102 { 1103 return tashkeelMedial [ch - 0xFE70]; 1104 } else if( (ch >= 0xfcf2 && ch <= 0xfcf4) || (ch == SHADDA_TATWEEL_CHAR)) { 1105 return 2; 1106 } else { 1107 return 0; 1108 } 1109 } 1110 1111 /* 1112 * Name: isIsolatedTashkeelChar 1113 * Function: Checks if the Tashkeel Character is in the isolated form 1114 * (i.e. Unicode FE range) returns 1 else if the Tashkeel 1115 * with shadda is in the isolated form (i.e. Unicode FC range) 1116 * returns 1 otherwise returns 0 1117 */ 1118 private static int isIsolatedTashkeelChar(char ch){ 1119 if (ch >= 0xfe70 && ch <= 0xfe7f && ch != NEW_TAIL_CHAR && ch != 0xFE75){ 1120 return (1 - tashkeelMedial [ch - 0xFE70]); 1121 } else if(ch >= 0xfc5e && ch <= 0xfc63){ 1122 return 1; 1123 } else{ 1124 return 0; 1125 } 1126 } 1127 1128 /* 1129 * Name : isAlefChar 1130 * Function: Returns 1 for Alef characters else return 0 1131 */ 1132 private static boolean isAlefChar(char ch) { 1133 return ch == '\u0622' || ch == '\u0623' || ch == '\u0625' || ch == '\u0627'; 1134 } 1135 1136 /* 1137 * Name : isLamAlefChar 1138 * Function: Returns true for LamAlef characters else return false 1139 */ 1140 private static boolean isLamAlefChar(char ch) { 1141 return ch >= '\uFEF5' && ch <= '\uFEFC'; 1142 } 1143 1144 private static boolean isNormalizedLamAlefChar(char ch) { 1145 return ch >= '\u065C' && ch <= '\u065F'; 1146 } 1147 1148 /* 1149 * Name : calculateSize 1150 * Function: This function calculates the destSize to be used in preflighting 1151 * when the destSize is equal to 0 1152 */ 1153 private int calculateSize(char[] source, 1154 int sourceStart, 1155 int sourceLength) { 1156 1157 int destSize = sourceLength; 1158 1159 switch (options & LETTERS_MASK) { 1160 case LETTERS_SHAPE: 1161 case LETTERS_SHAPE_TASHKEEL_ISOLATED: 1162 if (isLogical) { 1163 for (int i = sourceStart, e = sourceStart + sourceLength - 1; i < e; ++i) { 1164 if ((source[i] == LAM_CHAR && isAlefChar(source[i+1])) || isTashkeelCharFE(source[i])){ 1165 --destSize; 1166 } 1167 } 1168 } else { // visual 1169 for(int i = sourceStart + 1, e = sourceStart + sourceLength; i < e; ++i) { 1170 if ((source[i] == LAM_CHAR && isAlefChar(source[i-1])) || isTashkeelCharFE(source[i])) { 1171 --destSize; 1172 } 1173 } 1174 } 1175 break; 1176 1177 case LETTERS_UNSHAPE: 1178 for(int i = sourceStart, e = sourceStart + sourceLength; i < e; ++i) { 1179 if (isLamAlefChar(source[i])) { 1180 destSize++; 1181 } 1182 } 1183 break; 1184 1185 default: 1186 break; 1187 } 1188 1189 return destSize; 1190 } 1191 1192 1193 /* 1194 * Name : countSpaceSub 1195 * Function: Counts number of times the subChar appears in the array 1196 */ 1197 private static int countSpaceSub(char [] dest,int length, char subChar){ 1198 int i = 0; 1199 int count = 0; 1200 while (i < length) { 1201 if (dest[i] == subChar) { 1202 count++; 1203 } 1204 i++; 1205 } 1206 return count; 1207 } 1208 1209 /* 1210 * Name : shiftArray 1211 * Function: Shifts characters to replace space sub characters 1212 */ 1213 private static void shiftArray(char [] dest,int start, int e, char subChar){ 1214 int w = e; 1215 int r = e; 1216 while (--r >= start) { 1217 char ch = dest[r]; 1218 if (ch != subChar) { 1219 --w; 1220 if (w != r) { 1221 dest[w] = ch; 1222 } 1223 } 1224 } 1225 } 1226 1227 /* 1228 * Name : flipArray 1229 * Function: inverts array, so that start becomes end and vice versa 1230 */ 1231 private static int flipArray(char [] dest, int start, int e, int w){ 1232 int r; 1233 if (w > start) { 1234 // shift, assume small buffer size so don't use arraycopy 1235 r = w; 1236 w = start; 1237 while (r < e) { 1238 dest[w++] = dest[r++]; 1239 } 1240 } else { 1241 w = e; 1242 } 1243 return w; 1244 } 1245 1246 /* 1247 * Name : handleTashkeelWithTatweel 1248 * Function : Replaces Tashkeel as following: 1249 * Case 1 :if the Tashkeel on tatweel, replace it with Tatweel. 1250 * Case 2 :if the Tashkeel aggregated with Shadda on Tatweel, replace 1251 * it with Shadda on Tatweel. 1252 * Case 3: if the Tashkeel is isolated replace it with Space. 1253 * 1254 */ 1255 private static int handleTashkeelWithTatweel(char[] dest, int sourceLength) { 1256 int i; 1257 for(i = 0; i < sourceLength; i++){ 1258 if((isTashkeelOnTatweelChar(dest[i]) == 1)){ 1259 dest[i] = TATWEEL_CHAR; 1260 }else if((isTashkeelOnTatweelChar(dest[i]) == 2)){ 1261 dest[i] = SHADDA_TATWEEL_CHAR; 1262 }else if((isIsolatedTashkeelChar(dest[i])==1) && dest[i] != SHADDA_CHAR){ 1263 dest[i] = SPACE_CHAR; 1264 } 1265 } 1266 return sourceLength; 1267 } 1268 1269 /* 1270 *Name : handleGeneratedSpaces 1271 *Function : The shapeUnicode function converts Lam + Alef into LamAlef + space, 1272 * and Tashkeel to space. 1273 * handleGeneratedSpaces function puts these generated spaces 1274 * according to the options the user specifies. LamAlef and Tashkeel 1275 * spaces can be replaced at begin, at end, at near or decrease the 1276 * buffer size. 1277 * 1278 * There is also Auto option for LamAlef and tashkeel, which will put 1279 * the spaces at end of the buffer (or end of text if the user used 1280 * the option SPACES_RELATIVE_TO_TEXT_BEGIN_END). 1281 * 1282 * If the text type was visual_LTR and the option 1283 * SPACES_RELATIVE_TO_TEXT_BEGIN_END was selected the END 1284 * option will place the space at the beginning of the buffer and 1285 * BEGIN will place the space at the end of the buffer. 1286 */ 1287 private int handleGeneratedSpaces(char[] dest, 1288 int start, 1289 int length) { 1290 1291 int lenOptionsLamAlef = options & LAMALEF_MASK; 1292 int lenOptionsTashkeel = options & TASHKEEL_MASK; 1293 boolean lamAlefOn = false; 1294 boolean tashkeelOn = false; 1295 1296 if (!isLogical & !spacesRelativeToTextBeginEnd) { 1297 switch (lenOptionsLamAlef) { 1298 case LAMALEF_BEGIN: lenOptionsLamAlef = LAMALEF_END; break; 1299 case LAMALEF_END: lenOptionsLamAlef = LAMALEF_BEGIN; break; 1300 default: break; 1301 } 1302 switch (lenOptionsTashkeel){ 1303 case TASHKEEL_BEGIN: lenOptionsTashkeel = TASHKEEL_END; break; 1304 case TASHKEEL_END: lenOptionsTashkeel = TASHKEEL_BEGIN; break; 1305 default: break; 1306 } 1307 } 1308 1309 1310 if (lenOptionsLamAlef == LAMALEF_NEAR) { 1311 for (int i = start, e = i + length; i < e; ++i) { 1312 if (dest[i] == LAMALEF_SPACE_SUB) { 1313 dest[i] = SPACE_CHAR; 1314 } 1315 } 1316 1317 } else { 1318 1319 final int e = start + length; 1320 int wL = countSpaceSub(dest, length, LAMALEF_SPACE_SUB); 1321 int wT = countSpaceSub(dest, length, TASHKEEL_SPACE_SUB); 1322 1323 if (lenOptionsLamAlef == LAMALEF_END){ 1324 lamAlefOn = true; 1325 } 1326 if (lenOptionsTashkeel == TASHKEEL_END){ 1327 tashkeelOn = true; 1328 } 1329 1330 1331 if (lamAlefOn && (lenOptionsLamAlef == LAMALEF_END)) { 1332 shiftArray(dest, start, e, LAMALEF_SPACE_SUB); 1333 while (wL > start) { 1334 dest[--wL] = SPACE_CHAR; 1335 } 1336 } 1337 1338 if (tashkeelOn && (lenOptionsTashkeel == TASHKEEL_END)){ 1339 shiftArray(dest, start, e, TASHKEEL_SPACE_SUB); 1340 while (wT > start) { 1341 dest[--wT] = SPACE_CHAR; 1342 } 1343 } 1344 1345 lamAlefOn = false; 1346 tashkeelOn = false; 1347 1348 if (lenOptionsLamAlef == LAMALEF_RESIZE){ 1349 lamAlefOn = true; 1350 } 1351 if (lenOptionsTashkeel == TASHKEEL_RESIZE){ 1352 tashkeelOn = true; 1353 } 1354 1355 if (lamAlefOn && (lenOptionsLamAlef == LAMALEF_RESIZE)){ 1356 shiftArray(dest, start, e, LAMALEF_SPACE_SUB); 1357 wL = flipArray(dest,start,e, wL); 1358 length = wL - start; 1359 } 1360 if (tashkeelOn && (lenOptionsTashkeel == TASHKEEL_RESIZE)) { 1361 shiftArray(dest, start, e, TASHKEEL_SPACE_SUB); 1362 wT = flipArray(dest,start,e, wT); 1363 length = wT - start; 1364 } 1365 1366 lamAlefOn = false; 1367 tashkeelOn = false; 1368 1369 if ((lenOptionsLamAlef == LAMALEF_BEGIN) || 1370 (lenOptionsLamAlef == LAMALEF_AUTO)){ 1371 lamAlefOn = true; 1372 } 1373 if (lenOptionsTashkeel == TASHKEEL_BEGIN){ 1374 tashkeelOn = true; 1375 } 1376 1377 if (lamAlefOn && ((lenOptionsLamAlef == LAMALEF_BEGIN)|| 1378 (lenOptionsLamAlef == LAMALEF_AUTO))) { // spaces at beginning 1379 shiftArray(dest, start, e, LAMALEF_SPACE_SUB); 1380 wL = flipArray(dest,start,e, wL); 1381 while (wL < e) { 1382 dest[wL++] = SPACE_CHAR; 1383 } 1384 } 1385 if(tashkeelOn && (lenOptionsTashkeel == TASHKEEL_BEGIN)){ 1386 shiftArray(dest, start, e, TASHKEEL_SPACE_SUB); 1387 wT = flipArray(dest,start,e, wT); 1388 while (wT < e) { 1389 dest[wT++] = SPACE_CHAR; 1390 } 1391 } 1392 } 1393 1394 return length; 1395 } 1396 1397 1398 /* 1399 *Name :expandCompositCharAtBegin 1400 *Function :Expands the LamAlef character to Lam and Alef consuming the required 1401 * space from beginning of the buffer. If the text type was visual_LTR 1402 * and the option SPACES_RELATIVE_TO_TEXT_BEGIN_END was selected 1403 * the spaces will be located at end of buffer. 1404 * If there are no spaces to expand the LamAlef, an exception is thrown. 1405 */ 1406 private boolean expandCompositCharAtBegin(char[] dest,int start, int length, 1407 int lacount) { 1408 boolean spaceNotFound = false; 1409 1410 if (lacount > countSpacesRight(dest, start, length)) { 1411 spaceNotFound = true; 1412 return spaceNotFound; 1413 } 1414 for (int r = start + length - lacount, w = start + length; --r >= start;) { 1415 char ch = dest[r]; 1416 if (isNormalizedLamAlefChar(ch)) { 1417 dest[--w] = LAM_CHAR; 1418 dest[--w] = convertNormalizedLamAlef[ch - '\u065C']; 1419 } else { 1420 dest[--w] = ch; 1421 } 1422 } 1423 return spaceNotFound; 1424 1425 } 1426 1427 /* 1428 *Name : expandCompositCharAtEnd 1429 *Function : Expands the LamAlef character to Lam and Alef consuming the 1430 * required space from end of the buffer. If the text type was 1431 * Visual LTR and the option SPACES_RELATIVE_TO_TEXT_BEGIN_END 1432 * was used, the spaces will be consumed from begin of buffer. If 1433 * there are no spaces to expand the LamAlef, an exception is thrown. 1434 */ 1435 1436 private boolean expandCompositCharAtEnd(char[] dest,int start, int length, 1437 int lacount){ 1438 boolean spaceNotFound = false; 1439 1440 if (lacount > countSpacesLeft(dest, start, length)) { 1441 spaceNotFound = true; 1442 return spaceNotFound; 1443 } 1444 for (int r = start + lacount, w = start, e = start + length; r < e; ++r) { 1445 char ch = dest[r]; 1446 if (isNormalizedLamAlefChar(ch)) { 1447 dest[w++] = convertNormalizedLamAlef[ch - '\u065C']; 1448 dest[w++] = LAM_CHAR; 1449 } else { 1450 dest[w++] = ch; 1451 } 1452 } 1453 return spaceNotFound; 1454 } 1455 1456 /* 1457 *Name : expandCompositCharAtNear 1458 *Function : Expands the LamAlef character into Lam + Alef, YehHamza character 1459 * into Yeh + Hamza, SeenFamily character into SeenFamily character 1460 * + Tail, while consuming the space next to the character. 1461 */ 1462 1463 private boolean expandCompositCharAtNear(char[] dest,int start, int length, 1464 int yehHamzaOption, int seenTailOption, int lamAlefOption){ 1465 1466 boolean spaceNotFound = false; 1467 1468 1469 1470 if (isNormalizedLamAlefChar(dest[start])) { 1471 spaceNotFound = true; 1472 return spaceNotFound; 1473 } 1474 for (int i = start + length; --i >=start;) { 1475 char ch = dest[i]; 1476 if (lamAlefOption == 1 && isNormalizedLamAlefChar(ch)) { 1477 if (i>start &&dest[i-1] == SPACE_CHAR) { 1478 dest[i] = LAM_CHAR; 1479 dest[--i] = convertNormalizedLamAlef[ch - '\u065C']; 1480 } else { 1481 spaceNotFound = true; 1482 return spaceNotFound; 1483 } 1484 }else if(seenTailOption == 1 && isSeenTailFamilyChar(ch) == 1){ 1485 if(i>start &&dest[i-1] == SPACE_CHAR){ 1486 dest[i-1] = tailChar; 1487 } else{ 1488 spaceNotFound = true; 1489 return spaceNotFound; 1490 } 1491 }else if(yehHamzaOption == 1 && isYehHamzaChar(ch)){ 1492 1493 if(i>start &&dest[i-1] == SPACE_CHAR){ 1494 dest[i] = yehHamzaToYeh[ch - YEH_HAMZAFE_CHAR]; 1495 dest[i-1] = HAMZAFE_CHAR; 1496 }else{ 1497 spaceNotFound = true; 1498 return spaceNotFound; 1499 } 1500 1501 1502 } 1503 } 1504 return false; 1505 1506 } 1507 1508 /* 1509 * Name : expandCompositChar 1510 * Function: LamAlef needs special handling as the LamAlef is 1511 * one character while expanding it will give two 1512 * characters Lam + Alef, so we need to expand the LamAlef 1513 * in near or far spaces according to the options the user 1514 * specifies or increase the buffer size. 1515 * Dest has enough room for the expansion if we are growing. 1516 * lamalef are normalized to the 'special characters' 1517 */ 1518 private int expandCompositChar(char[] dest, 1519 int start, 1520 int length, 1521 int lacount, 1522 int shapingMode) throws ArabicShapingException { 1523 1524 int lenOptionsLamAlef = options & LAMALEF_MASK; 1525 int lenOptionsSeen = options & SEEN_MASK; 1526 int lenOptionsYehHamza = options & YEHHAMZA_MASK; 1527 boolean spaceNotFound = false; 1528 1529 if (!isLogical && !spacesRelativeToTextBeginEnd) { 1530 switch (lenOptionsLamAlef) { 1531 case LAMALEF_BEGIN: lenOptionsLamAlef = LAMALEF_END; break; 1532 case LAMALEF_END: lenOptionsLamAlef = LAMALEF_BEGIN; break; 1533 default: break; 1534 } 1535 } 1536 1537 if(shapingMode == 1){ 1538 if(lenOptionsLamAlef == LAMALEF_AUTO){ 1539 if(isLogical){ 1540 spaceNotFound = expandCompositCharAtEnd(dest, start, length, lacount); 1541 if(spaceNotFound){ 1542 spaceNotFound = expandCompositCharAtBegin(dest, start, length, lacount); 1543 } 1544 if(spaceNotFound){ 1545 spaceNotFound = expandCompositCharAtNear(dest, start, length,0,0,1); 1546 } 1547 if(spaceNotFound){ 1548 throw new ArabicShapingException("No spacefor lamalef"); 1549 } 1550 }else{ 1551 spaceNotFound = expandCompositCharAtBegin(dest, start, length, lacount); 1552 if(spaceNotFound){ 1553 spaceNotFound = expandCompositCharAtEnd(dest, start, length, lacount); 1554 } 1555 if(spaceNotFound){ 1556 spaceNotFound = expandCompositCharAtNear(dest, start, length,0,0,1); 1557 } 1558 if(spaceNotFound){ 1559 throw new ArabicShapingException("No spacefor lamalef"); 1560 } 1561 } 1562 }else if(lenOptionsLamAlef == LAMALEF_END){ 1563 spaceNotFound = expandCompositCharAtEnd(dest, start, length, lacount); 1564 if(spaceNotFound){ 1565 throw new ArabicShapingException("No spacefor lamalef"); 1566 } 1567 }else if(lenOptionsLamAlef == LAMALEF_BEGIN){ 1568 spaceNotFound = expandCompositCharAtBegin(dest, start, length, lacount); 1569 if(spaceNotFound){ 1570 throw new ArabicShapingException("No spacefor lamalef"); 1571 } 1572 }else if(lenOptionsLamAlef == LAMALEF_NEAR){ 1573 spaceNotFound = expandCompositCharAtNear(dest, start, length,0,0,1); 1574 if(spaceNotFound){ 1575 throw new ArabicShapingException("No spacefor lamalef"); 1576 } 1577 }else if(lenOptionsLamAlef == LAMALEF_RESIZE){ 1578 for (int r = start + length, w = r + lacount; --r >= start;) { 1579 char ch = dest[r]; 1580 if (isNormalizedLamAlefChar(ch)) { 1581 dest[--w] = '\u0644'; 1582 dest[--w] = convertNormalizedLamAlef[ch - '\u065C']; 1583 } else { 1584 dest[--w] = ch; 1585 } 1586 } 1587 length += lacount; 1588 } 1589 }else{ 1590 if(lenOptionsSeen == SEEN_TWOCELL_NEAR){ 1591 spaceNotFound = expandCompositCharAtNear(dest, start, length,0,1,0); 1592 if(spaceNotFound){ 1593 throw new ArabicShapingException("No space for Seen tail expansion"); 1594 } 1595 } 1596 if(lenOptionsYehHamza == YEHHAMZA_TWOCELL_NEAR){ 1597 spaceNotFound = expandCompositCharAtNear(dest, start, length,1,0,0); 1598 if(spaceNotFound){ 1599 throw new ArabicShapingException("No space for YehHamza expansion"); 1600 } 1601 } 1602 } 1603 return length; 1604 } 1605 1606 1607 /* Convert the input buffer from FExx Range into 06xx Range 1608 * to put all characters into the 06xx range 1609 * even the lamalef is converted to the special region in 1610 * the 06xx range. Return the number of lamalef chars found. 1611 */ 1612 private int normalize(char[] dest, int start, int length) { 1613 int lacount = 0; 1614 for (int i = start, e = i + length; i < e; ++i) { 1615 char ch = dest[i]; 1616 if (ch >= '\uFE70' && ch <= '\uFEFC') { 1617 if (isLamAlefChar(ch)) { 1618 ++lacount; 1619 } 1620 dest[i] = (char)convertFEto06[ch - '\uFE70']; 1621 } 1622 } 1623 return lacount; 1624 } 1625 1626 /* 1627 * Name : deshapeNormalize 1628 * Function: Convert the input buffer from FExx Range into 06xx Range 1629 * even the lamalef is converted to the special region in the 06xx range. 1630 * According to the options the user enters, all seen family characters 1631 * followed by a tail character are merged to seen tail family character and 1632 * any yeh followed by a hamza character are merged to yehhamza character. 1633 * Method returns the number of lamalef chars found. 1634 */ 1635 private int deshapeNormalize(char[] dest, int start, int length) { 1636 int lacount = 0; 1637 int yehHamzaComposeEnabled = 0; 1638 int seenComposeEnabled = 0; 1639 1640 yehHamzaComposeEnabled = ((options&YEHHAMZA_MASK) == YEHHAMZA_TWOCELL_NEAR) ? 1 : 0; 1641 seenComposeEnabled = ((options&SEEN_MASK) == SEEN_TWOCELL_NEAR)? 1 : 0; 1642 1643 for (int i = start, e = i + length; i < e; ++i) { 1644 char ch = dest[i]; 1645 1646 if( (yehHamzaComposeEnabled == 1) && ((ch == HAMZA06_CHAR) || (ch == HAMZAFE_CHAR)) 1647 && (i < (length - 1)) && isAlefMaksouraChar(dest[i+1] )) { 1648 dest[i] = SPACE_CHAR; 1649 dest[i+1] = YEH_HAMZA_CHAR; 1650 } else if ( (seenComposeEnabled == 1) && (isTailChar(ch)) && (i< (length - 1)) 1651 && (isSeenTailFamilyChar(dest[i+1])==1) ) { 1652 dest[i] = SPACE_CHAR; 1653 } 1654 else if (ch >= '\uFE70' && ch <= '\uFEFC') { 1655 if (isLamAlefChar(ch)) { 1656 ++lacount; 1657 } 1658 dest[i] = (char)convertFEto06[ch - '\uFE70']; 1659 } 1660 } 1661 return lacount; 1662 } 1663 1664 /* 1665 * Name : shapeUnicode 1666 * Function: Converts an Arabic Unicode buffer in 06xx Range into a shaped 1667 * arabic Unicode buffer in FExx Range 1668 */ 1669 private int shapeUnicode(char[] dest, 1670 int start, 1671 int length, 1672 int destSize, 1673 int tashkeelFlag)throws ArabicShapingException { 1674 1675 int lamalef_count = normalize(dest, start, length); 1676 1677 // resolve the link between the characters. 1678 // Arabic characters have four forms: Isolated, Initial, Medial and Final. 1679 // Tashkeel characters have two, isolated or medial, and sometimes only isolated. 1680 // tashkeelFlag == 0: shape normally, 1: shape isolated, 2: don't shape 1681 1682 boolean lamalef_found = false, seenfam_found = false; 1683 boolean yehhamza_found = false, tashkeel_found = false; 1684 int i = start + length - 1; 1685 int currLink = getLink(dest[i]); 1686 int nextLink = 0; 1687 int prevLink = 0; 1688 int lastLink = 0; 1689 //int prevPos = i; 1690 int lastPos = i; 1691 int nx = -2; 1692 int nw = 0; 1693 1694 while (i >= 0) { 1695 // If high byte of currLink != 0 then there might be more than one shape 1696 if ((currLink & '\uFF00') != 0 || isTashkeelChar(dest[i])) { 1697 nw = i - 1; 1698 nx = -2; 1699 while (nx < 0) { // we need to know about next char 1700 if (nw == -1) { 1701 nextLink = 0; 1702 nx = Integer.MAX_VALUE; 1703 } else { 1704 nextLink = getLink(dest[nw]); 1705 if ((nextLink & IRRELEVANT) == 0) { 1706 nx = nw; 1707 } else { 1708 --nw; 1709 } 1710 } 1711 } 1712 1713 if (((currLink & ALEFTYPE) > 0) && ((lastLink & LAMTYPE) > 0)) { 1714 lamalef_found = true; 1715 char wLamalef = changeLamAlef(dest[i]); // get from 0x065C-0x065f 1716 if (wLamalef != '\u0000') { 1717 // replace alef by marker, it will be removed later 1718 dest[i] = '\uffff'; 1719 dest[lastPos] = wLamalef; 1720 i = lastPos; 1721 } 1722 1723 lastLink = prevLink; 1724 currLink = getLink(wLamalef); // requires '\u0000', unfortunately 1725 } 1726 if ((i > 0) && (dest[i-1] == SPACE_CHAR)) 1727 { 1728 if ( isSeenFamilyChar(dest[i]) == 1){ 1729 seenfam_found = true; 1730 } else if (dest[i] == YEH_HAMZA_CHAR) { 1731 yehhamza_found = true; 1732 } 1733 } 1734 else if(i==0){ 1735 if ( isSeenFamilyChar(dest[i]) == 1){ 1736 seenfam_found = true; 1737 } else if (dest[i] == YEH_HAMZA_CHAR) { 1738 yehhamza_found = true; 1739 } 1740 } 1741 1742 1743 // get the proper shape according to link ability of neighbors 1744 // and of character; depends on the order of the shapes 1745 // (isolated, initial, middle, final) in the compatibility area 1746 1747 int flag = specialChar(dest[i]); 1748 1749 int shape = shapeTable[nextLink & LINK_MASK] 1750 [lastLink & LINK_MASK] 1751 [currLink & LINK_MASK]; 1752 1753 if (flag == 1) { 1754 shape &= 0x1; 1755 } else if (flag == 2) { 1756 if (tashkeelFlag == 0 && 1757 ((lastLink & LINKL) != 0) && 1758 ((nextLink & LINKR) != 0) && 1759 dest[i] != '\u064C' && 1760 dest[i] != '\u064D' && 1761 !((nextLink & ALEFTYPE) == ALEFTYPE && 1762 (lastLink & LAMTYPE) == LAMTYPE)) { 1763 1764 shape = 1; 1765 1766 } else if(tashkeelFlag == 2 && dest[i] == SHADDA06_CHAR){ 1767 shape = 1; 1768 1769 } else { 1770 shape = 0; 1771 } 1772 } 1773 if (flag == 2) { 1774 if (tashkeelFlag == 2 && dest[i] != SHADDA06_CHAR) { 1775 dest[i] = TASHKEEL_SPACE_SUB; 1776 tashkeel_found = true; 1777 } 1778 else{ 1779 dest[i] = (char)('\uFE70' + irrelevantPos[dest[i] - '\u064B'] + shape); 1780 } 1781 // else leave tashkeel alone 1782 } else { 1783 dest[i] = (char)('\uFE70' + (currLink >> 8) + shape); 1784 } 1785 } 1786 1787 // move one notch forward 1788 if ((currLink & IRRELEVANT) == 0) { 1789 prevLink = lastLink; 1790 lastLink = currLink; 1791 //prevPos = lastPos; 1792 lastPos = i; 1793 } 1794 1795 --i; 1796 if (i == nx) { 1797 currLink = nextLink; 1798 nx = -2; 1799 } else if (i != -1) { 1800 currLink = getLink(dest[i]); 1801 } 1802 } 1803 1804 // If we found a lam/alef pair in the buffer 1805 // call handleGeneratedSpaces to remove the spaces that were added 1806 1807 destSize = length; 1808 if (lamalef_found || tashkeel_found) { 1809 destSize = handleGeneratedSpaces(dest, start, length); 1810 } 1811 if (seenfam_found || yehhamza_found){ 1812 destSize = expandCompositChar(dest, start, destSize, lamalef_count, SHAPE_MODE); 1813 } 1814 return destSize; 1815 } 1816 1817 /* 1818 * Name : deShapeUnicode 1819 * Function: Converts an Arabic Unicode buffer in FExx Range into unshaped 1820 * arabic Unicode buffer in 06xx Range 1821 */ 1822 private int deShapeUnicode(char[] dest, 1823 int start, 1824 int length, 1825 int destSize) throws ArabicShapingException { 1826 1827 int lamalef_count = deshapeNormalize(dest, start, length); 1828 1829 // If there was a lamalef in the buffer call expandLamAlef 1830 if (lamalef_count != 0) { 1831 // need to adjust dest to fit expanded buffer... !!! 1832 destSize = expandCompositChar(dest, start, length, lamalef_count,DESHAPE_MODE); 1833 } else { 1834 destSize = length; 1835 } 1836 1837 return destSize; 1838 } 1839 1840 private int internalShape(char[] source, 1841 int sourceStart, 1842 int sourceLength, 1843 char[] dest, 1844 int destStart, 1845 int destSize) throws ArabicShapingException { 1846 1847 if (sourceLength == 0) { 1848 return 0; 1849 } 1850 1851 if (destSize == 0) { 1852 if (((options & LETTERS_MASK) != LETTERS_NOOP) && 1853 ((options & LAMALEF_MASK) == LAMALEF_RESIZE)) { 1854 1855 return calculateSize(source, sourceStart, sourceLength); 1856 } else { 1857 return sourceLength; // by definition 1858 } 1859 } 1860 1861 // always use temp buffer 1862 char[] temp = new char[sourceLength * 2]; // all lamalefs requiring expansion 1863 System.arraycopy(source, sourceStart, temp, 0, sourceLength); 1864 1865 if (isLogical) { 1866 invertBuffer(temp, 0, sourceLength); 1867 } 1868 1869 int outputSize = sourceLength; 1870 1871 switch (options & LETTERS_MASK) { 1872 case LETTERS_SHAPE_TASHKEEL_ISOLATED: 1873 outputSize = shapeUnicode(temp, 0, sourceLength, destSize, 1); 1874 break; 1875 1876 case LETTERS_SHAPE: 1877 if( ((options&TASHKEEL_MASK) != 0) && 1878 ((options&TASHKEEL_MASK) !=TASHKEEL_REPLACE_BY_TATWEEL)) { 1879 /* Call the shaping function with tashkeel flag == 2 for removal of tashkeel */ 1880 outputSize = shapeUnicode(temp, 0, sourceLength, destSize, 2); 1881 }else { 1882 //default Call the shaping function with tashkeel flag == 1 */ 1883 outputSize = shapeUnicode(temp, 0, sourceLength, destSize, 0); 1884 1885 /*After shaping text check if user wants to remove tashkeel and replace it with tatweel*/ 1886 if( (options&TASHKEEL_MASK) == TASHKEEL_REPLACE_BY_TATWEEL){ 1887 outputSize = handleTashkeelWithTatweel(temp,sourceLength); 1888 } 1889 } 1890 break; 1891 1892 case LETTERS_UNSHAPE: 1893 outputSize = deShapeUnicode(temp, 0, sourceLength, destSize); 1894 break; 1895 1896 default: 1897 break; 1898 } 1899 1900 if (outputSize > destSize) { 1901 throw new ArabicShapingException("not enough room for result data"); 1902 } 1903 1904 if ((options & DIGITS_MASK) != DIGITS_NOOP) { 1905 char digitBase = '\u0030'; // European digits 1906 switch (options & DIGIT_TYPE_MASK) { 1907 case DIGIT_TYPE_AN: 1908 digitBase = '\u0660'; // Arabic-Indic digits 1909 break; 1910 1911 case DIGIT_TYPE_AN_EXTENDED: 1912 digitBase = '\u06f0'; // Eastern Arabic-Indic digits (Persian and Urdu) 1913 break; 1914 1915 default: 1916 break; 1917 } 1918 1919 switch (options & DIGITS_MASK) { 1920 case DIGITS_EN2AN: 1921 { 1922 int digitDelta = digitBase - '\u0030'; 1923 for (int i = 0; i < outputSize; ++i) { 1924 char ch = temp[i]; 1925 if (ch <= '\u0039' && ch >= '\u0030') { 1926 temp[i] += digitDelta; 1927 } 1928 } 1929 } 1930 break; 1931 1932 case DIGITS_AN2EN: 1933 { 1934 char digitTop = (char)(digitBase + 9); 1935 int digitDelta = '\u0030' - digitBase; 1936 for (int i = 0; i < outputSize; ++i) { 1937 char ch = temp[i]; 1938 if (ch <= digitTop && ch >= digitBase) { 1939 temp[i] += digitDelta; 1940 } 1941 } 1942 } 1943 break; 1944 1945 case DIGITS_EN2AN_INIT_LR: 1946 shapeToArabicDigitsWithContext(temp, 0, outputSize, digitBase, false); 1947 break; 1948 1949 case DIGITS_EN2AN_INIT_AL: 1950 shapeToArabicDigitsWithContext(temp, 0, outputSize, digitBase, true); 1951 break; 1952 1953 default: 1954 break; 1955 } 1956 } 1957 1958 if (isLogical) { 1959 invertBuffer(temp, 0, outputSize); 1960 } 1961 1962 System.arraycopy(temp, 0, dest, destStart, outputSize); 1963 1964 return outputSize; 1965 } 1966 } 1967