1 /* 2 ******************************************************************************* 3 * Copyright (C) 2001-2003, International Business Machines 4 * Corporation and others. All Rights Reserved. 5 ******************************************************************************* 6 * file name: bocsu.c 7 * encoding: US-ASCII 8 * tab size: 8 (not used) 9 * indentation:4 10 * 11 * Author: Markus W. Scherer 12 * 13 * Modification history: 14 * 05/18/2001 weiv Made into separate module 15 */ 16 17 18 #include "unicode/utypes.h" 19 20 #if !UCONFIG_NO_COLLATION 21 22 #include "bocsu.h" 23 24 /* 25 * encode one difference value -0x10ffff..+0x10ffff in 1..3 bytes, 26 * preserving lexical order 27 */ 28 U_CFUNC uint8_t * 29 u_writeDiff(int32_t diff, uint8_t *p) { 30 if(diff>=SLOPE_REACH_NEG_1) { 31 if(diff<=SLOPE_REACH_POS_1) { 32 *p++=(uint8_t)(SLOPE_MIDDLE+diff); 33 } else if(diff<=SLOPE_REACH_POS_2) { 34 *p++=(uint8_t)(SLOPE_START_POS_2+(diff/SLOPE_TAIL_COUNT)); 35 *p++=(uint8_t)(SLOPE_MIN+diff%SLOPE_TAIL_COUNT); 36 } else if(diff<=SLOPE_REACH_POS_3) { 37 p[2]=(uint8_t)(SLOPE_MIN+diff%SLOPE_TAIL_COUNT); 38 diff/=SLOPE_TAIL_COUNT; 39 p[1]=(uint8_t)(SLOPE_MIN+diff%SLOPE_TAIL_COUNT); 40 *p=(uint8_t)(SLOPE_START_POS_3+(diff/SLOPE_TAIL_COUNT)); 41 p+=3; 42 } else { 43 p[3]=(uint8_t)(SLOPE_MIN+diff%SLOPE_TAIL_COUNT); 44 diff/=SLOPE_TAIL_COUNT; 45 p[2]=(uint8_t)(SLOPE_MIN+diff%SLOPE_TAIL_COUNT); 46 diff/=SLOPE_TAIL_COUNT; 47 p[1]=(uint8_t)(SLOPE_MIN+diff%SLOPE_TAIL_COUNT); 48 *p=SLOPE_MAX; 49 p+=4; 50 } 51 } else { 52 int32_t m; 53 54 if(diff>=SLOPE_REACH_NEG_2) { 55 NEGDIVMOD(diff, SLOPE_TAIL_COUNT, m); 56 *p++=(uint8_t)(SLOPE_START_NEG_2+diff); 57 *p++=(uint8_t)(SLOPE_MIN+m); 58 } else if(diff>=SLOPE_REACH_NEG_3) { 59 NEGDIVMOD(diff, SLOPE_TAIL_COUNT, m); 60 p[2]=(uint8_t)(SLOPE_MIN+m); 61 NEGDIVMOD(diff, SLOPE_TAIL_COUNT, m); 62 p[1]=(uint8_t)(SLOPE_MIN+m); 63 *p=(uint8_t)(SLOPE_START_NEG_3+diff); 64 p+=3; 65 } else { 66 NEGDIVMOD(diff, SLOPE_TAIL_COUNT, m); 67 p[3]=(uint8_t)(SLOPE_MIN+m); 68 NEGDIVMOD(diff, SLOPE_TAIL_COUNT, m); 69 p[2]=(uint8_t)(SLOPE_MIN+m); 70 NEGDIVMOD(diff, SLOPE_TAIL_COUNT, m); 71 p[1]=(uint8_t)(SLOPE_MIN+m); 72 *p=SLOPE_MIN; 73 p+=4; 74 } 75 } 76 return p; 77 } 78 79 /* How many bytes would writeDiff() write? */ 80 static int32_t 81 lengthOfDiff(int32_t diff) { 82 if(diff>=SLOPE_REACH_NEG_1) { 83 if(diff<=SLOPE_REACH_POS_1) { 84 return 1; 85 } else if(diff<=SLOPE_REACH_POS_2) { 86 return 2; 87 } else if(diff<=SLOPE_REACH_POS_3) { 88 return 3; 89 } else { 90 return 4; 91 } 92 } else { 93 if(diff>=SLOPE_REACH_NEG_2) { 94 return 2; 95 } else if(diff>=SLOPE_REACH_NEG_3) { 96 return 3; 97 } else { 98 return 4; 99 } 100 } 101 } 102 103 /* 104 * Encode the code points of a string as 105 * a sequence of byte-encoded differences (slope detection), 106 * preserving lexical order. 107 * 108 * Optimize the difference-taking for runs of Unicode text within 109 * small scripts: 110 * 111 * Most small scripts are allocated within aligned 128-blocks of Unicode 112 * code points. Lexical order is preserved if "prev" is always moved 113 * into the middle of such a block. 114 * 115 * Additionally, "prev" is moved from anywhere in the Unihan 116 * area into the middle of that area. 117 * Note that the identical-level run in a sort key is generated from 118 * NFD text - there are never Hangul characters included. 119 */ 120 U_CFUNC int32_t 121 u_writeIdenticalLevelRun(const UChar *s, int32_t length, uint8_t *p) { 122 uint8_t *p0; 123 int32_t c, prev; 124 int32_t i; 125 126 prev=0; 127 p0=p; 128 i=0; 129 while(i<length) { 130 if(prev<0x4e00 || prev>=0xa000) { 131 prev=(prev&~0x7f)-SLOPE_REACH_NEG_1; 132 } else { 133 /* 134 * Unihan U+4e00..U+9fa5: 135 * double-bytes down from the upper end 136 */ 137 prev=0x9fff-SLOPE_REACH_POS_2; 138 } 139 140 UTF_NEXT_CHAR(s, i, length, c); 141 p=u_writeDiff(c-prev, p); 142 prev=c; 143 } 144 return (int32_t)(p-p0); 145 } 146 147 U_CFUNC int32_t 148 u_writeIdenticalLevelRunTwoChars(UChar32 first, UChar32 second, uint8_t *p) { 149 uint8_t *p0 = p; 150 if(first<0x4e00 || first>=0xa000) { 151 first=(first&~0x7f)-SLOPE_REACH_NEG_1; 152 } else { 153 /* 154 * Unihan U+4e00..U+9fa5: 155 * double-bytes down from the upper end 156 */ 157 first=0x9fff-SLOPE_REACH_POS_2; 158 } 159 160 p=u_writeDiff(second-first, p); 161 return (int32_t)(p-p0); 162 } 163 164 /* How many bytes would writeIdenticalLevelRun() write? */ 165 U_CFUNC int32_t 166 u_lengthOfIdenticalLevelRun(const UChar *s, int32_t length) { 167 int32_t c, prev; 168 int32_t i, runLength; 169 170 prev=0; 171 runLength=0; 172 i=0; 173 while(i<length) { 174 if(prev<0x4e00 || prev>=0xa000) { 175 prev=(prev&~0x7f)-SLOPE_REACH_NEG_1; 176 } else { 177 /* 178 * Unihan U+4e00..U+9fa5: 179 * double-bytes down from the upper end 180 */ 181 prev=0x9fff-SLOPE_REACH_POS_2; 182 } 183 184 UTF_NEXT_CHAR(s, i, length, c); 185 runLength+=lengthOfDiff(c-prev); 186 prev=c; 187 } 188 return runLength; 189 } 190 191 #endif /* #if !UCONFIG_NO_COLLATION */ 192