icu/patches/search_collation.patch

Index: source/test/cintltst/usrchtst.c
===================================================================
--- source/test/cintltst/usrchtst.c	(revision 75773)
+++ source/test/cintltst/usrchtst.c	(working copy)
@@ -1,5 +1,5 @@
 /********************************************************************
- * Copyright (c) 2001-2010 International Business Machines
+ * Copyright (c) 2001-2011 International Business Machines
  * Corporation and others. All Rights Reserved.
  ********************************************************************
  * File usrchtst.c
@@ -2553,7 +2553,173 @@
     ucol_close(coll);
 }

+/**
+* TestUsingSearchCollator
+*/

+#define ARRAY_LENGTH(array) (sizeof(array)/sizeof(array[0]))
+
+typedef struct {
+    const UChar *   pattern;
+    const int32_t * offsets;
+    int32_t         offsetsLen;
+} PatternAndOffsets;
+
+static const UChar scKoText[] = {
+       0x0020,
+/*01*/ 0xAC00, 0x0020,                         /* simple LV Hangul */
+/*03*/ 0xAC01, 0x0020,                         /* simple LVT Hangul */
+/*05*/ 0xAC0F, 0x0020,                         /* LVTT, last jamo expands for search */
+/*07*/ 0xAFFF, 0x0020,                         /* LLVVVTT, every jamo expands for search */
+/*09*/ 0x1100, 0x1161, 0x11A8, 0x0020,         /* 0xAC01 as conjoining jamo */
+/*13*/ 0x1100, 0x1161, 0x1100, 0x0020,         /* 0xAC01 as basic conjoining jamo (per search rules) */
+/*17*/ 0x3131, 0x314F, 0x3131, 0x0020,         /* 0xAC01 as compatibility jamo */
+/*21*/ 0x1100, 0x1161, 0x11B6, 0x0020,         /* 0xAC0F as conjoining jamo; last expands for search */
+/*25*/ 0x1100, 0x1161, 0x1105, 0x1112, 0x0020, /* 0xAC0F as basic conjoining jamo; last expands for search */
+/*30*/ 0x1101, 0x1170, 0x11B6, 0x0020,         /* 0xAFFF as conjoining jamo; all expand for search */
+/*34*/ 0x00E6, 0x0020,                         /* small letter ae, expands */
+/*36*/ 0x1E4D, 0x0020,                         /* small letter o with tilde and acute, decomposes */
+       0
+};
+
+static const UChar scKoPat0[] = { 0xAC01, 0 };
+static const UChar scKoPat1[] = { 0x1100, 0x1161, 0x11A8, 0 }; /* 0xAC01 as conjoining jamo */
+static const UChar scKoPat2[] = { 0xAC0F, 0 };
+static const UChar scKoPat3[] = { 0x1100, 0x1161, 0x1105, 0x1112, 0 }; /* 0xAC0F as basic conjoining jamo */
+static const UChar scKoPat4[] = { 0xAFFF, 0 };
+static const UChar scKoPat5[] = { 0x1101, 0x1170, 0x11B6, 0 }; /* 0xAFFF as conjoining jamo */
+
+static const int32_t scKoSrchOff01[] = { 3,  9, 13 };
+static const int32_t scKoSrchOff23[] = { 5, 21, 25 };
+static const int32_t scKoSrchOff45[] = { 7, 30     };
+
+static const PatternAndOffsets scKoSrchPatternsOffsets[] = {
+    { scKoPat0, scKoSrchOff01, ARRAY_LENGTH(scKoSrchOff01) },
+    { scKoPat1, scKoSrchOff01, ARRAY_LENGTH(scKoSrchOff01) },
+    { scKoPat2, scKoSrchOff23, ARRAY_LENGTH(scKoSrchOff23) },
+    { scKoPat3, scKoSrchOff23, ARRAY_LENGTH(scKoSrchOff23) },
+    { scKoPat4, scKoSrchOff45, ARRAY_LENGTH(scKoSrchOff45) },
+    { scKoPat5, scKoSrchOff45, ARRAY_LENGTH(scKoSrchOff45) },
+    { NULL,     NULL,          0                           }
+};
+
+static const int32_t scKoStndOff01[] = { 3,  9 };
+static const int32_t scKoStndOff2[]  = { 5, 21 };
+static const int32_t scKoStndOff3[]  = { 25    };
+static const int32_t scKoStndOff45[] = { 7, 30 };
+
+static const PatternAndOffsets scKoStndPatternsOffsets[] = {
+    { scKoPat0, scKoStndOff01, ARRAY_LENGTH(scKoStndOff01) },
+    { scKoPat1, scKoStndOff01, ARRAY_LENGTH(scKoStndOff01) },
+    { scKoPat2, scKoStndOff2,  ARRAY_LENGTH(scKoStndOff2)  },
+    { scKoPat3, scKoStndOff3,  ARRAY_LENGTH(scKoStndOff3)  },
+    { scKoPat4, scKoStndOff45, ARRAY_LENGTH(scKoStndOff45) },
+    { scKoPat5, scKoStndOff45, ARRAY_LENGTH(scKoStndOff45) },
+    { NULL,     NULL,          0                           }
+};
+
+typedef struct {
+    const char *  locale;
+    const UChar * text;
+    const PatternAndOffsets * patternsAndOffsets;
+} TUSCItem;
+
+static const TUSCItem tuscItems[] = {
+    { "root",                  scKoText, scKoStndPatternsOffsets },
+    { "root@collation=search", scKoText, scKoSrchPatternsOffsets },
+    { "ko@collation=search",   scKoText, scKoSrchPatternsOffsets },
+    { NULL,                    NULL,     NULL                    }
+};
+
+static const UChar dummyPat[] = { 0x0061, 0 };
+
+static void TestUsingSearchCollator(void)
+{
+    const TUSCItem * tuscItemPtr;
+    for (tuscItemPtr = tuscItems; tuscItemPtr->locale != NULL; tuscItemPtr++) {
+        UErrorCode status = U_ZERO_ERROR;
+        UCollator* ucol = ucol_open(tuscItemPtr->locale, &status);
+        if ( U_SUCCESS(status) ) {
+            UStringSearch* usrch = usearch_openFromCollator(dummyPat, -1, tuscItemPtr->text, -1, ucol, NULL, &status);
+            if ( U_SUCCESS(status) ) {
+                const PatternAndOffsets * patternsOffsetsPtr;
+                for ( patternsOffsetsPtr = tuscItemPtr->patternsAndOffsets; patternsOffsetsPtr->pattern != NULL; patternsOffsetsPtr++) {
+                    usearch_setPattern(usrch, patternsOffsetsPtr->pattern, -1, &status);
+                    if ( U_SUCCESS(status) ) {
+                        int32_t offset;
+                        const int32_t * nextOffsetPtr;
+                        const int32_t * limitOffsetPtr;
+
+                        usearch_reset(usrch);
+                        nextOffsetPtr = patternsOffsetsPtr->offsets;
+                        limitOffsetPtr = patternsOffsetsPtr->offsets + patternsOffsetsPtr->offsetsLen;
+                        while (TRUE) {
+                            offset = usearch_next(usrch, &status);
+                            if ( U_FAILURE(status) || offset == USEARCH_DONE ) {
+                                break;
+                            }
+                            if ( nextOffsetPtr < limitOffsetPtr ) {
+                                 if (offset != *nextOffsetPtr) {
+                                     log_err("error, locale %s, expected usearch_next %d, got %d\n", tuscItemPtr->locale, *nextOffsetPtr, offset);
+                                     nextOffsetPtr = limitOffsetPtr;
+                                     break;
+                                 }
+                                 nextOffsetPtr++;
+                            } else {
+                                log_err("error, locale %s, usearch_next returned more matches than expected\n", tuscItemPtr->locale );
+                            }
+                        }
+                        if ( U_FAILURE(status) ) {
+                            log_err("error, locale %s, usearch_next failed: %s\n", tuscItemPtr->locale, u_errorName(status) );
+                        } else if ( nextOffsetPtr < limitOffsetPtr ) {
+                            log_err("error, locale %s, usearch_next returned fewer matches than expected\n", tuscItemPtr->locale );
+                        }
+
+                        status = U_ZERO_ERROR;
+                        usearch_reset(usrch);
+                        nextOffsetPtr = patternsOffsetsPtr->offsets + patternsOffsetsPtr->offsetsLen;
+                        limitOffsetPtr = patternsOffsetsPtr->offsets;
+                        while (TRUE) {
+                            offset = usearch_previous(usrch, &status);
+                            if ( U_FAILURE(status) || offset == USEARCH_DONE ) {
+                                break;
+                            }
+                            if ( nextOffsetPtr > limitOffsetPtr ) {
+                                nextOffsetPtr--;
+                                if (offset != *nextOffsetPtr) {
+                                     log_err("error, locale %s, expected usearch_previous %d, got %d\n", tuscItemPtr->locale, *nextOffsetPtr, offset);
+                                     nextOffsetPtr = limitOffsetPtr;
+                                      break;
+                                }
+                            } else {
+                                log_err("error, locale %s, usearch_previous returned more matches than expected\n", tuscItemPtr->locale );
+                            }
+                        }
+                        if ( U_FAILURE(status) ) {
+                            log_err("error, locale %s, usearch_previous failed: %s\n", tuscItemPtr->locale, u_errorName(status) );
+                        } else if ( nextOffsetPtr > limitOffsetPtr ) {
+                            log_err("error, locale %s, usearch_previous returned fewer matches than expected\n", tuscItemPtr->locale );
+                        }
+
+                    } else {
+                        log_err("error, locale %s, usearch_setPattern failed: %s\n", tuscItemPtr->locale, u_errorName(status) );
+                    }
+                }
+                usearch_close(usrch);
+            } else {
+                log_err("error, locale %s, usearch_openFromCollator failed: %s\n", tuscItemPtr->locale, u_errorName(status) );
+            }
+            ucol_close(ucol);
+        } else {
+            log_err("error, locale %s, ucol_open failed: %s\n", tuscItemPtr->locale, u_errorName(status) );
+        }
+    }
+}
+
+/**
+* addSearchTest
+*/
+
 void addSearchTest(TestNode** root)
 {
     addTest(root, &TestStart, "tscoll/usrchtst/TestStart");
@@ -2608,6 +2774,7 @@
     addTest(root, &TestForwardBackward, "tscoll/usrchtst/TestForwardBackward");
 	addTest(root, &TestSearchForNull, "tscoll/usrchtst/TestSearchForNull");
     addTest(root, &TestStrengthIdentical, "tscoll/usrchtst/TestStrengthIdentical");
+    addTest(root, &TestUsingSearchCollator, "tscoll/usrchtst/TestUsingSearchCollator");
 }

 #endif /* #if !UCONFIG_NO_COLLATION */
Index: source/test/cintltst/citertst.c
===================================================================
--- source/test/cintltst/citertst.c	(revision 75773)
+++ source/test/cintltst/citertst.c	(working copy)
@@ -1,6 +1,6 @@
 /********************************************************************
  * COPYRIGHT:
- * Copyright (c) 1997-2010, International Business Machines Corporation and
+ * Copyright (c) 1997-2011, International Business Machines Corporation and
  * others. All Rights Reserved.
  ********************************************************************/
 /********************************************************************************
@@ -22,6 +22,7 @@
 #if !UCONFIG_NO_COLLATION

 #include "unicode/ucol.h"
+#include "unicode/ucoleitr.h"
 #include "unicode/uloc.h"
 #include "unicode/uchar.h"
 #include "unicode/ustring.h"
@@ -58,6 +59,7 @@
     addTest(root, &TestCEBufferOverflow, "tscoll/citertst/TestCEBufferOverflow");
     addTest(root, &TestCEValidity, "tscoll/citertst/TestCEValidity");
     addTest(root, &TestSortKeyValidity, "tscoll/citertst/TestSortKeyValidity");
+    addTest(root, &TestSearchCollatorElements, "tscoll/citertst/TestSearchCollatorElements");
 }

 /* The locales we support */
@@ -2017,4 +2019,141 @@
     T_FileStream_close(file);
 }

+/**
+* TestSearchCollatorElements tests iterator behavior (forwards and backwards) with
+* normalization on AND jamo tailoring, among other things.
+*/
+static const UChar tsceText[] = {   /* Nothing in here should be ignorable */
+    0x0020, 0xAC00,                 /* simple LV Hangul */
+    0x0020, 0xAC01,                 /* simple LVT Hangul */
+    0x0020, 0xAC0F,                 /* LVTT, last jamo expands for search */
+    0x0020, 0xAFFF,                 /* LLVVVTT, every jamo expands for search */
+    0x0020, 0x1100, 0x1161, 0x11A8, /* 0xAC01 as conjoining jamo */
+    0x0020, 0x3131, 0x314F, 0x3131, /* 0xAC01 as compatibility jamo */
+    0x0020, 0x1100, 0x1161, 0x11B6, /* 0xAC0F as conjoining jamo; last expands for search */
+    0x0020, 0x1101, 0x1170, 0x11B6, /* 0xAFFF as conjoining jamo; all expand for search */
+    0x0020, 0x00E6,                 /* small letter ae, expands */
+    0x0020, 0x1E4D,                 /* small letter o with tilde and acute, decomposes */
+    0x0020
+};
+enum { kLen_tsceText = sizeof(tsceText)/sizeof(tsceText[0]) };
+
+static const int32_t rootStandardOffsets[] = {
+    0,  1,2,
+    2,  3,4,4,
+    4,  5,6,6,
+    6,  7,8,8,
+    8,  9,10,11,
+    12, 13,14,15,
+    16, 17,18,19,
+    20, 21,22,23,
+    24, 25,26,26,26,
+    26, 27,28,28,
+    28,
+    29
+};
+enum { kLen_rootStandardOffsets = sizeof(rootStandardOffsets)/sizeof(rootStandardOffsets[0]) };
+
+static const int32_t rootSearchOffsets[] = {
+    0,  1,2,
+    2,  3,4,4,
+    4,  5,6,6,6,
+    6,  7,8,8,8,8,8,8,
+    8,  9,10,11,
+    12, 13,14,15,
+    16, 17,18,19,20,
+    20, 21,22,22,23,23,23,24,
+    24, 25,26,26,26,
+    26, 27,28,28,
+    28,
+    29
+};
+enum { kLen_rootSearchOffsets = sizeof(rootSearchOffsets)/sizeof(rootSearchOffsets[0]) };
+
+typedef struct {
+    const char *    locale;
+    const int32_t * offsets;
+    int32_t         offsetsLen;
+} TSCEItem;
+
+static const TSCEItem tsceItems[] = {
+    { "root",                  rootStandardOffsets, kLen_rootStandardOffsets },
+    { "root@collation=search", rootSearchOffsets,   kLen_rootSearchOffsets   },
+    { NULL,                    NULL,                0                        }
+};
+
+static void TestSearchCollatorElements(void)
+{
+    const TSCEItem * tsceItemPtr;
+    for (tsceItemPtr = tsceItems; tsceItemPtr->locale != NULL; tsceItemPtr++) {
+        UErrorCode status = U_ZERO_ERROR;
+        UCollator* ucol = ucol_open(tsceItemPtr->locale, &status);
+        if ( U_SUCCESS(status) ) {
+            UCollationElements * uce = ucol_openElements(ucol, tsceText, kLen_tsceText, &status);
+            if ( U_SUCCESS(status) ) {
+                int32_t offset, element;
+                const int32_t * nextOffsetPtr;
+                const int32_t * limitOffsetPtr;
+
+                nextOffsetPtr = tsceItemPtr->offsets;
+                limitOffsetPtr = tsceItemPtr->offsets + tsceItemPtr->offsetsLen;
+                do {
+                    offset = ucol_getOffset(uce);
+                    element = ucol_next(uce, &status);
+                    if ( element == 0 ) {
+                        log_err("error, locale %s, ucol_next returned element 0\n", tsceItemPtr->locale );
+                    }
+                    if ( nextOffsetPtr < limitOffsetPtr ) {
+                        if (offset != *nextOffsetPtr) {
+                            log_err("error, locale %s, expected ucol_next -> ucol_getOffset %d, got %d\n",
+                                                            tsceItemPtr->locale, *nextOffsetPtr, offset );
+                            nextOffsetPtr = limitOffsetPtr;
+                            break;
+                        }
+                        nextOffsetPtr++;
+                    } else {
+                        log_err("error, locale %s, ucol_next returned more elements than expected\n", tsceItemPtr->locale );
+                    }
+                } while ( U_SUCCESS(status) && element != UCOL_NULLORDER );
+                if ( nextOffsetPtr < limitOffsetPtr ) {
+                    log_err("error, locale %s, ucol_next returned fewer elements than expected\n", tsceItemPtr->locale );
+                }
+
+                ucol_setOffset(uce, kLen_tsceText, &status);
+                status = U_ZERO_ERROR;
+                nextOffsetPtr = tsceItemPtr->offsets + tsceItemPtr->offsetsLen;
+                limitOffsetPtr = tsceItemPtr->offsets;
+                do {
+                    offset = ucol_getOffset(uce);
+                    element = ucol_previous(uce, &status);
+                    if ( element == 0 ) {
+                        log_err("error, locale %s, ucol_previous returned element 0\n", tsceItemPtr->locale );
+                    }
+                    if ( nextOffsetPtr > limitOffsetPtr ) {
+                        nextOffsetPtr--;
+                        if (offset != *nextOffsetPtr) {
+                            log_err("error, locale %s, expected ucol_previous -> ucol_getOffset %d, got %d\n",
+                                                                tsceItemPtr->locale, *nextOffsetPtr, offset );
+                            nextOffsetPtr = limitOffsetPtr;
+                            break;
+                        }
+                   } else {
+                        log_err("error, locale %s, ucol_previous returned more elements than expected\n", tsceItemPtr->locale );
+                    }
+                } while ( U_SUCCESS(status) && element != UCOL_NULLORDER );
+                if ( nextOffsetPtr > limitOffsetPtr ) {
+                    log_err("error, locale %s, ucol_previous returned fewer elements than expected\n", tsceItemPtr->locale );
+                }
+
+                ucol_closeElements(uce);
+            } else {
+                log_err("error, locale %s, ucol_openElements failed: %s\n", tsceItemPtr->locale, u_errorName(status) );
+            }
+            ucol_close(ucol);
+        } else {
+            log_err("error, locale %s, ucol_open failed: %s\n", tsceItemPtr->locale, u_errorName(status) );
+        }
+    }
+}
+
 #endif /* #if !UCONFIG_NO_COLLATION */
Index: source/test/cintltst/citertst.h
===================================================================
--- source/test/cintltst/citertst.h	(revision 75773)
+++ source/test/cintltst/citertst.h	(working copy)
@@ -1,6 +1,6 @@
 /********************************************************************
  * COPYRIGHT:
- * Copyright (c) 1997-2008, International Business Machines Corporation and
+ * Copyright (c) 1997-2008,2011, International Business Machines Corporation and
  * others. All Rights Reserved.
  ********************************************************************/
 /********************************************************************************
@@ -101,6 +101,11 @@
 * Bound checkings.
 */
 static void TestSortKeyValidity(void);
+/**
+* TestSearchCollatorElements tests iterator behavior (forwards and backwards) with
+* normalization on AND jamo tailoring, among other things.
+*/
+static void TestSearchCollatorElements(void);

 /*------------------------------------------------------------------------
  Internal utilities
Index: source/i18n/ucol.cpp
===================================================================
--- source/i18n/ucol.cpp	(revision 75773)
+++ source/i18n/ucol.cpp	(working copy)
@@ -1,6 +1,6 @@
 /*
 *******************************************************************************
-*   Copyright (C) 1996-2010, International Business Machines
+*   Copyright (C) 1996-2011, International Business Machines
 *   Corporation and others.  All Rights Reserved.
 *******************************************************************************
 *   file name:  ucol.cpp
@@ -1444,173 +1444,176 @@
     UChar ch = 0;
     collationSource->offsetReturn = NULL;

-    for (;;)                           /* Loop handles case when incremental normalize switches   */
-    {                                  /*   to or from the side buffer / original string, and we  */
-        /*   need to start again to get the next character.        */
+    do {
+        for (;;)                           /* Loop handles case when incremental normalize switches   */
+        {                                  /*   to or from the side buffer / original string, and we  */
+            /*   need to start again to get the next character.        */

-        if ((collationSource->flags & (UCOL_ITER_HASLEN | UCOL_ITER_INNORMBUF | UCOL_ITER_NORM | UCOL_HIRAGANA_Q | UCOL_USE_ITERATOR)) == 0)
-        {
-            // The source string is null terminated and we're not working from the side buffer,
-            //   and we're not normalizing.  This is the fast path.
-            //   (We can be in the side buffer for Thai pre-vowel reordering even when not normalizing.)
-            ch = *collationSource->pos++;
-            if (ch != 0) {
-                break;
+            if ((collationSource->flags & (UCOL_ITER_HASLEN | UCOL_ITER_INNORMBUF | UCOL_ITER_NORM | UCOL_HIRAGANA_Q | UCOL_USE_ITERATOR)) == 0)
+            {
+                // The source string is null terminated and we're not working from the side buffer,
+                //   and we're not normalizing.  This is the fast path.
+                //   (We can be in the side buffer for Thai pre-vowel reordering even when not normalizing.)
+                ch = *collationSource->pos++;
+                if (ch != 0) {
+                    break;
+                }
+                else {
+                    return UCOL_NO_MORE_CES;
+                }
             }
-            else {
-                return UCOL_NO_MORE_CES;
-            }
-        }

-        if (collationSource->flags & UCOL_ITER_HASLEN) {
-            // Normal path for strings when length is specified.
-            //   (We can't be in side buffer because it is always null terminated.)
-            if (collationSource->pos >= collationSource->endp) {
-                // Ran off of the end of the main source string.  We're done.
-                return UCOL_NO_MORE_CES;
+            if (collationSource->flags & UCOL_ITER_HASLEN) {
+                // Normal path for strings when length is specified.
+                //   (We can't be in side buffer because it is always null terminated.)
+                if (collationSource->pos >= collationSource->endp) {
+                    // Ran off of the end of the main source string.  We're done.
+                    return UCOL_NO_MORE_CES;
+                }
+                ch = *collationSource->pos++;
             }
-            ch = *collationSource->pos++;
-        }
-        else if(collationSource->flags & UCOL_USE_ITERATOR) {
-            UChar32 iterCh = collationSource->iterator->next(collationSource->iterator);
-            if(iterCh == U_SENTINEL) {
-                return UCOL_NO_MORE_CES;
-            }
-            ch = (UChar)iterCh;
-        }
-        else
-        {
-            // Null terminated string.
-            ch = *collationSource->pos++;
-            if (ch == 0) {
-                // Ran off end of buffer.
-                if ((collationSource->flags & UCOL_ITER_INNORMBUF) == 0) {
-                    // Ran off end of main string. backing up one character.
-                    collationSource->pos--;
+            else if(collationSource->flags & UCOL_USE_ITERATOR) {
+                UChar32 iterCh = collationSource->iterator->next(collationSource->iterator);
+                if(iterCh == U_SENTINEL) {
                     return UCOL_NO_MORE_CES;
                 }
-                else
-                {
-                    // Hit null in the normalize side buffer.
-                    // Usually this means the end of the normalized data,
-                    // except for one odd case: a null followed by combining chars,
-                    //   which is the case if we are at the start of the buffer.
-                    if (collationSource->pos == collationSource->writableBuffer.getBuffer()+1) {
-                        break;
+                ch = (UChar)iterCh;
+            }
+            else
+            {
+                // Null terminated string.
+                ch = *collationSource->pos++;
+                if (ch == 0) {
+                    // Ran off end of buffer.
+                    if ((collationSource->flags & UCOL_ITER_INNORMBUF) == 0) {
+                        // Ran off end of main string. backing up one character.
+                        collationSource->pos--;
+                        return UCOL_NO_MORE_CES;
                     }
+                    else
+                    {
+                        // Hit null in the normalize side buffer.
+                        // Usually this means the end of the normalized data,
+                        // except for one odd case: a null followed by combining chars,
+                        //   which is the case if we are at the start of the buffer.
+                        if (collationSource->pos == collationSource->writableBuffer.getBuffer()+1) {
+                            break;
+                        }

-                    //  Null marked end of side buffer.
-                    //   Revert to the main string and
-                    //   loop back to top to try again to get a character.
-                    collationSource->pos   = collationSource->fcdPosition;
-                    collationSource->flags = collationSource->origFlags;
-                    continue;
+                        //  Null marked end of side buffer.
+                        //   Revert to the main string and
+                        //   loop back to top to try again to get a character.
+                        collationSource->pos   = collationSource->fcdPosition;
+                        collationSource->flags = collationSource->origFlags;
+                        continue;
+                    }
                 }
             }
-        }

-        if(collationSource->flags&UCOL_HIRAGANA_Q) {
-            /* Codepoints \u3099-\u309C are both Hiragana and Katakana. Set the flag
-             * based on whether the previous codepoint was Hiragana or Katakana.
-             */
-            if(((ch>=0x3040 && ch<=0x3096) || (ch >= 0x309d && ch <= 0x309f)) ||
-                    ((collationSource->flags & UCOL_WAS_HIRAGANA) && (ch >= 0x3099 && ch <= 0x309C))) {
-                collationSource->flags |= UCOL_WAS_HIRAGANA;
-            } else {
-                collationSource->flags &= ~UCOL_WAS_HIRAGANA;
+            if(collationSource->flags&UCOL_HIRAGANA_Q) {
+                /* Codepoints \u3099-\u309C are both Hiragana and Katakana. Set the flag
+                 * based on whether the previous codepoint was Hiragana or Katakana.
+                 */
+                if(((ch>=0x3040 && ch<=0x3096) || (ch >= 0x309d && ch <= 0x309f)) ||
+                        ((collationSource->flags & UCOL_WAS_HIRAGANA) && (ch >= 0x3099 && ch <= 0x309C))) {
+                    collationSource->flags |= UCOL_WAS_HIRAGANA;
+                } else {
+                    collationSource->flags &= ~UCOL_WAS_HIRAGANA;
+                }
             }
-        }

-        // We've got a character.  See if there's any fcd and/or normalization stuff to do.
-        //    Note that UCOL_ITER_NORM flag is always zero when we are in the side buffer.
-        if ((collationSource->flags & UCOL_ITER_NORM) == 0) {
-            break;
-        }
+            // We've got a character.  See if there's any fcd and/or normalization stuff to do.
+            //    Note that UCOL_ITER_NORM flag is always zero when we are in the side buffer.
+            if ((collationSource->flags & UCOL_ITER_NORM) == 0) {
+                break;
+            }

-        if (collationSource->fcdPosition >= collationSource->pos) {
-            // An earlier FCD check has already covered the current character.
-            // We can go ahead and process this char.
-            break;
-        }
-
-        if (ch < ZERO_CC_LIMIT_ ) {
-            // Fast fcd safe path.  Trailing combining class == 0.  This char is OK.
-            break;
-        }
-
-        if (ch < NFC_ZERO_CC_BLOCK_LIMIT_) {
-            // We need to peek at the next character in order to tell if we are FCD
-            if ((collationSource->flags & UCOL_ITER_HASLEN) && collationSource->pos >= collationSource->endp) {
-                // We are at the last char of source string.
-                //  It is always OK for FCD check.
+            if (collationSource->fcdPosition >= collationSource->pos) {
+                // An earlier FCD check has already covered the current character.
+                // We can go ahead and process this char.
                 break;
             }

-            // Not at last char of source string (or we'll check against terminating null).  Do the FCD fast test
-            if (*collationSource->pos < NFC_ZERO_CC_BLOCK_LIMIT_) {
+            if (ch < ZERO_CC_LIMIT_ ) {
+                // Fast fcd safe path.  Trailing combining class == 0.  This char is OK.
                 break;
             }
-        }

+            if (ch < NFC_ZERO_CC_BLOCK_LIMIT_) {
+                // We need to peek at the next character in order to tell if we are FCD
+                if ((collationSource->flags & UCOL_ITER_HASLEN) && collationSource->pos >= collationSource->endp) {
+                    // We are at the last char of source string.
+                    //  It is always OK for FCD check.
+                    break;
+                }

-        // Need a more complete FCD check and possible normalization.
-        if (collIterFCD(collationSource)) {
-            collIterNormalize(collationSource);
-        }
-        if ((collationSource->flags & UCOL_ITER_INNORMBUF) == 0) {
-            //  No normalization was needed.  Go ahead and process the char we already had.
-            break;
-        }
+                // Not at last char of source string (or we'll check against terminating null).  Do the FCD fast test
+                if (*collationSource->pos < NFC_ZERO_CC_BLOCK_LIMIT_) {
+                    break;
+                }
+            }

-        // Some normalization happened.  Next loop iteration will pick up a char
-        //   from the normalization buffer.

-    }   // end for (;;)
+            // Need a more complete FCD check and possible normalization.
+            if (collIterFCD(collationSource)) {
+                collIterNormalize(collationSource);
+            }
+            if ((collationSource->flags & UCOL_ITER_INNORMBUF) == 0) {
+                //  No normalization was needed.  Go ahead and process the char we already had.
+                break;
+            }

+            // Some normalization happened.  Next loop iteration will pick up a char
+            //   from the normalization buffer.

-    if (ch <= 0xFF) {
-        /*  For latin-1 characters we never need to fall back to the UCA table        */
-        /*    because all of the UCA data is replicated in the latinOneMapping array  */
-        order = coll->latinOneMapping[ch];
-        if (order > UCOL_NOT_FOUND) {
-            order = ucol_prv_getSpecialCE(coll, ch, order, collationSource, status);
+        }   // end for (;;)
+
+
+        if (ch <= 0xFF) {
+            /*  For latin-1 characters we never need to fall back to the UCA table        */
+            /*    because all of the UCA data is replicated in the latinOneMapping array  */
+            order = coll->latinOneMapping[ch];
+            if (order > UCOL_NOT_FOUND) {
+                order = ucol_prv_getSpecialCE(coll, ch, order, collationSource, status);
+            }
         }
-    }
-    else
-    {
-        // Always use UCA for Han, Hangul
-        // (Han extension A is before main Han block)
-        // **** Han compatibility chars ?? ****
-        if ((collationSource->flags & UCOL_FORCE_HAN_IMPLICIT) != 0 &&
-            (ch >= UCOL_FIRST_HAN_A && ch <= UCOL_LAST_HANGUL)) {
-            if (ch > UCOL_LAST_HAN && ch < UCOL_FIRST_HANGUL) {
-                // between the two target ranges; do normal lookup
-                // **** this range is YI, Modifier tone letters, ****
-                // **** Latin-D, Syloti Nagari, Phagas-pa.       ****
-                // **** Latin-D might be tailored, so we need to ****
-                // **** do the normal lookup for these guys.     ****
+        else
+        {
+            // Always use UCA for Han, Hangul
+            // (Han extension A is before main Han block)
+            // **** Han compatibility chars ?? ****
+            if ((collationSource->flags & UCOL_FORCE_HAN_IMPLICIT) != 0 &&
+                (ch >= UCOL_FIRST_HAN_A && ch <= UCOL_LAST_HANGUL)) {
+                if (ch > UCOL_LAST_HAN && ch < UCOL_FIRST_HANGUL) {
+                    // between the two target ranges; do normal lookup
+                    // **** this range is YI, Modifier tone letters, ****
+                    // **** Latin-D, Syloti Nagari, Phagas-pa.       ****
+                    // **** Latin-D might be tailored, so we need to ****
+                    // **** do the normal lookup for these guys.     ****
+                    order = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch);
+                } else {
+                    // in one of the target ranges; use UCA
+                    order = UCOL_NOT_FOUND;
+                }
+            } else {
                 order = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch);
-            } else {
-                // in one of the target ranges; use UCA
-                order = UCOL_NOT_FOUND;
             }
-        } else {
-            order = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch);
-        }

-        if(order > UCOL_NOT_FOUND) {                                       /* if a CE is special                */
-            order = ucol_prv_getSpecialCE(coll, ch, order, collationSource, status);    /* and try to get the special CE     */
-        }
+            if(order > UCOL_NOT_FOUND) {                                       /* if a CE is special                */
+                order = ucol_prv_getSpecialCE(coll, ch, order, collationSource, status);    /* and try to get the special CE     */
+            }

-        if(order == UCOL_NOT_FOUND && coll->UCA) {   /* We couldn't find a good CE in the tailoring */
-            /* if we got here, the codepoint MUST be over 0xFF - so we look directly in the trie */
-            order = UTRIE_GET32_FROM_LEAD(&coll->UCA->mapping, ch);
+            if(order == UCOL_NOT_FOUND && coll->UCA) {   /* We couldn't find a good CE in the tailoring */
+                /* if we got here, the codepoint MUST be over 0xFF - so we look directly in the trie */
+                order = UTRIE_GET32_FROM_LEAD(&coll->UCA->mapping, ch);

-            if(order > UCOL_NOT_FOUND) { /* UCA also gives us a special CE */
-                order = ucol_prv_getSpecialCE(coll->UCA, ch, order, collationSource, status);
+                if(order > UCOL_NOT_FOUND) { /* UCA also gives us a special CE */
+                    order = ucol_prv_getSpecialCE(coll->UCA, ch, order, collationSource, status);
+                }
             }
         }
-    }
+    } while ( order == UCOL_IGNORABLE && ch >= UCOL_FIRST_HANGUL && ch <= UCOL_LAST_HANGUL );
+
     if(order == UCOL_NOT_FOUND) {
         order = getImplicit(ch, collationSource);
     }
@@ -1958,161 +1961,163 @@
     else {
         UChar ch = 0;

-        /*
-        Loop handles case when incremental normalize switches to or from the
-        side buffer / original string, and we need to start again to get the
-        next character.
-        */
-        for (;;) {
-            if (data->flags & UCOL_ITER_HASLEN) {
-                /*
-                Normal path for strings when length is specified.
-                Not in side buffer because it is always null terminated.
-                */
-                if (data->pos <= data->string) {
-                    /* End of the main source string */
-                    return UCOL_NO_MORE_CES;
-                }
-                data->pos --;
-                ch = *data->pos;
-            }
-            // we are using an iterator to go back. Pray for us!
-            else if (data->flags & UCOL_USE_ITERATOR) {
-              UChar32 iterCh = data->iterator->previous(data->iterator);
-              if(iterCh == U_SENTINEL) {
-                return UCOL_NO_MORE_CES;
-              } else {
-                ch = (UChar)iterCh;
-              }
-            }
-            else {
-                data->pos --;
-                ch = *data->pos;
-                /* we are in the side buffer. */
-                if (ch == 0) {
+        do {
+            /*
+            Loop handles case when incremental normalize switches to or from the
+            side buffer / original string, and we need to start again to get the
+            next character.
+            */
+            for (;;) {
+                if (data->flags & UCOL_ITER_HASLEN) {
                     /*
-                    At the start of the normalize side buffer.
-                    Go back to string.
-                    Because pointer points to the last accessed character,
-                    hence we have to increment it by one here.
+                    Normal path for strings when length is specified.
+                    Not in side buffer because it is always null terminated.
                     */
-                    data->flags = data->origFlags;
-                    data->offsetRepeatValue = 0;
-
-                     if (data->fcdPosition == NULL) {
-                        data->pos = data->string;
+                    if (data->pos <= data->string) {
+                        /* End of the main source string */
                         return UCOL_NO_MORE_CES;
                     }
-                    else {
-                        data->pos   = data->fcdPosition + 1;
+                    data->pos --;
+                    ch = *data->pos;
+                }
+                // we are using an iterator to go back. Pray for us!
+                else if (data->flags & UCOL_USE_ITERATOR) {
+                  UChar32 iterCh = data->iterator->previous(data->iterator);
+                  if(iterCh == U_SENTINEL) {
+                    return UCOL_NO_MORE_CES;
+                  } else {
+                    ch = (UChar)iterCh;
+                  }
+                }
+                else {
+                    data->pos --;
+                    ch = *data->pos;
+                    /* we are in the side buffer. */
+                    if (ch == 0) {
+                        /*
+                        At the start of the normalize side buffer.
+                        Go back to string.
+                        Because pointer points to the last accessed character,
+                        hence we have to increment it by one here.
+                        */
+                        data->flags = data->origFlags;
+                        data->offsetRepeatValue = 0;
+
+                         if (data->fcdPosition == NULL) {
+                            data->pos = data->string;
+                            return UCOL_NO_MORE_CES;
+                        }
+                        else {
+                            data->pos   = data->fcdPosition + 1;
+                        }
+
+                       continue;
                     }
-
-                   continue;
                 }
-            }

-            if(data->flags&UCOL_HIRAGANA_Q) {
-              if(ch>=0x3040 && ch<=0x309f) {
-                data->flags |= UCOL_WAS_HIRAGANA;
-              } else {
-                data->flags &= ~UCOL_WAS_HIRAGANA;
-              }
-            }
+                if(data->flags&UCOL_HIRAGANA_Q) {
+                  if(ch>=0x3040 && ch<=0x309f) {
+                    data->flags |= UCOL_WAS_HIRAGANA;
+                  } else {
+                    data->flags &= ~UCOL_WAS_HIRAGANA;
+                  }
+                }

-            /*
-            * got a character to determine if there's fcd and/or normalization
-            * stuff to do.
-            * if the current character is not fcd.
-            * if current character is at the start of the string
-            * Trailing combining class == 0.
-            * Note if pos is in the writablebuffer, norm is always 0
-            */
-            if (ch < ZERO_CC_LIMIT_ ||
-              // this should propel us out of the loop in the iterator case
-                (data->flags & UCOL_ITER_NORM) == 0 ||
-                (data->fcdPosition != NULL && data->fcdPosition <= data->pos)
-                || data->string == data->pos) {
-                break;
-            }
-
-            if (ch < NFC_ZERO_CC_BLOCK_LIMIT_) {
-                /* if next character is FCD */
-                if (data->pos == data->string) {
-                    /* First char of string is always OK for FCD check */
+                /*
+                * got a character to determine if there's fcd and/or normalization
+                * stuff to do.
+                * if the current character is not fcd.
+                * if current character is at the start of the string
+                * Trailing combining class == 0.
+                * Note if pos is in the writablebuffer, norm is always 0
+                */
+                if (ch < ZERO_CC_LIMIT_ ||
+                  // this should propel us out of the loop in the iterator case
+                    (data->flags & UCOL_ITER_NORM) == 0 ||
+                    (data->fcdPosition != NULL && data->fcdPosition <= data->pos)
+                    || data->string == data->pos) {
                     break;
                 }

-                /* Not first char of string, do the FCD fast test */
-                if (*(data->pos - 1) < NFC_ZERO_CC_BLOCK_LIMIT_) {
+                if (ch < NFC_ZERO_CC_BLOCK_LIMIT_) {
+                    /* if next character is FCD */
+                    if (data->pos == data->string) {
+                        /* First char of string is always OK for FCD check */
+                        break;
+                    }
+
+                    /* Not first char of string, do the FCD fast test */
+                    if (*(data->pos - 1) < NFC_ZERO_CC_BLOCK_LIMIT_) {
+                        break;
+                    }
+                }
+
+                /* Need a more complete FCD check and possible normalization. */
+                if (collPrevIterFCD(data)) {
+                    collPrevIterNormalize(data);
+                }
+
+                if ((data->flags & UCOL_ITER_INNORMBUF) == 0) {
+                    /*  No normalization. Go ahead and process the char. */
                     break;
                 }
-            }

-            /* Need a more complete FCD check and possible normalization. */
-            if (collPrevIterFCD(data)) {
-                collPrevIterNormalize(data);
+                /*
+                Some normalization happened.
+                Next loop picks up a char from the normalization buffer.
+                */
             }

-            if ((data->flags & UCOL_ITER_INNORMBUF) == 0) {
-                /*  No normalization. Go ahead and process the char. */
-                break;
-            }
-
-            /*
-            Some normalization happened.
-            Next loop picks up a char from the normalization buffer.
+            /* attempt to handle contractions, after removal of the backwards
+            contraction
             */
-        }
-
-        /* attempt to handle contractions, after removal of the backwards
-        contraction
-        */
-        if (ucol_contractionEndCP(ch, coll) && !isAtStartPrevIterate(data)) {
-            result = ucol_prv_getSpecialPrevCE(coll, ch, UCOL_CONTRACTION, data, status);
-        } else {
-            if (ch <= 0xFF) {
-                result = coll->latinOneMapping[ch];
-            }
-            else {
-                // Always use UCA for [3400..9FFF], [AC00..D7AF]
-                // **** [FA0E..FA2F] ?? ****
-                if ((data->flags & UCOL_FORCE_HAN_IMPLICIT) != 0 &&
-                    (ch >= 0x3400 && ch <= 0xD7AF)) {
-                    if (ch > 0x9FFF && ch < 0xAC00) {
-                        // between the two target ranges; do normal lookup
-                        // **** this range is YI, Modifier tone letters, ****
-                        // **** Latin-D, Syloti Nagari, Phagas-pa.       ****
-                        // **** Latin-D might be tailored, so we need to ****
-                        // **** do the normal lookup for these guys.     ****
-                         result = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch);
+            if (ucol_contractionEndCP(ch, coll) && !isAtStartPrevIterate(data)) {
+                result = ucol_prv_getSpecialPrevCE(coll, ch, UCOL_CONTRACTION, data, status);
+            } else {
+                if (ch <= 0xFF) {
+                    result = coll->latinOneMapping[ch];
+                }
+                else {
+                    // Always use UCA for [3400..9FFF], [AC00..D7AF]
+                    // **** [FA0E..FA2F] ?? ****
+                    if ((data->flags & UCOL_FORCE_HAN_IMPLICIT) != 0 &&
+                        (ch >= 0x3400 && ch <= 0xD7AF)) {
+                        if (ch > 0x9FFF && ch < 0xAC00) {
+                            // between the two target ranges; do normal lookup
+                            // **** this range is YI, Modifier tone letters, ****
+                            // **** Latin-D, Syloti Nagari, Phagas-pa.       ****
+                            // **** Latin-D might be tailored, so we need to ****
+                            // **** do the normal lookup for these guys.     ****
+                             result = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch);
+                        } else {
+                            result = UCOL_NOT_FOUND;
+                        }
                     } else {
-                        result = UCOL_NOT_FOUND;
+                        result = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch);
                     }
-                } else {
-                    result = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch);
                 }
-            }
-            if (result > UCOL_NOT_FOUND) {
-                result = ucol_prv_getSpecialPrevCE(coll, ch, result, data, status);
-            }
-            if (result == UCOL_NOT_FOUND) { // Not found in master list
-                if (!isAtStartPrevIterate(data) &&
-                    ucol_contractionEndCP(ch, data->coll))
-                {
-                    result = UCOL_CONTRACTION;
-                } else {
-                    if(coll->UCA) {
-                        result = UTRIE_GET32_FROM_LEAD(&coll->UCA->mapping, ch);
+                if (result > UCOL_NOT_FOUND) {
+                    result = ucol_prv_getSpecialPrevCE(coll, ch, result, data, status);
+                }
+                if (result == UCOL_NOT_FOUND) { // Not found in master list
+                    if (!isAtStartPrevIterate(data) &&
+                        ucol_contractionEndCP(ch, data->coll))
+                    {
+                        result = UCOL_CONTRACTION;
+                    } else {
+                        if(coll->UCA) {
+                            result = UTRIE_GET32_FROM_LEAD(&coll->UCA->mapping, ch);
+                        }
                     }
-                }

-                if (result > UCOL_NOT_FOUND) {
-                    if(coll->UCA) {
-                        result = ucol_prv_getSpecialPrevCE(coll->UCA, ch, result, data, status);
+                    if (result > UCOL_NOT_FOUND) {
+                        if(coll->UCA) {
+                            result = ucol_prv_getSpecialPrevCE(coll->UCA, ch, result, data, status);
+                        }
                     }
                 }
             }
-        }
+        } while ( result == UCOL_IGNORABLE && ch >= UCOL_FIRST_HANGUL && ch <= UCOL_LAST_HANGUL );

         if(result == UCOL_NOT_FOUND) {
             result = getPrevImplicit(ch, data);
@@ -3193,6 +3198,7 @@
                     // Since Hanguls pass the FCD check, it is
                     // guaranteed that we won't be in
                     // the normalization buffer if something like this happens
+
                     // However, if we are using a uchar iterator and normalization
                     // is ON, the Hangul that lead us here is going to be in that
                     // normalization buffer. Here we want to restore the uchar
@@ -3201,6 +3207,7 @@
                         source->flags = source->origFlags; // restore the iterator
                         source->pos = NULL;
                     }
+
                     // Move Jamos into normalization buffer
                     UChar *buffer = source->writableBuffer.getBuffer(4);
                     int32_t bufferLength;
@@ -3214,8 +3221,9 @@
                     }
                     source->writableBuffer.releaseBuffer(bufferLength);

-                    source->fcdPosition       = source->pos;   // Indicate where to continue in main input string
-                    //   after exhausting the writableBuffer
+                    // Indicate where to continue in main input string after exhausting the writableBuffer
+                    source->fcdPosition       = source->pos;
+
                     source->pos   = source->writableBuffer.getTerminatedBuffer();
                     source->origFlags   = source->flags;
                     source->flags       |= UCOL_ITER_INNORMBUF;
@@ -3966,13 +3974,10 @@
                     // Since Hanguls pass the FCD check, it is
                     // guaranteed that we won't be in
                     // the normalization buffer if something like this happens
+
                     // Move Jamos into normalization buffer
-                    /*
-                    Move the Jamos into the
-                    normalization buffer
-                    */
                     UChar *tempbuffer = source->writableBuffer.getBuffer(5);
-                    int32_t tempbufferLength;
+                    int32_t tempbufferLength, jamoOffset;
                     tempbuffer[0] = 0;
                     tempbuffer[1] = (UChar)L;
                     tempbuffer[2] = (UChar)V;
@@ -3984,16 +3989,30 @@
                     }
                     source->writableBuffer.releaseBuffer(tempbufferLength);

-                    /*
-                    Indicate where to continue in main input string after exhausting
-                    the writableBuffer
-                    */
+                    // Indicate where to continue in main input string after exhausting the writableBuffer
                     if (source->pos  == source->string) {
+                        jamoOffset = 0;
                         source->fcdPosition = NULL;
                     } else {
+                        jamoOffset = source->pos - source->string;
                         source->fcdPosition       = source->pos-1;
                     }
+
+					// Append offsets for the additional chars
+					// (not the 0, and not the L whose offsets match the original Hangul)
+                    int32_t jamoRemaining = tempbufferLength - 2;
+                    jamoOffset++; // appended offsets should match end of original Hangul
+                    while (jamoRemaining-- > 0) {
+                        source->appendOffset(jamoOffset, *status);
+                    }

+                    source->offsetRepeatValue = jamoOffset;
+
+                    source->offsetReturn = source->offsetStore - 1;
+                    if (source->offsetReturn == source->offsetBuffer) {
+                        source->offsetStore = source->offsetBuffer;
+                    }
+
                     source->pos               = source->writableBuffer.getTerminatedBuffer() + tempbufferLength;
                     source->origFlags         = source->flags;
                     source->flags            |= UCOL_ITER_INNORMBUF;