icu/impl/IDNA2003.java

//  2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html#License
/*
*******************************************************************************
* Copyright (C) 2003-2010, International Business Machines
* Corporation and others.  All Rights Reserved.
*******************************************************************************
*/
package com.ibm.icu.impl;

import com.ibm.icu.text.IDNA;
import com.ibm.icu.text.StringPrep;
import com.ibm.icu.text.StringPrepParseException;
import com.ibm.icu.text.UCharacterIterator;

/**
 * IDNA2003 implementation code, moved out of com.ibm.icu.text.IDNA.java
 * while extending that class to support IDNA2008/UTS #46 as well.
 * @author Ram Viswanadha
 */
public final class IDNA2003 {
    /* IDNA ACE Prefix is "xn--" */
    private static char[] ACE_PREFIX                = new char[]{ 0x0078,0x006E,0x002d,0x002d } ;
    //private static final int ACE_PREFIX_LENGTH      = ACE_PREFIX.length;

    private static final int MAX_LABEL_LENGTH       = 63;
    private static final int HYPHEN                 = 0x002D;
    private static final int CAPITAL_A              = 0x0041;
    private static final int CAPITAL_Z              = 0x005A;
    private static final int LOWER_CASE_DELTA       = 0x0020;
    private static final int FULL_STOP              = 0x002E;
    private static final int MAX_DOMAIN_NAME_LENGTH = 255;

    // The NamePrep profile object
    private static final StringPrep namePrep = StringPrep.getInstance(StringPrep.RFC3491_NAMEPREP);

    private static boolean startsWithPrefix(StringBuffer src){
        boolean startsWithPrefix = true;

        if(src.length() < ACE_PREFIX.length){
            return false;
        }
        for(int i=0; i<ACE_PREFIX.length;i++){
            if(toASCIILower(src.charAt(i)) != ACE_PREFIX[i]){
                startsWithPrefix = false;
            }
        }
        return startsWithPrefix;
    }

    private static char toASCIILower(char ch){
        if(CAPITAL_A <= ch && ch <= CAPITAL_Z){
            return (char)(ch + LOWER_CASE_DELTA);
        }
        return ch;
    }

    private static StringBuffer toASCIILower(CharSequence src){
        StringBuffer dest = new StringBuffer();
        for(int i=0; i<src.length();i++){
            dest.append(toASCIILower(src.charAt(i)));
        }
        return dest;
    }

    private static int compareCaseInsensitiveASCII(StringBuffer s1, StringBuffer s2){
        char c1,c2;
        int rc;
        for(int i =0;/* no condition */;i++) {
            /* If we reach the ends of both strings then they match */
            if(i == s1.length()) {
                return 0;
            }

            c1 = s1.charAt(i);
            c2 = s2.charAt(i);

            /* Case-insensitive comparison */
            if(c1!=c2) {
                rc=toASCIILower(c1)-toASCIILower(c2);
                if(rc!=0) {
                    return rc;
                }
            }
        }
    }

    private static int getSeparatorIndex(char[] src,int start, int limit){
        for(; start<limit;start++){
            if(isLabelSeparator(src[start])){
                return start;
            }
        }
        // we have not found the separator just return length
        return start;
    }

    /*
    private static int getSeparatorIndex(UCharacterIterator iter){
        int currentIndex = iter.getIndex();
        int separatorIndex = 0;
        int ch;
        while((ch=iter.next())!= UCharacterIterator.DONE){
            if(isLabelSeparator(ch)){
                separatorIndex = iter.getIndex();
                iter.setIndex(currentIndex);
                return separatorIndex;
            }
        }
        // reset index
        iter.setIndex(currentIndex);
        // we have not found the separator just return the length

    }
    */


    private static boolean isLDHChar(int ch){
        // high runner case
        if(ch>0x007A){
            return false;
        }
        //[\\u002D \\u0030-\\u0039 \\u0041-\\u005A \\u0061-\\u007A]
        if( (ch==0x002D) ||
            (0x0030 <= ch && ch <= 0x0039) ||
            (0x0041 <= ch && ch <= 0x005A) ||
            (0x0061 <= ch && ch <= 0x007A)
          ){
            return true;
        }
        return false;
    }

    /**
     * Ascertain if the given code point is a label separator as
     * defined by the IDNA RFC
     *
     * @param ch The code point to be ascertained
     * @return true if the char is a label separator
     * @stable ICU 2.8
     */
    private static boolean isLabelSeparator(int ch){
        switch(ch){
            case 0x002e:
            case 0x3002:
            case 0xFF0E:
            case 0xFF61:
                return true;
            default:
                return false;
        }
    }

    public static StringBuffer convertToASCII(UCharacterIterator src, int options)
            throws StringPrepParseException{

        boolean[] caseFlags = null;

        // the source contains all ascii codepoints
        boolean srcIsASCII  = true;
        // assume the source contains all LDH codepoints
        boolean srcIsLDH = true;

        //get the options
        boolean useSTD3ASCIIRules = ((options & IDNA.USE_STD3_RULES) != 0);
        int ch;
        // step 1
        while((ch = src.next())!= UCharacterIterator.DONE){
            if(ch> 0x7f){
                srcIsASCII = false;
            }
        }
        int failPos = -1;
        src.setToStart();
        StringBuffer processOut = null;
        // step 2 is performed only if the source contains non ASCII
        if(!srcIsASCII){
            // step 2
            processOut = namePrep.prepare(src, options);
        }else{
            processOut = new StringBuffer(src.getText());
        }
        int poLen = processOut.length();

        if(poLen==0){
            throw new StringPrepParseException("Found zero length lable after NamePrep.",StringPrepParseException.ZERO_LENGTH_LABEL);
        }
        StringBuffer dest = new StringBuffer();

        // reset the variable to verify if output of prepare is ASCII or not
        srcIsASCII = true;

        // step 3 & 4
        for(int j=0;j<poLen;j++ ){
            ch=processOut.charAt(j);
            if(ch > 0x7F){
                srcIsASCII = false;
            }else if(isLDHChar(ch)==false){
                // here we do not assemble surrogates
                // since we know that LDH code points
                // are in the ASCII range only
                srcIsLDH = false;
                failPos = j;
            }
        }

        if(useSTD3ASCIIRules == true){
            // verify 3a and 3b
            if( srcIsLDH == false /* source contains some non-LDH characters */
                || processOut.charAt(0) ==  HYPHEN
                || processOut.charAt(processOut.length()-1) == HYPHEN){

                /* populate the parseError struct */
                if(srcIsLDH==false){
                     throw new StringPrepParseException( "The input does not conform to the STD 3 ASCII rules",
                                              StringPrepParseException.STD3_ASCII_RULES_ERROR,
                                              processOut.toString(),
                                             (failPos>0) ? (failPos-1) : failPos);
                }else if(processOut.charAt(0) == HYPHEN){
                    throw new StringPrepParseException("The input does not conform to the STD 3 ASCII rules",
                                              StringPrepParseException.STD3_ASCII_RULES_ERROR,processOut.toString(),0);

                }else{
                     throw new StringPrepParseException("The input does not conform to the STD 3 ASCII rules",
                                              StringPrepParseException.STD3_ASCII_RULES_ERROR,
                                              processOut.toString(),
                                              (poLen>0) ? poLen-1 : poLen);

                }
            }
        }
        if(srcIsASCII){
            dest =  processOut;
        }else{
            // step 5 : verify the sequence does not begin with ACE prefix
            if(!startsWithPrefix(processOut)){

                //step 6: encode the sequence with punycode
                caseFlags = new boolean[poLen];

                StringBuilder punyout = Punycode.encode(processOut,caseFlags);

                // convert all codepoints to lower case ASCII
                StringBuffer lowerOut = toASCIILower(punyout);

                //Step 7: prepend the ACE prefix
                dest.append(ACE_PREFIX,0,ACE_PREFIX.length);
                //Step 6: copy the contents in b2 into dest
                dest.append(lowerOut);
            }else{

                throw new StringPrepParseException("The input does not start with the ACE Prefix.",
                                         StringPrepParseException.ACE_PREFIX_ERROR,processOut.toString(),0);
            }
        }
        if(dest.length() > MAX_LABEL_LENGTH){
            throw new StringPrepParseException("The labels in the input are too long. Length > 63.",
                                     StringPrepParseException.LABEL_TOO_LONG_ERROR,dest.toString(),0);
        }
        return dest;
    }

    public static StringBuffer convertIDNToASCII(String src,int options)
            throws StringPrepParseException{

        char[] srcArr = src.toCharArray();
        StringBuffer result = new StringBuffer();
        int sepIndex=0;
        int oldSepIndex=0;
        for(;;){
            sepIndex = getSeparatorIndex(srcArr,sepIndex,srcArr.length);
            String label = new String(srcArr,oldSepIndex,sepIndex-oldSepIndex);
            //make sure this is not a root label separator.
            if(!(label.length()==0 && sepIndex==srcArr.length)){
                UCharacterIterator iter = UCharacterIterator.getInstance(label);
                result.append(convertToASCII(iter,options));
            }
            if(sepIndex==srcArr.length){
                break;
            }

            // increment the sepIndex to skip past the separator
            sepIndex++;
            oldSepIndex = sepIndex;
            result.append((char)FULL_STOP);
        }
        if(result.length() > MAX_DOMAIN_NAME_LENGTH){
            throw new StringPrepParseException("The output exceed the max allowed length.", StringPrepParseException.DOMAIN_NAME_TOO_LONG_ERROR);
        }
        return result;
    }

    public static StringBuffer convertToUnicode(UCharacterIterator src, int options)
            throws StringPrepParseException{

        boolean[] caseFlags = null;

        // the source contains all ascii codepoints
        boolean srcIsASCII  = true;
        // assume the source contains all LDH codepoints
        //boolean srcIsLDH = true;

        //get the options
        //boolean useSTD3ASCIIRules = ((options & USE_STD3_RULES) != 0);

        //int failPos = -1;
        int ch;
        int saveIndex = src.getIndex();
        // step 1: find out if all the codepoints in src are ASCII
        while((ch=src.next())!= UCharacterIterator.DONE){
            if(ch>0x7F){
                srcIsASCII = false;
            }/*else if((srcIsLDH = isLDHChar(ch))==false){
                failPos = src.getIndex();
            }*/
        }
        StringBuffer processOut;

        if(srcIsASCII == false){
            try {
                // step 2: process the string
                src.setIndex(saveIndex);
                processOut = namePrep.prepare(src,options);
            } catch (StringPrepParseException ex) {
                return new StringBuffer(src.getText());
            }

        }else{
            //just point to source
            processOut = new StringBuffer(src.getText());
        }
        // TODO:
        // The RFC states that
        // <quote>
        // ToUnicode never fails. If any step fails, then the original input
        // is returned immediately in that step.
        // </quote>

        //step 3: verify ACE Prefix
        if(startsWithPrefix(processOut)){
            StringBuffer decodeOut = null;

            //step 4: Remove the ACE Prefix
            String temp = processOut.substring(ACE_PREFIX.length,processOut.length());

            //step 5: Decode using punycode
            try {
                decodeOut = new StringBuffer(Punycode.decode(temp,caseFlags));
            } catch (StringPrepParseException e) {
                decodeOut = null;
            }

            //step 6:Apply toASCII
            if (decodeOut != null) {
                StringBuffer toASCIIOut = convertToASCII(UCharacterIterator.getInstance(decodeOut), options);

                //step 7: verify
                if(compareCaseInsensitiveASCII(processOut, toASCIIOut) !=0){
//                    throw new StringPrepParseException("The verification step prescribed by the RFC 3491 failed",
//                                             StringPrepParseException.VERIFICATION_ERROR);
                    decodeOut = null;
                }
            }

            //step 8: return output of step 5
             if (decodeOut != null) {
                 return decodeOut;
             }
        }

//        }else{
//            // verify that STD3 ASCII rules are satisfied
//            if(useSTD3ASCIIRules == true){
//                if( srcIsLDH == false /* source contains some non-LDH characters */
//                    || processOut.charAt(0) ==  HYPHEN
//                    || processOut.charAt(processOut.length()-1) == HYPHEN){
//
//                    if(srcIsLDH==false){
//                        throw new StringPrepParseException("The input does not conform to the STD 3 ASCII rules",
//                                                 StringPrepParseException.STD3_ASCII_RULES_ERROR,processOut.toString(),
//                                                 (failPos>0) ? (failPos-1) : failPos);
//                    }else if(processOut.charAt(0) == HYPHEN){
//                        throw new StringPrepParseException("The input does not conform to the STD 3 ASCII rules",
//                                                 StringPrepParseException.STD3_ASCII_RULES_ERROR,
//                                                 processOut.toString(),0);
//
//                    }else{
//                        throw new StringPrepParseException("The input does not conform to the STD 3 ASCII rules",
//                                                 StringPrepParseException.STD3_ASCII_RULES_ERROR,
//                                                 processOut.toString(),
//                                                 processOut.length());
//
//                    }
//                }
//            }
//            // just return the source
//            return new StringBuffer(src.getText());
//        }

        return new StringBuffer(src.getText());
    }

    public static StringBuffer convertIDNToUnicode(String src, int options)
            throws StringPrepParseException{

        char[] srcArr = src.toCharArray();
        StringBuffer result = new StringBuffer();
        int sepIndex=0;
        int oldSepIndex=0;
        for(;;){
            sepIndex = getSeparatorIndex(srcArr,sepIndex,srcArr.length);
            String label = new String(srcArr,oldSepIndex,sepIndex-oldSepIndex);
            if(label.length()==0 && sepIndex!=srcArr.length ){
                throw new StringPrepParseException("Found zero length lable after NamePrep.",StringPrepParseException.ZERO_LENGTH_LABEL);
            }
            UCharacterIterator iter = UCharacterIterator.getInstance(label);
            result.append(convertToUnicode(iter,options));
            if(sepIndex==srcArr.length){
                break;
            }
            // Unlike the ToASCII operation we don't normalize the label separators
            result.append(srcArr[sepIndex]);
            // increment the sepIndex to skip past the separator
            sepIndex++;
            oldSepIndex =sepIndex;
        }
        if(result.length() > MAX_DOMAIN_NAME_LENGTH){
            throw new StringPrepParseException("The output exceed the max allowed length.", StringPrepParseException.DOMAIN_NAME_TOO_LONG_ERROR);
        }
        return result;
    }

    public static int compare(String s1, String s2, int options) throws StringPrepParseException{
        StringBuffer s1Out = convertIDNToASCII(s1, options);
        StringBuffer s2Out = convertIDNToASCII(s2, options);
        return compareCaseInsensitiveASCII(s1Out,s2Out);
    }
}