Home | History | Annotate | Download | only in unicode
      1 /*
      2  *******************************************************************************
      3  *
      4  *   Copyright (C) 2003-2004, International Business Machines
      5  *   Corporation and others.  All Rights Reserved.
      6  *
      7  *******************************************************************************
      8  *   file name:  uidna.h
      9  *   encoding:   US-ASCII
     10  *   tab size:   8 (not used)
     11  *   indentation:4
     12  *
     13  *   created on: 2003feb1
     14  *   created by: Ram Viswanadha
     15  */
     16 
     17 #ifndef __UIDNA_H__
     18 #define __UIDNA_H__
     19 
     20 #include "unicode/utypes.h"
     21 
     22 #if !UCONFIG_NO_IDNA
     23 
     24 #include "unicode/parseerr.h"
     25 
     26 /**
     27  *\file
     28  * UIDNA API implements the IDNA protocol as defined in the IDNA RFC
     29  * (http://www.ietf.org/rfc/rfc3490.txt).
     30  * The RFC defines 2 operations: ToASCII and ToUnicode. Domain labels
     31  * containing non-ASCII code points are required to be processed by
     32  * ToASCII operation before passing it to resolver libraries. Domain names
     33  * that are obtained from resolver libraries are required to be processed by
     34  * ToUnicode operation before displaying the domain name to the user.
     35  * IDNA requires that implementations process input strings with Nameprep
     36  * (http://www.ietf.org/rfc/rfc3491.txt),
     37  * which is a profile of Stringprep (http://www.ietf.org/rfc/rfc3454.txt),
     38  * and then with Punycode (http://www.ietf.org/rfc/rfc3492.txt).
     39  * Implementations of IDNA MUST fully implement Nameprep and Punycode;
     40  * neither Nameprep nor Punycode are optional.
     41  * The input and output of ToASCII and ToUnicode operations are Unicode
     42  * and are designed to be chainable, i.e., applying ToASCII or ToUnicode operations
     43  * multiple times to an input string will yield the same result as applying the operation
     44  * once.
     45  * ToUnicode(ToUnicode(ToUnicode...(ToUnicode(string)))) == ToUnicode(string)
     46  * ToASCII(ToASCII(ToASCII...(ToASCII(string))) == ToASCII(string).
     47  *
     48  */
     49 
     50 #ifndef U_HIDE_DRAFT_API
     51 
     52 /**
     53  * Option to prohibit processing of unassigned codepoints in the input and
     54  * do not check if the input conforms to STD-3 ASCII rules.
     55  *
     56  * @see  uidna_toASCII uidna_toUnicode
     57  * @stable ICU 2.6
     58  */
     59 #define UIDNA_DEFAULT          0x0000
     60 /**
     61  * Option to allow processing of unassigned codepoints in the input
     62  *
     63  * @see  uidna_toASCII uidna_toUnicode
     64  * @stable ICU 2.6
     65  */
     66 #define UIDNA_ALLOW_UNASSIGNED 0x0001
     67 /**
     68  * Option to check if input conforms to STD-3 ASCII rules
     69  *
     70  * @see  uidna_toASCII uidna_toUnicode
     71  * @stable ICU 2.6
     72  */
     73 #define UIDNA_USE_STD3_RULES   0x0002
     74 
     75 #endif /*U_HIDE_DRAFT_API*/
     76 
     77 /**
     78  * This function implements the ToASCII operation as defined in the IDNA RFC.
     79  * This operation is done on <b>single labels</b> before sending it to something that expects
     80  * ASCII names. A label is an individual part of a domain name. Labels are usually
     81  * separated by dots; e.g." "www.example.com" is composed of 3 labels
     82  * "www","example", and "com".
     83  *
     84  *
     85  * @param src               Input UChar array containing label in Unicode.
     86  * @param srcLength         Number of UChars in src, or -1 if NUL-terminated.
     87  * @param dest              Output UChar array with ASCII (ACE encoded) label.
     88  * @param destCapacity      Size of dest.
     89  * @param options           A bit set of options:
     90  *
     91  *  - UIDNA_DEFAULT             Use default options, i.e., do not process unassigned code points
     92  *                              and do not use STD3 ASCII rules
     93  *                              If unassigned code points are found the operation fails with
     94  *                              U_UNASSIGNED_ERROR error code.
     95  *
     96  *  - UIDNA_ALLOW_UNASSIGNED    Unassigned values can be converted to ASCII for query operations
     97  *                              If this option is set, the unassigned code points are in the input
     98  *                              are treated as normal Unicode code points.
     99  *
    100  *  - UIDNA_USE_STD3_RULES      Use STD3 ASCII rules for host name syntax restrictions
    101  *                              If this option is set and the input does not satisfy STD3 rules,
    102  *                              the operation will fail with U_IDNA_STD3_ASCII_RULES_ERROR
    103  *
    104  * @param parseError        Pointer to UParseError struct to receive information on position
    105  *                          of error if an error is encountered. Can be NULL.
    106  * @param status            ICU in/out error code parameter.
    107  *                          U_INVALID_CHAR_FOUND if src contains
    108  *                          unmatched single surrogates.
    109  *                          U_INDEX_OUTOFBOUNDS_ERROR if src contains
    110  *                          too many code points.
    111  *                          U_BUFFER_OVERFLOW_ERROR if destCapacity is not enough
    112  * @return                  Number of ASCII characters converted.
    113  * @stable ICU 2.6
    114  */
    115 U_STABLE int32_t U_EXPORT2
    116 uidna_toASCII(const UChar* src, int32_t srcLength,
    117               UChar* dest, int32_t destCapacity,
    118               int32_t options,
    119               UParseError* parseError,
    120               UErrorCode* status);
    121 
    122 
    123 /**
    124  * This function implements the ToUnicode operation as defined in the IDNA RFC.
    125  * This operation is done on <b>single labels</b> before sending it to something that expects
    126  * Unicode names. A label is an individual part of a domain name. Labels are usually
    127  * separated by dots; for e.g." "www.example.com" is composed of 3 labels
    128  * "www","example", and "com".
    129  *
    130  * @param src               Input UChar array containing ASCII (ACE encoded) label.
    131  * @param srcLength         Number of UChars in src, or -1 if NUL-terminated.
    132  * @param dest Output       Converted UChar array containing Unicode equivalent of label.
    133  * @param destCapacity      Size of dest.
    134  * @param options           A bit set of options:
    135  *
    136  *  - UIDNA_DEFAULT             Use default options, i.e., do not process unassigned code points
    137  *                              and do not use STD3 ASCII rules
    138  *                              If unassigned code points are found the operation fails with
    139  *                              U_UNASSIGNED_ERROR error code.
    140  *
    141  *  - UIDNA_ALLOW_UNASSIGNED      Unassigned values can be converted to ASCII for query operations
    142  *                              If this option is set, the unassigned code points are in the input
    143  *                              are treated as normal Unicode code points. <b> Note: </b> This option is
    144  *                              required on toUnicode operation because the RFC mandates
    145  *                              verification of decoded ACE input by applying toASCII and comparing
    146  *                              its output with source
    147  *
    148  *
    149  *
    150  *  - UIDNA_USE_STD3_RULES      Use STD3 ASCII rules for host name syntax restrictions
    151  *                              If this option is set and the input does not satisfy STD3 rules,
    152  *                              the operation will fail with U_IDNA_STD3_ASCII_RULES_ERROR
    153  *
    154  * @param parseError        Pointer to UParseError struct to receive information on position
    155  *                          of error if an error is encountered. Can be NULL.
    156  * @param status            ICU in/out error code parameter.
    157  *                          U_INVALID_CHAR_FOUND if src contains
    158  *                          unmatched single surrogates.
    159  *                          U_INDEX_OUTOFBOUNDS_ERROR if src contains
    160  *                          too many code points.
    161  *                          U_BUFFER_OVERFLOW_ERROR if destCapacity is not enough
    162  * @return                  Number of Unicode characters converted.
    163  * @stable ICU 2.6
    164  */
    165 U_STABLE int32_t U_EXPORT2
    166 uidna_toUnicode(const UChar* src, int32_t srcLength,
    167                 UChar* dest, int32_t destCapacity,
    168                 int32_t options,
    169                 UParseError* parseError,
    170                 UErrorCode* status);
    171 
    172 
    173 /**
    174  * Convenience function that implements the IDNToASCII operation as defined in the IDNA RFC.
    175  * This operation is done on complete domain names, e.g: "www.example.com".
    176  * It is important to note that this operation can fail. If it fails, then the input
    177  * domain name cannot be used as an Internationalized Domain Name and the application
    178  * should have methods defined to deal with the failure.
    179  *
    180  * <b>Note:</b> IDNA RFC specifies that a conformant application should divide a domain name
    181  * into separate labels, decide whether to apply allowUnassigned and useSTD3ASCIIRules on each,
    182  * and then convert. This function does not offer that level of granularity. The options once
    183  * set will apply to all labels in the domain name
    184  *
    185  * @param src               Input UChar array containing IDN in Unicode.
    186  * @param srcLength         Number of UChars in src, or -1 if NUL-terminated.
    187  * @param dest              Output UChar array with ASCII (ACE encoded) IDN.
    188  * @param destCapacity      Size of dest.
    189  * @param options           A bit set of options:
    190  *
    191  *  - UIDNA_DEFAULT             Use default options, i.e., do not process unassigned code points
    192  *                              and do not use STD3 ASCII rules
    193  *                              If unassigned code points are found the operation fails with
    194  *                              U_UNASSIGNED_CODE_POINT_FOUND error code.
    195  *
    196  *  - UIDNA_ALLOW_UNASSIGNED    Unassigned values can be converted to ASCII for query operations
    197  *                              If this option is set, the unassigned code points are in the input
    198  *                              are treated as normal Unicode code points.
    199  *
    200  *  - UIDNA_USE_STD3_RULES      Use STD3 ASCII rules for host name syntax restrictions
    201  *                              If this option is set and the input does not satisfy STD3 rules,
    202  *                              the operation will fail with U_IDNA_STD3_ASCII_RULES_ERROR
    203  *
    204  * @param parseError        Pointer to UParseError struct to receive information on position
    205  *                          of error if an error is encountered. Can be NULL.
    206  * @param status            ICU in/out error code parameter.
    207  *                          U_INVALID_CHAR_FOUND if src contains
    208  *                          unmatched single surrogates.
    209  *                          U_INDEX_OUTOFBOUNDS_ERROR if src contains
    210  *                          too many code points.
    211  *                          U_BUFFER_OVERFLOW_ERROR if destCapacity is not enough
    212  * @return                  Number of ASCII characters converted.
    213  * @stable ICU 2.6
    214  */
    215 U_STABLE int32_t U_EXPORT2
    216 uidna_IDNToASCII(  const UChar* src, int32_t srcLength,
    217                    UChar* dest, int32_t destCapacity,
    218                    int32_t options,
    219                    UParseError* parseError,
    220                    UErrorCode* status);
    221 
    222 /**
    223  * Convenience function that implements the IDNToUnicode operation as defined in the IDNA RFC.
    224  * This operation is done on complete domain names, e.g: "www.example.com".
    225  *
    226  * <b>Note:</b> IDNA RFC specifies that a conformant application should divide a domain name
    227  * into separate labels, decide whether to apply allowUnassigned and useSTD3ASCIIRules on each,
    228  * and then convert. This function does not offer that level of granularity. The options once
    229  * set will apply to all labels in the domain name
    230  *
    231  * @param src               Input UChar array containing IDN in ASCII (ACE encoded) form.
    232  * @param srcLength         Number of UChars in src, or -1 if NUL-terminated.
    233  * @param dest Output       UChar array containing Unicode equivalent of source IDN.
    234  * @param destCapacity      Size of dest.
    235  * @param options           A bit set of options:
    236  *
    237  *  - UIDNA_DEFAULT             Use default options, i.e., do not process unassigned code points
    238  *                              and do not use STD3 ASCII rules
    239  *                              If unassigned code points are found the operation fails with
    240  *                              U_UNASSIGNED_CODE_POINT_FOUND error code.
    241  *
    242  *  - UIDNA_ALLOW_UNASSIGNED    Unassigned values can be converted to ASCII for query operations
    243  *                              If this option is set, the unassigned code points are in the input
    244  *                              are treated as normal Unicode code points.
    245  *
    246  *  - UIDNA_USE_STD3_RULES      Use STD3 ASCII rules for host name syntax restrictions
    247  *                              If this option is set and the input does not satisfy STD3 rules,
    248  *                              the operation will fail with U_IDNA_STD3_ASCII_RULES_ERROR
    249  *
    250  * @param parseError        Pointer to UParseError struct to receive information on position
    251  *                          of error if an error is encountered. Can be NULL.
    252  * @param status            ICU in/out error code parameter.
    253  *                          U_INVALID_CHAR_FOUND if src contains
    254  *                          unmatched single surrogates.
    255  *                          U_INDEX_OUTOFBOUNDS_ERROR if src contains
    256  *                          too many code points.
    257  *                          U_BUFFER_OVERFLOW_ERROR if destCapacity is not enough
    258  * @return                  Number of ASCII characters converted.
    259  * @stable ICU 2.6
    260  */
    261 U_STABLE int32_t U_EXPORT2
    262 uidna_IDNToUnicode(  const UChar* src, int32_t srcLength,
    263                      UChar* dest, int32_t destCapacity,
    264                      int32_t options,
    265                      UParseError* parseError,
    266                      UErrorCode* status);
    267 
    268 /**
    269  * Compare two IDN strings for equivalence.
    270  * This function splits the domain names into labels and compares them.
    271  * According to IDN RFC, whenever two labels are compared, they are
    272  * considered equal if and only if their ASCII forms (obtained by
    273  * applying toASCII) match using an case-insensitive ASCII comparison.
    274  * Two domain names are considered a match if and only if all labels
    275  * match regardless of whether label separators match.
    276  *
    277  * @param s1                First source string.
    278  * @param length1           Length of first source string, or -1 if NUL-terminated.
    279  *
    280  * @param s2                Second source string.
    281  * @param length2           Length of second source string, or -1 if NUL-terminated.
    282  * @param options           A bit set of options:
    283  *
    284  *  - UIDNA_DEFAULT             Use default options, i.e., do not process unassigned code points
    285  *                              and do not use STD3 ASCII rules
    286  *                              If unassigned code points are found the operation fails with
    287  *                              U_UNASSIGNED_CODE_POINT_FOUND error code.
    288  *
    289  *  - UIDNA_ALLOW_UNASSIGNED    Unassigned values can be converted to ASCII for query operations
    290  *                              If this option is set, the unassigned code points are in the input
    291  *                              are treated as normal Unicode code points.
    292  *
    293  *  - UIDNA_USE_STD3_RULES      Use STD3 ASCII rules for host name syntax restrictions
    294  *                              If this option is set and the input does not satisfy STD3 rules,
    295  *                              the operation will fail with U_IDNA_STD3_ASCII_RULES_ERROR
    296  *
    297  * @param status            ICU error code in/out parameter.
    298  *                          Must fulfill U_SUCCESS before the function call.
    299  * @return <0 or 0 or >0 as usual for string comparisons
    300  * @stable ICU 2.6
    301  */
    302 U_STABLE int32_t U_EXPORT2
    303 uidna_compare(  const UChar *s1, int32_t length1,
    304                 const UChar *s2, int32_t length2,
    305                 int32_t options,
    306                 UErrorCode* status);
    307 
    308 #endif /* #if !UCONFIG_NO_IDNA */
    309 
    310 #endif
    311