Home | History | Annotate | Download | only in test
      1 <?xml version="1.0"?>
      2 <!DOCTYPE kanjidic2 [
      3 	<!-- Version 1.3
      4 	This is the DTD of the XML-format kanji file combining information from
      5 	the KANJIDIC and KANJD212 files. It is intended to be largely self-
      6 	documenting, with each field being accompanied by an explanatory
      7 	comment.
      8 
      9 	The file covers the following kanji:
     10 	(a) the 6,355 kanji from JIS X 0208;
     11 	(b) the 5,801 kanji from JIS X 0212;
     12 	(c) the 3,625 kanji from JIS X 0213 as follows:
     13 		(i) the 2,741 kanji which are also in JIS X 0212 have
     14 		JIS X 0213 code-points (kuten) added to the existing entry;
     15 		(ii) the 884 "new" kanji have new entries.
     16 
     17 	At the end of the explanation for a number of fields there is a tag
     18 	with the format [N]. This indicates the leading letter(s) of the
     19 	equivalent field in the KANJIDIC and KANJD212 files.
     20 
     21 	The KANJIDIC documentation should also be read for additional 
     22 	information about the information in the file.
     23 	-->
     24 <!ELEMENT kanjidic2 (header,character*)>
     25 <!ELEMENT header (file_version,database_version,date_of_creation)>
     26 <!--
     27 	The single header element will contain identification information
     28 	about the version of the file 
     29 	-->
     30 <!ELEMENT file_version (#PCDATA)>
     31 <!--
     32 	This field denotes the version of kanjidic2 structure, as more
     33 	than one version may exist.
     34 	-->
     35 <!ELEMENT database_version (#PCDATA)>
     36 <!--
     37 	The version of the file, in the format YYYY-NN, where NN will be
     38 	a number starting with 01 for the first version released in a
     39 	calendar year, then increasing for each version in that year.
     40 	-->
     41 <!ELEMENT date_of_creation (#PCDATA)>
     42 <!--
     43 	The date the file was created in international format (YYYY-MM-DD).
     44 	-->
     45 <!ELEMENT character (literal,codepoint, radical, misc, dic_number?, query_code?, reading_meaning?,nanori?)*>
     46 <!ELEMENT literal (#PCDATA)>
     47 <!--
     48 	The character itself in UTF8 coding.
     49 	-->
     50 <!ELEMENT codepoint (cp_value+)>
     51 	<!-- 
     52 	The codepoint element states the code of the character in the various
     53 	character set standards.
     54 	-->
     55 <!ELEMENT cp_value (#PCDATA)>
     56 	<!-- 
     57 	The cp_value contains the codepoint of the character in a particular
     58 	standard. The standard will be identified in the cp_type attribute.
     59 	-->
     60 <!ATTLIST cp_value cp_type CDATA #REQUIRED>
     61 	<!-- 
     62 	The cp_type attribute states the coding standard applying to the
     63 	element. The values assigned so far are:
     64 		jis208 - JIS X 0208-1997 - kuten coding (nn-nn)
     65 		jis212 - JIS X 0212-1990 - kuten coding (nn-nn)
     66 		jis213 - JIS X 0213-2000 - kuten coding (p-nn-nn)
     67 		ucs - Unicode 4.0 - hex coding (4 or 5 hexadecimal digits)
     68 	-->
     69 <!ELEMENT radical (rad_value+)>
     70 <!ELEMENT rad_value (#PCDATA)>
     71 	<!-- 
     72 	The radical number, in the range 1 to 214. The particular
     73 	classification type is stated in the rad_type attribute.
     74 	-->
     75 <!ATTLIST rad_value rad_type CDATA #REQUIRED>
     76 	<!-- 
     77 	The rad_type attribute states the type of radical classification.
     78 		classical - as recorded in the KangXi Zidian.
     79 		nelson - as used in the Nelson "Modern Japanese-English 
     80 		Character Dictionary" (i.e. the Classic, not the New Nelson).
     81 		This will only be used where Nelson reclassified the kanji.
     82 	-->
     83 <!ELEMENT misc (grade?, stroke_count+, variant*, freq*, rad_name*)>
     84 <!ELEMENT grade (#PCDATA)>
     85 	<!-- 
     86 	The Jouyou Kanji grade level. 1 through 6 indicate the grade in which
     87 	the kanji is taught in Japanese schools. 8 indicates it is one of the
     88 	remaining Jouyou Kanji to be learned in junior high school, and 9 
     89 	indicates it is a Jinmeiyou (for use in names) kanji. [G]
     90 	-->
     91 <!ELEMENT stroke_count (#PCDATA)>
     92 	<!-- 
     93 	The stroke count of the kanji, including the radical. If more than 
     94 	one, the first is considered the accepted count, while subsequent ones 
     95 	are common miscounts. (See Appendix E. of the KANJIDIC documentation
     96 	for some of the rules applied when counting strokes in some of the 
     97 	radicals.) [S]
     98 	-->
     99 <!ELEMENT variant (#PCDATA)>
    100 	<!-- 
    101 	A cross-reference code to another kanji, usually regarded as a variant.
    102 	The type of cross-reference is given in the var_type attribute.
    103 	-->
    104 <!ATTLIST variant var_type CDATA #REQUIRED>
    105 	<!-- 
    106 	The var_type attribute indicates the type of variant code. The current
    107 	values are: 
    108 		jis208 - in JIS X 0208 - kuten coding
    109 		jis212 - in JIS X 0212 - kuten coding
    110 		jis213 - in JIS X 0213 - kuten coding
    111 		deroo - De Roo number - numeric
    112 		njecd - Halpern NJECD index number - numeric
    113 		s_h - The Kanji Dictionary (Spahn & Hadamitzky) - descriptor
    114 		nelson - "Classic" Nelson - numeric
    115 		oneill - Japanese Names (O'Neill) - numeric
    116 	-->
    117 <!ELEMENT freq (#PCDATA)>
    118 	<!-- 
    119 	A frequency-of-use ranking. The 2,500 most-used characters have a 
    120 	ranking; those characters that lack this field are not ranked. The 
    121 	frequency is a number from 1 to 2,500 that expresses the relative 
    122 	frequency of occurrence of a character in modern Japanese. This is
    123 	based on a survey in newspapers, so it is biassed towards kanji
    124 	used in newspaper articles. The discrimination between the less
    125 	frequently used kanji is not strong.
    126 	-->
    127 <!ELEMENT rad_name (#PCDATA)>
    128 	<!-- 
    129 	When the kanji is itself a radical and has a name, this element
    130 	contains the name (in hiragana.) [T2]
    131 	-->
    132 <!ELEMENT dic_number (dic_ref+)>
    133 	<!-- 
    134 	This element contains the index numbers and similar unstructured
    135 	information such as page numbers in a number of published dictionaries,
    136 	and instructional books on kanji.
    137 	-->
    138 <!ELEMENT dic_ref (#PCDATA)>
    139 	<!-- 
    140 	Each dic_ref contains an index number. The particular dictionary,
    141 	etc. is defined by the dr_type attribute.
    142 	-->
    143 <!ATTLIST dic_ref dr_type CDATA #REQUIRED>
    144 	<!-- 
    145 	The dr_type defines the dictionary or reference book, etc. to which
    146 	dic_ref element applies. The initial allocation is:
    147 	  nelson_c - "Modern Reader's Japanese-English Character Dictionary",  
    148 	  	edited by Andrew Nelson (now published as the "Classic" 
    149 	  	Nelson).
    150 	  nelson_n - "The New Nelson Japanese-English Character Dictionary", 
    151 	  	edited by John Haig.
    152 	  halpern_njecd - "New Japanese-English Character Dictionary", 
    153 	  	edited by Jack Halpern.
    154 	  halpern_kkld - "Kanji Learners Dictionary" (Kodansha) edited by 
    155 	  	Jack Halpern.
    156 	  heisig - "Remembering The  Kanji"  by  James Heisig.
    157 	  gakken - "A  New Dictionary of Kanji Usage" (Gakken)
    158 	  oneill_names - "Japanese Names", by P.G. O'Neill. 
    159 	  oneill_kk - "Essential Kanji" by P.G. O'Neill.
    160 	  moro - "Daikanwajiten" compiled by Morohashi. For some kanji two
    161 	  	additional attributes are used: m_vol:  the volume of the
    162 	  	dictionary in which the kanji is found, and m_page: the page
    163 	  	number in the volume.
    164 	  henshall - "A Guide To Remembering Japanese Characters" by
    165 	  	Kenneth G.  Henshall.
    166 	  sh_kk - "Kanji and Kana" by Spahn and Hadamitzky.
    167 	  sakade - "A Guide To Reading and Writing Japanese" edited by
    168 	  	Florence Sakade.
    169 	  henshall3 - "A Guide To Reading and Writing Japanese" 3rd
    170 		edition, edited by Henshall, Seeley and De Groot.
    171 	  tutt_cards - Tuttle Kanji Cards, compiled by Alexander Kask.
    172 	  crowley - "The Kanji Way to Japanese Language Power" by
    173 	  	Dale Crowley.
    174 	  kanji_in_context - "Kanji in Context" by Nishiguchi and Kono.
    175 	  busy_people - "Japanese For Busy People" vols I-III, published
    176 		by the AJLT. The codes are the volume.chapter.
    177 	  kodansha_compact - the "Kodansha Compact Kanji Guide".
    178 	-->
    179 <!ATTLIST dic_ref m_vol CDATA #IMPLIED>
    180 	<!-- 
    181 	See above under "moro".
    182 	-->
    183 <!ATTLIST dic_ref m_page CDATA #IMPLIED>
    184 	<!-- 
    185 	See above under "moro".
    186 	-->
    187 <!ELEMENT query_code (q_code+)>
    188 	<!-- 
    189 	These codes contain information relating to the glyph, and can be used
    190 	for finding a required kanji. The type of code is defined by the
    191 	qc_type attribute.
    192 	-->
    193 <!ELEMENT q_code (#PCDATA)>
    194 	<!--
    195 	The q_code contains the actual query-code value, according to the
    196 	qc_type attribute.
    197 	-->
    198 <!ATTLIST q_code qc_type CDATA #REQUIRED>
    199 	<!-- 
    200 	The q_code attribute defines the type of query code. The current values
    201 	are:
    202 	  skip -  Halpern's SKIP (System  of  Kanji  Indexing  by  Patterns) 
    203 	  	code. The  format is n-nn-nn.  See the KANJIDIC  documentation 
    204 	  	for  a description of the code and restrictions on  the 
    205 	  	commercial  use  of this data. [P]
    206 
    207 	  sh_desc - the descriptor codes for The Kanji Dictionary (Tuttle 
    208 	  	1996) by Spahn and Hadamitzky. They are in the form nxnn.n,  
    209 	  	e.g.  3k11.2, where the  kanji has 3 strokes in the 
    210 	  	identifying radical, it is radical "k" in the SH 
    211 	  	classification system, there are 11 other strokes, and it is 
    212 	  	the 2nd kanji in the 3k11 sequence. (I am very grateful to 
    213 	  	Mark Spahn for providing the list of these descriptor codes 
    214 	  	for the kanji in this file.) [I]
    215 	  four_corner - the "Four Corner" code for the kanji. This is a code 
    216 	  	invented by Wang Chen in 1928. See the KANJIDIC documentation 
    217 	  	for  an overview of  the Four Corner System. [Q]
    218 
    219 	  deroo - the codes developed by the late Father Joseph De Roo, and 
    220 	  	published in  his book "2001 Kanji" (Bojinsha). Fr De Roo 
    221 	  	gave his permission for these codes to be included. [DR]
    222 	  misclass - a possible misclassification of the kanji according
    223 		to one of the code types. (See the "Z" codes in the KANJIDIC
    224 		documentation for more details.)
    225 	  
    226 	-->
    227 <!ELEMENT reading_meaning (rmgroup*, nanori*)>
    228 	<!-- 
    229 	The readings for the kanji in several languages, and the meanings, also
    230 	in several languages. The readings and meanings are grouped to enable
    231 	the handling of the situation where the meaning is differentiated by 
    232 	reading. [T1]
    233 	-->
    234 <!ELEMENT nanori (#PCDATA)>
    235 	<!-- 
    236 	Japanese readings that are now only associated with names.
    237 	-->
    238 <!ELEMENT rmgroup (reading*, meaning*)>
    239 <!ELEMENT reading (#PCDATA)>
    240 	<!-- 
    241 	The reading element contains the reading or pronunciation
    242 	of the kanji.
    243 	-->
    244 <!ATTLIST reading r_type CDATA #REQUIRED>
    245 	<!-- 
    246 	The r_type attribute defines the type of reading in the reading
    247 	element. The current values are:
    248 	  pinyin - the modern PinYin romanization of the Chinese reading 
    249 	  	of the kanji. The tones are represented by a concluding 
    250 	  	digit. [Y]
    251 	  korean_r - the romanized form of the Korean reading(s) of the 
    252 	  	kanji.  The readings are in the (Republic of Korea) Ministry 
    253 	  	of Education style of romanization. [W]
    254 	  korean_h - the Korean reading(s) of the kanji in hangul.
    255 	  ja_on - the "on" Japanese reading of the kanji, in katakana. A
    256 	  	second attribute r_status, if present, will indicate with
    257 	  	a value of "jy" whether the reading is approved for a
    258 	  	"Jouyou kanji".
    259 	  ja_kun - the "kun" Japanese reading of the kanji, in hiragana. 
    260 	  	Where relevant the okurigana is also included separated by a 
    261 	  	".". Readings associated with prefixes and suffixes are 
    262 	  	marked with a "-". A second attribute r_status, if present, 
    263 	  	will indicate with a value of "jy" whether the reading is 
    264 	  	approved for a "Jouyou kanji".
    265 	-->
    266 <!ATTLIST reading r_status CDATA #IMPLIED>
    267 	<!-- 
    268 	See under ja_on and ja_kun above.
    269 	-->
    270 <!ELEMENT meaning (#PCDATA)>
    271 	<!-- 
    272 	The meaning associated with the kanji.
    273 	-->
    274 <!ATTLIST meaning m_lang CDATA #IMPLIED>
    275 	<!-- 
    276 	The m_lang attribute defines the target language of the meaning. It 
    277 	will be coded using the two-letter language code from the ISO 639 
    278 	standard. When absent, the value "en" (i.e. English) is implied. [{}]
    279 	-->
    280 ] >
    281 <kanjidic2>
    282 </kanjidic2>
    283