1 /* character.c -- character module.
2 Copyright (C) 2003, 2004
3 National Institute of Advanced Industrial Science and Technology (AIST)
4 Registration Number H15PRO112
6 This file is part of the m17n library.
8 The m17n library is free software; you can redistribute it and/or
9 modify it under the terms of the GNU Lesser General Public License
10 as published by the Free Software Foundation; either version 2.1 of
11 the License, or (at your option) any later version.
13 The m17n library is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public
19 License along with the m17n library; if not, write to the Free
20 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
24 @addtogroup m17nCharacter
25 @brief Character objects and API for them.
27 The m17n library represents a @e character by a character code (an
28 integer). The minimum character code is @c 0. The maximum
29 character code is defined by the macro #MCHAR_MAX. It is
30 assured that #MCHAR_MAX is not smaller than @c 0x3FFFFF (22
33 Characters @c 0 to @c 0x10FFFF are equivalent to the Unicode
34 characters of the same code values.
36 A character can have zero or more properties called @e character
37 @e properties. A character property consists of a @e key and a
38 @e value, where key is a symbol and value is anything that can be
39 cast to <tt>(void *)</tt>. "The character property that belongs
40 to character C and whose key is K" may be shortened to "the K
44 @addtogroup m17nCharacter
45 @brief ʸ»ú¥ª¥Ö¥¸¥§¥¯¥È¤È¤½¤ì¤Ë´Ø¤¹¤ë API.
47 m17n ¥é¥¤¥Ö¥é¥ê¤Ï @e ʸ»ú ¤òʸ»ú¥³¡¼¥É¡ÊÀ°¿ô¡Ë¤Çɽ¸½¤¹¤ë¡£ºÇ¾®¤Îʸ
48 »ú¥³¡¼¥É¤Ï @c 0 ¤Ç¡¢ºÇÂç¤Îʸ»ú¥³¡¼¥É¤Ï¥Þ¥¯¥í #MCHAR_MAX ¤Ë¤è¤Ã¤Æ
49 ÄêµÁ¤µ¤ì¤Æ¤¤¤ë¡£#MCHAR_MAX ¤Ï @c 0x3FFFFF¡Ê22¥Ó¥Ã¥È¡Ë °Ê¾å¤Ç¤¢¤ë
50 ¤³¤È¤¬Êݾڤµ¤ì¤Æ¤¤¤ë¡£
52 @c 0 ¤«¤é @c 0x10FFFF ¤Þ¤Ç¤Îʸ»ú¤Ï¡¢¤½¤ì¤ÈƱ¤¸Ãͤò»ý¤Ä Unicode ¤Î
53 ʸ»ú¤Ë³ä¤êÅö¤Æ¤é¤ì¤Æ¤¤¤ë¡£
55 ³Æʸ»ú¤Ï @e ʸ»ú¥×¥í¥Ñ¥Æ¥£ ¤È¸Æ¤Ö¥×¥í¥Ñ¥Æ¥£¤ò 0 ¸Ä°Ê¾å»ý¤Ä¤³¤È¤¬¤Ç
56 ¤¤ë¡£Ê¸»ú¥×¥í¥Ñ¥Æ¥£¤Ï @e ¥¡¼ ¤È @e ÃÍ ¤«¤é¤Ê¤ë¡£¥¡¼¤Ï¥·¥ó¥Ü¥ë¤Ç
57 ¤¢¤ê¡¢ÃÍ¤Ï <tt>(void *)</tt> ·¿¤Ë¥¥ã¥¹¥È¤Ç¤¤ë¤â¤Î¤Ê¤é²¿¤Ç¤â¤è¤¤¡£
58 ¡Öʸ»ú C ¤Îʸ»ú¥×¥í¥Ñ¥Æ¥£¤Î¤¦¤Á¥¡¼¤¬ K ¤Ç¤¢¤ë¤â¤Î¡×¤ò´Êñ¤Ë
59 ¡Öʸ»ú C ¤Î K ¥×¥í¥Ñ¥Æ¥£¡×¤È¸Æ¤Ö¤³¤È¤¬¤¢¤ë¡£ */
62 #if !defined (FOR_DOXYGEN) || defined (DOXYGEN_INTERNAL_MODULE)
63 /*** @addtogroup m17nInternal
73 #include "m17n-core.h"
74 #include "m17n-misc.h"
84 static MPlist *char_prop_list;
87 free_string (int from, int to, void *str, void *arg)
98 char_prop_list = mplist ();
101 = mchar_define_property ("name", Mstring);
103 = mchar_define_property ("category", Msymbol);
105 = mchar_define_property ("combining-class", Minteger);
107 = mchar_define_property ("bidirectional-category", Msymbol);
109 = mchar_define_property ("simple-case-folding", Minteger);
110 Mcomplicated_case_folding
111 = mchar_define_property ("complicated-case-folding", Mtext);
113 = mchar_define_property ("script", Msymbol);
123 for (p = char_prop_list; mplist_key (p) != Mnil; p = mplist_next (p))
125 MCharPropRecord *record = mplist_value (p);
129 if (record->type == Mstring)
130 mchartable_map (record->table, NULL, free_string, NULL);
131 M17N_OBJECT_UNREF (record->table);
135 M17N_OBJECT_UNREF (char_prop_list);
139 #endif /* !FOR_DOXYGEN || DOXYGEN_INTERNAL_MODULE */
143 /*** @addtogroup m17nCharacter */
149 @brief Maximum character code.
151 The macro #MCHAR_MAX gives the maximum character code. */
154 @brief ʸ»ú¥³¡¼¥É¤ÎºÇÂçÃÍ.
156 ¥Þ¥¯¥í #MCHAR_MAX ¤Ïʸ»ú¥³¡¼¥É¤ÎºÇÂçÃͤòÍ¿¤¨¤ë¡£ */
160 #endif /* FOR_DOXYGEN */
163 @ingroup m17nCharacter
164 @name Variables: Keys of character properties
166 These symbols are used as keys of character properties. */
169 @name ÊÑ¿ô: ʸ»ú¥×¥í¥Ñ¥Æ¥£¤Î¥¡¼
171 ¤³¤ì¤é¤Î¥·¥ó¥Ü¥ë¤Ïʸ»ú¥×¥í¥Ñ¥Æ¥£¤Î¥¡¼¤È¤·¤Æ»È¤ï¤ì¤ë¡£*/
176 @brief Key for script.
178 The symbol #Mscript has the name <tt>"script"</tt> and is used as the key
179 of a character property. The value of such a property is a symbol
180 representing the script to which the character belongs.
182 Each symbol that represents a script has one of the names listed in
183 the <em>Unicode Technical Report #24</em>. */
186 @brief ¥¹¥¯¥ê¥×¥È¤òɽ¤ï¤¹¥¡¼.
188 ¥·¥ó¥Ü¥ë #Mscript ¤Ï <tt>"script"</tt> ¤È¤¤¤¦Ì¾Á°¤ò»ý¤Á¡¢Ê¸»ú¥×
189 ¥í¥Ñ¥Æ¥£¤Î¥¡¼¤È¤·¤Æ»È¤ï¤ì¤ë¡£¤³¤Î¥×¥í¥Ñ¥Æ¥£¤ÎÃͤϡ¢¤³¤Îʸ»ú¤Î°¤¹
190 ¤ë¥¹¥¯¥ê¥×¥È¤òɽ¤ï¤¹¥·¥ó¥Ü¥ë¤Ç¤¢¤ë¡£
192 ¥¹¥¯¥ê¥×¥È¤òɽ¤ï¤¹¥·¥ó¥Ü¥ë¤Î̾Á°¤Ï¡¢<em>Unicode Technical Report
193 #24</em> ¤Ë¥ê¥¹¥È¤µ¤ì¤Æ¤¤¤ë¤â¤Î¤Î¤¤¤º¤ì¤«¤Ç¤¢¤ë¡£ */
200 @brief Key for character name.
202 The symbol #Mname has the name <tt>"name"</tt> and is used as
203 the key of a character property. The value of such a property is a
204 C-string representing the name of the character. */
207 @brief ̾Á°¤òɽ¤ï¤¹¥¡¼.
209 ¥·¥ó¥Ü¥ë #Mname ¤Ï <tt>"name"</tt> ¤È¤¤¤¦Ì¾Á°¤ò»ý¤Á¡¢Ê¸»ú¥×¥í¥Ñ
210 ¥Æ¥£¤Î¥¡¼¤È¤·¤Æ»È¤ï¤ì¤ë¡£¤³¤Î¥×¥í¥Ñ¥Æ¥£¤ÎÃÍ¤Ï C-string ¤Ç¤¢¤ê¡¢¤½
211 ¤Îʸ»ú¤Î̾Á°¤òɽ¤ï¤¹¡£ */
218 @brief Key for general category.
220 The symbol #Mcategory has the name <tt>"category"</tt> and is
221 used as the key of a character property. The value of such a
222 property is a symbol representing the <em>general category</em> of
225 Each symbol that represents a general category has one of the
226 names listed as abbreviations for <em>General Category</em> in
230 @brief °ìÈÌ¥«¥Æ¥´¥ê¤òɽ¤ï¤¹¥¡¼.
232 ¥·¥ó¥Ü¥ë #Mcategory ¤Ï <tt>"category"</tt> ¤È¤¤¤¦Ì¾Á°¤ò»ý¤Á¡¢Ê¸
233 »ú¥×¥í¥Ñ¥Æ¥£¤Î¥¡¼¤È¤·¤Æ»È¤ï¤ì¤ë¡£¤³¤Î¥×¥í¥Ñ¥Æ¥£¤ÎÃͤϡ¢Âбþ¤¹¤ë
234 <em>°ìÈÌ¥«¥Æ¥´¥ê</em> ¤òɽ¤ï¤¹¥·¥ó¥Ü¥ë¤Ç¤¢¤ë¡£
236 °ìÈÌ¥«¥Æ¥´¥ê¤òɽ¤ï¤¹¥·¥ó¥Ü¥ë¤Î̾Á°¤Ï¡¢<em>General Category</em>¤Î
237 ¾Êά·Á¤È¤·¤Æ Unicode ¤ËÄêµÁ¤µ¤ì¤Æ¤¤¤ë¤â¤Î¤Ç¤¢¤ë¡£ */
244 @brief Key for canonical combining class.
246 The symbol #Mcombining_class has the name
247 <tt>"combining-class"</tt> and is used as the key of a character
248 property. The value of such a property is an integer that
249 represents the <em>canonical combining class</em> of the character.
251 The meaning of each integer that represents a canonical combining
252 class is identical to the one defined in Unicode. */
255 @brief ɸ½à·ë¹ç¥¯¥é¥¹¤òɽ¤ï¤¹¥¡¼.
257 ¥·¥ó¥Ü¥ë #Mcombining_class ¤Ï <tt>"combining-class"</tt> ¤È¤¤¤¦
258 ̾Á°¤ò»ý¤Á¡¢Ê¸»ú¥×¥í¥Ñ¥Æ¥£¤Î¥¡¼¤È¤·¤Æ»È¤ï¤ì¤ë¡£¤³¤Î¥×¥í¥Ñ¥Æ¥£¤ÎÃÍ
259 ¤Ï¡¢Âбþ¤¹¤ë @e ɸ½à·ë¹ç¥¯¥é¥¹ ¤òɽ¤ï¤¹À°¿ô¤Ç¤¢¤ë¡£
261 ɸ½à·ë¹ç¥¯¥é¥¹¤òɽ¤ï¤¹À°¿ô¤Î°ÕÌ£¤Ï¡¢Unicode ¤ËÄêµÁ¤µ¤ì¤Æ¤¤¤ë¤â¤Î
264 MSymbol Mcombining_class;
268 @brief Key for bidi category.
270 The symbol #Mbidi_category has the name <tt>"bidi-category"</tt>
271 and is used as the key of a character property. The value of such
272 a property is a symbol that represents the <em>bidirectional
273 category</em> of the character.
275 Each symbol that represents a bidirectional category has one of
276 the names listed as types of <em>Bidirectional Category</em> in
280 @brief ÁÐÊý¸þ¥«¥Æ¥´¥ê¤òɽ¤ï¤¹¥¡¼.
282 ¥·¥ó¥Ü¥ë #Mbidi_category ¤Ï <tt>"bidi-category"</tt> ¤È¤¤¤¦Ì¾Á°
283 ¤ò»ý¤Á¡¢Ê¸»ú¥×¥í¥Ñ¥Æ¥£¤Î¥¡¼¤È¤·¤Æ»È¤ï¤ì¤ë¡£¤³¤Î¥×¥í¥Ñ¥Æ¥£¤ÎÃͤϡ¢
284 Âбþ¤¹¤ë @e ÁÐÊý¸þ¥«¥Æ¥´¥ê ¤òɽ¤ï¤¹¥·¥ó¥Ü¥ë¤Ç¤¢¤ë¡£
286 ÁÐÊý¸þ¥«¥Æ¥´¥ê¤òɽ¤ï¤¹¥·¥ó¥Ü¥ë¤Î̾Á°¤Ï¡¢<em>Bidirectional
287 Category</em> ¤Î·¿¤È¤·¤Æ Unicode ¤ËÄêµÁ¤µ¤ì¤Æ¤¤¤ë¤â¤Î¤Ç¤¢¤ë¡£ */
289 MSymbol Mbidi_category;
293 @brief Key for corresponding single lowercase character.
295 The symbol #Msimple_case_folding has the name
296 <tt>"simple-case-folding"</tt> and is used as the key of a
297 character property. The value of such a property is the
298 corresponding single lowercase character that is used when
299 comparing M-texts ignoring cases.
301 If a character requires a complicated comparison (i.e. cannot be
302 compared by simply mapping to another single character), the value
303 of such a property is @c 0xFFFF. In this case, the character has
304 another property whose key is #Mcomplicated_case_folding. */
307 @brief Âбþ¤¹¤ë¾®Ê¸»ú°ìʸ»ú¤òɽ¤ï¤¹¥¡¼.
309 ¥·¥ó¥Ü¥ë #Msimple_case_folding ¤Ï <tt>"simple-case-folding"</tt>
310 ¤È¤¤¤¦Ì¾Á°¤ò»ý¤Á¡¢Ê¸»ú¥×¥í¥Ñ¥Æ¥£¤Î¥¡¼¤È¤·¤Æ»È¤ï¤ì¤ë¡£¤³¤Î¥×¥í¥Ñ¥Æ¥£
311 ¤ÎÃͤϡ¢Âбþ¤¹¤ë¾®Ê¸»ú°ìʸ»ú¤Ç¤¢¤ê¡¢Âçʸ»ú¡¿¾®Ê¸»ú¤Î¶èÊ̤ò̵»ë¤·¤¿
312 ʸ»úÎóÈæ³Ó¤ÎºÝ¤Ë»È¤ï¤ì¤ë¡£
314 Ê£»¨¤ÊÈæ³ÓÊýË¡¤òɬÍפȤ¹¤ëʸ»ú¤Ç¤¢¤Ã¤¿¾ì¹ç¡ÊÊ̤ΰìʸ»ú¤ÈÂбþÉÕ¤±¤ë
315 ¤³¤È¤Ë¤è¤Ã¤ÆÈæ³Ó¤Ç¤¤Ê¤¤¾ì¹ç¡Ë¡¢¤³¤Î¥×¥í¥Ñ¥Æ¥£¤ÎÃÍ¤Ï @c 0xFFFF ¤Ë
316 ¤Ê¤ë¡£¤³¤Î¾ì¹ç¤½¤Îʸ»ú¤Ï¡¢#Mcomplicated_case_folding ¤È¤¤¤¦¥¡¼¤Î
317 ʸ»ú¥×¥í¥Ñ¥Æ¥£¤ò»ý¤Ä¡£ */
319 MSymbol Msimple_case_folding;
321 @brief Key for corresponding multiple lowercase characters.
323 The symbol #Mcomplicated_case_folding has the name
324 <tt>"complicated-case-folding"</tt> and is used as the key of a
325 character property. The value of such a property is the
326 corresponding M-text that contains a sequence of lowercase
327 characters to be used for comparing M-texts ignoring case. */
330 @brief Âбþ¤¹¤ë¾®Ê¸»ú¤ÎÎó¤òɽ¤ï¤¹¥¡¼.
332 ¥·¥ó¥Ü¥ë #Mcomplicated_case_folding ¤Ï
333 <tt>"complicated-case-folding"</tt> ¤È¤¤¤¦Ì¾Á°¤ò»ý¤Á¡¢Ê¸»ú¥×¥í¥Ñ¥Æ¥£
334 ¤Î¥¡¼¤È¤·¤Æ»È¤ï¤ì¤ë¡£¤³¤Î¥×¥í¥Ñ¥Æ¥£¤ÎÃͤϡ¢Âбþ¤¹¤ë¾®Ê¸»úÎ󤫤é¤Ê
335 ¤ë M-text ¤Ç¤¢¤ê¡¢Âçʸ»ú¡¿¾®Ê¸»ú¤Î¶èÊ̤ò̵»ë¤·¤¿Ê¸»úÎóÈæ³Ó¤ÎºÝ¤Ë»È
339 MSymbol Mcomplicated_case_folding;
345 @brief Define a character property.
347 The mchar_define_property () function searches the m17n database
348 for a data whose tags are \<#Mchar_table, $TYPE, $SYM \>.
349 Here, $SYM is a symbol whose name is $NAME. $TYPE must be
350 #Mstring, #Mtext, #Msymbol, #Minteger, or #Mplist.
353 If the operation was successful, mchar_define_property () returns
354 $SYM. Otherwise it returns #Mnil. */
357 @brief ʸ»ú¥×¥í¥Ñ¥Æ¥£¤òÄêµÁ¤¹¤ë.
359 ´Ø¿ô mchar_define_property () ¤Ï¡¢ \<#Mchar_table, $TYPE, $SYM \>
360 ¤È¤¤¤¦¥¿¥°¤ò»ý¤Ã¤¿¥Ç¡¼¥¿¥Ù¡¼¥¹¤ò m17n ¸À¸ì¾ðÊó¥Ù¡¼¥¹¤«¤éõ¤¹¡£ ¤³
361 ¤³¤Ç $SYM ¤Ï $NAME ¤È¤¤¤¦Ì¾Á°¤Î¥·¥ó¥Ü¥ë¤Ç¤¢¤ë¡£$TYPE ¤Ï#Mstring,
362 #Mtext, #Msymbol, #Minteger, #Mplist ¤Î¤¤¤º¤ì¤«¤Ç¤Ê¤±¤ì¤Ð¤Ê¤é¤Ê¤¤¡£
365 ½èÍý¤ËÀ®¸ù¤¹¤ì¤Ð mchar_define_property () ¤Ï$SYM ¤òÊÖ¤¹¡£
366 ¼ºÇÔ¤·¤¿¾ì¹ç¤Ï #Mnil ¤òÊÖ¤¹¡£ */
373 mchar_get_prop (), mchar_put_prop () */
376 mchar_define_property (char *name, MSymbol type)
378 MSymbol key = msymbol (name);
379 MCharPropRecord *record;
381 record = mplist_get (char_prop_list, key);
385 M17N_OBJECT_UNREF (record->table);
389 MSTRUCT_CALLOC (record, MERROR_CHAR);
390 mplist_put (char_prop_list, key, record);
394 if (mdatabase__finder
395 && (record->mdb = (*mdatabase__finder) (Mchar_table, type, key, Mnil)))
397 record->table = NULL;
401 void *default_value = NULL;
404 if (type == Minteger)
405 default_value = (void *) -1;
406 record->table = mchartable (type, default_value);
415 @brief Get the value of a character property.
417 The mchar_get_prop () function searches character $C for the
418 character property whose key is $KEY.
421 If the operation was successful, mchar_get_prop () returns the
422 value of the character property. Otherwise it returns @c
426 @brief ʸ»ú¥×¥í¥Ñ¥Æ¥£¤ÎÃͤòÆÀ¤ë.
428 ´Ø¿ô mchar_get_prop () ¤Ï¡¢Ê¸»ú $C ¤Îʸ»ú¥×¥í¥Ñ¥Æ¥£¤Î¤¦¤Á¥¡¼¤¬
429 $KEY ¤Ç¤¢¤ë¤â¤Î¤òõ¤¹¡£
432 ½èÍý¤¬À®¸ù¤¹¤ì¤Ð mchar_get_prop () ¤Ï¸«¤Ä¤«¤Ã¤¿¥×¥í¥Ñ¥Æ¥£¤ÎÃͤòÊÖ
433 ¤¹¡£¼ºÇÔ¤·¤¿¾ì¹ç¤Ï @c NULL ¤òÊÖ¤¹¡£
435 @latexonly \IPAlabel{mchar_get_prop} @endlatexonly
439 @c MERROR_SYMBOL, @c MERROR_DB
442 mchar_define_property (), mchar_put_prop () */
445 mchar_get_prop (int c, MSymbol key)
447 MCharPropRecord *record;
449 record = mplist_get (char_prop_list, key);
454 record->table = (*mdatabase__loader) (record->mdb);
456 MERROR (MERROR_DB, NULL);
459 return mchartable_lookup (record->table, c);
465 @brief Set the value of a character property.
467 The mchar_put_prop () function searches character $C for the
468 character property whose key is $KEY and assigns $VAL to the value
469 of the found property.
472 If the operation was successful, mchar_put_prop () returns 0.
473 Otherwise, it returns -1. */
475 @brief ʸ»ú¥×¥í¥Ñ¥Æ¥£¤ÎÃͤòÀßÄꤹ¤ë.
477 ´Ø¿ô mchar_put_prop () ¤Ï¡¢Ê¸»ú $C ¤Îʸ»ú¥×¥í¥Ñ¥Æ¥£¤Î¤¦¤Á¥¡¼¤¬ $KEY ¤Ç
478 ¤¢¤ë¤â¤Î¤òõ¤·¡¢¤½¤ÎÃͤȤ·¤Æ $VAL ¤òÀßÄꤹ¤ë¡£
481 ½èÍý¤¬À®¸ù¤¹¤ì¤Ð mchar_put_prop () ¤Ï0¤òÊÖ¤¹¡£¼ºÇÔ¤·¤¿¾ì¹ç¤Ï-1¤òÊÖ
485 @c MERROR_SYMBOL, @c MERROR_DB
488 mchar_define_property (), mchar_get_prop () */
491 mchar_put_prop (int c, MSymbol key, void *val)
493 MCharPropRecord *record;
495 record = mplist_get (char_prop_list, key);
500 record->table = (*mdatabase__loader) (record->mdb);
502 MERROR (MERROR_DB, -1);
503 M17N_OBJECT_REF (record->table);
506 return mchartable_set (record->table, c, val);