1 /* character.c -- character module.
2 Copyright (C) 2003, 2004
3 National Institute of Advanced Industrial Science and Technology (AIST)
4 Registration Number H15PRO112
6 This file is part of the m17n library.
8 The m17n library is free software; you can redistribute it and/or
9 modify it under the terms of the GNU Lesser General Public License
10 as published by the Free Software Foundation; either version 2.1 of
11 the License, or (at your option) any later version.
13 The m17n library is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public
19 License along with the m17n library; if not, write to the Free
20 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
24 @addtogroup m17nCharacter
25 @brief Character objects and API for them.
27 The m17n library represents a @e character by a character code (an
28 integer). The minimum character code is @c 0. The maximum
29 character code is defined by the macro #MCHAR_MAX. It is
30 assured that #MCHAR_MAX is not smaller than @c 0x3FFFFF (22
33 Characters @c 0 to @c 0x10FFFF are equivalent to the Unicode
34 characters of the same code values.
36 A character can have zero or more properties called @e character
37 @e properties. A character property consists of a @e key and a
38 @e value, where key is a symbol and value is anything that can be
39 cast to <tt>(void *)</tt>. "The character property that belongs
40 to character C and whose key is K" may be shortened to "the K
44 @addtogroup m17nCharacter
45 @brief ʸ»ú¥ª¥Ö¥¸¥§¥¯¥È¤È¤½¤ì¤Ë´Ø¤¹¤ë API
47 m17n ¥é¥¤¥Ö¥é¥ê¤Ï @e ʸ»ú ¤òʸ»ú¥³¡¼¥É¡ÊÀ°¿ô¡Ë¤Çɽ¸½¤¹¤ë¡£ºÇ¾®¤Îʸ
48 »ú¥³¡¼¥É¤Ï @c 0 ¤Ç¡¢ºÇÂç¤Îʸ»ú¥³¡¼¥É¤Ï¥Þ¥¯¥í #MCHAR_MAX ¤Ë¤è¤Ã¤Æ
49 ÄêµÁ¤µ¤ì¤Æ¤¤¤ë¡£#MCHAR_MAX ¤Ï @c 0x3FFFFF¡Ê22¥Ó¥Ã¥È¡Ë°Ê¾å¤Ç¤¢¤ë
50 ¤³¤È¤¬Êݾڤµ¤ì¤Æ¤¤¤ë¡£
52 @c 0 ¤«¤é @c 0x10FFFF ¤Þ¤Ç¤Îʸ»ú¤Ï¡¢¤½¤ì¤ÈƱ¤¸Ãͤò»ý¤Ä Unicode ¤Î
53 ʸ»ú¤Ë³ä¤êÅö¤Æ¤é¤ì¤Æ¤¤¤ë¡£
55 ³Æʸ»ú¤Ï @e ʸ»ú¥×¥í¥Ñ¥Æ¥£ ¤È¸Æ¤Ö¥×¥í¥Ñ¥Æ¥£¤ò 0 ¸Ä°Ê¾å»ý¤Ä¤³¤È¤¬¤Ç
56 ¤¤ë¡£Ê¸»ú¥×¥í¥Ñ¥Æ¥£¤Ï @e ¥¡¼ ¤È @e ÃÍ ¤«¤é¤Ê¤ë¡£¥¡¼¤Ï¥·¥ó¥Ü¥ë¤Ç
57 ¤¢¤ê¡¢ÃÍ¤Ï <tt>(void *)</tt> ·¿¤Ë¥¥ã¥¹¥È¤Ç¤¤ë¤â¤Î¤Ê¤é²¿¤Ç¤â¤è¤¤¡£
58 ¡Öʸ»ú C ¤Îʸ»ú¥×¥í¥Ñ¥Æ¥£¤Î¤¦¤Á¥¡¼¤¬ @c K ¤Ç¤¢¤ë¤â¤Î¡×¤ò´Êñ¤Ë
59 ¡Öʸ»ú C ¤Î K ¥×¥í¥Ñ¥Æ¥£¡×¤È¸Æ¤Ö¤³¤È¤¬¤¢¤ë¡£ */
62 #if !defined (FOR_DOXYGEN) || defined (DOXYGEN_INTERNAL_MODULE)
63 /*** @addtogroup m17nInternal
73 #include "m17n-core.h"
74 #include "m17n-misc.h"
84 static MPlist *char_prop_list;
87 free_string (int from, int to, void *str, void *arg)
98 char_prop_list = mplist ();
101 = mchar_define_property ("name", Mstring);
103 = mchar_define_property ("category", Msymbol);
105 = mchar_define_property ("combining-class", Minteger);
107 = mchar_define_property ("bidirectional-category", Msymbol);
109 = mchar_define_property ("simple-case-folding", Minteger);
110 Mcomplicated_case_folding
111 = mchar_define_property ("complicated-case-folding", Mtext);
113 = mchar_define_property ("script", Msymbol);
123 for (p = char_prop_list; mplist_key (p) != Mnil; p = mplist_next (p))
125 MCharPropRecord *record = mplist_value (p);
129 if (record->type == Mstring)
130 mchartable_map (record->table, NULL, free_string, NULL);
131 M17N_OBJECT_UNREF (record->table);
135 M17N_OBJECT_UNREF (char_prop_list);
139 #endif /* !FOR_DOXYGEN || DOXYGEN_INTERNAL_MODULE */
143 /*** @addtogroup m17nCharacter */
149 @brief Maximum character code.
151 The macro #MCHAR_MAX gives the maximum character code. */
154 @brief ʸ»ú¥³¡¼¥É¤ÎºÇÂçÃÍ
156 ¥Þ¥¯¥í #MCHAR_MAX ¤Ïʸ»ú¥³¡¼¥É¤ÎºÇÂçÃͤòÍ¿¤¨¤ë¡£ */
160 #endif /* FOR_DOXYGEN */
163 @ingroup m17nCharacter
164 @name Variables: Keys of character properties
166 These symbols are used as keys of character properties. */
169 @name ÊÑ¿ô: ʸ»ú¥×¥í¥Ñ¥Æ¥£¤Î¥¡¼
171 ¤³¤ì¤é¤Î¥·¥ó¥Ü¥ë¤Ïʸ»ú¥×¥í¥Ñ¥Æ¥£¤Î¥¡¼¤È¤·¤Æ»È¤ï¤ì¤ë¡£*/
176 @brief Key for script.
178 The symbol #Mscript has the name <tt>"script"</tt> and is used as the key
179 of a character property. The value of such a property is a symbol
180 representing the script to which the character belongs.
182 Each symbol that represents a script has one of the names listed in
183 the <em>Unicode Technical Report #24</em>. */
186 @brief ¥¹¥¯¥ê¥×¥È¤òɽ¤ï¤¹¥¡¼.
188 ¥·¥ó¥Ü¥ë #Mscript ¤Ï <tt>"script"</tt> ¤È¤¤¤¦Ì¾Á°¤ò»ý¤Á¡¢Ê¸»ú¥×
189 ¥í¥Ñ¥Æ¥£¤Î¥¡¼¤È¤·¤Æ»È¤ï¤ì¤ë¡£¤³¤Î¥×¥í¥Ñ¥Æ¥£¤ÎÃͤϡ¢¤³¤Îʸ»ú¤Î°¤¹
190 ¤ë¥¹¥¯¥ê¥×¥È¤òɽ¤ï¤¹¥·¥ó¥Ü¥ë¤Ç¤¢¤ë¡£
192 ¥¹¥¯¥ê¥×¥È¤òɽ¤ï¤¹¥·¥ó¥Ü¥ë¤Î̾Á°¤Ï¡¢<em>Unicode Technical Report
193 #24</em> ¤Ë¥ê¥¹¥È¤µ¤ì¤Æ¤¤¤ë¤â¤Î¤Î¤¤¤º¤ì¤«¤Ç¤¢¤ë¡£ */
200 @brief Key for character name.
202 The symbol #Mname has the name <tt>"name"</tt> and is used as
203 the key of a character property. The value of such a property is a
204 C-string representing the name of the character. */
207 @brief ̾Á°¤òɽ¤ï¤¹¥¡¼.
209 ¥·¥ó¥Ü¥ë #Mname ¤Ï <tt>"name"</tt> ¤È¤¤¤¦Ì¾Á°¤ò»ý¤Á¡¢Ê¸»ú¥×¥í¥Ñ
210 ¥Æ¥£¤Î¥¡¼¤È¤·¤Æ»È¤ï¤ì¤ë¡£¤³¤Î¥×¥í¥Ñ¥Æ¥£¤ÎÃÍ¤Ï C-string ¤Ç¤¢¤ê¡¢¤½
211 ¤Îʸ»ú¤Î̾Á°¤òɽ¤ï¤¹¡£ */
218 @brief Key for general category.
220 The symbol #Mcategory has the name <tt>"category"</tt> and is
221 used as the key of a character property. The value of such a
222 property is a symbol representing the <em>general category</em> of
225 Each symbol that represents a general category has one of the
226 names listed as abbreviations for <em>General Category</em> in
230 @brief °ìÈÌ¥«¥Æ¥´¥ê¤òɽ¤ï¤¹¥¡¼.
232 ¥·¥ó¥Ü¥ë #Mcategory ¤Ï <tt>"category"</tt> ¤È¤¤¤¦Ì¾Á°¤ò»ý¤Á¡¢Ê¸
233 »ú¥×¥í¥Ñ¥Æ¥£¤Î¥¡¼¤È¤·¤Æ»È¤ï¤ì¤ë¡£¤³¤Î¥×¥í¥Ñ¥Æ¥£¤ÎÃͤϡ¢Âбþ¤¹¤ë
234 <em>°ìÈÌ¥«¥Æ¥´¥ê</em> ¤òɽ¤ï¤¹¥·¥ó¥Ü¥ë¤Ç¤¢¤ë¡£
236 °ìÈÌ¥«¥Æ¥´¥ê¤òɽ¤ï¤¹¥·¥ó¥Ü¥ë¤Î̾Á°¤Ï¡¢<em>General Category</em>¤Î
237 ¾Êά·Á¤È¤·¤Æ Unicode ¤ËÄêµÁ¤µ¤ì¤Æ¤¤¤ë¤â¤Î¤Ç¤¢¤ë¡£ */
244 @brief Key for canonical combining class.
246 The symbol #Mcombining_class has the name
247 <tt>"combining-class"</tt> and is used as the key of a character
248 property. The value of such a property is an integer that
249 represents the <em>canonical combining class</em> of the character.
251 The meaning of each integer that represents a canonical combining
252 class is identical to the one defined in Unicode. */
255 @brief ɸ½à·ë¹ç¥¯¥é¥¹¤òɽ¤ï¤¹¥¡¼
257 ¥·¥ó¥Ü¥ë #Mcombining_class ¤Ï <tt>"combining-class"</tt> ¤È¤¤¤¦
258 ̾Á°¤ò»ý¤Á¡¢Ê¸»ú¥×¥í¥Ñ¥Æ¥£¤Î¥¡¼¤È¤·¤Æ»È¤ï¤ì¤ë¡£¤³¤Î¥×¥í¥Ñ¥Æ¥£¤ÎÃÍ
259 ¤Ï¡¢Âбþ¤¹¤ë @e ɸ½à·ë¹ç¥¯¥é¥¹ ¤òɽ¤ï¤¹ÈóÉéÀ°¿ô¤Ç¤¢¤ë¡£
261 ɸ½à·ë¹ç¥¯¥é¥¹¤òɽ¤ï¤¹ÈóÉéÀ°¿ô¤Î°ÕÌ£¤Ï¡¢Unicode ¤ËÄêµÁ¤µ¤ì¤Æ¤¤¤ë¤â¤Î
264 MSymbol Mcombining_class;
268 @brief Key for bidi category.
270 The symbol #Mbidi_category has the name <tt>"bidi-category"</tt>
271 and is used as the key of a character property. The value of such
272 a property is a symbol that represents the <em>bidirectional
273 category</em> of the character.
275 Each symbol that represents a bidirectional category has one of
276 the names listed as types of <em>Bidirectional Category</em> in
280 @brief ÁÐÊý¸þ¥«¥Æ¥´¥ê¤òɽ¤ï¤¹¥¡¼.
282 ¥·¥ó¥Ü¥ë #Mbidi_category ¤Ï <tt>"bidi-category"</tt> ¤È¤¤¤¦Ì¾Á°
283 ¤ò»ý¤Á¡¢Ê¸»ú¥×¥í¥Ñ¥Æ¥£¤Î¥¡¼¤È¤·¤Æ»È¤ï¤ì¤ë¡£¤³¤Î¥×¥í¥Ñ¥Æ¥£¤ÎÃͤϡ¢
284 Âбþ¤¹¤ë @e ÁÐÊý¸þ¥«¥Æ¥´¥ê ¤òɽ¤ï¤¹¥·¥ó¥Ü¥ë¤Ç¤¢¤ë¡£
286 ÁÐÊý¸þ¥«¥Æ¥´¥ê¤òɽ¤ï¤¹¥·¥ó¥Ü¥ë¤Î̾Á°¤Ï¡¢<em>Bidirectional
287 Category</em> ¤Î·¿¤È¤·¤Æ Unicode ¤ËÄêµÁ¤µ¤ì¤Æ¤¤¤ë¤â¤Î¤Ç¤¢¤ë¡£ */
289 MSymbol Mbidi_category;
293 @brief Key for corresponding single lowercase character.
295 The symbol #Msimple_case_folding has the name
296 <tt>"simple-case-folding"</tt> and is used as the key of a
297 character property. The value of such a property is the
298 corresponding single lowercase character that is used when
299 comparing M-texts ignoring cases.
301 If a character requires a complicated comparison (i.e. cannot be
302 compared by simply mapping to another single character), the value
303 of such a property is @c 0xFFFF. In this case, the character has
304 another property whose key is #Mcomplicated_case_folding. */
307 @brief Âбþ¤¹¤ë¾®Ê¸»ú°ìʸ»ú¤òɽ¤ï¤¹¥¡¼.
309 ¥·¥ó¥Ü¥ë #Msimple_case_folding ¤Ï <tt>"simple-case-folding"</tt>
310 ¤È¤¤¤¦Ì¾Á°¤ò»ý¤Á¡¢Ê¸»ú¥×¥í¥Ñ¥Æ¥£¤Î¥¡¼¤È¤·¤Æ»È¤ï¤ì¤ë¡£¤³¤Î¥×¥í¥Ñ¥Æ¥£
311 ¤ÎÃͤϡ¢Âбþ¤¹¤ë¾®Ê¸»ú°ìʸ»ú¤Ç¤¢¤ê¡¢Âçʸ»ú¡¿¾®Ê¸»ú¤Î¶èÊ̤ò̵»ë¤·¤¿
312 ʸ»úÎóÈæ³Ó¤ÎºÝ¤Ë»È¤ï¤ì¤ë¡£
314 ¤â¤·¤½¤Î¤è¤¦¤ÊÈæ³Ó¤Ë»ÈÍѤ·ÆÀ¤ëñ°ì¤Î¾®Ê¸»ú¤¬Â¸ºß¤·¤Ê¤¤¾ì¹ç¡¢¤³¤Î¥×
315 ¥í¥Ñ¥Æ¥£¤ÎÃÍ¤Ï 0xFFFF ¤Ë¤Ê¤ë¡£¤³¤Î¾ì¹ç¤½¤Îʸ»ú¤Ï¡¢
316 #Mcomplicated_case_folding ¤È¤¤¤¦¥¡¼¤Îʸ»ú¥×¥í¥Ñ¥Æ¥£¤ò»ý¤Ä¡£ */
318 MSymbol Msimple_case_folding;
320 @brief Key for corresponding multiple lowercase characters.
322 The symbol #Mcomplicated_case_folding has the name
323 <tt>"complicated-case-folding"</tt> and is used as the key of a
324 character property. The value of such a property is the
325 corresponding M-text that contains a sequence of lowercase
326 characters to be used for comparing M-texts ignoring case. */
329 @brief Âбþ¤¹¤ë¾®Ê¸»ú¤ÎÎó¤òɽ¤ï¤¹¥¡¼.
331 ¥·¥ó¥Ü¥ë #Mcomplicated_case_folding ¤Ï
332 <tt>"complicated-case-folding"</tt> ¤È¤¤¤¦Ì¾Á°¤ò»ý¤Á¡¢Ê¸»ú¥×¥í¥Ñ¥Æ¥£
333 ¤Î¥¡¼¤È¤·¤Æ»È¤ï¤ì¤ë¡£¤³¤Î¥×¥í¥Ñ¥Æ¥£¤ÎÃͤϡ¢Âбþ¤¹¤ë¾®Ê¸»úÎ󤫤é¤Ê
334 ¤ë M-text ¤Ç¤¢¤ê¡¢Âçʸ»ú¡¿¾®Ê¸»ú¤Î¶èÊ̤ò̵»ë¤·¤¿Ê¸»úÎóÈæ³Ó¤ÎºÝ¤Ë»È
338 MSymbol Mcomplicated_case_folding;
344 @brief Define a character property.
346 The mchar_define_property () function searches the m17n database
347 for a data whose tags are \<#Mchar_table, $TYPE, $SYM \>.
348 Here, $SYM is a symbol whose name is $NAME. $TYPE must be
349 #Mstring, #Mtext, #Msymbol, #Minteger, or #Mplist.
352 If the operation was successful, mchar_define_property () returns
353 $SYM. Otherwise it returns #Mnil. */
356 @brief ʸ»ú¥×¥í¥Ñ¥Æ¥£¤òÄêµÁ¤¹¤ë.
358 ´Ø¿ô mchar_define_property () ¤Ï¡¢ \<#Mchar_table, $TYPE, $SYM \>
359 ¤È¤¤¤¦¥¿¥°¤ò»ý¤Ã¤¿¥Ç¡¼¥¿¥Ù¡¼¥¹¤ò m17n ¸À¸ì¾ðÊó¥Ù¡¼¥¹¤«¤éõ¤¹¡£ ¤³
360 ¤³¤Ç $SYM ¤Ï $NAME ¤È¤¤¤¦Ì¾Á°¤Î¥·¥ó¥Ü¥ë¤Ç¤¢¤ë¡£$TYPE ¤Ï#Mstring,
361 #Mtext, #Msymbol, #Minteger, #Mplist ¤Î¤¤¤º¤ì¤«¤Ç¤Ê¤±¤ì¤Ð¤Ê¤é¤Ê¤¤¡£
364 ½èÍý¤ËÀ®¸ù¤¹¤ì¤Ð mchar_define_property () ¤Ï$SYM ¤òÊÖ¤¹¡£
365 ¼ºÇÔ¤·¤¿¾ì¹ç¤Ï #Mnil ¤òÊÖ¤¹¡£ */
372 mchar_get_prop (), mchar_put_prop () */
375 mchar_define_property (char *name, MSymbol type)
377 MSymbol key = msymbol (name);
378 MCharPropRecord *record;
380 record = mplist_get (char_prop_list, key);
384 M17N_OBJECT_UNREF (record->table);
388 MSTRUCT_CALLOC (record, MERROR_CHAR);
389 mplist_put (char_prop_list, key, record);
393 if (mdatabase__finder
394 && (record->mdb = (*mdatabase__finder) (Mchar_table, type, key, Mnil)))
396 record->table = NULL;
400 void *default_value = NULL;
403 if (type == Minteger)
404 default_value = (void *) -1;
405 record->table = mchartable (type, default_value);
414 @brief Get the value of a character property.
416 The mchar_get_prop () function searches character $C for the
417 character property whose key is $KEY.
420 If the operation was successful, mchar_get_prop () returns the
421 value of the character property. Otherwise it returns @c
425 @brief ʸ»ú¥×¥í¥Ñ¥Æ¥£¤ÎÃͤòÆÀ¤ë.
427 ´Ø¿ô mchar_get_prop () ¤Ï¡¢Ê¸»ú $C ¤Îʸ»ú¥×¥í¥Ñ¥Æ¥£¤Î¤¦¤Á¥¡¼¤¬
428 $KEY ¤Ç¤¢¤ë¤â¤Î¤òõ¤¹¡£
431 ½èÍý¤¬À®¸ù¤¹¤ì¤Ð mchar_get_prop () ¤Ï¸«¤Ä¤«¤Ã¤¿¥×¥í¥Ñ¥Æ¥£¤ÎÃͤòÊÖ
432 ¤¹¡£¼ºÇÔ¤·¤¿¾ì¹ç¤Ï @c NULL ¤òÊÖ¤¹¡£
434 @latexonly \IPAlabel{mchar_get_prop} @endlatexonly
439 @c MERROR_SYMBOL, @c MERROR_DB
442 mchar_define_property (), mchar_put_prop () */
445 mchar_get_prop (int c, MSymbol key)
447 MCharPropRecord *record;
449 record = mplist_get (char_prop_list, key);
454 record->table = (*mdatabase__loader) (record->mdb);
456 MERROR (MERROR_DB, NULL);
459 return mchartable_lookup (record->table, c);
465 @brief Set the value of a character property.
467 The mchar_put_prop () function searches character $C for the
468 character property whose key is $KEY and assigns $VAL to the value
469 of the found property.
472 If the operation was successful, mchar_put_prop () returns 0.
473 Otherwise, it returns -1. */
476 @brief ʸ»ú¥×¥í¥Ñ¥Æ¥£¤ÎÃͤòÀßÄꤹ¤ë.
478 ´Ø¿ô mchar_put_prop () ¤Ï¡¢Ê¸»ú $C ¤Îʸ»ú¥×¥í¥Ñ¥Æ¥£¤Î¤¦¤Á¥¡¼¤¬ $KEY ¤Ç
479 ¤¢¤ë¤â¤Î¤òõ¤·¡¢¤½¤ÎÃͤȤ·¤Æ $VAL ¤òÀßÄꤹ¤ë¡£
482 ½èÍý¤¬À®¸ù¤¹¤ì¤Ð mchar_put_prop () ¤Ï0¤òÊÖ¤¹¡£¼ºÇÔ¤·¤¿¾ì¹ç¤Ï-1¤òÊÖ
487 @c MERROR_SYMBOL, @c MERROR_DB
490 mchar_define_property (), mchar_get_prop () */
493 mchar_put_prop (int c, MSymbol key, void *val)
495 MCharPropRecord *record;
497 record = mplist_get (char_prop_list, key);
502 record->table = (*mdatabase__loader) (record->mdb);
504 MERROR (MERROR_DB, -1);
505 M17N_OBJECT_REF (record->table);
508 return mchartable_set (record->table, c, val);