1 /* charset.c -- charset module.
2 Copyright (C) 2003, 2004
3 National Institute of Advanced Industrial Science and Technology (AIST)
4 Registration Number H15PRO112
6 This file is part of the m17n library.
8 The m17n library is free software; you can redistribute it and/or
9 modify it under the terms of the GNU Lesser General Public License
10 as published by the Free Software Foundation; either version 2.1 of
11 the License, or (at your option) any later version.
13 The m17n library is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public
19 License along with the m17n library; if not, write to the Free
20 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
24 @addtogroup m17nCharset
25 @brief Charset objects and API for them.
27 The m17n library uses @e charset objects to represent a coded
28 character sets (CCS). The m17n library supports many predefined
29 coded character sets. Moreover, application programs can add
30 other charsets. A character can belong to multiple charsets.
32 The m17n library distinguishes the following three concepts:
34 @li A @e code-point is a number assigned by the CCS to each
35 character. Code-points may or may not be continuous. The type
36 @c unsigned is used to represent a code-point. An invalid
37 code-point is represented by the macro @c MCHAR_INVALID_CODE.
39 @li A @e character @e index is the canonical index of a character
40 in a CCS. The character that has the character index N occupies
41 the Nth position when all the characters in the current CCS are
42 sorted by their code-points. Character indices in a CCS are
43 continuous and start with 0.
45 @li A @e character @e code is the internal representation in the
46 m17n library of a character. A character code is a signed integer
49 Each charset object defines how characters are converted between
50 code-points and character codes. To @e encode means converting
51 code-points to character codes and to @e decode means converting
52 character codes to code-points. */
55 @addtogroup m17nCharset
56 @brief ʸ»ú¥»¥Ã¥È¥ª¥Ö¥¸¥§¥¯¥È¤È¤½¤ì¤Ë´Ø¤¹¤ë API
58 m17n ¥é¥¤¥Ö¥é¥ê¤Ï¡¢Éä¹æ²½Ê¸»ú½¸¹ç (CCS) ¤ò @e ʸ»ú¥»¥Ã¥È ¤È¸Æ¤Ö¥ª
59 ¥Ö¥¸¥§¥¯¥È¤Çɽ¸½¤¹¤ë¡£m17n ¥é¥¤¥Ö¥é¥ê¤Ï¿¤¯¤ÎÉä¹æ²½Ê¸»ú½¸¹ç¤òͽ¤á
60 ¥µ¥Ý¡¼¥È¤·¤Æ¤¤¤ë¤¬¡¢¥¢¥×¥ê¥±¡¼¥·¥ç¥ó¥×¥í¥°¥é¥à¤¬Æȼ«¤Ëʸ»ú¥»¥Ã¥È¤ò
61 Äɲ乤뤳¤È¤â²Äǽ¤Ç¤¢¤ë¡£°ì¤Ä¤Îʸ»ú¤ÏÊ£¿ô¤Îʸ»ú¥»¥Ã¥È¤Ë°¤·¤Æ¤â¤è
64 m17n ¥é¥¤¥Ö¥é¥ê¤Ë¤Ï¡¢°Ê²¼¤Î°Û¤Ê¤ë³µÇ°¤¬¤¢¤ë:
66 @li @e ¥³¡¼¥É¥Ý¥¤¥ó¥È ¤È¤Ï¡¢CCS ¤¬¤½¤ÎÃæ¤Î¸Ä¡¹¤Îʸ»ú¤ËÂФ·¤ÆÄêµÁ¤¹
67 ¤ë¿ôÃͤǤ¢¤ë¡£¥³¡¼¥É¥Ý¥¤¥ó¥È¤ÏϢ³¤·¤Æ¤¤¤ë¤È¤Ï¸Â¤é¤Ê¤¤¡£
69 @li @e ʸ»ú¥¤¥ó¥Ç¥Ã¥¯¥¹ ¤È¤Ï¡¢CCS Æâ¤Ç³Æʸ»ú¤Ë³ä¤êÅö¤Æ¤é¤ì¤ëÀµµ¬²½¤µ
70 ¤ì¤¿¥¤¥ó¥Ç¥Ã¥¯¥¹¤Ç¤¢¤ë¡£Ê¸»ú¥¤¥ó¥Ç¥Ã¥¯¥¹¤¬N¤Îʸ»ú¤Ï¡¢CCS Ãæ¤ÎÁ´Ê¸»ú¤ò
71 ¥³¡¼¥É¥Ý¥¤¥ó¥È¤Ç¥½¡¼¥È¤·¤¿¤È¤¤ËNÈÖÌܤËÍè¤ë¡£
73 @li @e ʸ»ú¥³¡¼¥É¤È¤Ï¡¢m17n ¥é¥¤¥Ö¥é¥êÆâ¤Ë¤ª¤±¤ëʸ»ú¤ÎÆâÉôɽ¸½¤Ç¤¢
74 ¤ê¡¢21 ¥Ó¥Ã¥È°Ê¾å¤ÎŤµ¤ò»ý¤ÄÉä¹çÉÕ¤À°¿ô¤Ç¤¢¤ë¡£
76 ³Æʸ»ú¥»¥Ã¥È¥ª¥Ö¥¸¥§¥¯¥È¤Ï¡¢¤½¤ì¤Ë°¤¹¤ëʸ»ú¤Î¥³¡¼¥É¥Ý¥¤¥ó¥È¤Èʸ»ú
77 ¥³¡¼¥É¤È¤ÎÁê¸ßÊÑ´¹¤òµ¬Äꤹ¤ë¡£¥³¡¼¥É¥Ý¥¤¥ó¥È¤«¤éʸ»ú¥³¡¼¥É¤Ø¤ÎÊÑ´¹
78 ¤ò @e ¥Ç¥³¡¼¥É ¤È¸Æ¤Ó¡¢Ê¸»ú¥³¡¼¥É¤«¤é¥³¡¼¥É¥Ý¥¤¥ó¥È¤Ø¤ÎÊÑ´¹¤ò @e
79 ¥¨¥ó¥³¡¼¥É ¤È¸Æ¤Ö¡£ */
82 #if !defined (FOR_DOXYGEN) || defined (DOXYGEN_INTERNAL_MODULE)
83 /*** @addtogroup m17nInternal
93 #include "m17n-misc.h"
101 static int unified_max = MCHAR_MAX;
103 /** List of all charsets ever defined. */
111 static struct MCharsetList charset_list;
113 static MPlist *charset_definition_list;
115 /** Make a charset object from the template of MCharset structure
116 CHARSET, and return a pointer to the new charset object.
117 CHARSET->code_range[4N + 2] and TMPL->code_range[4N + 3] are not
121 make_charset (MCharset *charset)
123 unsigned min_code, max_code;
125 int *range = charset->code_range;
127 if (charset->dimension < 1 || charset->dimension > 4)
128 MERROR (MERROR_CHARSET, NULL);
129 if ((charset->final_byte > 0 && charset->final_byte < '0')
130 || charset->final_byte > 127)
131 MERROR (MERROR_CHARSET, NULL);
133 for (i = 0, n = 1; i < 4; i++)
135 if (range[i * 4] > range[i * 4 + 1])
136 MERROR (MERROR_CHARSET, NULL);
137 range[i * 4 + 2] = range[i * 4 + 1] - range[i * 4] + 1;
138 n *= range[i * 4 + 2];
139 range[i * 4 + 3] = n;
142 min_code = range[0] | (range[4] << 8) | (range[8] << 16) | (range[12] << 24);
143 if (charset->min_code == 0)
144 charset->min_code = min_code;
145 else if (charset->min_code < min_code)
146 MERROR (MERROR_CHARSET, NULL);
147 max_code = range[1] | (range[5] << 8) | (range[9] << 16) | (range[13] << 24);
148 if (charset->max_code == 0)
149 charset->max_code = max_code;
150 else if (charset->max_code > max_code)
151 MERROR (MERROR_CHARSET, NULL);
153 charset->code_range_min_code = min_code;
155 if (charset->method == Msubset)
159 if (charset->nparents != 1)
160 MERROR (MERROR_CHARSET, NULL);
161 parent = charset->parents[0];
162 if (parent->method == Msuperset
163 || charset->min_code - charset->subset_offset < parent->min_code
164 || charset->max_code - charset->subset_offset > parent->max_code)
165 MERROR (MERROR_CHARSET, NULL);
166 if (parent->method == Moffset)
170 code = charset->min_code - charset->subset_offset;
171 charset->min_char = DECODE_CHAR (parent, code);
172 code = charset->max_code - charset->subset_offset;
173 charset->max_char = DECODE_CHAR (parent, code);
177 unsigned min_code = charset->min_code - charset->subset_offset;
178 unsigned max_code = charset->max_code - charset->subset_offset;
179 int min_char = DECODE_CHAR (parent, min_code);
180 int max_char = min_char;
182 for (++min_code; min_code <= max_code; min_code++)
184 int c = DECODE_CHAR (parent, min_code);
190 else if (c > max_char)
194 charset->min_char = min_char;
195 charset->max_char = max_char;
199 else if (charset->method == Msuperset)
201 int min_char = 0, max_char = 0;
203 if (charset->nparents < 2)
204 MERROR (MERROR_CHARSET, NULL);
205 for (i = 0; i < charset->nparents; i++)
206 if (charset->min_code > charset->parents[i]->min_code
207 || charset->max_code < charset->parents[i]->max_code)
208 MERROR (MERROR_CHARSET, NULL);
210 for (i = 0; i < charset->nparents; i++)
212 MCharset *parent = charset->parents[i];
214 if (charset->min_code > parent->min_code
215 || charset->max_code < parent->max_code)
216 MERROR (MERROR_CHARSET, NULL);
218 min_char = parent->min_char, max_char = parent->max_char;
219 else if (parent->min_char < min_char)
220 min_char = parent->min_char;
221 else if (parent->max_char > max_char)
222 max_char = parent->max_char;
224 charset->min_char = min_char;
225 charset->max_char = max_char;
231 = (charset->dimension == 1
233 && (charset->dimension == 2
235 && (charset->dimension == 3
236 || range[10] == 256)))));
238 if (! charset->no_code_gap)
242 memset (charset->code_range_mask, 0,
243 sizeof charset->code_range_mask);
244 for (i = 0; i < 4; i++)
245 for (j = range[i * 4]; j <= range[i * 4 + 1]; j++)
246 charset->code_range_mask[j] |= (1 << i);
249 if (charset->method == Moffset)
251 charset->max_char = charset->min_char + range[15] - 1;
252 if (charset->min_char < 0
253 || charset->max_char < 0 || charset->max_char > unified_max)
254 MERROR (MERROR_CHARSET, NULL);
255 charset->simple = charset->no_code_gap;
257 else if (charset->method == Mmap || charset->method == Munify)
259 MDatabase *mdb = mdatabase_find (Mcharset, charset->name,
264 if (charset->method == Munify)
266 /* The magic number 12 below is to align to the
267 SUB_BITS_2 (defined in chartab.c) boundary in a
269 unified_max -= ((range[15] >> 12) + 1) << 12;
270 charset->unified_max = unified_max;
273 if (! mdb || ! (plist = mdatabase_load (mdb)))
274 MERROR (MERROR_CHARSET, NULL);
275 charset->decoder = mplist_value (plist);
276 charset->encoder = mplist_value (mplist_next (plist));
277 M17N_OBJECT_UNREF (plist);
278 mchartable_range (charset->encoder,
279 &charset->min_char, &charset->max_char);
280 if (charset->method == Mmap)
281 charset->simple = charset->no_code_gap;
284 = charset->unified_max + 1 + charset->code_range[15];
287 MERROR (MERROR_CHARSET, NULL);
290 MLIST_APPEND1 (&charset_list, charsets, charset, MERROR_CHARSET);
292 if (charset->final_byte > 0)
294 MLIST_APPEND1 (&mcharset__iso_2022_table, charsets, charset,
296 if (charset->revision <= 0)
298 int chars = range[2];
300 if (chars == 128) /* ASCII case */
302 else if (chars == 256) /* ISO-8859-X case */
304 MCHARSET_ISO_2022 (charset->dimension, chars, charset->final_byte)
309 charset->fully_loaded = 1;
314 load_charset_fully (MCharset *charset)
316 if (charset->method == Msubset)
318 MCharset *parent = charset->parents[0];
320 if (! parent->fully_loaded
321 && load_charset_fully (parent) < 0)
322 MERROR (MERROR_CHARSET, -1);
323 if (parent->method == Moffset)
327 code = charset->min_code - charset->subset_offset;
328 charset->min_char = DECODE_CHAR (parent, code);
329 code = charset->max_code - charset->subset_offset;
330 charset->max_char = DECODE_CHAR (parent, code);
334 unsigned min_code = charset->min_code - charset->subset_offset;
335 unsigned max_code = charset->max_code - charset->subset_offset;
336 int min_char = DECODE_CHAR (parent, min_code);
337 int max_char = min_char;
339 for (++min_code; min_code <= max_code; min_code++)
341 int c = DECODE_CHAR (parent, min_code);
347 else if (c > max_char)
351 charset->min_char = min_char;
352 charset->max_char = max_char;
355 else if (charset->method == Msuperset)
357 int min_char = 0, max_char = 0;
360 for (i = 0; i < charset->nparents; i++)
362 MCharset *parent = charset->parents[i];
364 if (! parent->fully_loaded
365 && load_charset_fully (parent) < 0)
366 MERROR (MERROR_CHARSET, -1);
368 min_char = parent->min_char, max_char = parent->max_char;
369 else if (parent->min_char < min_char)
370 min_char = parent->min_char;
371 else if (parent->max_char > max_char)
372 max_char = parent->max_char;
374 charset->min_char = min_char;
375 charset->max_char = max_char;
377 else /* charset->method is Mmap or Munify */
379 MDatabase *mdb = mdatabase_find (Mcharset, charset->name, Mnil, Mnil);
382 if (! mdb || ! (plist = mdatabase_load (mdb)))
383 MERROR (MERROR_CHARSET, -1);
384 charset->decoder = mplist_value (plist);
385 charset->encoder = mplist_value (mplist_next (plist));
386 M17N_OBJECT_UNREF (plist);
387 mchartable_range (charset->encoder,
388 &charset->min_char, &charset->max_char);
389 if (charset->method == Mmap)
390 charset->simple = charset->no_code_gap;
392 charset->max_char = charset->unified_max + 1 + charset->code_range[15];
395 charset->fully_loaded = 1;
402 MPlist *mcharset__cache;
404 /* Predefined charsets. */
405 MCharset *mcharset__ascii;
406 MCharset *mcharset__binary;
407 MCharset *mcharset__m17n;
408 MCharset *mcharset__unicode;
410 MCharsetISO2022Table mcharset__iso_2022_table;
412 /** Initialize charset handler. */
419 mcharset__cache = mplist ();
420 mplist_set (mcharset__cache, Mt, NULL);
422 MLIST_INIT1 (&charset_list, charsets, 128);
423 MLIST_INIT1 (&mcharset__iso_2022_table, charsets, 128);
424 charset_definition_list = mplist ();
426 memset (mcharset__iso_2022_table.classified, 0,
427 sizeof (mcharset__iso_2022_table.classified));
429 Mcharset = msymbol ("charset");
431 Mmethod = msymbol ("method");
432 Moffset = msymbol ("offset");
433 Mmap = msymbol ("map");
434 Munify = msymbol ("unify");
435 Msubset = msymbol ("subset");
436 Msuperset = msymbol ("superset");
438 Mdimension = msymbol ("dimension");
439 Mmin_range = msymbol ("min-range");
440 Mmax_range = msymbol ("max-range");
441 Mmin_code = msymbol ("min-code");
442 Mmax_code = msymbol ("max-code");
443 Mascii_compatible = msymbol ("ascii-compatible");
444 Mfinal_byte = msymbol ("final-byte");
445 Mrevision = msymbol ("revision");
446 Mmin_char = msymbol ("min-char");
447 Mmapfile = msymbol_as_managing_key ("mapfile");
448 Mparents = msymbol_as_managing_key ("parents");
449 Msubset_offset = msymbol ("subset-offset");
450 Mdefine_coding = msymbol ("define-coding");
451 Maliases = msymbol_as_managing_key ("aliases");
455 /* Setup predefined charsets. */
456 pl = mplist_add (pl, Mmethod, Moffset);
457 pl = mplist_add (pl, Mmin_range, (void *) 0);
458 pl = mplist_add (pl, Mmax_range, (void *) 0x7F);
459 pl = mplist_add (pl, Mascii_compatible, Mt);
460 pl = mplist_add (pl, Mfinal_byte, (void *) 'B');
461 pl = mplist_add (pl, Mmin_char, (void *) 0);
462 Mcharset_ascii = mchar_define_charset ("ascii", param);
464 mplist_put (param, Mmax_range, (void *) 0xFF);
465 mplist_put (param, Mfinal_byte, NULL);
466 Mcharset_iso_8859_1 = mchar_define_charset ("iso-8859-1", param);
468 mplist_put (param, Mmax_range, (void *) 0x10FFFF);
469 Mcharset_unicode = mchar_define_charset ("unicode", param);
471 mplist_put (param, Mmax_range, (void *) MCHAR_MAX);
472 Mcharset_m17n = mchar_define_charset ("m17n", param);
474 mplist_put (param, Mmax_range, (void *) 0xFF);
475 Mcharset_binary = mchar_define_charset ("binary", param);
477 M17N_OBJECT_UNREF (param);
479 mcharset__ascii = MCHARSET (Mcharset_ascii);
480 mcharset__binary = MCHARSET (Mcharset_binary);
481 mcharset__m17n = MCHARSET (Mcharset_m17n);
482 mcharset__unicode = MCHARSET (Mcharset_unicode);
488 mcharset__fini (void)
493 for (i = 0; i < charset_list.used; i++)
495 MCharset *charset = charset_list.charsets[i];
497 if (charset->decoder)
498 free (charset->decoder);
499 if (charset->encoder)
500 M17N_OBJECT_UNREF (charset->encoder);
503 M17N_OBJECT_UNREF (mcharset__cache);
504 MLIST_FREE1 (&charset_list, charsets);
505 MLIST_FREE1 (&mcharset__iso_2022_table, charsets);
506 MPLIST_DO (plist, charset_definition_list)
507 M17N_OBJECT_UNREF (MPLIST_VAL (plist));
508 M17N_OBJECT_UNREF (charset_definition_list);
513 mcharset__find (MSymbol name)
517 charset = msymbol_get (name, Mcharset);
520 MPlist *param = mplist_get (charset_definition_list, name);
522 MPLIST_KEY (mcharset__cache) = Mt;
525 param = mplist__from_plist (param);
526 mchar_define_charset (MSYMBOL_NAME (name), param);
527 charset = msymbol_get (name, Mcharset);
528 M17N_OBJECT_UNREF (param);
530 MPLIST_KEY (mcharset__cache) = name;
531 MPLIST_VAL (mcharset__cache) = charset;
536 /** Return the character corresponding to code-point CODE in CHARSET.
537 If CODE is invalid for CHARSET, return -1. */
540 mcharset__decode_char (MCharset *charset, unsigned code)
544 if (code < 128 && charset->ascii_compatible)
546 if (code < charset->min_code || code > charset->max_code)
549 if (! charset->fully_loaded
550 && load_charset_fully (charset) < 0)
551 MERROR (MERROR_CHARSET, -1);
553 if (charset->method == Msubset)
555 MCharset *parent = charset->parents[0];
557 code -= charset->subset_offset;
558 return DECODE_CHAR (parent, code);
561 if (charset->method == Msuperset)
565 for (i = 0; i < charset->nparents; i++)
567 MCharset *parent = charset->parents[i];
568 int c = DECODE_CHAR (parent, code);
576 idx = CODE_POINT_TO_INDEX (charset, code);
580 if (charset->method == Mmap)
581 return charset->decoder[idx];
583 if (charset->method == Munify)
585 int c = charset->decoder[idx];
588 c = charset->unified_max + 1 + idx;
592 /* Now charset->method should be Moffset. */
593 return (charset->min_char + idx);
597 /** Return the code point of character C in CHARSET. If CHARSET does not
598 contain C, return MCHAR_INVALID_CODE. */
601 mcharset__encode_char (MCharset *charset, int c)
603 if (! charset->fully_loaded
604 && load_charset_fully (charset) < 0)
605 MERROR (MERROR_CHARSET, MCHAR_INVALID_CODE);
607 if (charset->method == Msubset)
609 MCharset *parent = charset->parents[0];
610 unsigned code = ENCODE_CHAR (parent, c);
612 if (code == MCHAR_INVALID_CODE)
614 code += charset->subset_offset;
615 if (code >= charset->min_code && code <= charset->max_code)
617 return MCHAR_INVALID_CODE;
620 if (charset->method == Msuperset)
624 for (i = 0; i < charset->nparents; i++)
626 MCharset *parent = charset->parents[i];
627 unsigned code = ENCODE_CHAR (parent, c);
629 if (code != MCHAR_INVALID_CODE)
632 return MCHAR_INVALID_CODE;
635 if (c < charset->min_char || c > charset->max_char)
636 return MCHAR_INVALID_CODE;
638 if (charset->method == Mmap)
639 return (unsigned) mchartable_lookup (charset->encoder, c);
641 if (charset->method == Munify)
643 if (c > charset->unified_max)
645 c -= charset->unified_max - 1;
646 return INDEX_TO_CODE_POINT (charset, c);
648 return (unsigned) mchartable_lookup (charset->encoder, c);
651 /* Now charset->method should be Moffset */
652 c -= charset->min_char;
653 return INDEX_TO_CODE_POINT (charset, c);
657 mcharset__load_from_database ()
659 MDatabase *mdb = mdatabase_find (msymbol ("charset-list"), Mnil, Mnil, Mnil);
660 MPlist *def_list, *plist;
661 MPlist *definitions = charset_definition_list;
662 int mdebug_mask = MDEBUG_CHARSET;
667 def_list = (MPlist *) mdatabase_load (mdb);
668 MDEBUG_PRINT_TIME ("CHARSET", (stderr, " to load data."));
674 MPLIST_DO (plist, def_list)
679 if (! MPLIST_PLIST_P (plist))
680 MERROR (MERROR_CHARSET, -1);
681 pl = MPLIST_PLIST (plist);
682 if (! MPLIST_SYMBOL_P (pl))
683 MERROR (MERROR_CHARSET, -1);
684 name = MPLIST_SYMBOL (pl);
685 pl = MPLIST_NEXT (pl);
686 definitions = mplist_add (definitions, name, pl);
687 M17N_OBJECT_REF (pl);
688 if ((pl = mplist_find_by_value (pl, Mdefine_coding))
689 && (MSymbol) MPLIST_VAL (MPLIST_NEXT (pl)) == Mt)
690 mconv__register_charset_coding (name);
693 M17N_OBJECT_UNREF (def_list);
694 MDEBUG_PRINT_TIME ("CHARSET", (stderr, " to parse the loaded data."));
700 #endif /* !FOR_DOXYGEN || DOXYGEN_INTERNAL_MODULE */
705 /*** @addtogroup m17nCharset */
711 @brief Invalid code-point.
713 The macro #MCHAR_INVALID_CODE gives the invalid code-point. */
716 @brief ̵¸ú¤Ê¥³¡¼¥É¥Ý¥¤¥ó¥È
718 ¥Þ¥¯¥í #MCHAR_INVALID_CODE ¤Ï̵¸ú¤Ê¥³¡¼¥É¥Ý¥¤¥ó¥È¤òÍ¿¤¨¤ë¡£ */
720 #define MCHAR_INVALID_CODE
724 @brief The symbol @c Mcharset.
726 Any decoded M-text has a text property whose key is the predefined
727 symbol @c Mcharset. The name of @c Mcharset is
728 <tt>"charset"</tt>. */
731 @brief ¥·¥ó¥Ü¥ë @c Mcharset
733 ¥Ç¥³¡¼¥É¤µ¤ì¤¿ M-text ¤Ï¡¢¥¡¼¤¬ @c Mcharset ¤Ç¤¢¤ë¤è¤¦¤Ê¥Æ¥¥¹¥È
734 ¥×¥í¥Ñ¥Æ¥£¤ò»ý¤Ä¡£¥·¥ó¥Ü¥ë @c Mcharset ¤Ï <tt>"charset"</tt> ¤È¤¤
735 ¤¦Ì¾Á°¤Ç¤¢¤é¤«¤¸¤áÄêµÁ¤µ¤ì¤Æ¤¤¤ë¡£ */
741 @name Variables: Symbols representing a charset.
743 Each of the following symbols represents a predefined charset. */
746 @name ÊÑ¿ô: ʸ»ú¥»¥Ã¥È¤òɽ¤ï¤¹ÄêµÁºÑ¤ß¥·¥ó¥Ü¥ë
748 °Ê²¼¤Î³Æ¥·¥ó¥Ü¥ë¤Ï¡¢¥¡¼¤¬ @c Mcharset ¤Ç¤¢¤ê¡¢Ãͤ¬Âбþ¤¹¤ëʸ»ú¥»¥Ã
749 ¥È¥ª¥Ö¥¸¥§¥¯¥È¡Ê @c MCharset ·¿¡Ë¤Ø¤Î¥Ý¥¤¥ó¥¿¤Ç¤¢¤ë¥·¥ó¥Ü¥ë¥×¥í¥Ñ
755 @brief Symbol representing the charset ASCII.
757 The symbol #Mcharset_ascii has name <tt>"ascii"</tt> and represents
758 the charset ISO 646, USA Version X3.4-1968 (ISO-IR-6). */
760 @brief ISO 646, USA Version ¤ËÂбþ¤¹¤ëʸ»ú¥»¥Ã¥È¤Î¥·¥ó¥Ü¥ë
762 ¥·¥ó¥Ü¥ë #Mcharset_ascii ¤Ï <tt>"ascii"</tt> ¤È¤¤¤¦Ì¾Á°¤ò»ý¤Á¡¢
763 ISO 646, USA Version X3.4-1968 (ISO-IR-6) ¤ËÂбþ¤¹¤ëʸ»ú¥»¥Ã¥È¤ò»Ø
764 Äꤹ¤ë¤¿¤á¤Ë»È¤ï¤ì¤ë¡£ */
766 MSymbol Mcharset_ascii;
770 @brief Symbol representing the charset ISO/IEC 8859/1.
772 The symbol #Mcharset_iso_8859_1 has name <tt>"iso-8859-1"</tt>
773 and represents the charset ISO/IEC 8859-1:1998. */
775 @brief ISO/IEC 8859-1:1998 ¤ËÂбþ¤¹¤ëʸ»ú¥»¥Ã¥È¤Î¥·¥ó¥Ü¥ë
777 ¥·¥ó¥Ü¥ë #Mcharset_iso_8859_1 ¤Ï <tt>"iso-8859-1"</tt> ¤È¤¤¤¦Ì¾
778 Á°¤ò»ý¤Á¡¢ISO/IEC 8859-1:1998 ¤ËÂбþ¤¹¤ëʸ»ú¥»¥Ã¥È¤ò»ØÄꤹ¤ë¤¿¤á¤Ë
781 MSymbol Mcharset_iso_8859_1;
784 @brief Symbol representing the charset Unicode.
786 The symbol #Mcharset_unicode has name <tt>"unicode"</tt> and
787 represents the charset Unicode. */
789 @brief Unicode ¤ËÂбþ¤¹¤ëʸ»ú¥»¥Ã¥È¤Î¥·¥ó¥Ü¥ë
791 ¥·¥ó¥Ü¥ë #Mcharset_unicode ¤Ï <tt>"unicode"</tt> ¤È¤¤¤¦Ì¾Á°¤ò»ý
792 ¤Á¡¢Unicode ¤ËÂбþ¤¹¤ëʸ»ú¥»¥Ã¥È¤ò»ØÄꤹ¤ë¤¿¤á¤Ë»È¤ï¤ì¤ë¡£ */
794 MSymbol Mcharset_unicode;
798 @brief Symbol representing the largest charset.
800 The symbol #Mcharset_m17n has name <tt>"m17n"</tt> and
801 represents the charset that contains all characters supported by
804 @brief Á´Ê¸»ú¤ò´Þ¤àʸ»ú¥»¥Ã¥È¤Î¥·¥ó¥Ü¥ë
806 ¥·¥ó¥Ü¥ë #Mcharset_m17n ¤Ï <tt>"m17n"</tt> ¤È¤¤¤¦Ì¾Á°¤ò»ý¤Á¡¢
807 m17n ¥é¥¤¥Ö¥é¥ê¤¬°·¤¦Á´¤Æ¤Îʸ»ú¤ò´Þ¤àʸ»ú¥»¥Ã¥È¤ò»ØÄꤹ¤ë¤¿¤á¤Ë»È
810 MSymbol Mcharset_m17n;
814 @brief Symbol representing the charset for ill-decoded characters.
816 The symbol #Mcharset_binary has name <tt>"binary"</tt> and
817 represents the fake charset which the decoding functions put to an
818 M-text as a text property when they encounter an invalid byte
819 (sequence). See @ref m17nConv @latexonly
820 (P.\pageref{group__m17nConv}) @endlatexonly for more detail. */
822 MSymbol Mcharset_binary;
829 @name Variables: Parameter keys for mchar_define_charset ().
831 These are the predefined symbols to use as parameter keys for the
832 function mchar_define_charset () (which see). */
835 @name ÊÑ¿ô: mchar_define_charset ÍѤΥѥé¥á¡¼¥¿¡¦¥¡¼
837 ¤³¤ì¤é¤Ï¡¢´Ø¿ô mchar_define_charset () ÍѤΥѥé¥á¡¼¥¿¡¦¥¡¼¤È¤·¤Æ
838 »È¤ï¤ì¤ë¥·¥ó¥Ü¥ë¤Ç¤¢¤ë¡£ ¾Ü¤·¤¯¤Ï¤³¤Î´Ø¿ô¤Î²òÀâ¤ò»²¾È¤Î¤³¤È¡£*/
843 Parameter key for mchar_define_charset () (which see). */
851 MSymbol Mascii_compatible;
857 MSymbol Msubset_offset;
858 MSymbol Mdefine_coding;
865 @name Variables: Symbols representing charset methods.
867 These are the predefined symbols that can be a value of the
868 #Mmethod parameter of a charset used in an argument to the
869 mchar_define_charset () function.
871 A method specifies how code-points and character codes are
872 converted. See the documentation of the mchar_define_charset ()
873 function for the details. */
876 @name ÊÑ¿ô: ʸ»ú¥»¥Ã¥È¤Î¥á¥½¥Ã¥É»ØÄê¤Ë»È¤ï¤ì¤ë¥·¥ó¥Ü¥ë
878 ¤³¤ì¤é¤Ï¡¢Ê¸»ú¥»¥Ã¥È¤Î @e ¥á¥½¥Ã¥É ¤ò»ØÄꤹ¤ë¤¿¤á¤Î¥·¥ó¥Ü¥ë¤Ç¤¢¤ê¡¢
879 ´Ø¿ô mchar_define_charset () ¤Î°ú¿ô¤È¤·¤Æ»È¤ï¤ì¤ë¡£
881 ¥á¥½¥Ã¥É¤È¤Ï¡¢¥³¡¼¥É¥Ý¥¤¥ó¥È¤Èʸ»ú¥³¡¼¥É¤òÁê¸ßÊÑ´¹¤¹¤ëºÝ¤ÎÊý¼°¤Î¤³
882 ¤È¤Ç¤¢¤ë¡£¾Ü¤·¤¯¤Ï´Ø¿ô mchar_define_charset () ¤Î²òÀâ¤ò»²¾È¤Î¤³¤È¡£ */
886 @brief Symbol for the offset type method of charset.
888 The symbol #Moffset has the name <tt>"offset"</tt> and, when used
889 as a value of #Mmethod parameter of a charset, it means that the
890 conversion of code-points and character codes of the charset is
891 done by this calculation:
894 CHARACTER-CODE = CODE-POINT - MIN-CODE + MIN-CHAR
897 where, MIN-CODE is a value of #Mmin_code parameter of the charset,
898 and MIN-CHAR is a value of #Mmin_char parameter. */
901 @brief ¥ª¥Õ¥»¥Ã¥È·¿¤Î¥á¥½¥Ã¥É¤ò¼¨¤¹¥·¥ó¥Ü¥ë
903 ¥·¥ó¥Ü¥ë #Moffset ¤Ï <tt>"offset"</tt> ¤È¤¤¤¦Ì¾Á°¤ò»ý¤Á¡¢
904 mchar_define_charset () ¤Ç¥ª¥Õ¥»¥Ã¥È·¿¤Î¥á¥½¥Ã¥É¤ò»ØÄꤹ¤ë¾ì¹ç¤Î°ú
905 ¿ô¤È¤·¤ÆÍѤ¤¤é¤ì¤ë¡£*/
910 /***en @brief Symbol for the map type method of charset.
912 The symbol #Mmap has the name <tt>"map"</tt> and, when use as a
913 value of #Mmethod parameter of a charset, it means that the
914 conversion of code-points and character codes of the charset is
915 done by map looking up. The map must be given by #Mmapfile
918 /***ja @brief ¥Þ¥Ã¥×·¿¤Î¥á¥½¥Ã¥É¤ò¼¨¤¹¥·¥ó¥Ü¥ë
920 ¥·¥ó¥Ü¥ë #Mmap ¤Ï <tt>"map"</tt> ¤È¤¤¤¦Ì¾Á°¤ò»ý¤Á¡¢
921 mchar_define_charset () ¤Ç¥Þ¥Ã¥×·¿¤Î¥á¥½¥Ã¥É¤ò»ØÄꤹ¤ë¾ì¹ç¤Î°ú¿ô¤È
927 /***en @brief Symbol for the unify type method of charset.
929 The symbol #Munify has the name <tt>"unify"</tt> and, when used as
930 a value of #Mmethod parameter of a charset, it means that the
931 conversion of code-points and character codes of the charset is
932 done by map looking up and offsetting. The map must be given by
933 #Mmapfile parameter. For this kind of charset, a unique
934 consequent character code space for all characters is assigned.
935 If the map has an entry for a code-point, the conversion is done
936 by looking up the map. Otherwise, the conversion is done by this
940 CHARACTER-CODE = CODE-POINT - MIN-CODE + LOWEST-CHAR-CODE
943 where, MIN-CODE is a value of #Mmin_code parameter of the charset,
944 and LOWEST-CHAR-CODE is the lowest character code of the assigned
947 /***ja @brief Áê³·¿¤Î¥á¥½¥Ã¥É¤ò¼¨¤¹¥·¥ó¥Ü¥ë
949 ¥·¥ó¥Ü¥ë #Minherit ¤Ï <tt>"inherit"</tt> ¤È¤¤¤¦Ì¾Á°¤ò»ý¤Á¡¢
950 mchar_define_charset () ¤ÇÁê³·¿¤Î¥á¥½¥Ã¥É¤ò»ØÄꤹ¤ë¾ì¹ç¤Î°ú¿ô¤È¤·
957 @brief Symbol for the subset type method of charset.
959 The symbol #Msubset has the name <tt>"subset"</tt> and, when used
960 as a value of #Mmethod parameter of a charset, it means that the
961 charset is a subset of a parent charset. The parent charset must
962 be given by #Mparents parameter. The conversion of code-points
963 and character codes of the charset is done conceptually by this
967 CHARACTER-CODE = PARENT-CODE (CODE-POINT) + SUBSET-OFFSET
970 where, PARENT-CODE is a pseudo function that returns a character
971 code of CODE-POINT in the parent charset, and SUBSET-OFFSET is a
972 value given by #Msubset_offset parameter. */
978 @brief Symbol for the superset type method of charset.
980 The symbol #Msuperset has the name <tt>"superset"</tt> and, when
981 used as a value of #Mmethod parameter of a charset, it means that
982 the charset is a superset of parent charsets. The parent charsets
983 must be given by #Mparents parameter. */
990 @brief Define a charset.
992 The mchar_define_charset () function defines a new charset and
993 makes it accessible via a symbol whose name is $NAME. $PLIST
994 specifies parameters of the charset as below:
998 <li> Key is #Mmethod, value is a symbol.
1000 The value specifies the method for decoding/encoding code-points
1001 in the charset. It must be #Moffset, #Mmap (default), #Munify,
1002 #Msubset, or #Msuperset.
1004 <li> Key is #Mdimension, value is an integer
1006 The value specifies the dimension of code-points of the charset.
1007 It must be 1 (default), 2, 3, or 4.
1009 <li> Key is #Mmin_range, value is an unsigned integer
1011 The value specifies the minimum range of a code-point, which means
1012 that the Nth byte of the value is the minimum Nth byte of
1013 code-points of the charset. The default value is 0.
1015 <li> Key is #Mmax_range, value is an unsigned integer
1017 The value specifies the maximum range of a code-point, which means
1018 that the Nth byte of the value is the maximum Nth byte of
1019 code-points of the charset. The default value is 0xFF, 0xFFFF,
1020 0xFFFFFF, or 0xFFFFFFFF if the dimension is 1, 2, 3, or 4
1023 <li> Key is #Mmin_code, value is an unsigned integer
1025 The value specifies the minimum code-point of
1026 the charset. The default value is the minimum range.
1028 <li> Key is #Mmax_code, value is an unsigned integer
1030 The value specifies the maximum code-point of
1031 the charset. The default value is the maximum range.
1033 <li> Key is #Mascii_compatible, value is a symbol
1035 The value specifies whether the charset is ASCII compatible or
1036 not. If the value is #Mnil (default), it is not ASCII
1037 compatible, else compatible.
1039 <li> Key is #Mfinal_byte, value is an integer
1041 The value specifies the @e final @e byte of the charset registered
1042 in The International Registry. It must be 0 (default) or 32..127.
1043 The value 0 means that the charset is not in the registry.
1045 <li> Key is #Mrevision, value is an integer
1047 The value specifies the @e revision @e number of the charset
1048 registered in The International Registry. it must be 0..127. If
1049 the charset is not in The International Registry, the value is
1050 ignored. The value 0 means that the charset has no revision
1053 <li> Key is #Mmin_char, value is an integer
1055 The value specifies the minimum character code of the charset.
1056 The default value is 0.
1058 <li> Key is #Mmapfile, value is an M-text
1060 If the method is #Mmap or #Munify, a data that contains
1061 mapping information is added to the m17n database by calling
1062 mdatabase_define () with the value as an argument $EXTRA_INFO,
1063 i.e. the value is used as a file name of the data.
1065 Otherwise, this parameter is ignored.
1067 <li> Key is #Mparents, value is a plist
1069 If the method is #Msubset, the value must is a plist of length
1070 1, and the value of the plist must be a symbol representing a
1073 If the method is #Msuperset, the value must be a plist of length
1074 less than 9, and the values of the plist must be symbols
1075 representing subset charsets.
1077 Otherwise, this parameter is ignored.
1079 <li> Key is #Mdefine_coding, value is a symbol
1081 If the dimension of the charset is 1, the value specifies whether
1082 or not to define a coding system of the same name whose method is
1085 Otherwise, this parameter is ignored.
1090 If the operation was successful, mchar_define_charset () returns a
1091 symbol whose name is $NAME. Otherwise it returns #Mnil and
1092 assigns an error code to the external variable #merror_code. */
1095 @brief ʸ»ú¥»¥Ã¥È¤òÄêµÁ¤¹¤ë.
1097 ´Ø¿ô mchar_define_charset () ¤Ï¿·¤·¤¤Ê¸»ú¥»¥Ã¥È¤òÄêµÁ¤·¡¢¤½¤ì¤ò
1098 $NAME ¤È¤¤¤¦Ì¾Á°¤Î¥·¥ó¥Ü¥ë·Ðͳ¤Ç¥¢¥¯¥»¥¹¤Ç¤¤ë¤è¤¦¤Ë¤¹¤ë¡£$METHOD
1099 ¤Ï¤½¤Îʸ»ú¥»¥Ã¥È¤Ë¤ª¤±¤ë¥³¡¼¥É¥Ý¥¤¥ó¥È¤Î¥Ç¥³¡¼¥É¡¿¥¨¥ó¥³¡¼¥É¥á¥½¥Ã
1100 ¥É¤ò»ØÄꤹ¤ë¥·¥ó¥Ü¥ë¤Ç¤¢¤ê¡¢#Moffset, #Mmap, #Munify,
1101 #Msubset, #Msuperset ¤Î¤¤¤º¤ì¤«¤ÎÃͤò¤È¤ë¡£
1103 $DIMENSION ¤Ï¤½¤Îʸ»ú¥»¥Ã¥È¤Î¥³¡¼¥É¥Ý¥¤¥ó¥È¤Î¼¡¸µ¤Ç¤¢¤ê¡¢1, 2, 3,
1104 4¤Î¤¤¤º¤ì¤«¤ÎÃͤò¤È¤ë¡£
1106 $CODE_RANGE ¤ÏÂ礤µ¤¬8¥Ð¥¤¥È¤ÎÇÛÎó¤Ç¤¢¤ê¡¢ÄêµÁ¤µ¤ì¤ëʸ»ú¥»¥Ã¥È¤Î
1107 ¥³¡¼¥É¥Ý¥¤¥ó¥È¶õ´Ö¤òɽ¤ï¤¹¡£Âè1¥Ð¥¤¥È¤ÈÂè2¥Ð¥¤¥È¤ÎÃͤϥ³¡¼¥É¥Ý¥¤¥ó
1108 ¥È¤ÎºÇ½é¤Î¼¡¸µ¤Ç¤ÎºÇ¾®¡¿ºÇÂç¥Ð¥¤¥È¤ÎÃͤǤ¢¤ë¡£Âè3¥Ð¥¤¥È¤ÈÂè4¥Ð¥¤¥È
1109 ¤Ï¡¢2ÈÖÌܤμ¡¸µ¤ÎºÇ¾®¡¿ºÇÂçÃͤǤ¢¤ê¡¢ °Ê²¼Æ±Íͤ˳¤¯¡£°ìÈÌŪ¤Ë¡¢Âè
1110 (2N-1)¥Ð¥¤¥È¤ÈÂè(2N)¥Ð¥¤¥È¤¬NÈÖÌܤμ¡¸µ¤ÎºÇ¾®¡¿ºÇÂçÃͤȤʤë (N =
1111 1, 2, 3, 4)¡£¥³¡¼¥É¥Ý¥¤¥ó¥È¤Î @e ʸ»ú¥¤¥ó¥Ç¥Ã¥¯¥¹ ¤Ï¤³¤ì¤é¤ÎÃͤ«¤é
1114 $MIN_CODE ¤È $MAX_CODE ¤Ï¡¢¤½¤ì¤¾¤ì¤³¤Îʸ»ú¥»¥Ã¥È¤ÎºÇ¾®¤ª¤è¤ÓºÇÂç
1115 ¥³¡¼¥É¥Ý¥¤¥ó¥È¤òɽ¤ï¤¹¡£0¤¬»ØÄꤵ¤ì¤¿¾ì¹ç¤Ï $CODE_RANGE ¤ÎÃͤ«¤é·×
1118 $FINAL_BYTE ¤Ï The International Registry ¤ËÅÐÏ¿¤µ¤ì¤Æ¤¤¤ë
1119 @e ½ªÃ¼¥Ð¥¤¥È ¤Ç¤¢¤ë¡£ÅÐÏ¿¤µ¤ì¤Æ¤¤¤Ê¤¤ CCS ¤Î¾ì¹ç¤Ë¤Ï -1 ¤Ç¤Ê¤¯¤Æ¤Ï
1122 $REVISION ¤Ï¡¢The International Registry ¤ËÅÐÏ¿¤µ¤ì¤Æ¤¤¤ë@e revision
1123 @e number ¤Ç¤¢¤ë¡£¤â¤· revision number ¤¬Â¸ºß¤·¤Ê¤¤¤Ê¤é -1 ¤Ç¤Ê¤¯
1126 @par ¥á¥½¥Ã¥É¤¬ Moffset ¤Î¾ì¹ç
1128 $MIN_CHAR ¤Ë¤ÏºÇ¾®¤Î¥³¡¼¥É¥Ý¥¤¥ó¥È¤ËÂбþ¤¹¤ëʸ»ú¥³¡¼¥É¤òÍ¿¤¨¤ë¡£
1129 $NPARENTS, $PARENTS, ¤ª¤è¤Ó $SUBSET_OFFSET ¤Ï̵»ë¤µ¤ì¤ë¡£
1131 @par ¥á¥½¥Ã¥É¤¬ Mmap ¤Î¾ì¹ç
1133 m17n ¸À¸ì¾ðÊó¥Ù¡¼¥¹Ãæ¤Ç \<#Mcharset, $NAME\> ¤È¤¤¤¦¥¿¥°¤ÎÉÕ¤¤¤¿
1134 ¥Þ¥Ã¥Ô¥ó¥°¥Æ¡¼¥Ö¥ë¤ò¡¢¥Ç¥³¡¼¥É¤ª¤è¤Ó¥¨¥ó¥³¡¼¥É¤ËÍѤ¤¤ë¡£$MIN_CHAR,
1135 $NPARENTS, $PARENTS, ¤ª¤è¤Ó $SUBSET_OFFSET ¤Ï̵»ë¤µ¤ì¤ë¡£
1137 @par ¥á¥½¥Ã¥É¤¬ Msubset ¤Î¾ì¹ç
1139 $NPARENTS ¤Ï1¤Ç¤Ê¤±¤ì¤Ð¤Ê¤é¤Ê¤¤¡£¤Þ¤¿ $PARENTS ¤Ï·Ñ¾µ¤Î¸µ¤È¤Ê¤ëʸ
1140 »ú¥»¥Ã¥È¤òɽ¤ï¤¹¥·¥ó¥Ü¥ë¤Ø¤Î¥Ý¥¤¥ó¥¿¤Ç¤¢¤ë¡£¸µ¤Îʸ»ú¥»¥Ã¥È¤Î¥³¡¼¥É
1141 ¥Ý¥¤¥ó¥È¤Ë $SUBSET_OFFSET ¤ò²Ã¤¨¤¿¤â¤Î¤¬¡¢¿·¤·¤¤Ê¸»ú¥»¥Ã¥ÈÃæ¤Ç¤Î¥³¡¼
1142 ¥É¥Ý¥¤¥ó¥È¤Ë¤Ê¤ë¡£$MIN_CHAR ¤Ï̵»ë¤µ¤ì¤ë¡£
1144 @par ¥á¥½¥Ã¥É¤¬ Msuperset ¤Î¾ì¹ç
1146 $NPARENTS ¤Ï¿Æ¤È¤Ê¤ëʸ»ú¥»¥Ã¥È¤Î¿ô¡¢$PARENTS ¤Ï¿Æʸ»ú¥»¥Ã¥È¤Î¥·¥ó
1147 ¥Ü¥ë¤ÎÇÛÎó¤òɽ¤ï¤¹¡£$MIN_CHAR ¤ª¤è¤Ó $SUBSET_OFFSET ¤Ï̵»ë¤µ¤ì¤ë¡£
1150 ½èÍý¤¬À®¸ù¤¹¤ì¤Ð¡¢mchar_define_charset () ¤Ï $NAME ¤È¤¤¤¦Ì¾Á°¤Î¥·
1151 ¥ó¥Ü¥ë¤òÊÖ¤¹¡£¤½¤¦¤Ç¤Ê¤±¤ì¤Ð #Mnil ¤òÊÖ¤·¡¢³°ÉôÊÑ¿ô @c
1152 merror_code ¤Ë¥¨¥é¡¼¥³¡¼¥É¤òÀßÄꤹ¤ë¡£ */
1156 @c MERROR_CHARSET */
1159 mchar_define_charset (char *name, MPlist *plist)
1161 MSymbol sym = msymbol (name);
1164 unsigned min_range, max_range;
1166 MText *mapfile = (MText *) mplist_get (plist, Mmapfile);
1168 MSTRUCT_CALLOC (charset, MERROR_CHARSET);
1169 charset->name = sym;
1170 charset->method = (MSymbol) mplist_get (plist, Mmethod);
1171 if (! charset->method)
1174 charset->method = Mmap;
1176 charset->method = Moffset;
1178 if (charset->method == Mmap || charset->method == Munify)
1181 MERROR (MERROR_CHARSET, Mnil);
1182 mdatabase_define (Mcharset, sym, Mnil, Mnil, NULL, mapfile->data);
1184 if (! (charset->dimension = (int) mplist_get (plist, Mdimension)))
1185 charset->dimension = 1;
1187 min_range = (unsigned) mplist_get (plist, Mmin_range);
1188 if ((pl = mplist_find_by_key (plist, Mmax_range)))
1190 max_range = (unsigned) MPLIST_VAL (pl);
1191 if (max_range >= 0x1000000)
1192 charset->dimension = 4;
1193 else if (max_range >= 0x10000 && charset->dimension < 3)
1194 charset->dimension = 3;
1195 else if (max_range >= 0x100 && charset->dimension < 2)
1196 charset->dimension = 2;
1198 else if (charset->dimension == 1)
1200 else if (charset->dimension == 2)
1202 else if (charset->dimension == 3)
1203 max_range = 0xFFFFFF;
1205 max_range = 0xFFFFFFFF;
1207 memset (charset->code_range, 0, sizeof charset->code_range);
1208 for (i = 0; i < charset->dimension; i++, min_range >>= 8, max_range >>= 8)
1210 charset->code_range[i * 4] = min_range & 0xFF;
1211 charset->code_range[i * 4 + 1] = max_range & 0xFF;
1213 if ((charset->min_code = (int) mplist_get (plist, Mmin_code)) < min_range)
1214 charset->min_code = min_range;
1215 if ((charset->max_code = (int) mplist_get (plist, Mmax_code)) > max_range)
1216 charset->max_code = max_range;
1217 charset->ascii_compatible
1218 = (MSymbol) mplist_get (plist, Mascii_compatible) != Mnil;
1219 charset->final_byte = (int) mplist_get (plist, Mfinal_byte);
1220 charset->revision = (int) mplist_get (plist, Mrevision);
1221 charset->min_char = (int) mplist_get (plist, Mmin_char);
1222 pl = (MPlist *) mplist_get (plist, Mparents);
1223 charset->nparents = pl ? mplist_length (pl) : 0;
1224 if (charset->nparents > 8)
1225 charset->nparents = 8;
1226 for (i = 0; i < charset->nparents; i++, pl = MPLIST_NEXT (pl))
1228 MSymbol parent_name;
1230 if (MPLIST_KEY (pl) != Msymbol)
1231 MERROR (MERROR_CHARSET, Mnil);
1232 parent_name = MPLIST_SYMBOL (pl);
1233 if (! (charset->parents[i] = MCHARSET (parent_name)))
1234 MERROR (MERROR_CHARSET, Mnil);
1237 charset->subset_offset = (int) mplist_get (plist, Msubset_offset);
1239 msymbol_put (sym, Mcharset, charset);
1240 charset = make_charset (charset);
1243 msymbol_put (msymbol__canonicalize (sym), Mcharset, charset);
1245 for (pl = (MPlist *) mplist_get (plist, Maliases);
1246 pl && MPLIST_KEY (pl) == Msymbol;
1247 pl = MPLIST_NEXT (pl))
1249 MSymbol alias = MPLIST_SYMBOL (pl);
1251 msymbol_put (alias, Mcharset, charset);
1252 msymbol_put (msymbol__canonicalize (alias), Mcharset, charset);
1255 if (mplist_get (plist, Mdefine_coding)
1256 && charset->dimension == 1
1257 && charset->code_range[0] == 0 && charset->code_range[1] == 255)
1258 mconv__register_charset_coding (sym);
1265 @brief Resolve charset name.
1267 The mchar_resolve_charset () function returns $SYMBOL if it
1268 represents a charset. Otherwise, canonicalize $SYMBOL as to a
1269 charset name, and if the canonicalized name represents a charset,
1270 return it. Otherwise, return #Mnil. */
1273 mchar_resolve_charset (MSymbol symbol)
1275 MCharset *charset = (MCharset *) msymbol_get (symbol, Mcharset);
1279 symbol = msymbol__canonicalize (symbol);
1280 charset = (MCharset *) msymbol_get (symbol, Mcharset);
1283 return (charset ? charset->name : Mnil);
1289 @brief List symbols representing a charset.
1291 The mchar_list_charsets () function makes an array of symbols
1292 representing a charset, stores the pointer to the array in a place
1293 pointed to by $SYMBOLS, and returns the length of the array. */
1296 mchar_list_charset (MSymbol **symbols)
1300 MTABLE_MALLOC ((*symbols), charset_list.used, MERROR_CHARSET);
1301 for (i = 0; i < charset_list.used; i++)
1302 (*symbols)[i] = charset_list.charsets[i]->name;
1309 @brief Decode a code-point.
1311 The mchar_decode () function decodes code-point $CODE in the
1312 charset represented by the symbol $CHARSET_NAME to get a character
1316 If decoding was successful, mchar_decode () returns the decoded
1317 character code. Otherwise it returns -1. */
1320 @brief ¥³¡¼¥É¥Ý¥¤¥ó¥È¤ò¥Ç¥³¡¼¥É¤¹¤ë
1322 ´Ø¿ô mchar_decode () ¤Ï¡¢¥·¥ó¥Ü¥ë $CHARSET_NAME ¤Ç¼¨¤µ¤ì¤ëʸ»ú¥»¥Ã
1323 ¥ÈÆâ¤Î $CODE ¤È¤¤¤¦¥³¡¼¥É¥Ý¥¤¥ó¥È¤ò¥Ç¥³¡¼¥É¤·¤Æʸ»ú¥³¡¼¥É¤òÆÀ¤ë¡£
1326 ¥Ç¥³¡¼¥É¤¬À®¸ù¤¹¤ì¤Ð¡¢mchar_decode () ¤Ï¥Ç¥³¡¼¥É¤µ¤ì¤¿Ê¸»ú¥³¡¼¥É¤ò
1327 ÊÖ¤¹¡£¤½¤¦¤Ç¤Ê¤±¤ì¤Ð -1 ¤òÊÖ¤¹¡£ */
1334 mchar_decode (MSymbol charset_name, unsigned code)
1336 MCharset *charset = MCHARSET (charset_name);
1339 return MCHAR_INVALID_CODE;
1340 return DECODE_CHAR (charset, code);
1346 @brief Encode a character code.
1348 The mchar_encode () function encodes character code $C to get a
1349 code-point in the charset represented by the symbol $CHARSET_NAME.
1352 If encoding was successful, mchar_encode () returns the encoded
1353 code-point. Otherwise it returns #MCHAR_INVALID_CODE. */
1356 @brief ʸ»ú¥³¡¼¥É¤ò¥¨¥ó¥³¡¼¥É¤¹¤ë
1358 ´Ø¿ô mchar_encode () ¤Ï¡¢Ê¸»ú¥³¡¼¥É $C ¤ò¥¨¥ó¥³¡¼¥É¤·¤Æ¥·¥ó¥Ü¥ë
1359 $CHARSET_NAME ¤Ç¼¨¤µ¤ì¤ëʸ»ú¥»¥Ã¥ÈÆâ¤Ë¤ª¤±¤ë¥³¡¼¥É¥Ý¥¤¥ó¥È¤òÆÀ¤ë¡£
1362 ¥¨¥ó¥³¡¼¥É¤¬À®¸ù¤¹¤ì¤Ð¡¢mchar_encode () ¤Ï¥¨¥ó¡¼¥É¤µ¤ì¤¿¥³¡¼¥É¥Ý¥¤
1363 ¥ó¥È¤òÊÖ¤¹¡£¤½¤¦¤Ç¤Ê¤±¤ì¤Ð #MCHAR_INVALID_CODE ¤òÊÖ¤¹¡£ */
1370 mchar_encode (MSymbol charset_name, int c)
1372 MCharset *charset = MCHARSET (charset_name);
1375 return MCHAR_INVALID_CODE;
1376 return ENCODE_CHAR (charset, c);
1382 @brief Call a function for all the characters in a specified charset.
1384 The mcharset_map_chars () function calls $FUNC for all the
1385 characters in the charset named $CHARSET_NAME. A call is done for
1386 a chunk of consecutive characters rather than character by
1389 $FUNC receives three arguments: $FROM, $TO, and $ARG. $FROM and
1390 $TO specify the range of character codes in $CHARSET. $ARG is the
1394 If the operation was successful, mcharset_map_chars () returns 0.
1395 Otherwise, it returns -1 and assigns an error code to the external
1396 variable #merror_code. */
1399 @brief »ØÄꤷ¤¿Ê¸»ú¥»¥Ã¥È¤Î¤¹¤Ù¤Æ¤Îʸ»ú¤ËÂФ·¤Æ´Ø¿ô¤ò¸Æ¤Ö
1401 ´Ø¿ô mcharset_map_chars () ¤Ï $CHARSET_NAME ¤È¤¤¤¦Ì¾Á°¤ò»ý¤Äʸ»ú¥»¥Ã
1402 ¥ÈÃæ¤Î¤¹¤Ù¤Æ¤Îʸ»ú¤ËÂФ·¤Æ $FUNC ¤ò¸Æ¤Ö¡£¸Æ¤Ó½Ð¤·¤Ï°ìʸ»úËè¤Ç¤Ï¤Ê
1403 ¤¯¡¢Ï¢Â³¤·¤¿Ê¸»ú¤Î¤Þ¤È¤Þ¤êñ°Ì¤Ç¹Ô¤Ê¤ï¤ì¤ë¡£
1405 ´Ø¿ô $FUNC ¤Ë¤Ï$FROM, $TO, $ARG ¤Î£³°ú¿ô¤¬ÅϤµ¤ì¤ë¡£$FROM ¤È $TO
1406 ¤Ï $CHARSET Ãæ¤Îʸ»ú¥³¡¼¥É¤ÎÈϰϤò»ØÄꤹ¤ë¡£$ARG ¤Ï $FUNC_ARG ¤ÈƱ
1410 ½èÍý¤ËÀ®¸ù¤¹¤ì¤Ð mcharset_map_chars () ¤Ï 0 ¤òÊÖ¤¹¡£¤½¤¦¤Ç¤Ê¤±¤ì¤Ð
1411 -1 ¤òÊÖ¤·¡¢³°ÉôÊÑ¿ô #merror_code ¤Ë¥¨¥é¡¼¥³¡¼¥É¤òÀßÄꤹ¤ë¡£ */
1415 @c MERROR_CHARSET */
1418 mchar_map_charset (MSymbol charset_name,
1419 void (*func) (int from, int to, void *arg),
1424 charset = MCHARSET (charset_name);
1426 MERROR (MERROR_CHARSET, -1);
1428 if (charset->encoder)
1430 int c = charset->min_char;
1433 if ((int) mchartable__lookup (charset->encoder, c, &next_c, 1) < 0)
1435 while (c <= charset->max_char)
1437 if ((int) mchartable__lookup (charset->encoder, c, &next_c, 1) >= 0)
1438 (*func) (c, next_c - 1, func_arg);
1443 (*func) (charset->min_char, charset->max_char, func_arg);