1 /* charset.c -- charset module.
2 Copyright (C) 2003, 2004
3 National Institute of Advanced Industrial Science and Technology (AIST)
4 Registration Number H15PRO112
6 This file is part of the m17n library.
8 The m17n library is free software; you can redistribute it and/or
9 modify it under the terms of the GNU Lesser General Public License
10 as published by the Free Software Foundation; either version 2.1 of
11 the License, or (at your option) any later version.
13 The m17n library is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public
19 License along with the m17n library; if not, write to the Free
20 Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
23 @addtogroup m17nCharset
24 @brief Charset objects and API for them.
26 The m17n library uses @e charset objects to represent a coded
27 character sets (CCS). The m17n library supports many predefined
28 coded character sets. Moreover, application programs can add
29 other charsets. A character can belong to multiple charsets.
31 The m17n library distinguishes the following three concepts:
33 @li A @e code-point is a number assigned by the CCS to each
34 character. Code-points may or may not be continuous. The type
35 @c unsigned is used to represent a code-point. An invalid
36 code-point is represented by the macro @c MCHAR_INVALID_CODE.
38 @li A @e character @e index is the canonical index of a character
39 in a CCS. The character that has the character index N occupies
40 the Nth position when all the characters in the current CCS are
41 sorted by their code-points. Character indices in a CCS are
42 continuous and start with 0.
44 @li A @e character @e code is the internal representation in the
45 m17n library of a character. A character code is a signed integer
48 Each charset object defines how characters are converted between
49 code-points and character codes. To @e encode means converting
50 code-points to character codes and to @e decode means converting
51 character codes to code-points. */
54 @addtogroup m17nCharset
55 @brief ʸ»ú¥»¥Ã¥È¥ª¥Ö¥¸¥§¥¯¥È¤È¤½¤ì¤Ë´Ø¤¹¤ë API.
57 m17n ¥é¥¤¥Ö¥é¥ê¤Ï¡¢Éä¹æ²½Ê¸»ú½¸¹ç (CCS) ¤ò @e ʸ»ú¥»¥Ã¥È
58 ¤È¸Æ¤Ö¥ª¥Ö¥¸¥§¥¯¥È¤Çɽ¸½¤¹¤ë¡£
59 m17n ¥é¥¤¥Ö¥é¥ê¤Ï¿¤¯¤ÎÉä¹æ²½Ê¸»ú½¸¹ç¤ò¤¢¤é¤«¤¸¤á¥µ¥Ý¡¼¥È¤·¤Æ¤¤¤ë¤·¡¢¥¢¥×¥ê¥±¡¼¥·¥ç¥ó¥×¥í¥°¥é¥à¤¬Æȼ«¤Ëʸ»ú¥»¥Ã¥È¤òÄɲ乤뤳¤È¤â²Äǽ¤Ç¤¢¤ë¡£
60 °ì¤Ä¤Îʸ»ú¤ÏÊ£¿ô¤Îʸ»ú¥»¥Ã¥È¤Ë°¤·¤Æ¤â¤è¤¤¡£
62 m17n ¥é¥¤¥Ö¥é¥ê¤Ï¡¢°Ê²¼¤Î³µÇ°¤ò¶èÊ̤·¤Æ¤¤¤ë:
64 @li @e ¥³¡¼¥É¥Ý¥¤¥ó¥È ¤È¤Ï¡¢CCS ¤¬¤½¤ÎÃæ¤Î¸Ä¡¹¤Îʸ»ú¤ËÂФ·¤ÆÄêµÁ¤¹¤ë¿ôÃͤǤ¢¤ë¡£
65 ¥³¡¼¥É¥Ý¥¤¥ó¥È¤ÏϢ³¤·¤Æ¤¤¤ë¤È¤Ï¸Â¤é¤Ê¤¤¡£¥³¡¼¥É¥Ý¥¤¥ó¥È¤Ï
66 @c unsigned ·¿¤Ë¤è¤Ã¤Æɽ¤µ¤ì¤ë¡£Ìµ¸ú¤Ê¥³¡¼¥É¥Ý¥¤¥ó¥È¤Ï¥Þ¥¯¥í
67 @c MCHAR_INVALID_CODE ¤Çɽ¤µ¤ì¤ë¡£
69 @li @e ʸ»ú¥¤¥ó¥Ç¥Ã¥¯¥¹ ¤È¤Ï¡¢CCS Æâ¤Ç³Æʸ»ú¤Ë³ä¤êÅö¤Æ¤é¤ì¤ëÀµµ¬²½¤µ¤ì¤¿¥¤¥ó¥Ç¥Ã¥¯¥¹¤Ç¤¢¤ë¡£
70 ʸ»ú¥¤¥ó¥Ç¥Ã¥¯¥¹¤¬ N ¤Îʸ»ú¤Ï¡¢CCS Ãæ¤ÎÁ´Ê¸»ú¤ò¥³¡¼¥É¥Ý¥¤¥ó¥È½ç¤Ëʤ٤¿¤È¤¤Ë N ÈÖÌܤ˸½¤ï¤ì¤ë¡£
71 CCS Ãæ¤Îʸ»ú¥¤¥ó¥Ç¥Ã¥¯¥¹¤ÏϢ³¤·¤Æ¤ª¤ê¡¢0 ¤«¤é»Ï¤Þ¤ë¡£
73 @li @e ʸ»ú¥³¡¼¥É ¤È¤Ï¡¢m17n ¥é¥¤¥Ö¥é¥êÆâ¤Ë¤ª¤±¤ëʸ»ú¤ÎÆâÉôɽ¸½¤Ç¤¢¤ê¡¢21 ¥Ó¥Ã¥È°Ê¾å¤ÎŤµ¤ò»ý¤ÄÉä¹çÉÕ¤À°¿ô¤Ç¤¢¤ë¡£
75 ³Æʸ»ú¥»¥Ã¥È¥ª¥Ö¥¸¥§¥¯¥È¤Ï¡¢¤½¤Îʸ»ú¥»¥Ã¥È¤Ë°¤¹¤ëʸ»ú¤Î¥³¡¼¥É¥Ý¥¤¥ó¥È¤Èʸ»ú¥³¡¼¥É¤È¤Î´Ö¤ÎÊÑ´¹¤òµ¬Äꤹ¤ë¡£
76 ¥³¡¼¥É¥Ý¥¤¥ó¥È¤«¤éʸ»ú¥³¡¼¥É¤Ø¤ÎÊÑ´¹¤ò @e ¥Ç¥³¡¼¥É
77 ¤È¸Æ¤Ó¡¢Ê¸»ú¥³¡¼¥É¤«¤é¥³¡¼¥É¥Ý¥¤¥ó¥È¤Ø¤ÎÊÑ´¹¤ò @e ¥¨¥ó¥³¡¼¥É ¤È¸Æ¤Ö¡£ */
81 #if !defined (FOR_DOXYGEN) || defined (DOXYGEN_INTERNAL_MODULE)
82 /*** @addtogroup m17nInternal
92 #include "m17n-misc.h"
101 static int unified_max;
103 /** List of all charsets ever defined. */
111 static struct MCharsetList charset_list;
113 static MPlist *charset_definition_list;
115 /** Make a charset object from the template of MCharset structure
116 CHARSET, and return a pointer to the new charset object.
117 CHARSET->code_range[4N + 2] and CHARSET->code_range[4N + 3] are
121 make_charset (MCharset *charset)
123 unsigned min_code, max_code;
125 int *range = charset->code_range;
127 if (charset->dimension < 1 || charset->dimension > 4)
128 MERROR (MERROR_CHARSET, NULL);
129 if ((charset->final_byte > 0 && charset->final_byte < '0')
130 || charset->final_byte > 127)
131 MERROR (MERROR_CHARSET, NULL);
133 for (i = 0, n = 1; i < 4; i++)
135 if (range[i * 4] > range[i * 4 + 1])
136 MERROR (MERROR_CHARSET, NULL);
137 range[i * 4 + 2] = range[i * 4 + 1] - range[i * 4] + 1;
138 n *= range[i * 4 + 2];
139 range[i * 4 + 3] = n;
142 min_code = range[0] | (range[4] << 8) | (range[8] << 16) | (range[12] << 24);
143 if (charset->min_code == 0)
144 charset->min_code = min_code;
145 else if (charset->min_code < min_code)
146 MERROR (MERROR_CHARSET, NULL);
147 max_code = range[1] | (range[5] << 8) | (range[9] << 16) | (range[13] << 24);
148 if (charset->max_code == 0)
149 charset->max_code = max_code;
150 else if (charset->max_code > max_code)
151 MERROR (MERROR_CHARSET, NULL);
153 charset->code_range_min_code = min_code;
154 charset->fully_loaded = 0;
157 if (charset->method == Msubset)
161 if (charset->nparents != 1)
162 MERROR (MERROR_CHARSET, NULL);
163 parent = charset->parents[0];
164 if (parent->method == Msuperset
165 || charset->min_code - charset->subset_offset < parent->min_code
166 || charset->max_code - charset->subset_offset > parent->max_code)
167 MERROR (MERROR_CHARSET, NULL);
169 else if (charset->method == Msuperset)
171 if (charset->nparents < 2)
172 MERROR (MERROR_CHARSET, NULL);
173 for (i = 0; i < charset->nparents; i++)
174 if (charset->min_code > charset->parents[i]->min_code
175 || charset->max_code < charset->parents[i]->max_code)
176 MERROR (MERROR_CHARSET, NULL);
181 = (charset->dimension == 1
183 && (charset->dimension == 2
185 && (charset->dimension == 3
186 || range[10] == 256)))));
188 if (! charset->no_code_gap)
192 memset (charset->code_range_mask, 0,
193 sizeof charset->code_range_mask);
194 for (i = 0; i < 4; i++)
195 for (j = range[i * 4]; j <= range[i * 4 + 1]; j++)
196 charset->code_range_mask[j] |= (1 << i);
199 if (charset->method == Moffset)
201 charset->max_char = charset->min_char + range[15] - 1;
202 if (charset->min_char < 0
203 || charset->max_char < 0 || charset->max_char > unified_max)
204 MERROR (MERROR_CHARSET, NULL);
205 charset->simple = charset->no_code_gap;
206 charset->fully_loaded = 1;
208 else if (charset->method == Munify)
210 /* The magic number 12 below is to align to the SUB_BITS_2
211 (defined in chartab.c) boundary in a char-table. */
212 unified_max -= ((range[15] >> 12) + 1) << 12;
213 charset->unified_max = unified_max;
215 else if (charset->method != Mmap)
216 MERROR (MERROR_CHARSET, NULL);
219 MLIST_APPEND1 (&charset_list, charsets, charset, MERROR_CHARSET);
221 if (charset->final_byte > 0)
223 MLIST_APPEND1 (&mcharset__iso_2022_table, charsets, charset,
225 if (charset->revision <= 0)
227 int chars = range[2];
229 if (chars == 128) /* ASCII case */
231 else if (chars == 256) /* ISO-8859-X case */
233 MCHARSET_ISO_2022 (charset->dimension, chars, charset->final_byte)
242 load_charset_fully (MCharset *charset)
244 if (charset->method == Msubset)
246 MCharset *parent = charset->parents[0];
248 if (! parent->fully_loaded
249 && load_charset_fully (parent) < 0)
250 MERROR (MERROR_CHARSET, -1);
251 if (parent->method == Moffset)
255 code = charset->min_code - charset->subset_offset;
256 charset->min_char = DECODE_CHAR (parent, code);
257 code = charset->max_code - charset->subset_offset;
258 charset->max_char = DECODE_CHAR (parent, code);
262 unsigned min_code = charset->min_code - charset->subset_offset;
263 unsigned max_code = charset->max_code - charset->subset_offset;
264 int min_char = DECODE_CHAR (parent, min_code);
265 int max_char = min_char;
267 for (++min_code; min_code <= max_code; min_code++)
269 int c = DECODE_CHAR (parent, min_code);
275 else if (c > max_char)
279 charset->min_char = min_char;
280 charset->max_char = max_char;
283 else if (charset->method == Msuperset)
285 int min_char = 0, max_char = 0;
288 for (i = 0; i < charset->nparents; i++)
290 MCharset *parent = charset->parents[i];
292 if (! parent->fully_loaded
293 && load_charset_fully (parent) < 0)
294 MERROR (MERROR_CHARSET, -1);
296 min_char = parent->min_char, max_char = parent->max_char;
297 else if (parent->min_char < min_char)
298 min_char = parent->min_char;
299 else if (parent->max_char > max_char)
300 max_char = parent->max_char;
302 charset->min_char = min_char;
303 charset->max_char = max_char;
305 else /* charset->method is Mmap or Munify */
307 MDatabase *mdb = mdatabase_find (Mcharset, charset->name, Mnil, Mnil);
310 if (! mdb || ! (plist = mdatabase_load (mdb)))
311 MERROR (MERROR_CHARSET, -1);
312 charset->decoder = mplist_value (plist);
313 charset->encoder = mplist_value (mplist_next (plist));
314 M17N_OBJECT_UNREF (plist);
315 mchartable_range (charset->encoder,
316 &charset->min_char, &charset->max_char);
317 if (charset->method == Mmap)
318 charset->simple = charset->no_code_gap;
320 charset->max_char = charset->unified_max + 1 + charset->code_range[15];
323 charset->fully_loaded = 1;
327 /** Load a data of type @c charset from the file FD. */
330 load_charset (FILE *fp, MSymbol charset_name)
332 MCharset *charset = MCHARSET (charset_name);
341 MERROR (MERROR_DB, NULL);
342 size = (charset->code_range[15]
343 - (charset->min_code - charset->code_range_min_code));
344 MTABLE_MALLOC (decoder, size, MERROR_DB);
345 for (i = 0; i < size; i++)
347 encoder = mchartable (Minteger, (void *) MCHAR_INVALID_CODE);
349 while ((c = getc (fp)) != EOF)
351 unsigned code1, code2, c1, c2;
356 fgets (buf, 256, fp);
359 if (sscanf (buf, "0x%x-0x%x 0x%x", &code1, &code2, &c1) == 3)
361 idx1 = CODE_POINT_TO_INDEX (charset, code1);
364 idx2 = CODE_POINT_TO_INDEX (charset, code2);
367 c2 = c1 + (idx2 - idx1);
369 else if (sscanf (buf, "0x%x 0x%x", &code1, &c1) == 2)
371 idx1 = idx2 = CODE_POINT_TO_INDEX (charset, code1);
378 if (idx1 >= 0 && idx2 >= 0)
381 mchartable_set (encoder, c1, (void *) code1);
382 for (idx1++, c1++; idx1 <= idx2; idx1++, c1++)
384 code1 = INDEX_TO_CODE_POINT (charset, idx1);
386 mchartable_set (encoder, c1, (void *) code1);
396 M17N_OBJECT_UNREF (encoder);
400 mplist_add (plist, Mt, decoder);
401 mplist_add (plist, Mt, encoder);
408 MPlist *mcharset__cache;
410 /* Predefined charsets. */
411 MCharset *mcharset__ascii;
412 MCharset *mcharset__binary;
413 MCharset *mcharset__m17n;
414 MCharset *mcharset__unicode;
416 MCharsetISO2022Table mcharset__iso_2022_table;
418 /** Initialize charset handler. */
425 unified_max = MCHAR_MAX;
427 mdatabase__load_charset_func = load_charset;
428 mcharset__cache = mplist ();
429 mplist_set (mcharset__cache, Mt, NULL);
431 MLIST_INIT1 (&charset_list, charsets, 128);
432 MLIST_INIT1 (&mcharset__iso_2022_table, charsets, 128);
433 charset_definition_list = mplist ();
435 memset (mcharset__iso_2022_table.classified, 0,
436 sizeof (mcharset__iso_2022_table.classified));
438 Mmethod = msymbol ("method");
439 Moffset = msymbol ("offset");
440 Mmap = msymbol ("map");
441 Munify = msymbol ("unify");
442 Msubset = msymbol ("subset");
443 Msuperset = msymbol ("superset");
445 Mdimension = msymbol ("dimension");
446 Mmin_range = msymbol ("min-range");
447 Mmax_range = msymbol ("max-range");
448 Mmin_code = msymbol ("min-code");
449 Mmax_code = msymbol ("max-code");
450 Mascii_compatible = msymbol ("ascii-compatible");
451 Mfinal_byte = msymbol ("final-byte");
452 Mrevision = msymbol ("revision");
453 Mmin_char = msymbol ("min-char");
454 Mmapfile = msymbol_as_managing_key ("mapfile");
455 Mparents = msymbol_as_managing_key ("parents");
456 Msubset_offset = msymbol ("subset-offset");
457 Mdefine_coding = msymbol ("define-coding");
458 Maliases = msymbol_as_managing_key ("aliases");
462 /* Setup predefined charsets. */
463 pl = mplist_add (pl, Mmethod, Moffset);
464 pl = mplist_add (pl, Mmin_range, (void *) 0);
465 pl = mplist_add (pl, Mmax_range, (void *) 0x7F);
466 pl = mplist_add (pl, Mascii_compatible, Mt);
467 pl = mplist_add (pl, Mfinal_byte, (void *) 'B');
468 pl = mplist_add (pl, Mmin_char, (void *) 0);
469 Mcharset_ascii = mchar_define_charset ("ascii", param);
471 mplist_put (param, Mmax_range, (void *) 0xFF);
472 mplist_put (param, Mfinal_byte, NULL);
473 Mcharset_iso_8859_1 = mchar_define_charset ("iso-8859-1", param);
475 mplist_put (param, Mmax_range, (void *) 0x10FFFF);
476 Mcharset_unicode = mchar_define_charset ("unicode", param);
478 mplist_put (param, Mmax_range, (void *) MCHAR_MAX);
479 Mcharset_m17n = mchar_define_charset ("m17n", param);
481 mplist_put (param, Mmax_range, (void *) 0xFF);
482 Mcharset_binary = mchar_define_charset ("binary", param);
484 M17N_OBJECT_UNREF (param);
486 mcharset__ascii = MCHARSET (Mcharset_ascii);
487 mcharset__binary = MCHARSET (Mcharset_binary);
488 mcharset__m17n = MCHARSET (Mcharset_m17n);
489 mcharset__unicode = MCHARSET (Mcharset_unicode);
495 mcharset__fini (void)
500 for (i = 0; i < charset_list.used; i++)
502 MCharset *charset = charset_list.charsets[i];
504 if (charset->decoder)
505 free (charset->decoder);
506 if (charset->encoder)
507 M17N_OBJECT_UNREF (charset->encoder);
510 M17N_OBJECT_UNREF (mcharset__cache);
511 MLIST_FREE1 (&charset_list, charsets);
512 MLIST_FREE1 (&mcharset__iso_2022_table, charsets);
513 MPLIST_DO (plist, charset_definition_list)
514 M17N_OBJECT_UNREF (MPLIST_VAL (plist));
515 M17N_OBJECT_UNREF (charset_definition_list);
520 mcharset__find (MSymbol name)
524 charset = msymbol_get (name, Mcharset);
527 MPlist *param = mplist_get (charset_definition_list, name);
529 MPLIST_KEY (mcharset__cache) = Mt;
532 param = mplist__from_plist (param);
533 mchar_define_charset (MSYMBOL_NAME (name), param);
534 charset = msymbol_get (name, Mcharset);
535 M17N_OBJECT_UNREF (param);
537 MPLIST_KEY (mcharset__cache) = name;
538 MPLIST_VAL (mcharset__cache) = charset;
543 /** Return the character corresponding to code-point CODE in CHARSET.
544 If CODE is invalid for CHARSET, return -1. */
547 mcharset__decode_char (MCharset *charset, unsigned code)
551 if (code < 128 && charset->ascii_compatible)
553 if (code < charset->min_code || code > charset->max_code)
556 if (! charset->fully_loaded
557 && load_charset_fully (charset) < 0)
558 MERROR (MERROR_CHARSET, -1);
560 if (charset->method == Msubset)
562 MCharset *parent = charset->parents[0];
564 code -= charset->subset_offset;
565 return DECODE_CHAR (parent, code);
568 if (charset->method == Msuperset)
572 for (i = 0; i < charset->nparents; i++)
574 MCharset *parent = charset->parents[i];
575 int c = DECODE_CHAR (parent, code);
583 idx = CODE_POINT_TO_INDEX (charset, code);
587 if (charset->method == Mmap)
588 return charset->decoder[idx];
590 if (charset->method == Munify)
592 int c = charset->decoder[idx];
595 c = charset->unified_max + 1 + idx;
599 /* Now charset->method should be Moffset. */
600 return (charset->min_char + idx);
604 /** Return the code point of character C in CHARSET. If CHARSET does not
605 contain C, return MCHAR_INVALID_CODE. */
608 mcharset__encode_char (MCharset *charset, int c)
610 if (! charset->fully_loaded
611 && load_charset_fully (charset) < 0)
612 MERROR (MERROR_CHARSET, MCHAR_INVALID_CODE);
614 if (charset->method == Msubset)
616 MCharset *parent = charset->parents[0];
617 unsigned code = ENCODE_CHAR (parent, c);
619 if (code == MCHAR_INVALID_CODE)
621 code += charset->subset_offset;
622 if (code >= charset->min_code && code <= charset->max_code)
624 return MCHAR_INVALID_CODE;
627 if (charset->method == Msuperset)
631 for (i = 0; i < charset->nparents; i++)
633 MCharset *parent = charset->parents[i];
634 unsigned code = ENCODE_CHAR (parent, c);
636 if (code != MCHAR_INVALID_CODE)
639 return MCHAR_INVALID_CODE;
642 if (c < charset->min_char || c > charset->max_char)
643 return MCHAR_INVALID_CODE;
645 if (charset->method == Mmap)
646 return (unsigned) mchartable_lookup (charset->encoder, c);
648 if (charset->method == Munify)
650 if (c > charset->unified_max)
652 c -= charset->unified_max - 1;
653 return INDEX_TO_CODE_POINT (charset, c);
655 return (unsigned) mchartable_lookup (charset->encoder, c);
658 /* Now charset->method should be Moffset */
659 c -= charset->min_char;
660 return INDEX_TO_CODE_POINT (charset, c);
664 mcharset__load_from_database ()
666 MDatabase *mdb = mdatabase_find (msymbol ("charset-list"), Mnil, Mnil, Mnil);
667 MPlist *def_list, *plist;
668 MPlist *definitions = charset_definition_list;
669 int mdebug_flag = MDEBUG_CHARSET;
674 def_list = (MPlist *) mdatabase_load (mdb);
675 MDEBUG_PRINT_TIME ("CHARSET", (stderr, " to load data."));
681 MPLIST_DO (plist, def_list)
686 if (! MPLIST_PLIST_P (plist))
687 MERROR (MERROR_CHARSET, -1);
688 pl = MPLIST_PLIST (plist);
689 if (! MPLIST_SYMBOL_P (pl))
690 MERROR (MERROR_CHARSET, -1);
691 name = MPLIST_SYMBOL (pl);
692 pl = MPLIST_NEXT (pl);
693 definitions = mplist_add (definitions, name, pl);
694 M17N_OBJECT_REF (pl);
695 p = mplist__from_plist (pl);
696 mchar_define_charset (MSYMBOL_NAME (name), p);
697 M17N_OBJECT_UNREF (p);
700 M17N_OBJECT_UNREF (def_list);
701 MDEBUG_PRINT_TIME ("CHARSET", (stderr, " to parse the loaded data."));
707 #endif /* !FOR_DOXYGEN || DOXYGEN_INTERNAL_MODULE */
712 /*** @addtogroup m17nCharset */
718 @brief Invalid code-point.
720 The macro #MCHAR_INVALID_CODE gives the invalid code-point. */
723 @brief ̵¸ú¤Ê¥³¡¼¥É¥Ý¥¤¥ó¥È.
725 ¥Þ¥¯¥í #MCHAR_INVALID_CODE ¤Ï̵¸ú¤Ê¥³¡¼¥É¥Ý¥¤¥ó¥È¤ò¼¨¤¹¡£ */
727 #define MCHAR_INVALID_CODE
733 @name Variables: Symbols representing a charset.
735 Each of the following symbols represents a predefined charset. */
738 @name ÊÑ¿ô: ʸ»ú¥»¥Ã¥È¤òɽ¸½¤¹¤ëÄêµÁºÑ¤ß¥·¥ó¥Ü¥ë.
740 °Ê²¼¤Î³Æ¥·¥ó¥Ü¥ë¤Ï¡¢ÄêµÁºÑ¤ßʸ»ú¥»¥Ã¥È¤òɽ¸½¤¹¤ë¡£ */
745 @brief Symbol representing the charset ASCII.
747 The symbol #Mcharset_ascii has name <tt>"ascii"</tt> and represents
748 the charset ISO 646, USA Version X3.4-1968 (ISO-IR-6). */
750 @brief ASCII ʸ»ú¥»¥Ã¥È¤òɽ¸½¤¹¤ë¥·¥ó¥Ü¥ë.
752 ¥·¥ó¥Ü¥ë #Mcharset_ascii ¤Ï <tt>"ascii"</tt> ¤È¤¤¤¦Ì¾Á°¤ò»ý¤Á¡¢
753 ISO 646, USA Version X3.4-1968 (ISO-IR-6) ʸ»ú¥»¥Ã¥È¤òɽ¸½¤¹¤ë¡£
756 MSymbol Mcharset_ascii;
760 @brief Symbol representing the charset ISO/IEC 8859/1.
762 The symbol #Mcharset_iso_8859_1 has name <tt>"iso-8859-1"</tt>
763 and represents the charset ISO/IEC 8859-1:1998. */
765 @brief ISO/IEC 8859-1:1998 ʸ»ú¥»¥Ã¥È¤òɽ¸½¤¹¤ë¥·¥ó¥Ü¥ë.
767 ¥·¥ó¥Ü¥ë #Mcharset_iso_8859_1 ¤Ï <tt>"iso-8859-1"</tt>
768 ¤È¤¤¤¦Ì¾Á°¤ò»ý¤Á¡¢ISO/IEC 8859-1:1998 ʸ»ú¥»¥Ã¥È¤òɽ¸½¤¹¤ë¡£
771 MSymbol Mcharset_iso_8859_1;
774 @brief Symbol representing the charset Unicode.
776 The symbol #Mcharset_unicode has name <tt>"unicode"</tt> and
777 represents the charset Unicode. */
779 @brief Unicode ʸ»ú¥»¥Ã¥È¤òɽ¸½¤¹¤ë¥·¥ó¥Ü¥ë.
781 ¥·¥ó¥Ü¥ë #Mcharset_unicode ¤Ï <tt>"unicode"</tt>
782 ¤È¤¤¤¦Ì¾Á°¤ò»ý¤Á¡¢Unicode ʸ»ú¥»¥Ã¥È¤òɽ¸½¤¹¤ë¡£ */
784 MSymbol Mcharset_unicode;
788 @brief Symbol representing the largest charset.
790 The symbol #Mcharset_m17n has name <tt>"m17n"</tt> and
791 represents the charset that contains all characters supported by
794 @brief Á´Ê¸»ú¤ò´Þ¤àʸ»ú¥»¥Ã¥È¤òɽ¸½¤¹¤ë¥·¥ó¥Ü¥ë.
796 ¥·¥ó¥Ü¥ë #Mcharset_m17n ¤Ï <tt>"m17n"</tt> ¤È¤¤¤¦Ì¾Á°¤ò»ý¤Á¡¢
797 m17n ¥é¥¤¥Ö¥é¥ê¤¬°·¤¦Á´¤Æ¤Îʸ»ú¤ò´Þ¤àʸ»ú¥»¥Ã¥È¤òɽ¸½¤¹¤ë¡£ */
799 MSymbol Mcharset_m17n;
803 @brief Symbol representing the charset for ill-decoded characters.
805 The symbol #Mcharset_binary has name <tt>"binary"</tt> and
806 represents the fake charset which the decoding functions put to an
807 M-text as a text property when they encounter an invalid byte
810 See @ref m17nConv for more details. */
813 @brief Àµ¤·¤¯¥Ç¥³¡¼¥É¤Ç¤¤Ê¤¤Ê¸»ú¤Îʸ»ú¥»¥Ã¥È¤òɽ¸½¤¹¤ë¥·¥ó¥Ü¥ë.
815 ¥·¥ó¥Ü¥ë #Mcharset_binary ¤Ï <tt>"binary"</tt>
816 ¤È¤¤¤¦Ì¾Á°¤ò»ý¤Á¡¢µ¶¤Î (fake) ʸ»ú¥»¥Ã¥È¤òɽ¸½¤¹¤ë¡£
817 ¥Ç¥³¡¼¥É´Ø¿ô¤Ï¡¢M-text ¤Î¥Æ¥¥¹¥È¥×¥í¥Ñ¥Æ¥£¤È¤·¤Æ¡¢Ìµ¸ú¤Ê¥Ð¥¤¥È¡Ê¥·¡¼¥¯¥¨¥ó¥¹¡Ë¤ËÁø¶ø¤·¤¿°ÌÃÖ¤òÉղ乤롣
819 ¾ÜºÙ¤Ï @ref m17nConv »²¾È¤Î¤³¤È¡£ */
821 MSymbol Mcharset_binary;
828 @name Variables: Parameter keys for mchar_define_charset ().
830 These are the predefined symbols to use as parameter keys for the
831 function mchar_define_charset () (which see). */
834 @name ÊÑ¿ô: mchar_define_charset ÍѤΥѥé¥á¡¼¥¿¡¦¥¡¼
836 ¤³¤ì¤é¤Ï¡¢´Ø¿ô mchar_define_charset () ÍѤΥѥé¥á¡¼¥¿¡¦¥¡¼¤È¤·¤Æ»È¤ï¤ì¤ë¥·¥ó¥Ü¥ë¤Ç¤¢¤ë¡£
837 ¾Ü¤·¤¯¤Ï¤³¤Î´Ø¿ô¤Î²òÀâ¤ò»²¾È¤Î¤³¤È¡£*/
847 MSymbol Mascii_compatible;
853 MSymbol Msubset_offset;
854 MSymbol Mdefine_coding;
861 @name Variables: Symbols representing charset methods.
863 These are the predefined symbols that can be a value of the
864 @b Mmethod parameter of a charset used in an argument to the
865 mchar_define_charset () function.
867 A method specifies how code-points and character codes are
868 converted. See the documentation of the mchar_define_charset ()
869 function for the details. */
872 @name ÊÑ¿ô: ʸ»ú¥»¥Ã¥È¤Î¥á¥½¥Ã¥É»ØÄê¤Ë»È¤ï¤ì¤ë¥·¥ó¥Ü¥ë
874 ¤³¤ì¤é¤Ï¡¢Ê¸»ú¥»¥Ã¥È¤Î @e ¥á¥½¥Ã¥É ¤ò»ØÄꤹ¤ë¤¿¤á¤ÎÄêµÁºÑ¤ß¥·¥ó¥Ü¥ë¤Ç¤¢¤ê¡¢Ê¸»ú¥»¥Ã¥È¤Î
875 @b Mmethod ¥Ñ¥é¥á¡¼¥¿¤ÎÃͤȤʤ뤳¤È¤¬¤Ç¤¤ë¡£
876 ¤³¤ÎÃͤϴؿô mchar_define_charset () ¤Î°ú¿ô¤È¤·¤Æ»È¤ï¤ì¤ë¡£
878 ¥á¥½¥Ã¥É¤È¤Ï¡¢¥³¡¼¥É¥Ý¥¤¥ó¥È¤Èʸ»ú¥³¡¼¥É¤òÁê¸ßÊÑ´¹¤¹¤ëºÝ¤ÎÊý¼°¤Î¤³¤È¤Ç¤¢¤ë¡£
879 ¾Ü¤·¤¯¤Ï´Ø¿ô mchar_define_charset () ¤Î²òÀâ¤ò»²¾È¤Î¤³¤È¡£ */
883 @brief Symbol for the offset type method of charset.
885 The symbol #Moffset has the name <tt>"offset"</tt> and, when used
886 as a value of @b Mmethod parameter of a charset, it means that the
887 conversion of code-points and character codes of the charset is
888 done by this calculation:
891 CHARACTER-CODE = CODE-POINT - MIN-CODE + MIN-CHAR
894 where, MIN-CODE is a value of @b Mmin_code parameter of the charset,
895 and MIN-CHAR is a value of @b Mmin_char parameter. */
898 @brief ¥ª¥Õ¥»¥Ã¥È·¿¤Î¥á¥½¥Ã¥É¤ò¼¨¤¹¥·¥ó¥Ü¥ë.
900 ¥·¥ó¥Ü¥ë #Moffset ¤Ï <tt>"offset"</tt> ¤È¤¤¤¦Ì¾Á°¤ò»ý¤Á¡¢Ê¸»ú¥»¥Ã¥È¤Î
901 @b Mmethod ¥Ñ¥é¥á¡¼¥¿¤ÎÃͤȤ·¤ÆÍѤ¤¤é¤ì¤¿¾ì¹ç¤Ë¤Ï¡¢¥³¡¼¥É¥Ý¥¤¥ó¥È¤Èʸ»ú¥»¥Ã¥È¤Îʸ»ú¥³¡¼¥É¤Î´Ö¤ÎÊÑ´¹¤¬°Ê²¼¤Î¼°¤Ë½¾¤Ã¤Æ¹Ô¤ï¤ì¤ë¤³¤È¤ò°ÕÌ£¤¹¤ë¡£
904 ʸ»ú¥³¡¼¥É = ¥³¡¼¥É¥Ý¥¤¥ó¥È - MIN-CODE + MIN-CHAR
907 ¤³¤³¤Ç¡¢MIN-CODE ¤Ïʸ»ú¥»¥Ã¥È¤Î @b Mmin_code ¥Ñ¥é¥á¡¼¥¿¤ÎÃͤǤ¢¤ê¡¢MIN-CHAR ¤Ï
908 @b Mmin_char ¥Ñ¥é¥á¡¼¥¿¤ÎÃͤǤ¢¤ë¡£ */
913 /***en @brief Symbol for the map type method of charset.
915 The symbol #Mmap has the name <tt>"map"</tt> and, when used as a
916 value of @b Mmethod parameter of a charset, it means that the
917 conversion of code-points and character codes of the charset is
918 done by map looking up. The map must be given by @b Mmapfile
921 /***ja @brief ¥Þ¥Ã¥×·¿¤Î¥á¥½¥Ã¥É¤ò¼¨¤¹¥·¥ó¥Ü¥ë.
923 ¥·¥ó¥Ü¥ë #Mmap ¤Ï <tt>"map"</tt> ¤È¤¤¤¦Ì¾Á°¤ò»ý¤Á¡¢Ê¸»ú¥»¥Ã¥È¤Î
924 @b Mmethod ¥Ñ¥é¥á¡¼¥¿¤ÎÃͤȤ·¤ÆÍѤ¤¤é¤ì¤¿¾ì¹ç¤Ë¤Ï¡¢¥³¡¼¥É¥Ý¥¤¥ó¥È¤Èʸ»ú¥»¥Ã¥È¤Îʸ»ú¥³¡¼¥É¤Î´Ö¤ÎÊÑ´¹¤¬¥Þ¥Ã¥×¤ò»²¾È¤¹¤ë¤³¤È¤Ë¤è¤Ã¤Æ¹Ô¤ï¤ì¤ë¤³¤È¤ò°ÕÌ£¤¹¤ë¡£
925 ¥Þ¥Ã¥×¤Ï @b Mmapfile ¥Ñ¥é¥á¡¼¥¿¤È¤·¤ÆÍ¿¤¨¤Ê¤±¤ì¤Ð¤Ê¤é¤Ê¤¤¡£ */
930 /***en @brief Symbol for the unify type method of charset.
932 The symbol #Munify has the name <tt>"unify"</tt> and, when used as
933 a value of @b Mmethod parameter of a charset, it means that the
934 conversion of code-points and character codes of the charset is
935 done by map looking up and offsetting. The map must be given by
936 @b Mmapfile parameter. For this kind of charset, a unique
937 continuous character code space for all characters is assigned.
939 If the map has an entry for a code-point, the conversion is done
940 by looking up the map. Otherwise, the conversion is done by this
944 CHARACTER-CODE = CODE-POINT - MIN-CODE + LOWEST-CHAR-CODE
947 where, MIN-CODE is a value of @b Mmin_code parameter of the charset,
948 and LOWEST-CHAR-CODE is the lowest character code of the assigned
951 /***ja @brief ¥æ¥Ë¥Õ¥¡¥¤·¿¤Î¥á¥½¥Ã¥É¤ò¼¨¤¹¥·¥ó¥Ü¥ë.
953 ¥·¥ó¥Ü¥ë #Munify ¤Ï <tt>"unify"</tt> ¤È¤¤¤¦Ì¾Á°¤ò»ý¤Á¡¢Ê¸»ú¥»¥Ã¥È¤Î
954 @b Mmethod ¥Ñ¥é¥á¡¼¥¿¤ÎÃͤȤ·¤ÆÍѤ¤¤é¤ì¤¿¾ì¹ç¤Ë¤Ï¡¢¥³¡¼¥É¥Ý¥¤¥ó¥È¤Èʸ»ú¥»¥Ã¥È¤Îʸ»ú¥³¡¼¥É¤Î´Ö¤ÎÊÑ´¹¤¬¡¢¥Þ¥Ã¥×¤Î»²¾È¤È¥ª¥Õ¥»¥Ã¥È¤ÎÁȤ߹ç¤ï¤»¤Ë¤è¤Ã¤Æ¹Ô¤ï¤ì¤ë¤³¤È¤ò°ÕÌ£¤¹¤ë¡£
955 ¥Þ¥Ã¥×¤Ï @b Mmapfile ¥Ñ¥é¥á¡¼¥¿¤È¤·¤ÆÍ¿¤¨¤Ê¤±¤ì¤Ð¤Ê¤é¤Ê¤¤¡£
956 ¤³¤Î¼ï¤Î³Æʸ»ú¥»¥Ã¥È¤Ë¤Ï¡¢Á´Ê¸»ú¤ËÂФ·¤ÆϢ³¤¹¤ë¥³¡¼¥É¥¹¥Ú¡¼¥¹¤¬¤½¤ì¤¾¤ì³ä¤êÅö¤Æ¤é¤ì¤ë¡£
958 ¥³¡¼¥É¥Ý¥¤¥ó¥È¤¬¥Þ¥Ã¥×¤Ë´Þ¤Þ¤ì¤Æ¤¤¤ì¤Ð¡¢ÊÑ´¹¤Ï¥Þ¥Ã¥×»²¾È¤Ë¤è¤Ã¤Æ¹Ô¤ï¤ì¤ë¡£
959 ¤½¤¦¤Ç¤Ê¤±¤ì¤Ð¡¢°Ê²¼¤Î¼°¤Ë½¾¤¦¡£
962 CHARACTER-CODE = CODE-POINT - MIN-CODE + LOWEST-CHAR-CODE
965 ¤³¤³¤Ç¡¢MIN-CODE ¤Ïʸ»ú¥»¥Ã¥È¤Î @b Mmin_code ¥Ñ¥é¥á¡¼¥¿¤ÎÃͤǤ¢¤ê¡¢
966 LOWEST-CHAR-CODE ¤Ï³ä¤êÅö¤Æ¤é¤ì¤¿¥³¡¼¥É¥¹¥Ú¡¼¥¹¤ÎºÇ¤â¾®¤µ¤¤Ê¸»ú¥³¡¼¥É¤Ç¤¢¤ë¡£
973 @brief Symbol for the subset type method of charset.
975 The symbol #Msubset has the name <tt>"subset"</tt> and, when used
976 as a value of @b Mmethod parameter of a charset, it means that the
977 charset is a subset of a parent charset. The parent charset must
978 be given by @b Mparents parameter. The conversion of code-points
979 and character codes of the charset is done conceptually by this
983 CHARACTER-CODE = PARENT-CODE (CODE-POINT) + SUBSET-OFFSET
986 where, PARENT-CODE is a pseudo function that returns a character
987 code of CODE-POINT in the parent charset, and SUBSET-OFFSET is a
988 value given by @b Msubset_offset parameter. */
990 /***ja @brief ¥µ¥Ö¥»¥Ã¥È·¿¤Î¥á¥½¥Ã¥É¤ò¼¨¤¹¥·¥ó¥Ü¥ë.
992 ¥·¥ó¥Ü¥ë #Msubset ¤Ï <tt>"subset"</tt> ¤È¤¤¤¦Ì¾Á°¤ò»ý¤Á¡¢Ê¸»ú¥»¥Ã¥È¤Î
993 @b Mmethod ¥Ñ¥é¥á¡¼¥¿¤ÎÃͤȤ·¤ÆÍѤ¤¤é¤ì¤¿¾ì¹ç¤Ë¤Ï¡¢¤³¤Îʸ»ú¥»¥Ã¥È¤¬Ê̤Îʸ»ú¥»¥Ã¥È¡Ê¿Æʸ»ú¥»¥Ã¥È¡Ë¤ÎÉôʬ½¸¹ç¤Ç¤¢¤ë¤³¤È¤ò°ÕÌ£¤¹¤ë¡£
994 ¿Æʸ»ú¥»¥Ã¥È¤Ï @b Mparents ¥Ñ¥é¥á¡¼¥¿¤Ë¤è¤Ã¤ÆÍ¿¤¨¤é¤ì¤Ê¤¯¤Æ¤Ï¤Ê¤é¤Ê¤¤¡£
995 ¥³¡¼¥É¥Ý¥¤¥ó¥È¤Èʸ»ú¥»¥Ã¥È¤Îʸ»ú¥³¡¼¥É¤Î´Ö¤ÎÊÑ´¹¤Ï¡¢³µÇ°Åª¤Ë¤Ï°Ê²¼¤Î¼°¤Ë½¾¤¦¡£
998 CHARACTER-CODE = PARENT-CODE (CODE-POINT) + SUBSET-OFFSET
1001 ¤³¤³¤Ç PARENT-CODE ¤Ï CODE-POINT
1002 ¤Î¿Æʸ»ú¥»¥Ã¥ÈÃæ¤Ç¤Îʸ»ú¥³¡¼¥É¤òÊÖ¤¹µ¼´Ø¿ô¤Ç¤¢¤ê¡¢SUBSET-OFFSET ¤Ï
1003 @b Msubset_offset ¥Ñ¥é¥á¡¼¥¿¤ÇÍ¿¤¨¤é¤ì¤ëÃͤǤ¢¤ë¡£
1010 @brief Symbol for the superset type method of charset.
1012 The symbol #Msuperset has the name <tt>"superset"</tt> and, when
1013 used as a value of @b Mmethod parameter of a charset, it means that
1014 the charset is a superset of parent charsets. The parent charsets
1015 must be given by @b Mparents parameter. */
1018 @brief ¥¹¡¼¥Ñ¡¼¥»¥Ã¥È·¿¤Î¥á¥½¥Ã¥É¤ò¼¨¤¹¥·¥ó¥Ü¥ë.
1020 ¥·¥ó¥Ü¥ë #Msuperset ¤Ï <tt>"superset"</tt> ¤È¤¤¤¦Ì¾Á°¤ò»ý¤Á¡¢Ê¸»ú¥»¥Ã¥È¤Î
1021 @b Mmethod ¥Ñ¥é¥á¡¼¥¿¤ÎÃͤȤ·¤ÆÍѤ¤¤é¤ì¤¿¾ì¹ç¤Ë¤Ï¡¢¤³¤Îʸ»ú¥»¥Ã¥È¤¬Ê̤Îʸ»ú¥»¥Ã¥È¡Ê¿Æʸ»ú¥»¥Ã¥È¡Ë¤Î¾å°Ì½¸¹ç¤Ç¤¢¤ë¤³¤È¤ò°ÕÌ£¤¹¤ë¡£
1022 ¿Æʸ»ú¥»¥Ã¥È¤Ï @b Mparents ¥Ñ¥é¥á¡¼¥¿¤Ë¤è¤Ã¤ÆÍ¿¤¨¤é¤ì¤Ê¤¯¤Æ¤Ï¤Ê¤é¤Ê¤¤¡£
1030 @brief Define a charset.
1032 The mchar_define_charset () function defines a new charset and
1033 makes it accessible via a symbol whose name is $NAME. $PLIST
1034 specifies parameters of the charset as below:
1038 <li> Key is @b Mmethod, value is a symbol.
1040 The value specifies the method for decoding/encoding code-points
1041 in the charset. It must be #Moffset, #Mmap (default), #Munify,
1042 #Msubset, or #Msuperset.
1044 <li> Key is @b Mdimension, value is an integer
1046 The value specifies the dimension of code-points of the charset.
1047 It must be 1 (default), 2, 3, or 4.
1049 <li> Key is @b Mmin_range, value is an unsigned integer
1051 The value specifies the minimum range of a code-point, which means
1052 that the Nth byte of the value is the minimum Nth byte of
1053 code-points of the charset. The default value is 0.
1055 <li> Key is @b Mmax_range, value is an unsigned integer
1057 The value specifies the maximum range of a code-point, which means
1058 that the Nth byte of the value is the maximum Nth byte of
1059 code-points of the charset. The default value is 0xFF, 0xFFFF,
1060 0xFFFFFF, or 0xFFFFFFFF if the dimension is 1, 2, 3, or 4
1063 <li> Key is @b Mmin_code, value is an unsigned integer
1065 The value specifies the minimum code-point of
1066 the charset. The default value is the minimum range.
1068 <li> Key is @b Mmax_code, value is an unsigned integer
1070 The value specifies the maximum code-point of
1071 the charset. The default value is the maximum range.
1073 <li> Key is @b Mascii_compatible, value is a symbol
1075 The value specifies whether the charset is ASCII compatible or
1076 not. If the value is #Mnil (default), it is not ASCII
1077 compatible, else compatible.
1079 <li> Key is @b Mfinal_byte, value is an integer
1081 The value specifies the @e final @e byte of the charset registered
1082 in The International Registry. It must be 0 (default) or 32..127.
1083 The value 0 means that the charset is not in the registry.
1085 <li> Key is @b Mrevision, value is an integer
1087 The value specifies the @e revision @e number of the charset
1088 registered in The International Registry. It must be 0..127. If
1089 the charset is not in The International Registry, the value is
1090 ignored. The value 0 means that the charset has no revision
1093 <li> Key is @b Mmin_char, value is an integer
1095 The value specifies the minimum character code of the charset.
1096 The default value is 0.
1098 <li> Key is @b Mmapfile, value is an M-text
1100 If the method is #Mmap or #Munify, a data that contains
1101 mapping information is added to the m17n database by calling
1102 the function mdatabase_define () with the value as an argument $EXTRA_INFO,
1103 i.e. the value is used as a file name of the data.
1105 Otherwise, this parameter is ignored.
1107 <li> Key is @b Mparents, value is a plist
1109 If the method is #Msubset, the value must is a plist of length
1110 1, and the value of the plist must be a symbol representing a
1113 If the method is #Msuperset, the value must be a plist of length
1114 less than 9, and the values of the plist must be symbols
1115 representing subset charsets.
1117 Otherwise, this parameter is ignored.
1119 <li> Key is @b Mdefine_coding, value is a symbol
1121 If the dimension of the charset is 1, the value specifies whether
1122 or not to define a coding system of the same name whose type is
1123 #Mcharset. A coding system is defined if the value is not #Mnil.
1125 Otherwise, this parameter is ignored.
1130 If the operation was successful, mchar_define_charset () returns a
1131 symbol whose name is $NAME. Otherwise it returns #Mnil and
1132 assigns an error code to the external variable #merror_code. */
1135 @brief ʸ»ú¥»¥Ã¥È¤òÄêµÁ¤¹¤ë.
1137 ´Ø¿ô mchar_define_charset () ¤Ï¿·¤·¤¤Ê¸»ú¥»¥Ã¥È¤òÄêµÁ¤·¡¢¤½¤ì¤ò
1138 $NAME ¤È¤¤¤¦Ì¾Á°¤ò»ý¤Ä¥·¥ó¥Ü¥ë·Ðͳ¤Ç¥¢¥¯¥»¥¹¤Ç¤¤ë¤è¤¦¤Ë¤¹¤ë¡£
1139 $PLIST ¤ÏÄêµÁ¤µ¤ì¤ëʸ»ú¥»¥Ã¥È¤Î¥Ñ¥é¥á¡¼¥¿¤ò°Ê²¼¤Î¤è¤¦¤Ë»ØÄꤹ¤ë¡£
1143 <li> ¥¡¼¤¬ @b Mmethod ¤ÇÃͤ¬¥·¥ó¥Ü¥ë¤Î»þ
1145 Ãͤϡ¢#Moffset, #Mmap (¥Ç¥Õ¥©¥ë¥ÈÃÍ), #Munify, #Msubset,
1146 #Msuperset ¤Î¤¤¤º¤ì¤«¤Ç¤¢¤ê¡¢Ê¸»ú¥»¥Ã¥È¤Î¥³¡¼¥É¥Ý¥¤¥ó¥È¤ò¥Ç¥³¡¼¥É¡¿¥¨¥ó¥³¡¼¥É¤¹¤ëºÝ¤Î¥á¥½¥Ã¥É¤ò»ØÄꤹ¤ë¡£
1148 <li> ¥¡¼¤¬ @b Mdimension ¤ÇÃͤ¬À°¿ôÃͤλþ
1150 Ãͤϡ¢1 (¥Ç¥Õ¥©¥ë¥ÈÃÍ), 2, 3, 4
1151 ¤Î¤¤¤º¤ì¤«¤Ç¤¢¤ê¡¢Ê¸»ú¥»¥Ã¥È¤Î¥³¡¼¥É¥Ý¥¤¥ó¥È¤Î¼¡¸µ¤Ç¤¢¤ë¡£
1153 <li> ¥¡¼¤¬ @b Mmin_range ¤ÇÃͤ¬ÈóÉéÀ°¿ôÃͤλþ
1155 Ãͤϥ³¡¼¥É¥Ý¥¤¥ó¥È¤ÎºÇ¾®¤ÎÃͤǤ¢¤ë¡£¤¹¤Ê¤ï¤Á¡¢¤³¤ÎÃͤΠN
1156 ÈÖÌܤΥХ¤¥È¤Ï¤³¤Îʸ»ú¥»¥Ã¥È¤Î¥³¡¼¥É¥Ý¥¤¥ó¥È¤Î N ÈÖÌܤΥХ¤¥È¤ÎºÇ¾®¤Î¤â¤Î¤È¤Ê¤ë¡£
1159 <li> ¥¡¼¤¬ @b Mmax_range ¤ÇÃͤ¬ÈóÉéÀ°¿ôÃͤλþ
1161 Ãͤϥ³¡¼¥É¥Ý¥¤¥ó¥È¤ÎºÇÂç¤ÎÃͤǤ¢¤ë¡£¤¹¤Ê¤ï¤Á¡¢¤³¤ÎÃͤΠN
1162 ÈÖÌܤΥХ¤¥È¤Ï¤³¤Îʸ»ú¥»¥Ã¥È¤Î¥³¡¼¥É¥Ý¥¤¥ó¥È¤Î N ÈÖÌܤΥХ¤¥È¤ÎºÇÂç¤Î¤â¤Î¤È¤Ê¤ë¡£
1163 ¥Ç¥Õ¥©¥ë¥ÈÃͤϡ¢¥³¡¼¥É¥Ý¥¤¥ó¥È¤Î¼¡¸µ¤¬ 1, 2, 3, 4 ¤Î»þ¡¢¤½¤ì¤¾¤ì
1164 0xFF, 0xFFFF, 0xFFFFFF, 0xFFFFFFFF ¡£
1166 <li> ¥¡¼¤¬ @b Mmin_code ¤ÇÃͤ¬ÈóÉéÀ°¿ôÃͤλþ
1168 ÃͤϤ³¤Îʸ»ú¥»¥Ã¥È¤ÎºÇ¾®¤Î¥³¡¼¥É¥Ý¥¤¥ó¥È¤Ç¤¢¤ë¡£¥Ç¥Õ¥©¥ë¥ÈÃͤÏ
1169 @b Mmin_range ¤ÎÃÍ¡£
1171 <li> ¥¡¼¤¬ @b Mmax_code ¤ÇÃͤ¬ÈóÉéÀ°¿ôÃͤλþ
1173 ÃͤϤ³¤Îʸ»ú¥»¥Ã¥È¤ÎºÇÂç¤Î¥³¡¼¥É¥Ý¥¤¥ó¥È¤Ç¤¢¤ë¡£¥Ç¥Õ¥©¥ë¥ÈÃͤÏ
1174 @b Mmax_range ¤ÎÃÍ¡£
1176 <li> ¥¡¼¤¬ @b Mascii_compatible ¤ÇÃͤ¬¥·¥ó¥Ü¥ë¤Î»þ
1178 ÃͤϤ³¤Îʸ»ú¥»¥Ã¥È¤¬ ASCII ¸ß´¹¤Ç¤¢¤ë¤«¤É¤¦¤«¤ò¼¨¤¹¡£¥Ç¥Õ¥©¥ë¥ÈÃͤÎ
1179 #Mnil ¤Ç¤¢¤ì¤Ð¸ß´¹¤Ç¤Ï¤Ê¤¯¡¢¤½¤ì°Ê³°¤Î¾ì¹ç¤Ï¸ß´¹¤Ç¤¢¤ë¡£
1181 <li> ¥¡¼¤¬ @b Mfinal_byte ¤ÇÃͤ¬À°¿ôÃͤλþ
1183 ÃͤϤ³¤Îʸ»ú¥»¥Ã¥È¤Î The International Registry ¤ËÅÐÏ¿¤µ¤ì¤Æ¤¤¤ë
1184 @e ½ªÃ¼¥Ð¥¤¥È ¤Ç¤¢¤ê¡¢0 (¥Ç¥Õ¥©¥ë¥ÈÃÍ) ¤Ç¤¢¤ë¤« 32..127 ¤Ç¤¢¤ë¡£0
1185 ¤ÏÅÐÏ¿¤µ¤ì¤Æ¤¤¤Ê¤¤¤³¤È¤ò°ÕÌ£¤¹¤ë¡£
1187 <li> ¥¡¼¤¬ @b Mrevision ¤ÇÃͤ¬À°¿ôÃͤλþ
1189 ÃÍ¤Ï The International Registry ¤ËÅÐÏ¿¤µ¤ì¤Æ¤¤¤ë @e revision @e
1190 number ¤Ç¤¢¤ê¡¢0..127 ¤Ç¤¢¤ë¡£
1191 ʸ»ú¥»¥Ã¥È¤¬ÅÐÏ¿¤µ¤ì¤Æ¤¤¤Ê¤¤¾ì¹ç¤Ë¤Ï¤³¤ÎÃͤÏ̵»ë¤µ¤ì¤ë¡£
1192 0 ¤Ï revision number ¤¬Â¸ºß¤·¤Ê¤¤¤³¤È¤ò°ÕÌ£¤¹¤ë¡£
1194 <li> ¥¡¼¤¬ @b Mmin_char ¤ÇÃͤ¬À°¿ôÃͤλþ
1196 ÃͤϤ³¤Îʸ»ú¥»¥Ã¥È¤ÎºÇ¾®¤Îʸ»ú¥³¡¼¥É¤Ç¤¢¤ë¡£¥Ç¥Õ¥©¥ë¥ÈÃÍ¤Ï 0 ¡£
1198 <li> ¥¡¼¤¬ @b Mmapfile ¤ÇÃͤ¬ M-text ¤Î»þ
1200 ¥á¥½¥Ã¥É¤¬ #Mmap ¤« #Munify ¤Î»þ¡¢´Ø¿ô mdatabase_define ()
1201 ¤ò¤³¤ÎÃͤò°ú¿ô $EXTRA_INFO ¤È¤·¤Æ¸Æ¤Ö¤³¤È¤Ë¤è¤Ã¤Æ¡¢¥Þ¥Ã¥Ô¥ó¥°¤Ë´Ø¤¹¤ë¥Ç¡¼¥¿¤¬
1202 m17n ¥Ç¡¼¥¿¥Ù¡¼¥¹¤ËÄɲ䵤ì¤ë¡£
1203 ¤¹¤Ê¤ï¤Á¡¢¤³¤ÎÃͤϥǡ¼¥¿¥Õ¥¡¥¤¥ë¤Î̾Á°¤Ç¤¢¤ë¡£
1205 ¤½¤¦¤Ç¤Ê¤±¤ì¤Ð¡¢¤³¤Î¥Ñ¥é¥á¡¼¥¿¤Ï̵»ë¤µ¤ì¤ë¡£
1207 <li> ¥¡¼¤¬ @b Mparents ¤ÇÃͤ¬ plist ¤Î»þ
1209 ¥á¥½¥Ã¥É¤¬ #Msubset ¤Ê¤é¤Ð¡¢ÃͤÏŤµ 1 ¤Î plist
1210 ¤Ç¤¢¤ê¡¢¤½¤ÎÃͤϤ³¤Îʸ»ú¥»¥Ã¥È¤Î¾å°Ì½¸¹ç¤È¤Ê¤ëʸ»ú¥»¥Ã¥È¤ò¼¨¤¹¥·¥ó¥Ü¥ë¤Ç¤¢¤ë¡£
1212 ¥á¥½¥Ã¥É¤¬ #Msuperset ¤Ê¤é¤Ð¡¢ÃͤÏŤµ 8 °Ê²¼¤Î plist
1213 ¤Ç¤¢¤ê¡¢¤½¤ì¤é¤ÎÃͤϤ³¤Îʸ»ú¥»¥Ã¥È¤Î²¼°Ì½¸¹ç¤Ç¤¢¤ëʸ»ú¥»¥Ã¥È¤ò¼¨¤¹¥·¥ó¥Ü¥ë¤Ç¤¢¤ë¡£
1215 ¤½¤¦¤Ç¤Ê¤±¤ì¤Ð¡¢¤³¤Î¥Ñ¥é¥á¡¼¥¿¤Ï̵»ë¤µ¤ì¤ë¡£
1217 <li> ¥¡¼¤¬ @b Mdefine_coding ¤ÇÃͤ¬¥·¥ó¥Ü¥ë¤Î»þ
1219 ʸ»ú¥»¥Ã¥È¤Î¼¡¸µ¤¬ 1 ¤Ê¤é¤Ð¡¢Ãͤ¬ #Mnil °Ê³°¤Î¾ì¹ç¤Ë #Mcharset ·¿
1220 ¤ÇƱ¤¸Ì¾Á°¤ò»ý¤Ä¥³¡¼¥É·Ï¤òÄêµÁ¤¹¤ë¡£
1222 ¤½¤¦¤Ç¤Ê¤±¤ì¤Ð¡¢¤³¤Î¥Ñ¥é¥á¡¼¥¿¤Ï̵»ë¤µ¤ì¤ë¡£
1227 ½èÍý¤¬À®¸ù¤¹¤ì¤Ð¡¢mchar_define_charset() ¤Ï $NAME
1228 ¤È¤¤¤¦Ì¾Á°¤Î¥·¥ó¥Ü¥ë¤òÊÖ¤¹¡£¤½¤¦¤Ç¤Ê¤±¤ì¤Ð #Mnil ¤òÊÖ¤·¡¢³°ÉôÊÑ¿ô
1229 #merror_code ¤Ë¥¨¥é¡¼¥³¡¼¥É¤òÀßÄꤹ¤ë¡£*/
1233 @c MERROR_CHARSET */
1236 mchar_define_charset (const char *name, MPlist *plist)
1238 MSymbol sym = msymbol (name);
1241 unsigned min_range, max_range;
1243 MText *mapfile = (MText *) mplist_get (plist, Mmapfile);
1245 MSTRUCT_CALLOC (charset, MERROR_CHARSET);
1246 charset->name = sym;
1247 charset->method = (MSymbol) mplist_get (plist, Mmethod);
1248 if (! charset->method)
1251 charset->method = Mmap;
1253 charset->method = Moffset;
1255 if (charset->method == Mmap || charset->method == Munify)
1258 MERROR (MERROR_CHARSET, Mnil);
1259 mdatabase_define (Mcharset, sym, Mnil, Mnil, NULL, mapfile->data);
1261 if (! (charset->dimension = (int) mplist_get (plist, Mdimension)))
1262 charset->dimension = 1;
1264 min_range = (unsigned) mplist_get (plist, Mmin_range);
1265 if ((pl = mplist_find_by_key (plist, Mmax_range)))
1267 max_range = (unsigned) MPLIST_VAL (pl);
1268 if (max_range >= 0x1000000)
1269 charset->dimension = 4;
1270 else if (max_range >= 0x10000 && charset->dimension < 3)
1271 charset->dimension = 3;
1272 else if (max_range >= 0x100 && charset->dimension < 2)
1273 charset->dimension = 2;
1275 else if (charset->dimension == 1)
1277 else if (charset->dimension == 2)
1279 else if (charset->dimension == 3)
1280 max_range = 0xFFFFFF;
1282 max_range = 0xFFFFFFFF;
1284 memset (charset->code_range, 0, sizeof charset->code_range);
1285 for (i = 0; i < charset->dimension; i++, min_range >>= 8, max_range >>= 8)
1287 charset->code_range[i * 4] = min_range & 0xFF;
1288 charset->code_range[i * 4 + 1] = max_range & 0xFF;
1290 if ((charset->min_code = (int) mplist_get (plist, Mmin_code)) < min_range)
1291 charset->min_code = min_range;
1292 if ((charset->max_code = (int) mplist_get (plist, Mmax_code)) > max_range)
1293 charset->max_code = max_range;
1294 charset->ascii_compatible
1295 = (MSymbol) mplist_get (plist, Mascii_compatible) != Mnil;
1296 charset->final_byte = (int) mplist_get (plist, Mfinal_byte);
1297 charset->revision = (int) mplist_get (plist, Mrevision);
1298 charset->min_char = (int) mplist_get (plist, Mmin_char);
1299 pl = (MPlist *) mplist_get (plist, Mparents);
1300 charset->nparents = pl ? mplist_length (pl) : 0;
1301 if (charset->nparents > 8)
1302 charset->nparents = 8;
1303 for (i = 0; i < charset->nparents; i++, pl = MPLIST_NEXT (pl))
1305 MSymbol parent_name;
1307 if (MPLIST_KEY (pl) != Msymbol)
1308 MERROR (MERROR_CHARSET, Mnil);
1309 parent_name = MPLIST_SYMBOL (pl);
1310 if (! (charset->parents[i] = MCHARSET (parent_name)))
1311 MERROR (MERROR_CHARSET, Mnil);
1314 charset->subset_offset = (int) mplist_get (plist, Msubset_offset);
1316 msymbol_put (sym, Mcharset, charset);
1317 charset = make_charset (charset);
1320 msymbol_put (msymbol__canonicalize (sym), Mcharset, charset);
1322 for (pl = (MPlist *) mplist_get (plist, Maliases);
1323 pl && MPLIST_KEY (pl) == Msymbol;
1324 pl = MPLIST_NEXT (pl))
1326 MSymbol alias = MPLIST_SYMBOL (pl);
1328 msymbol_put (alias, Mcharset, charset);
1329 msymbol_put (msymbol__canonicalize (alias), Mcharset, charset);
1332 if (mplist_get (plist, Mdefine_coding)
1333 && charset->dimension == 1
1334 && charset->code_range[0] == 0 && charset->code_range[1] == 255)
1335 mconv__register_charset_coding (sym);
1342 @brief Resolve charset name.
1344 The mchar_resolve_charset () function returns $SYMBOL if it
1345 represents a charset. Otherwise, canonicalize $SYMBOL as to a
1346 charset name, and if the canonicalized name represents a charset,
1347 return it. Otherwise, return #Mnil. */
1350 @brief ʸ»ú¥»¥Ã¥È̾¤ò²ò·è¤¹¤ë.
1352 ´Ø¿ô mchar_resolve_charset () ¤Ï $SYMBOL
1353 ¤¬Ê¸»ú¥»¥Ã¥È¤ò¼¨¤·¤Æ¤¤¤ì¤Ð¤½¤ì¤òÊÖ¤¹¡£
1355 ¤½¤¦¤Ç¤Ê¤±¤ì¤Ð¡¢$SYMBOL ¤òʸ»ú¥»¥Ã¥È̾¤È¤·¤ÆÀµµ¬²½¤·¡¢¤½¤ì¤¬Ê¸»ú¥»¥Ã¥È¤ò¼¨¤·¤Æ¤¤¤Æ¤¤¤ì¤ÐÀµµ¬²½¤·¤¿¤â¤Î¤òÊÖ¤¹¡£
1356 ¤½¤¦¤Ç¤Ê¤±¤ì¤Ð¡¢#Mnil ¤òÊÖ¤¹¡£ */
1359 mchar_resolve_charset (MSymbol symbol)
1361 MCharset *charset = (MCharset *) msymbol_get (symbol, Mcharset);
1365 symbol = msymbol__canonicalize (symbol);
1366 charset = (MCharset *) msymbol_get (symbol, Mcharset);
1369 return (charset ? charset->name : Mnil);
1375 @brief List symbols representing charsets.
1377 The mchar_list_charsets () function makes an array of symbols
1378 representing a charset, stores the pointer to the array in a place
1379 pointed to by $SYMBOLS, and returns the length of the array. */
1382 @brief ʸ»ú¥»¥Ã¥È¤òɽ¤ï¤¹¥·¥ó¥Ü¥ë¤òÎóµó¤¹¤ë.
1384 ´Ø¿ô mchar_list_charsets ()
1385 ¤Ï¡¢Ê¸»ú¥»¥Ã¥È¤ò¼¨¤¹¥·¥ó¥Ü¥ë¤òʤ٤¿ÇÛÎó¤òºî¤ê¡¢$SYMBOLS
1386 ¤Ç¥Ý¥¤¥ó¥È¤µ¤ì¤¿¾ì½ê¤Ë¤³¤ÎÇÛÎó¤Ø¤Î¥Ý¥¤¥ó¥¿¤òÃÖ¤¡¢ÇÛÎó¤ÎŤµ¤òÊÖ¤¹¡£ */
1389 mchar_list_charset (MSymbol **symbols)
1393 MTABLE_MALLOC ((*symbols), charset_list.used, MERROR_CHARSET);
1394 for (i = 0; i < charset_list.used; i++)
1395 (*symbols)[i] = charset_list.charsets[i]->name;
1402 @brief Decode a code-point.
1404 The mchar_decode () function decodes code-point $CODE in the
1405 charset represented by the symbol $CHARSET_NAME to get a character
1409 If decoding was successful, mchar_decode () returns the decoded
1410 character code. Otherwise it returns -1. */
1413 @brief ¥³¡¼¥É¥Ý¥¤¥ó¥È¤ò¥Ç¥³¡¼¥É¤¹¤ë.
1415 ´Ø¿ô mchar_decode () ¤Ï¡¢¥·¥ó¥Ü¥ë $CHARSET_NAME ¤Ç¼¨¤µ¤ì¤ëʸ»ú¥»¥Ã¥ÈÆâ¤Î
1416 $CODE ¤È¤¤¤¦¥³¡¼¥É¥Ý¥¤¥ó¥È¤ò¥Ç¥³¡¼¥É¤·¤Æʸ»ú¥³¡¼¥É¤òÆÀ¤ë¡£
1419 ¥Ç¥³¡¼¥É¤¬À®¸ù¤¹¤ì¤Ð¡¢mchar_decode () ¤Ï¥Ç¥³¡¼¥É¤µ¤ì¤¿Ê¸»ú¥³¡¼¥É¤òÊÖ¤¹¡£
1420 ¤½¤¦¤Ç¤Ê¤±¤ì¤Ð -1 ¤òÊÖ¤¹¡£ */
1427 mchar_decode (MSymbol charset_name, unsigned code)
1429 MCharset *charset = MCHARSET (charset_name);
1432 return MCHAR_INVALID_CODE;
1433 return DECODE_CHAR (charset, code);
1439 @brief Encode a character code.
1441 The mchar_encode () function encodes character code $C to get a
1442 code-point in the charset represented by the symbol $CHARSET_NAME.
1445 If encoding was successful, mchar_encode () returns the encoded
1446 code-point. Otherwise it returns #MCHAR_INVALID_CODE. */
1449 @brief ʸ»ú¥³¡¼¥É¤ò¥¨¥ó¥³¡¼¥É¤¹¤ë.
1451 ´Ø¿ô mchar_encode () ¤Ï¡¢Ê¸»ú¥³¡¼¥É $C ¤ò¥¨¥ó¥³¡¼¥É¤·¤Æ¥·¥ó¥Ü¥ë
1452 $CHARSET_NAME ¤Ç¼¨¤µ¤ì¤ëʸ»ú¥»¥Ã¥ÈÆâ¤Ë¤ª¤±¤ë¥³¡¼¥É¥Ý¥¤¥ó¥È¤òÆÀ¤ë¡£
1455 ¥¨¥ó¥³¡¼¥É¤¬À®¸ù¤¹¤ì¤Ð¡¢mchar_encode () ¤Ï¥¨¥ó¡¼¥É¤µ¤ì¤¿¥³¡¼¥É¥Ý¥¤¥ó¥È¤òÊÖ¤¹¡£
1456 ¤½¤¦¤Ç¤Ê¤±¤ì¤Ð #MCHAR_INVALID_CODE ¤òÊÖ¤¹¡£ */
1463 mchar_encode (MSymbol charset_name, int c)
1465 MCharset *charset = MCHARSET (charset_name);
1468 return MCHAR_INVALID_CODE;
1469 return ENCODE_CHAR (charset, c);
1475 @brief Call a function for all the characters in a specified charset.
1477 The mcharset_map_chars () function calls $FUNC for all the
1478 characters in the charset named $CHARSET_NAME. A call is done for
1479 a chunk of consecutive characters rather than character by
1482 $FUNC receives three arguments: $FROM, $TO, and $ARG. $FROM and
1483 $TO specify the range of character codes in $CHARSET. $ARG is the
1487 If the operation was successful, mcharset_map_chars () returns 0.
1488 Otherwise, it returns -1 and assigns an error code to the external
1489 variable #merror_code. */
1492 @brief »ØÄꤷ¤¿Ê¸»ú¥»¥Ã¥È¤Î¤¹¤Ù¤Æ¤Îʸ»ú¤ËÂФ·¤Æ´Ø¿ô¤ò¸Æ¤Ö.
1494 ´Ø¿ô mcharset_map_chars () ¤Ï $CHARSET_NAME
1495 ¤È¤¤¤¦Ì¾Á°¤ò»ý¤Äʸ»ú¥»¥Ã¥ÈÃæ¤Î¤¹¤Ù¤Æ¤Îʸ»ú¤ËÂФ·¤Æ $FUNC ¤ò¸Æ¤Ö¡£
1496 ¸Æ¤Ó½Ð¤·¤Ï°ìʸ»úËè¤Ç¤Ï¤Ê¤¯¡¢Ï¢Â³¤·¤¿Ê¸»ú¤Î¤Þ¤È¤Þ¤êñ°Ì¤Ç¹Ô¤Ê¤ï¤ì¤ë¡£
1498 ´Ø¿ô $FUNC ¤Ë¤Ï$FROM, $TO, $ARG ¤Î£³°ú¿ô¤¬ÅϤµ¤ì¤ë¡£$FROM ¤È $TO
1499 ¤Ï $CHARSET Ãæ¤Îʸ»ú¥³¡¼¥É¤ÎÈϰϤò»ØÄꤹ¤ë¡£$ARG ¤Ï $FUNC_ARG
1503 ½èÍý¤ËÀ®¸ù¤¹¤ì¤Ð mcharset_map_chars () ¤Ï 0 ¤òÊÖ¤¹¡£
1504 ¤½¤¦¤Ç¤Ê¤±¤ì¤Ð -1 ¤òÊÖ¤·¡¢³°ÉôÊÑ¿ô #merror_code ¤Ë¥¨¥é¡¼¥³¡¼¥É¤òÀßÄꤹ¤ë¡£ */
1508 @c MERROR_CHARSET */
1511 mchar_map_charset (MSymbol charset_name,
1512 void (*func) (int from, int to, void *arg),
1517 charset = MCHARSET (charset_name);
1519 MERROR (MERROR_CHARSET, -1);
1521 if (charset->encoder)
1523 int c = charset->min_char;
1526 if ((int) mchartable__lookup (charset->encoder, c, &next_c, 1) < 0)
1528 while (c <= charset->max_char)
1530 if ((int) mchartable__lookup (charset->encoder, c, &next_c, 1) >= 0)
1531 (*func) (c, next_c - 1, func_arg);
1536 (*func) (charset->min_char, charset->max_char, func_arg);