1 /* charset.c -- charset module.
2 Copyright (C) 2003, 2004
3 National Institute of Advanced Industrial Science and Technology (AIST)
4 Registration Number H15PRO112
6 This file is part of the m17n library.
8 The m17n library is free software; you can redistribute it and/or
9 modify it under the terms of the GNU Lesser General Public License
10 as published by the Free Software Foundation; either version 2.1 of
11 the License, or (at your option) any later version.
13 The m17n library is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public
19 License along with the m17n library; if not, write to the Free
20 Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
23 @addtogroup m17nCharset
24 @brief Charset objects and API for them.
26 The m17n library uses @e charset objects to represent a coded
27 character sets (CCS). The m17n library supports many predefined
28 coded character sets. Moreover, application programs can add
29 other charsets. A character can belong to multiple charsets.
31 The m17n library distinguishes the following three concepts:
33 @li A @e code-point is a number assigned by the CCS to each
34 character. Code-points may or may not be continuous. The type
35 @c unsigned is used to represent a code-point. An invalid
36 code-point is represented by the macro @c MCHAR_INVALID_CODE.
38 @li A @e character @e index is the canonical index of a character
39 in a CCS. The character that has the character index N occupies
40 the Nth position when all the characters in the current CCS are
41 sorted by their code-points. Character indices in a CCS are
42 continuous and start with 0.
44 @li A @e character @e code is the internal representation in the
45 m17n library of a character. A character code is a signed integer
48 Each charset object defines how characters are converted between
49 code-points and character codes. To @e encode means converting
50 code-points to character codes and to @e decode means converting
51 character codes to code-points. */
54 @addtogroup m17nCharset
55 @brief ʸ»ú¥»¥Ã¥È¥ª¥Ö¥¸¥§¥¯¥È¤È¤½¤ì¤Ë´Ø¤¹¤ë API.
57 m17n ¥é¥¤¥Ö¥é¥ê¤Ï¡¢Éä¹æ²½Ê¸»ú½¸¹ç (CCS) ¤ò @e ʸ»ú¥»¥Ã¥È
58 ¤È¸Æ¤Ö¥ª¥Ö¥¸¥§¥¯¥È¤Çɽ¸½¤¹¤ë¡£
59 m17n ¥é¥¤¥Ö¥é¥ê¤Ï¿¤¯¤ÎÉä¹æ²½Ê¸»ú½¸¹ç¤ò¤¢¤é¤«¤¸¤á¥µ¥Ý¡¼¥È¤·¤Æ¤¤¤ë¤·¡¢¥¢¥×¥ê¥±¡¼¥·¥ç¥ó¥×¥í¥°¥é¥à¤¬Æȼ«¤Ëʸ»ú¥»¥Ã¥È¤òÄɲ乤뤳¤È¤â²Äǽ¤Ç¤¢¤ë¡£
60 °ì¤Ä¤Îʸ»ú¤ÏÊ£¿ô¤Îʸ»ú¥»¥Ã¥È¤Ë°¤·¤Æ¤â¤è¤¤¡£
62 m17n ¥é¥¤¥Ö¥é¥ê¤Ï¡¢°Ê²¼¤Î³µÇ°¤ò¶èÊ̤·¤Æ¤¤¤ë:
64 @li @e ¥³¡¼¥É¥Ý¥¤¥ó¥È ¤È¤Ï¡¢CCS ¤¬¤½¤ÎÃæ¤Î¸Ä¡¹¤Îʸ»ú¤ËÂФ·¤ÆÄêµÁ¤¹¤ë¿ôÃͤǤ¢¤ë¡£
65 ¥³¡¼¥É¥Ý¥¤¥ó¥È¤ÏϢ³¤·¤Æ¤¤¤ë¤È¤Ï¸Â¤é¤Ê¤¤¡£¥³¡¼¥É¥Ý¥¤¥ó¥È¤Ï
66 @c unsigned ·¿¤Ë¤è¤Ã¤Æɽ¤µ¤ì¤ë¡£Ìµ¸ú¤Ê¥³¡¼¥É¥Ý¥¤¥ó¥È¤Ï¥Þ¥¯¥í
67 @c MCHAR_INVALID_CODE ¤Çɽ¤µ¤ì¤ë¡£
69 @li @e ʸ»ú¥¤¥ó¥Ç¥Ã¥¯¥¹ ¤È¤Ï¡¢CCS Æâ¤Ç³Æʸ»ú¤Ë³ä¤êÅö¤Æ¤é¤ì¤ëÀµµ¬²½¤µ¤ì¤¿¥¤¥ó¥Ç¥Ã¥¯¥¹¤Ç¤¢¤ë¡£
70 ʸ»ú¥¤¥ó¥Ç¥Ã¥¯¥¹¤¬ N ¤Îʸ»ú¤Ï¡¢CCS Ãæ¤ÎÁ´Ê¸»ú¤ò¥³¡¼¥É¥Ý¥¤¥ó¥È½ç¤Ëʤ٤¿¤È¤¤Ë N ÈÖÌܤ˸½¤ï¤ì¤ë¡£
71 CCS Ãæ¤Îʸ»ú¥¤¥ó¥Ç¥Ã¥¯¥¹¤ÏϢ³¤·¤Æ¤ª¤ê¡¢0 ¤«¤é»Ï¤Þ¤ë¡£
73 @li @e ʸ»ú¥³¡¼¥É ¤È¤Ï¡¢m17n ¥é¥¤¥Ö¥é¥êÆâ¤Ë¤ª¤±¤ëʸ»ú¤ÎÆâÉôɽ¸½¤Ç¤¢¤ê¡¢21 ¥Ó¥Ã¥È°Ê¾å¤ÎŤµ¤ò»ý¤ÄÉä¹çÉÕ¤À°¿ô¤Ç¤¢¤ë¡£
75 ³Æʸ»ú¥»¥Ã¥È¥ª¥Ö¥¸¥§¥¯¥È¤Ï¡¢¤½¤Îʸ»ú¥»¥Ã¥È¤Ë°¤¹¤ëʸ»ú¤Î¥³¡¼¥É¥Ý¥¤¥ó¥È¤Èʸ»ú¥³¡¼¥É¤È¤Î´Ö¤ÎÊÑ´¹¤òµ¬Äꤹ¤ë¡£
76 ¥³¡¼¥É¥Ý¥¤¥ó¥È¤«¤éʸ»ú¥³¡¼¥É¤Ø¤ÎÊÑ´¹¤ò @e ¥Ç¥³¡¼¥É
77 ¤È¸Æ¤Ó¡¢Ê¸»ú¥³¡¼¥É¤«¤é¥³¡¼¥É¥Ý¥¤¥ó¥È¤Ø¤ÎÊÑ´¹¤ò @e ¥¨¥ó¥³¡¼¥É ¤È¸Æ¤Ö¡£ */
80 #if !defined (FOR_DOXYGEN) || defined (DOXYGEN_INTERNAL_MODULE)
81 /*** @addtogroup m17nInternal
91 #include "m17n-misc.h"
100 static int unified_max;
102 /** List of all charsets ever defined. */
110 static struct MCharsetList charset_list;
112 static MPlist *charset_definition_list;
114 /** Make a charset object from the template of MCharset structure
115 CHARSET, and return a pointer to the new charset object.
116 CHARSET->code_range[4N + 2] and CHARSET->code_range[4N + 3] are
120 make_charset (MCharset *charset)
122 unsigned min_code, max_code;
124 int *range = charset->code_range;
126 if (charset->dimension < 1 || charset->dimension > 4)
127 MERROR (MERROR_CHARSET, NULL);
128 if ((charset->final_byte > 0 && charset->final_byte < '0')
129 || charset->final_byte > 127)
130 MERROR (MERROR_CHARSET, NULL);
132 for (i = 0, n = 1; i < 4; i++)
134 if (range[i * 4] > range[i * 4 + 1])
135 MERROR (MERROR_CHARSET, NULL);
136 range[i * 4 + 2] = range[i * 4 + 1] - range[i * 4] + 1;
137 n *= range[i * 4 + 2];
138 range[i * 4 + 3] = n;
141 min_code = range[0] | (range[4] << 8) | (range[8] << 16) | (range[12] << 24);
142 if (charset->min_code == 0)
143 charset->min_code = min_code;
144 else if (charset->min_code < min_code)
145 MERROR (MERROR_CHARSET, NULL);
146 max_code = range[1] | (range[5] << 8) | (range[9] << 16) | (range[13] << 24);
147 if (charset->max_code == 0)
148 charset->max_code = max_code;
149 else if (charset->max_code > max_code)
150 MERROR (MERROR_CHARSET, NULL);
152 charset->code_range_min_code = min_code;
153 charset->fully_loaded = 0;
156 if (charset->method == Msubset)
160 if (charset->nparents != 1)
161 MERROR (MERROR_CHARSET, NULL);
162 parent = charset->parents[0];
163 if (parent->method == Msuperset
164 || charset->min_code - charset->subset_offset < parent->min_code
165 || charset->max_code - charset->subset_offset > parent->max_code)
166 MERROR (MERROR_CHARSET, NULL);
168 else if (charset->method == Msuperset)
170 if (charset->nparents < 2)
171 MERROR (MERROR_CHARSET, NULL);
172 for (i = 0; i < charset->nparents; i++)
173 if (charset->min_code > charset->parents[i]->min_code
174 || charset->max_code < charset->parents[i]->max_code)
175 MERROR (MERROR_CHARSET, NULL);
180 = (charset->dimension == 1
182 && (charset->dimension == 2
184 && (charset->dimension == 3
185 || range[10] == 256)))));
187 if (! charset->no_code_gap)
191 memset (charset->code_range_mask, 0,
192 sizeof charset->code_range_mask);
193 for (i = 0; i < 4; i++)
194 for (j = range[i * 4]; j <= range[i * 4 + 1]; j++)
195 charset->code_range_mask[j] |= (1 << i);
198 if (charset->method == Moffset)
200 charset->max_char = charset->min_char + range[15] - 1;
201 if (charset->min_char < 0
202 || charset->max_char < 0 || charset->max_char > unified_max)
203 MERROR (MERROR_CHARSET, NULL);
204 charset->simple = charset->no_code_gap;
205 charset->fully_loaded = 1;
207 else if (charset->method == Munify)
209 /* The magic number 12 below is to align to the SUB_BITS_2
210 (defined in chartab.c) boundary in a char-table. */
211 unified_max -= ((range[15] >> 12) + 1) << 12;
212 charset->unified_max = unified_max;
214 else if (charset->method != Mmap)
215 MERROR (MERROR_CHARSET, NULL);
218 MLIST_APPEND1 (&charset_list, charsets, charset, MERROR_CHARSET);
220 if (charset->final_byte > 0)
222 MLIST_APPEND1 (&mcharset__iso_2022_table, charsets, charset,
224 if (charset->revision <= 0)
226 int chars = range[2];
228 if (chars == 128) /* ASCII case */
230 else if (chars == 256) /* ISO-8859-X case */
232 MCHARSET_ISO_2022 (charset->dimension, chars, charset->final_byte)
241 load_charset_fully (MCharset *charset)
243 if (charset->method == Msubset)
245 MCharset *parent = charset->parents[0];
247 if (! parent->fully_loaded
248 && load_charset_fully (parent) < 0)
249 MERROR (MERROR_CHARSET, -1);
250 if (parent->method == Moffset)
254 code = charset->min_code - charset->subset_offset;
255 charset->min_char = DECODE_CHAR (parent, code);
256 code = charset->max_code - charset->subset_offset;
257 charset->max_char = DECODE_CHAR (parent, code);
261 unsigned min_code = charset->min_code - charset->subset_offset;
262 unsigned max_code = charset->max_code - charset->subset_offset;
263 int min_char = DECODE_CHAR (parent, min_code);
264 int max_char = min_char;
266 for (++min_code; min_code <= max_code; min_code++)
268 int c = DECODE_CHAR (parent, min_code);
274 else if (c > max_char)
278 charset->min_char = min_char;
279 charset->max_char = max_char;
282 else if (charset->method == Msuperset)
284 int min_char = 0, max_char = 0;
287 for (i = 0; i < charset->nparents; i++)
289 MCharset *parent = charset->parents[i];
291 if (! parent->fully_loaded
292 && load_charset_fully (parent) < 0)
293 MERROR (MERROR_CHARSET, -1);
295 min_char = parent->min_char, max_char = parent->max_char;
296 else if (parent->min_char < min_char)
297 min_char = parent->min_char;
298 else if (parent->max_char > max_char)
299 max_char = parent->max_char;
301 charset->min_char = min_char;
302 charset->max_char = max_char;
304 else /* charset->method is Mmap or Munify */
306 MDatabase *mdb = mdatabase_find (Mcharset, charset->name, Mnil, Mnil);
309 if (! mdb || ! (plist = mdatabase_load (mdb)))
310 MERROR (MERROR_CHARSET, -1);
311 charset->decoder = mplist_value (plist);
312 charset->encoder = mplist_value (mplist_next (plist));
313 M17N_OBJECT_UNREF (plist);
314 mchartable_range (charset->encoder,
315 &charset->min_char, &charset->max_char);
316 if (charset->method == Mmap)
317 charset->simple = charset->no_code_gap;
319 charset->max_char = charset->unified_max + 1 + charset->code_range[15];
322 charset->fully_loaded = 1;
326 /** Load a data of type @c charset from the file FD. */
329 load_charset (FILE *fp, MSymbol charset_name)
331 MCharset *charset = MCHARSET (charset_name);
340 MERROR (MERROR_DB, NULL);
341 size = (charset->code_range[15]
342 - (charset->min_code - charset->code_range_min_code));
343 MTABLE_MALLOC (decoder, size, MERROR_DB);
344 for (i = 0; i < size; i++)
346 encoder = mchartable (Minteger, (void *) MCHAR_INVALID_CODE);
348 while ((c = getc (fp)) != EOF)
350 unsigned code1, code2, c1, c2;
355 fgets (buf, 256, fp);
358 if (sscanf (buf, "0x%x-0x%x 0x%x", &code1, &code2, &c1) == 3)
360 idx1 = CODE_POINT_TO_INDEX (charset, code1);
363 idx2 = CODE_POINT_TO_INDEX (charset, code2);
366 c2 = c1 + (idx2 - idx1);
368 else if (sscanf (buf, "0x%x 0x%x", &code1, &c1) == 2)
370 idx1 = idx2 = CODE_POINT_TO_INDEX (charset, code1);
377 if (idx1 >= 0 && idx2 >= 0)
380 mchartable_set (encoder, c1, (void *) code1);
381 for (idx1++, c1++; idx1 <= idx2; idx1++, c1++)
383 code1 = INDEX_TO_CODE_POINT (charset, idx1);
385 mchartable_set (encoder, c1, (void *) code1);
395 M17N_OBJECT_UNREF (encoder);
399 mplist_add (plist, Mt, decoder);
400 mplist_add (plist, Mt, encoder);
407 MPlist *mcharset__cache;
409 /* Predefined charsets. */
410 MCharset *mcharset__ascii;
411 MCharset *mcharset__binary;
412 MCharset *mcharset__m17n;
413 MCharset *mcharset__unicode;
415 MCharsetISO2022Table mcharset__iso_2022_table;
417 /** Initialize charset handler. */
424 unified_max = MCHAR_MAX;
426 mdatabase__load_charset_func = load_charset;
427 mcharset__cache = mplist ();
428 mplist_set (mcharset__cache, Mt, NULL);
430 MLIST_INIT1 (&charset_list, charsets, 128);
431 MLIST_INIT1 (&mcharset__iso_2022_table, charsets, 128);
432 charset_definition_list = mplist ();
434 memset (mcharset__iso_2022_table.classified, 0,
435 sizeof (mcharset__iso_2022_table.classified));
437 Mmethod = msymbol ("method");
438 Moffset = msymbol ("offset");
439 Mmap = msymbol ("map");
440 Munify = msymbol ("unify");
441 Msubset = msymbol ("subset");
442 Msuperset = msymbol ("superset");
444 Mdimension = msymbol ("dimension");
445 Mmin_range = msymbol ("min-range");
446 Mmax_range = msymbol ("max-range");
447 Mmin_code = msymbol ("min-code");
448 Mmax_code = msymbol ("max-code");
449 Mascii_compatible = msymbol ("ascii-compatible");
450 Mfinal_byte = msymbol ("final-byte");
451 Mrevision = msymbol ("revision");
452 Mmin_char = msymbol ("min-char");
453 Mmapfile = msymbol_as_managing_key ("mapfile");
454 Mparents = msymbol_as_managing_key ("parents");
455 Msubset_offset = msymbol ("subset-offset");
456 Mdefine_coding = msymbol ("define-coding");
457 Maliases = msymbol_as_managing_key ("aliases");
461 /* Setup predefined charsets. */
462 pl = mplist_add (pl, Mmethod, Moffset);
463 pl = mplist_add (pl, Mmin_range, (void *) 0);
464 pl = mplist_add (pl, Mmax_range, (void *) 0x7F);
465 pl = mplist_add (pl, Mascii_compatible, Mt);
466 pl = mplist_add (pl, Mfinal_byte, (void *) 'B');
467 pl = mplist_add (pl, Mmin_char, (void *) 0);
468 Mcharset_ascii = mchar_define_charset ("ascii", param);
470 mplist_put (param, Mmax_range, (void *) 0xFF);
471 mplist_put (param, Mfinal_byte, NULL);
472 Mcharset_iso_8859_1 = mchar_define_charset ("iso-8859-1", param);
474 mplist_put (param, Mmax_range, (void *) 0x10FFFF);
475 Mcharset_unicode = mchar_define_charset ("unicode", param);
477 mplist_put (param, Mmax_range, (void *) MCHAR_MAX);
478 Mcharset_m17n = mchar_define_charset ("m17n", param);
480 mplist_put (param, Mmax_range, (void *) 0xFF);
481 Mcharset_binary = mchar_define_charset ("binary", param);
483 M17N_OBJECT_UNREF (param);
485 mcharset__ascii = MCHARSET (Mcharset_ascii);
486 mcharset__binary = MCHARSET (Mcharset_binary);
487 mcharset__m17n = MCHARSET (Mcharset_m17n);
488 mcharset__unicode = MCHARSET (Mcharset_unicode);
494 mcharset__fini (void)
499 for (i = 0; i < charset_list.used; i++)
501 MCharset *charset = charset_list.charsets[i];
503 if (charset->decoder)
504 free (charset->decoder);
505 if (charset->encoder)
506 M17N_OBJECT_UNREF (charset->encoder);
509 M17N_OBJECT_UNREF (mcharset__cache);
510 MLIST_FREE1 (&charset_list, charsets);
511 MLIST_FREE1 (&mcharset__iso_2022_table, charsets);
512 MPLIST_DO (plist, charset_definition_list)
513 M17N_OBJECT_UNREF (MPLIST_VAL (plist));
514 M17N_OBJECT_UNREF (charset_definition_list);
519 mcharset__find (MSymbol name)
523 charset = msymbol_get (name, Mcharset);
526 MPlist *param = mplist_get (charset_definition_list, name);
528 MPLIST_KEY (mcharset__cache) = Mt;
531 param = mplist__from_plist (param);
532 mchar_define_charset (MSYMBOL_NAME (name), param);
533 charset = msymbol_get (name, Mcharset);
534 M17N_OBJECT_UNREF (param);
536 MPLIST_KEY (mcharset__cache) = name;
537 MPLIST_VAL (mcharset__cache) = charset;
542 /** Return the character corresponding to code-point CODE in CHARSET.
543 If CODE is invalid for CHARSET, return -1. */
546 mcharset__decode_char (MCharset *charset, unsigned code)
550 if (code < 128 && charset->ascii_compatible)
552 if (code < charset->min_code || code > charset->max_code)
555 if (! charset->fully_loaded
556 && load_charset_fully (charset) < 0)
557 MERROR (MERROR_CHARSET, -1);
559 if (charset->method == Msubset)
561 MCharset *parent = charset->parents[0];
563 code -= charset->subset_offset;
564 return DECODE_CHAR (parent, code);
567 if (charset->method == Msuperset)
571 for (i = 0; i < charset->nparents; i++)
573 MCharset *parent = charset->parents[i];
574 int c = DECODE_CHAR (parent, code);
582 idx = CODE_POINT_TO_INDEX (charset, code);
586 if (charset->method == Mmap)
587 return charset->decoder[idx];
589 if (charset->method == Munify)
591 int c = charset->decoder[idx];
594 c = charset->unified_max + 1 + idx;
598 /* Now charset->method should be Moffset. */
599 return (charset->min_char + idx);
603 /** Return the code point of character C in CHARSET. If CHARSET does not
604 contain C, return MCHAR_INVALID_CODE. */
607 mcharset__encode_char (MCharset *charset, int c)
609 if (! charset->fully_loaded
610 && load_charset_fully (charset) < 0)
611 MERROR (MERROR_CHARSET, MCHAR_INVALID_CODE);
613 if (charset->method == Msubset)
615 MCharset *parent = charset->parents[0];
616 unsigned code = ENCODE_CHAR (parent, c);
618 if (code == MCHAR_INVALID_CODE)
620 code += charset->subset_offset;
621 if (code >= charset->min_code && code <= charset->max_code)
623 return MCHAR_INVALID_CODE;
626 if (charset->method == Msuperset)
630 for (i = 0; i < charset->nparents; i++)
632 MCharset *parent = charset->parents[i];
633 unsigned code = ENCODE_CHAR (parent, c);
635 if (code != MCHAR_INVALID_CODE)
638 return MCHAR_INVALID_CODE;
641 if (c < charset->min_char || c > charset->max_char)
642 return MCHAR_INVALID_CODE;
644 if (charset->method == Mmap)
645 return (unsigned) mchartable_lookup (charset->encoder, c);
647 if (charset->method == Munify)
649 if (c > charset->unified_max)
651 c -= charset->unified_max - 1;
652 return INDEX_TO_CODE_POINT (charset, c);
654 return (unsigned) mchartable_lookup (charset->encoder, c);
657 /* Now charset->method should be Moffset */
658 c -= charset->min_char;
659 return INDEX_TO_CODE_POINT (charset, c);
663 mcharset__load_from_database ()
665 MDatabase *mdb = mdatabase_find (msymbol ("charset-list"), Mnil, Mnil, Mnil);
666 MPlist *def_list, *plist;
667 MPlist *definitions = charset_definition_list;
668 int mdebug_flag = MDEBUG_CHARSET;
673 def_list = (MPlist *) mdatabase_load (mdb);
674 MDEBUG_PRINT_TIME ("CHARSET", (stderr, " to load data."));
680 MPLIST_DO (plist, def_list)
685 if (! MPLIST_PLIST_P (plist))
686 MERROR (MERROR_CHARSET, -1);
687 pl = MPLIST_PLIST (plist);
688 if (! MPLIST_SYMBOL_P (pl))
689 MERROR (MERROR_CHARSET, -1);
690 name = MPLIST_SYMBOL (pl);
691 pl = MPLIST_NEXT (pl);
692 definitions = mplist_add (definitions, name, pl);
693 M17N_OBJECT_REF (pl);
694 p = mplist__from_plist (pl);
695 mchar_define_charset (MSYMBOL_NAME (name), p);
696 M17N_OBJECT_UNREF (p);
699 M17N_OBJECT_UNREF (def_list);
700 MDEBUG_PRINT_TIME ("CHARSET", (stderr, " to parse the loaded data."));
706 #endif /* !FOR_DOXYGEN || DOXYGEN_INTERNAL_MODULE */
711 /*** @addtogroup m17nCharset */
717 @brief Invalid code-point.
719 The macro #MCHAR_INVALID_CODE gives the invalid code-point. */
722 @brief ̵¸ú¤Ê¥³¡¼¥É¥Ý¥¤¥ó¥È.
724 ¥Þ¥¯¥í #MCHAR_INVALID_CODE ¤Ï̵¸ú¤Ê¥³¡¼¥É¥Ý¥¤¥ó¥È¤ò¼¨¤¹¡£ */
726 #define MCHAR_INVALID_CODE
732 @name Variables: Symbols representing a charset.
734 Each of the following symbols represents a predefined charset. */
737 @name ÊÑ¿ô: ʸ»ú¥»¥Ã¥È¤òɽ¸½¤¹¤ëÄêµÁºÑ¤ß¥·¥ó¥Ü¥ë.
739 °Ê²¼¤Î³Æ¥·¥ó¥Ü¥ë¤Ï¡¢ÄêµÁºÑ¤ßʸ»ú¥»¥Ã¥È¤òɽ¸½¤¹¤ë¡£ */
744 @brief Symbol representing the charset ASCII.
746 The symbol #Mcharset_ascii has name <tt>"ascii"</tt> and represents
747 the charset ISO 646, USA Version X3.4-1968 (ISO-IR-6). */
749 @brief ASCII ʸ»ú¥»¥Ã¥È¤òɽ¸½¤¹¤ë¥·¥ó¥Ü¥ë.
751 ¥·¥ó¥Ü¥ë #Mcharset_ascii ¤Ï <tt>"ascii"</tt> ¤È¤¤¤¦Ì¾Á°¤ò»ý¤Á¡¢
752 ISO 646, USA Version X3.4-1968 (ISO-IR-6) ʸ»ú¥»¥Ã¥È¤òɽ¸½¤¹¤ë¡£
755 MSymbol Mcharset_ascii;
759 @brief Symbol representing the charset ISO/IEC 8859/1.
761 The symbol #Mcharset_iso_8859_1 has name <tt>"iso-8859-1"</tt>
762 and represents the charset ISO/IEC 8859-1:1998. */
764 @brief ISO/IEC 8859-1:1998 ʸ»ú¥»¥Ã¥È¤òɽ¸½¤¹¤ë¥·¥ó¥Ü¥ë.
766 ¥·¥ó¥Ü¥ë #Mcharset_iso_8859_1 ¤Ï <tt>"iso-8859-1"</tt>
767 ¤È¤¤¤¦Ì¾Á°¤ò»ý¤Á¡¢ISO/IEC 8859-1:1998 ʸ»ú¥»¥Ã¥È¤òɽ¸½¤¹¤ë¡£
770 MSymbol Mcharset_iso_8859_1;
773 @brief Symbol representing the charset Unicode.
775 The symbol #Mcharset_unicode has name <tt>"unicode"</tt> and
776 represents the charset Unicode. */
778 @brief Unicode ʸ»ú¥»¥Ã¥È¤òɽ¸½¤¹¤ë¥·¥ó¥Ü¥ë.
780 ¥·¥ó¥Ü¥ë #Mcharset_unicode ¤Ï <tt>"unicode"</tt>
781 ¤È¤¤¤¦Ì¾Á°¤ò»ý¤Á¡¢Unicode ʸ»ú¥»¥Ã¥È¤òɽ¸½¤¹¤ë¡£ */
783 MSymbol Mcharset_unicode;
787 @brief Symbol representing the largest charset.
789 The symbol #Mcharset_m17n has name <tt>"m17n"</tt> and
790 represents the charset that contains all characters supported by
793 @brief Á´Ê¸»ú¤ò´Þ¤àʸ»ú¥»¥Ã¥È¤òɽ¸½¤¹¤ë¥·¥ó¥Ü¥ë.
795 ¥·¥ó¥Ü¥ë #Mcharset_m17n ¤Ï <tt>"m17n"</tt> ¤È¤¤¤¦Ì¾Á°¤ò»ý¤Á¡¢
796 m17n ¥é¥¤¥Ö¥é¥ê¤¬°·¤¦Á´¤Æ¤Îʸ»ú¤ò´Þ¤àʸ»ú¥»¥Ã¥È¤òɽ¸½¤¹¤ë¡£ */
798 MSymbol Mcharset_m17n;
802 @brief Symbol representing the charset for ill-decoded characters.
804 The symbol #Mcharset_binary has name <tt>"binary"</tt> and
805 represents the fake charset which the decoding functions put to an
806 M-text as a text property when they encounter an invalid byte
809 See @ref m17nConv for more details. */
812 @brief Àµ¤·¤¯¥Ç¥³¡¼¥É¤Ç¤¤Ê¤¤Ê¸»ú¤Îʸ»ú¥»¥Ã¥È¤òɽ¸½¤¹¤ë¥·¥ó¥Ü¥ë.
814 ¥·¥ó¥Ü¥ë #Mcharset_binary ¤Ï <tt>"binary"</tt>
815 ¤È¤¤¤¦Ì¾Á°¤ò»ý¤Á¡¢µ¶¤Î (fake) ʸ»ú¥»¥Ã¥È¤òɽ¸½¤¹¤ë¡£
816 ¥Ç¥³¡¼¥É´Ø¿ô¤Ï¡¢M-text ¤Î¥Æ¥¥¹¥È¥×¥í¥Ñ¥Æ¥£¤È¤·¤Æ¡¢Ìµ¸ú¤Ê¥Ð¥¤¥È¡Ê¥·¡¼¥¯¥¨¥ó¥¹¡Ë¤ËÁø¶ø¤·¤¿°ÌÃÖ¤òÉղ乤롣
818 ¾ÜºÙ¤Ï @ref m17nConv »²¾È¤Î¤³¤È¡£ */
820 MSymbol Mcharset_binary;
827 @name Variables: Parameter keys for mchar_define_charset ().
829 These are the predefined symbols to use as parameter keys for the
830 function mchar_define_charset () (which see). */
833 @name ÊÑ¿ô: mchar_define_charset ÍѤΥѥé¥á¡¼¥¿¡¦¥¡¼
835 ¤³¤ì¤é¤Ï¡¢´Ø¿ô mchar_define_charset () ÍѤΥѥé¥á¡¼¥¿¡¦¥¡¼¤È¤·¤Æ»È¤ï¤ì¤ë¥·¥ó¥Ü¥ë¤Ç¤¢¤ë¡£
836 ¾Ü¤·¤¯¤Ï¤³¤Î´Ø¿ô¤Î²òÀâ¤ò»²¾È¤Î¤³¤È¡£*/
841 Parameter key for mchar_define_charset () (which see). */
844 ´Ø¿ô mchar_define_charset () ÍѤΥѥé¥á¡¼¥¿¡¦¥¡¼.
845 ¾Ü¤·¤¯¤Ï¤³¤Î´Ø¿ô¤Î²òÀâ¤ò»²¾È¤Î¤³¤È¡£*/
853 MSymbol Mascii_compatible;
859 MSymbol Msubset_offset;
860 MSymbol Mdefine_coding;
867 @name Variables: Symbols representing charset methods.
869 These are the predefined symbols that can be a value of the
870 #Mmethod parameter of a charset used in an argument to the
871 mchar_define_charset () function.
873 A method specifies how code-points and character codes are
874 converted. See the documentation of the mchar_define_charset ()
875 function for the details. */
878 @name ÊÑ¿ô: ʸ»ú¥»¥Ã¥È¤Î¥á¥½¥Ã¥É»ØÄê¤Ë»È¤ï¤ì¤ë¥·¥ó¥Ü¥ë
880 ¤³¤ì¤é¤Ï¡¢Ê¸»ú¥»¥Ã¥È¤Î @e ¥á¥½¥Ã¥É ¤ò»ØÄꤹ¤ë¤¿¤á¤ÎÄêµÁºÑ¤ß¥·¥ó¥Ü¥ë¤Ç¤¢¤ê¡¢Ê¸»ú¥»¥Ã¥È¤Î
881 #Mmethod ¥Ñ¥é¥á¡¼¥¿¤ÎÃͤȤʤ뤳¤È¤¬¤Ç¤¤ë¡£
882 ¤³¤ÎÃͤϴؿô mchar_define_charset () ¤Î°ú¿ô¤È¤·¤Æ»È¤ï¤ì¤ë¡£
884 ¥á¥½¥Ã¥É¤È¤Ï¡¢¥³¡¼¥É¥Ý¥¤¥ó¥È¤Èʸ»ú¥³¡¼¥É¤òÁê¸ßÊÑ´¹¤¹¤ëºÝ¤ÎÊý¼°¤Î¤³¤È¤Ç¤¢¤ë¡£
885 ¾Ü¤·¤¯¤Ï´Ø¿ô mchar_define_charset () ¤Î²òÀâ¤ò»²¾È¤Î¤³¤È¡£ */
889 @brief Symbol for the offset type method of charset.
891 The symbol #Moffset has the name <tt>"offset"</tt> and, when used
892 as a value of #Mmethod parameter of a charset, it means that the
893 conversion of code-points and character codes of the charset is
894 done by this calculation:
897 CHARACTER-CODE = CODE-POINT - MIN-CODE + MIN-CHAR
900 where, MIN-CODE is a value of #Mmin_code parameter of the charset,
901 and MIN-CHAR is a value of #Mmin_char parameter. */
904 @brief ¥ª¥Õ¥»¥Ã¥È·¿¤Î¥á¥½¥Ã¥É¤ò¼¨¤¹¥·¥ó¥Ü¥ë.
906 ¥·¥ó¥Ü¥ë #Moffset ¤Ï <tt>"offset"</tt> ¤È¤¤¤¦Ì¾Á°¤ò»ý¤Á¡¢Ê¸»ú¥»¥Ã¥È¤Î
907 #Mmethod ¥Ñ¥é¥á¡¼¥¿¤ÎÃͤȤ·¤ÆÍѤ¤¤é¤ì¤¿¾ì¹ç¤Ë¤Ï¡¢¥³¡¼¥É¥Ý¥¤¥ó¥È¤Èʸ»ú¥»¥Ã¥È¤Îʸ»ú¥³¡¼¥É¤Î´Ö¤ÎÊÑ´¹¤¬°Ê²¼¤Î¼°¤Ë½¾¤Ã¤Æ¹Ô¤ï¤ì¤ë¤³¤È¤ò°ÕÌ£¤¹¤ë¡£
910 ʸ»ú¥³¡¼¥É = ¥³¡¼¥É¥Ý¥¤¥ó¥È - MIN-CODE + MIN-CHAR
913 ¤³¤³¤Ç¡¢MIN-CODE ¤Ïʸ»ú¥»¥Ã¥È¤Î #Mmin_code ¥Ñ¥é¥á¡¼¥¿¤ÎÃͤǤ¢¤ê¡¢MIN-CHAR ¤Ï
914 #Mmin_char ¥Ñ¥é¥á¡¼¥¿¤ÎÃͤǤ¢¤ë¡£ */
919 /***en @brief Symbol for the map type method of charset.
921 The symbol #Mmap has the name <tt>"map"</tt> and, when used as a
922 value of #Mmethod parameter of a charset, it means that the
923 conversion of code-points and character codes of the charset is
924 done by map looking up. The map must be given by #Mmapfile
927 /***ja @brief ¥Þ¥Ã¥×·¿¤Î¥á¥½¥Ã¥É¤ò¼¨¤¹¥·¥ó¥Ü¥ë.
929 ¥·¥ó¥Ü¥ë #Mmap ¤Ï <tt>"map"</tt> ¤È¤¤¤¦Ì¾Á°¤ò»ý¤Á¡¢Ê¸»ú¥»¥Ã¥È¤Î
930 #Mmethod ¥Ñ¥é¥á¡¼¥¿¤ÎÃͤȤ·¤ÆÍѤ¤¤é¤ì¤¿¾ì¹ç¤Ë¤Ï¡¢¥³¡¼¥É¥Ý¥¤¥ó¥È¤Èʸ»ú¥»¥Ã¥È¤Îʸ»ú¥³¡¼¥É¤Î´Ö¤ÎÊÑ´¹¤¬¥Þ¥Ã¥×¤ò»²¾È¤¹¤ë¤³¤È¤Ë¤è¤Ã¤Æ¹Ô¤ï¤ì¤ë¤³¤È¤ò°ÕÌ£¤¹¤ë¡£
931 ¥Þ¥Ã¥×¤Ï #Mmapfile ¥Ñ¥é¥á¡¼¥¿¤È¤·¤ÆÍ¿¤¨¤Ê¤±¤ì¤Ð¤Ê¤é¤Ê¤¤¡£ */
936 /***en @brief Symbol for the unify type method of charset.
938 The symbol #Munify has the name <tt>"unify"</tt> and, when used as
939 a value of #Mmethod parameter of a charset, it means that the
940 conversion of code-points and character codes of the charset is
941 done by map looking up and offsetting. The map must be given by
942 #Mmapfile parameter. For this kind of charset, a unique
943 continuous character code space for all characters is assigned.
945 If the map has an entry for a code-point, the conversion is done
946 by looking up the map. Otherwise, the conversion is done by this
950 CHARACTER-CODE = CODE-POINT - MIN-CODE + LOWEST-CHAR-CODE
953 where, MIN-CODE is a value of #Mmin_code parameter of the charset,
954 and LOWEST-CHAR-CODE is the lowest character code of the assigned
957 /***ja @brief ¥æ¥Ë¥Õ¥¡¥¤·¿¤Î¥á¥½¥Ã¥É¤ò¼¨¤¹¥·¥ó¥Ü¥ë.
959 ¥·¥ó¥Ü¥ë #Minherit ¤Ï <tt>"unify"</tt> ¤È¤¤¤¦Ì¾Á°¤ò»ý¤Á¡¢Ê¸»ú¥»¥Ã¥È¤Î
960 #Mmethod ¥Ñ¥é¥á¡¼¥¿¤ÎÃͤȤ·¤ÆÍѤ¤¤é¤ì¤¿¾ì¹ç¤Ë¤Ï¡¢¥³¡¼¥É¥Ý¥¤¥ó¥È¤Èʸ»ú¥»¥Ã¥È¤Îʸ»ú¥³¡¼¥É¤Î´Ö¤ÎÊÑ´¹¤¬¡¢¥Þ¥Ã¥×¤Î»²¾È¤È¥ª¥Õ¥»¥Ã¥È¤ÎÁȤ߹ç¤ï¤»¤Ë¤è¤Ã¤Æ¹Ô¤ï¤ì¤ë¤³¤È¤ò°ÕÌ£¤¹¤ë¡£
961 ¥Þ¥Ã¥×¤Ï #Mmapfile ¥Ñ¥é¥á¡¼¥¿¤È¤·¤ÆÍ¿¤¨¤Ê¤±¤ì¤Ð¤Ê¤é¤Ê¤¤¡£
962 ¤³¤Î¼ï¤Î³Æʸ»ú¥»¥Ã¥È¤Ë¤Ï¡¢Á´Ê¸»ú¤ËÂФ·¤ÆϢ³¤¹¤ë¥³¡¼¥É¥¹¥Ú¡¼¥¹¤¬¤½¤ì¤¾¤ì³ä¤êÅö¤Æ¤é¤ì¤ë¡£
964 ¥³¡¼¥É¥Ý¥¤¥ó¥È¤¬¥Þ¥Ã¥×¤Ë´Þ¤Þ¤ì¤Æ¤¤¤ì¤Ð¡¢ÊÑ´¹¤Ï¥Þ¥Ã¥×»²¾È¤Ë¤è¤Ã¤Æ¹Ô¤ï¤ì¤ë¡£
965 ¤½¤¦¤Ç¤Ê¤±¤ì¤Ð¡¢°Ê²¼¤Î¼°¤Ë½¾¤¦¡£
968 CHARACTER-CODE = CODE-POINT - MIN-CODE + LOWEST-CHAR-CODE
971 ¤³¤³¤Ç¡¢MIN-CODE ¤Ïʸ»ú¥»¥Ã¥È¤Î #Mmin_code ¥Ñ¥é¥á¡¼¥¿¤ÎÃͤǤ¢¤ê¡¢
972 LOWEST-CHAR-CODE ¤Ï³ä¤êÅö¤Æ¤é¤ì¤¿¥³¡¼¥É¥¹¥Ú¡¼¥¹¤ÎºÇ¤â¾®¤µ¤¤Ê¸»ú¥³¡¼¥É¤Ç¤¢¤ë¡£
979 @brief Symbol for the subset type method of charset.
981 The symbol #Msubset has the name <tt>"subset"</tt> and, when used
982 as a value of #Mmethod parameter of a charset, it means that the
983 charset is a subset of a parent charset. The parent charset must
984 be given by #Mparents parameter. The conversion of code-points
985 and character codes of the charset is done conceptually by this
989 CHARACTER-CODE = PARENT-CODE (CODE-POINT) + SUBSET-OFFSET
992 where, PARENT-CODE is a pseudo function that returns a character
993 code of CODE-POINT in the parent charset, and SUBSET-OFFSET is a
994 value given by #Msubset_offset parameter. */
996 /***ja @brief ¥µ¥Ö¥»¥Ã¥È·¿¤Î¥á¥½¥Ã¥É¤ò¼¨¤¹¥·¥ó¥Ü¥ë.
998 ¥·¥ó¥Ü¥ë #Msubset ¤Ï <tt>"subset"</tt> ¤È¤¤¤¦Ì¾Á°¤ò»ý¤Á¡¢Ê¸»ú¥»¥Ã¥È¤Î
999 #Mmethod ¥Ñ¥é¥á¡¼¥¿¤ÎÃͤȤ·¤ÆÍѤ¤¤é¤ì¤¿¾ì¹ç¤Ë¤Ï¡¢¤³¤Îʸ»ú¥»¥Ã¥È¤¬Ê̤Îʸ»ú¥»¥Ã¥È¡Ê¿Æʸ»ú¥»¥Ã¥È¡Ë¤ÎÉôʬ½¸¹ç¤Ç¤¢¤ë¤³¤È¤ò°ÕÌ£¤¹¤ë¡£
1000 ¿Æʸ»ú¥»¥Ã¥È¤Ï #Mparents ¥Ñ¥é¥á¡¼¥¿¤Ë¤è¤Ã¤ÆÍ¿¤¨¤é¤ì¤Ê¤¯¤Æ¤Ï¤Ê¤é¤Ê¤¤¡£
1001 ¥³¡¼¥É¥Ý¥¤¥ó¥È¤Èʸ»ú¥»¥Ã¥È¤Îʸ»ú¥³¡¼¥É¤Î´Ö¤ÎÊÑ´¹¤Ï¡¢³µÇ°Åª¤Ë¤Ï°Ê²¼¤Î¼°¤Ë½¾¤¦¡£
1004 CHARACTER-CODE = PARENT-CODE (CODE-POINT) + SUBSET-OFFSET
1007 ¤³¤³¤Ç PARENT-CODE ¤Ï CODE-POINT
1008 ¤Î¿Æʸ»ú¥»¥Ã¥ÈÃæ¤Ç¤Îʸ»ú¥³¡¼¥É¤òÊÖ¤¹µ¼´Ø¿ô¤Ç¤¢¤ê¡¢SUBSET-OFFSET ¤Ï
1009 #Msubset_offset ¥Ñ¥é¥á¡¼¥¿¤ÇÍ¿¤¨¤é¤ì¤ëÃͤǤ¢¤ë¡£
1016 @brief Symbol for the superset type method of charset.
1018 The symbol #Msuperset has the name <tt>"superset"</tt> and, when
1019 used as a value of #Mmethod parameter of a charset, it means that
1020 the charset is a superset of parent charsets. The parent charsets
1021 must be given by #Mparents parameter. */
1024 @brief ¥¹¡¼¥Ñ¡¼¥»¥Ã¥È·¿¤Î¥á¥½¥Ã¥É¤ò¼¨¤¹¥·¥ó¥Ü¥ë.
1026 ¥·¥ó¥Ü¥ë #Msuperset ¤Ï <tt>"superset"</tt> ¤È¤¤¤¦Ì¾Á°¤ò»ý¤Á¡¢Ê¸»ú¥»¥Ã¥È¤Î
1027 #Mmethod ¥Ñ¥é¥á¡¼¥¿¤ÎÃͤȤ·¤ÆÍѤ¤¤é¤ì¤¿¾ì¹ç¤Ë¤Ï¡¢¤³¤Îʸ»ú¥»¥Ã¥È¤¬Ê̤Îʸ»ú¥»¥Ã¥È¡Ê¿Æʸ»ú¥»¥Ã¥È¡Ë¤Î¾å°Ì½¸¹ç¤Ç¤¢¤ë¤³¤È¤ò°ÕÌ£¤¹¤ë¡£
1028 ¿Æʸ»ú¥»¥Ã¥È¤Ï #Mparents ¥Ñ¥é¥á¡¼¥¿¤Ë¤è¤Ã¤ÆÍ¿¤¨¤é¤ì¤Ê¤¯¤Æ¤Ï¤Ê¤é¤Ê¤¤¡£
1036 @brief Define a charset.
1038 The mchar_define_charset () function defines a new charset and
1039 makes it accessible via a symbol whose name is $NAME. $PLIST
1040 specifies parameters of the charset as below:
1044 <li> Key is #Mmethod, value is a symbol.
1046 The value specifies the method for decoding/encoding code-points
1047 in the charset. It must be #Moffset, #Mmap (default), #Munify,
1048 #Msubset, or #Msuperset.
1050 <li> Key is #Mdimension, value is an integer
1052 The value specifies the dimension of code-points of the charset.
1053 It must be 1 (default), 2, 3, or 4.
1055 <li> Key is #Mmin_range, value is an unsigned integer
1057 The value specifies the minimum range of a code-point, which means
1058 that the Nth byte of the value is the minimum Nth byte of
1059 code-points of the charset. The default value is 0.
1061 <li> Key is #Mmax_range, value is an unsigned integer
1063 The value specifies the maximum range of a code-point, which means
1064 that the Nth byte of the value is the maximum Nth byte of
1065 code-points of the charset. The default value is 0xFF, 0xFFFF,
1066 0xFFFFFF, or 0xFFFFFFFF if the dimension is 1, 2, 3, or 4
1069 <li> Key is #Mmin_code, value is an unsigned integer
1071 The value specifies the minimum code-point of
1072 the charset. The default value is the minimum range.
1074 <li> Key is #Mmax_code, value is an unsigned integer
1076 The value specifies the maximum code-point of
1077 the charset. The default value is the maximum range.
1079 <li> Key is #Mascii_compatible, value is a symbol
1081 The value specifies whether the charset is ASCII compatible or
1082 not. If the value is #Mnil (default), it is not ASCII
1083 compatible, else compatible.
1085 <li> Key is #Mfinal_byte, value is an integer
1087 The value specifies the @e final @e byte of the charset registered
1088 in The International Registry. It must be 0 (default) or 32..127.
1089 The value 0 means that the charset is not in the registry.
1091 <li> Key is #Mrevision, value is an integer
1093 The value specifies the @e revision @e number of the charset
1094 registered in The International Registry. It must be 0..127. If
1095 the charset is not in The International Registry, the value is
1096 ignored. The value 0 means that the charset has no revision
1099 <li> Key is #Mmin_char, value is an integer
1101 The value specifies the minimum character code of the charset.
1102 The default value is 0.
1104 <li> Key is #Mmapfile, value is an M-text
1106 If the method is #Mmap or #Munify, a data that contains
1107 mapping information is added to the m17n database by calling
1108 the function mdatabase_define () with the value as an argument $EXTRA_INFO,
1109 i.e. the value is used as a file name of the data.
1111 Otherwise, this parameter is ignored.
1113 <li> Key is #Mparents, value is a plist
1115 If the method is #Msubset, the value must is a plist of length
1116 1, and the value of the plist must be a symbol representing a
1119 If the method is #Msuperset, the value must be a plist of length
1120 less than 9, and the values of the plist must be symbols
1121 representing subset charsets.
1123 Otherwise, this parameter is ignored.
1125 <li> Key is #Mdefine_coding, value is a symbol
1127 If the dimension of the charset is 1, the value specifies whether
1128 or not to define a coding system of the same name whose type is
1129 #Mcharset. A coding system is defined if the value is not #Mnil.
1131 Otherwise, this parameter is ignored.
1136 If the operation was successful, mchar_define_charset () returns a
1137 symbol whose name is $NAME. Otherwise it returns #Mnil and
1138 assigns an error code to the external variable #merror_code. */
1141 @brief ʸ»ú¥»¥Ã¥È¤òÄêµÁ¤¹¤ë.
1143 ´Ø¿ô mchar_define_charset () ¤Ï¿·¤·¤¤Ê¸»ú¥»¥Ã¥È¤òÄêµÁ¤·¡¢¤½¤ì¤ò
1144 $NAME ¤È¤¤¤¦Ì¾Á°¤ò»ý¤Ä¥·¥ó¥Ü¥ë·Ðͳ¤Ç¥¢¥¯¥»¥¹¤Ç¤¤ë¤è¤¦¤Ë¤¹¤ë¡£
1145 $PLIST ¤ÏÄêµÁ¤µ¤ì¤ëʸ»ú¥»¥Ã¥È¤Î¥Ñ¥é¥á¡¼¥¿¤ò°Ê²¼¤Î¤è¤¦¤Ë»ØÄꤹ¤ë¡£
1149 <li> ¥¡¼¤¬ #Mmethod ¤ÇÃͤ¬¥·¥ó¥Ü¥ë¤Î»þ
1151 Ãͤϡ¢#Moffset, #Mmap (¥Ç¥Õ¥©¥ë¥ÈÃÍ), #Munify, #Msubset,
1152 #Msuperset ¤Î¤¤¤º¤ì¤«¤Ç¤¢¤ê¡¢Ê¸»ú¥»¥Ã¥È¤Î¥³¡¼¥É¥Ý¥¤¥ó¥È¤ò¥Ç¥³¡¼¥É¡¿¥¨¥ó¥³¡¼¥É¤¹¤ëºÝ¤Î¥á¥½¥Ã¥É¤ò»ØÄꤹ¤ë¡£
1154 <li> ¥¡¼¤¬ #Mdimension ¤ÇÃͤ¬À°¿ôÃͤλþ
1156 Ãͤϡ¢1 (¥Ç¥Õ¥©¥ë¥ÈÃÍ), 2, 3, 4
1157 ¤Î¤¤¤º¤ì¤«¤Ç¤¢¤ê¡¢Ê¸»ú¥»¥Ã¥È¤Î¥³¡¼¥É¥Ý¥¤¥ó¥È¤Î¼¡¸µ¤Ç¤¢¤ë¡£
1159 <li> ¥¡¼¤¬ #Mmin_range ¤ÇÃͤ¬ÈóÉéÀ°¿ôÃͤλþ
1161 Ãͤϥ³¡¼¥É¥Ý¥¤¥ó¥È¤ÎºÇ¾®¤ÎÃͤǤ¢¤ë¡£¤¹¤Ê¤ï¤Á¡¢¤³¤ÎÃͤΠN
1162 ÈÖÌܤΥХ¤¥È¤Ï¤³¤Îʸ»ú¥»¥Ã¥È¤Î¥³¡¼¥É¥Ý¥¤¥ó¥È¤Î N ÈÖÌܤΥХ¤¥È¤ÎºÇ¾®¤Î¤â¤Î¤È¤Ê¤ë¡£
1165 <li> ¥¡¼¤¬ #Mmax_range ¤ÇÃͤ¬ÈóÉéÀ°¿ôÃͤλþ
1167 Ãͤϥ³¡¼¥É¥Ý¥¤¥ó¥È¤ÎºÇÂç¤ÎÃͤǤ¢¤ë¡£¤¹¤Ê¤ï¤Á¡¢¤³¤ÎÃͤΠN
1168 ÈÖÌܤΥХ¤¥È¤Ï¤³¤Îʸ»ú¥»¥Ã¥È¤Î¥³¡¼¥É¥Ý¥¤¥ó¥È¤Î N ÈÖÌܤΥХ¤¥È¤ÎºÇÂç¤Î¤â¤Î¤È¤Ê¤ë¡£
1169 ¥Ç¥Õ¥©¥ë¥ÈÃͤϡ¢¥³¡¼¥É¥Ý¥¤¥ó¥È¤Î¼¡¸µ¤¬ 1, 2, 3, 4 ¤Î»þ¡¢¤½¤ì¤¾¤ì
1170 0xFF, 0xFFFF, 0xFFFFFF, 0xFFFFFFFF ¡£
1172 <li> ¥¡¼¤¬ #Mmin_code ¤ÇÃͤ¬ÈóÉéÀ°¿ôÃͤλþ
1174 ÃͤϤ³¤Îʸ»ú¥»¥Ã¥È¤ÎºÇ¾®¤Î¥³¡¼¥É¥Ý¥¤¥ó¥È¤Ç¤¢¤ë¡£¥Ç¥Õ¥©¥ë¥ÈÃͤÏ
1177 <li> ¥¡¼¤¬ #Mmax_code ¤ÇÃͤ¬ÈóÉéÀ°¿ôÃͤλþ
1179 ÃͤϤ³¤Îʸ»ú¥»¥Ã¥È¤ÎºÇÂç¤Î¥³¡¼¥É¥Ý¥¤¥ó¥È¤Ç¤¢¤ë¡£¥Ç¥Õ¥©¥ë¥ÈÃͤÏ
1182 <li> ¥¡¼¤¬ #Mascii_compatible ¤ÇÃͤ¬¥·¥ó¥Ü¥ë¤Î»þ
1184 ÃͤϤ³¤Îʸ»ú¥»¥Ã¥È¤¬ ASCII ¸ß´¹¤Ç¤¢¤ë¤«¤É¤¦¤«¤ò¼¨¤¹¡£¥Ç¥Õ¥©¥ë¥ÈÃͤÎ
1185 #Mnil ¤Ç¤¢¤ì¤Ð¸ß´¹¤Ç¤Ï¤Ê¤¯¡¢¤½¤ì°Ê³°¤Î¾ì¹ç¤Ï¸ß´¹¤Ç¤¢¤ë¡£
1187 <li> ¥¡¼¤¬ #Mfinal_byte ¤ÇÃͤ¬À°¿ôÃͤλþ
1189 ÃͤϤ³¤Îʸ»ú¥»¥Ã¥È¤Î The International Registry ¤ËÅÐÏ¿¤µ¤ì¤Æ¤¤¤ë
1190 @e ½ªÃ¼¥Ð¥¤¥È ¤Ç¤¢¤ê¡¢0 (¥Ç¥Õ¥©¥ë¥ÈÃÍ) ¤Ç¤¢¤ë¤« 32..127 ¤Ç¤¢¤ë¡£0
1191 ¤ÏÅÐÏ¿¤µ¤ì¤Æ¤¤¤Ê¤¤¤³¤È¤ò°ÕÌ£¤¹¤ë¡£
1193 <li> ¥¡¼¤¬ #Mrevision ¤ÇÃͤ¬À°¿ôÃͤλþ
1195 ÃÍ¤Ï The International Registry ¤ËÅÐÏ¿¤µ¤ì¤Æ¤¤¤ë @e revision @e
1196 number ¤Ç¤¢¤ê¡¢0..127 ¤Ç¤¢¤ë¡£
1197 ʸ»ú¥»¥Ã¥È¤¬ÅÐÏ¿¤µ¤ì¤Æ¤¤¤Ê¤¤¾ì¹ç¤Ë¤Ï¤³¤ÎÃͤÏ̵»ë¤µ¤ì¤ë¡£
1198 0 ¤Ï revision number ¤¬Â¸ºß¤·¤Ê¤¤¤³¤È¤ò°ÕÌ£¤¹¤ë¡£
1200 <li> ¥¡¼¤¬ #Mmin_char ¤ÇÃͤ¬À°¿ôÃͤλþ
1202 ÃͤϤ³¤Îʸ»ú¥»¥Ã¥È¤ÎºÇ¾®¤Îʸ»ú¥³¡¼¥É¤Ç¤¢¤ë¡£¥Ç¥Õ¥©¥ë¥ÈÃÍ¤Ï 0 ¡£
1204 <li> ¥¡¼¤¬ #Mmapfile ¤ÇÃͤ¬ M-text ¤Î»þ
1206 ¥á¥½¥Ã¥É¤¬ #Mmap ¤« #Munify ¤Î»þ¡¢´Ø¿ô mdatabase_define ()
1207 ¤ò¤³¤ÎÃͤò°ú¿ô $EXTRA_INFO ¤È¤·¤Æ¸Æ¤Ö¤³¤È¤Ë¤è¤Ã¤Æ¡¢¥Þ¥Ã¥Ô¥ó¥°¤Ë´Ø¤¹¤ë¥Ç¡¼¥¿¤¬
1208 m17n ¥Ç¡¼¥¿¥Ù¡¼¥¹¤ËÄɲ䵤ì¤ë¡£
1209 ¤¹¤Ê¤ï¤Á¡¢¤³¤ÎÃͤϥǡ¼¥¿¥Õ¥¡¥¤¥ë¤Î̾Á°¤Ç¤¢¤ë¡£
1211 ¤½¤¦¤Ç¤Ê¤±¤ì¤Ð¡¢¤³¤Î¥Ñ¥é¥á¡¼¥¿¤Ï̵»ë¤µ¤ì¤ë¡£
1213 <li> ¥¡¼¤¬ #Mparents ¤ÇÃͤ¬ plist ¤Î»þ
1215 ¥á¥½¥Ã¥É¤¬ #Msubset ¤Ê¤é¤Ð¡¢ÃͤÏŤµ 1 ¤Î plist
1216 ¤Ç¤¢¤ê¡¢¤½¤ÎÃͤϤ³¤Îʸ»ú¥»¥Ã¥È¤Î¾å°Ì½¸¹ç¤È¤Ê¤ëʸ»ú¥»¥Ã¥È¤ò¼¨¤¹¥·¥ó¥Ü¥ë¤Ç¤¢¤ë¡£
1218 ¥á¥½¥Ã¥É¤¬ #Msuperset ¤Ê¤é¤Ð¡¢ÃͤÏŤµ 8 °Ê²¼¤Î plist
1219 ¤Ç¤¢¤ê¡¢¤½¤ì¤é¤ÎÃͤϤ³¤Îʸ»ú¥»¥Ã¥È¤Î²¼°Ì½¸¹ç¤Ç¤¢¤ëʸ»ú¥»¥Ã¥È¤ò¼¨¤¹¥·¥ó¥Ü¥ë¤Ç¤¢¤ë¡£
1221 ¤½¤¦¤Ç¤Ê¤±¤ì¤Ð¡¢¤³¤Î¥Ñ¥é¥á¡¼¥¿¤Ï̵»ë¤µ¤ì¤ë¡£
1223 <li> ¥¡¼¤¬ #Mdefine_coding ¤ÇÃͤ¬¥·¥ó¥Ü¥ë¤Î»þ
1225 ʸ»ú¥»¥Ã¥È¤Î¼¡¸µ¤¬ 1 ¤Ê¤é¤Ð¡¢Ãͤ¬ #Mnil °Ê³°¤Î¾ì¹ç¤Ë #Mcharset ·¿
1226 ¤ÇƱ¤¸Ì¾Á°¤ò»ý¤Ä¥³¡¼¥É·Ï¤òÄêµÁ¤¹¤ë¡£
1228 ¤½¤¦¤Ç¤Ê¤±¤ì¤Ð¡¢¤³¤Î¥Ñ¥é¥á¡¼¥¿¤Ï̵»ë¤µ¤ì¤ë¡£
1233 ½èÍý¤¬À®¸ù¤¹¤ì¤Ð¡¢mchar_define_charset() ¤Ï $NAME
1234 ¤È¤¤¤¦Ì¾Á°¤Î¥·¥ó¥Ü¥ë¤òÊÖ¤¹¡£¤½¤¦¤Ç¤Ê¤±¤ì¤Ð #Mnil ¤òÊÖ¤·¡¢³°ÉôÊÑ¿ô
1235 #merror_code ¤Ë¥¨¥é¡¼¥³¡¼¥É¤òÀßÄꤹ¤ë¡£*/
1239 @c MERROR_CHARSET */
1242 mchar_define_charset (const char *name, MPlist *plist)
1244 MSymbol sym = msymbol (name);
1247 unsigned min_range, max_range;
1249 MText *mapfile = (MText *) mplist_get (plist, Mmapfile);
1251 MSTRUCT_CALLOC (charset, MERROR_CHARSET);
1252 charset->name = sym;
1253 charset->method = (MSymbol) mplist_get (plist, Mmethod);
1254 if (! charset->method)
1257 charset->method = Mmap;
1259 charset->method = Moffset;
1261 if (charset->method == Mmap || charset->method == Munify)
1264 MERROR (MERROR_CHARSET, Mnil);
1265 mdatabase_define (Mcharset, sym, Mnil, Mnil, NULL, mapfile->data);
1267 if (! (charset->dimension = (int) mplist_get (plist, Mdimension)))
1268 charset->dimension = 1;
1270 min_range = (unsigned) mplist_get (plist, Mmin_range);
1271 if ((pl = mplist_find_by_key (plist, Mmax_range)))
1273 max_range = (unsigned) MPLIST_VAL (pl);
1274 if (max_range >= 0x1000000)
1275 charset->dimension = 4;
1276 else if (max_range >= 0x10000 && charset->dimension < 3)
1277 charset->dimension = 3;
1278 else if (max_range >= 0x100 && charset->dimension < 2)
1279 charset->dimension = 2;
1281 else if (charset->dimension == 1)
1283 else if (charset->dimension == 2)
1285 else if (charset->dimension == 3)
1286 max_range = 0xFFFFFF;
1288 max_range = 0xFFFFFFFF;
1290 memset (charset->code_range, 0, sizeof charset->code_range);
1291 for (i = 0; i < charset->dimension; i++, min_range >>= 8, max_range >>= 8)
1293 charset->code_range[i * 4] = min_range & 0xFF;
1294 charset->code_range[i * 4 + 1] = max_range & 0xFF;
1296 if ((charset->min_code = (int) mplist_get (plist, Mmin_code)) < min_range)
1297 charset->min_code = min_range;
1298 if ((charset->max_code = (int) mplist_get (plist, Mmax_code)) > max_range)
1299 charset->max_code = max_range;
1300 charset->ascii_compatible
1301 = (MSymbol) mplist_get (plist, Mascii_compatible) != Mnil;
1302 charset->final_byte = (int) mplist_get (plist, Mfinal_byte);
1303 charset->revision = (int) mplist_get (plist, Mrevision);
1304 charset->min_char = (int) mplist_get (plist, Mmin_char);
1305 pl = (MPlist *) mplist_get (plist, Mparents);
1306 charset->nparents = pl ? mplist_length (pl) : 0;
1307 if (charset->nparents > 8)
1308 charset->nparents = 8;
1309 for (i = 0; i < charset->nparents; i++, pl = MPLIST_NEXT (pl))
1311 MSymbol parent_name;
1313 if (MPLIST_KEY (pl) != Msymbol)
1314 MERROR (MERROR_CHARSET, Mnil);
1315 parent_name = MPLIST_SYMBOL (pl);
1316 if (! (charset->parents[i] = MCHARSET (parent_name)))
1317 MERROR (MERROR_CHARSET, Mnil);
1320 charset->subset_offset = (int) mplist_get (plist, Msubset_offset);
1322 msymbol_put (sym, Mcharset, charset);
1323 charset = make_charset (charset);
1326 msymbol_put (msymbol__canonicalize (sym), Mcharset, charset);
1328 for (pl = (MPlist *) mplist_get (plist, Maliases);
1329 pl && MPLIST_KEY (pl) == Msymbol;
1330 pl = MPLIST_NEXT (pl))
1332 MSymbol alias = MPLIST_SYMBOL (pl);
1334 msymbol_put (alias, Mcharset, charset);
1335 msymbol_put (msymbol__canonicalize (alias), Mcharset, charset);
1338 if (mplist_get (plist, Mdefine_coding)
1339 && charset->dimension == 1
1340 && charset->code_range[0] == 0 && charset->code_range[1] == 255)
1341 mconv__register_charset_coding (sym);
1348 @brief Resolve charset name.
1350 The mchar_resolve_charset () function returns $SYMBOL if it
1351 represents a charset. Otherwise, canonicalize $SYMBOL as to a
1352 charset name, and if the canonicalized name represents a charset,
1353 return it. Otherwise, return #Mnil. */
1356 @brief ʸ»ú¥»¥Ã¥È̾¤ò²ò·è¤¹¤ë.
1358 ´Ø¿ô mchar_resolve_charset () ¤Ï $SYMBOL
1359 ¤¬Ê¸»ú¥»¥Ã¥È¤ò¼¨¤·¤Æ¤¤¤ì¤Ð¤½¤ì¤òÊÖ¤¹¡£
1361 ¤½¤¦¤Ç¤Ê¤±¤ì¤Ð¡¢$SYMBOL ¤òʸ»ú¥»¥Ã¥È̾¤È¤·¤ÆÀµµ¬²½¤·¡¢¤½¤ì¤¬Ê¸»ú¥»¥Ã¥È¤ò¼¨¤·¤Æ¤¤¤Æ¤¤¤ì¤ÐÀµµ¬²½¤·¤¿¤â¤Î¤òÊÖ¤¹¡£
1362 ¤½¤¦¤Ç¤Ê¤±¤ì¤Ð¡¢#Mnil ¤òÊÖ¤¹¡£ */
1365 mchar_resolve_charset (MSymbol symbol)
1367 MCharset *charset = (MCharset *) msymbol_get (symbol, Mcharset);
1371 symbol = msymbol__canonicalize (symbol);
1372 charset = (MCharset *) msymbol_get (symbol, Mcharset);
1375 return (charset ? charset->name : Mnil);
1381 @brief List symbols representing charsets.
1383 The mchar_list_charsets () function makes an array of symbols
1384 representing a charset, stores the pointer to the array in a place
1385 pointed to by $SYMBOLS, and returns the length of the array. */
1388 @brief ʸ»ú¥»¥Ã¥È¤òɽ¤ï¤¹¥·¥ó¥Ü¥ë¤òÎóµó¤¹¤ë.
1390 ´Ø¿ô mchar_list_charsets ()
1391 ¤Ï¡¢Ê¸»ú¥»¥Ã¥È¤ò¼¨¤¹¥·¥ó¥Ü¥ë¤òʤ٤¿ÇÛÎó¤òºî¤ê¡¢$SYMBOLS
1392 ¤Ç¥Ý¥¤¥ó¥È¤µ¤ì¤¿¾ì½ê¤Ë¤³¤ÎÇÛÎó¤Ø¤Î¥Ý¥¤¥ó¥¿¤òÃÖ¤¡¢ÇÛÎó¤ÎŤµ¤òÊÖ¤¹¡£ */
1395 mchar_list_charset (MSymbol **symbols)
1399 MTABLE_MALLOC ((*symbols), charset_list.used, MERROR_CHARSET);
1400 for (i = 0; i < charset_list.used; i++)
1401 (*symbols)[i] = charset_list.charsets[i]->name;
1408 @brief Decode a code-point.
1410 The mchar_decode () function decodes code-point $CODE in the
1411 charset represented by the symbol $CHARSET_NAME to get a character
1415 If decoding was successful, mchar_decode () returns the decoded
1416 character code. Otherwise it returns -1. */
1419 @brief ¥³¡¼¥É¥Ý¥¤¥ó¥È¤ò¥Ç¥³¡¼¥É¤¹¤ë.
1421 ´Ø¿ô mchar_decode () ¤Ï¡¢¥·¥ó¥Ü¥ë $CHARSET_NAME ¤Ç¼¨¤µ¤ì¤ëʸ»ú¥»¥Ã¥ÈÆâ¤Î
1422 $CODE ¤È¤¤¤¦¥³¡¼¥É¥Ý¥¤¥ó¥È¤ò¥Ç¥³¡¼¥É¤·¤Æʸ»ú¥³¡¼¥É¤òÆÀ¤ë¡£
1425 ¥Ç¥³¡¼¥É¤¬À®¸ù¤¹¤ì¤Ð¡¢mchar_decode () ¤Ï¥Ç¥³¡¼¥É¤µ¤ì¤¿Ê¸»ú¥³¡¼¥É¤òÊÖ¤¹¡£
1426 ¤½¤¦¤Ç¤Ê¤±¤ì¤Ð -1 ¤òÊÖ¤¹¡£ */
1433 mchar_decode (MSymbol charset_name, unsigned code)
1435 MCharset *charset = MCHARSET (charset_name);
1438 return MCHAR_INVALID_CODE;
1439 return DECODE_CHAR (charset, code);
1445 @brief Encode a character code.
1447 The mchar_encode () function encodes character code $C to get a
1448 code-point in the charset represented by the symbol $CHARSET_NAME.
1451 If encoding was successful, mchar_encode () returns the encoded
1452 code-point. Otherwise it returns #MCHAR_INVALID_CODE. */
1455 @brief ʸ»ú¥³¡¼¥É¤ò¥¨¥ó¥³¡¼¥É¤¹¤ë.
1457 ´Ø¿ô mchar_encode () ¤Ï¡¢Ê¸»ú¥³¡¼¥É $C ¤ò¥¨¥ó¥³¡¼¥É¤·¤Æ¥·¥ó¥Ü¥ë
1458 $CHARSET_NAME ¤Ç¼¨¤µ¤ì¤ëʸ»ú¥»¥Ã¥ÈÆâ¤Ë¤ª¤±¤ë¥³¡¼¥É¥Ý¥¤¥ó¥È¤òÆÀ¤ë¡£
1461 ¥¨¥ó¥³¡¼¥É¤¬À®¸ù¤¹¤ì¤Ð¡¢mchar_encode () ¤Ï¥¨¥ó¡¼¥É¤µ¤ì¤¿¥³¡¼¥É¥Ý¥¤¥ó¥È¤òÊÖ¤¹¡£
1462 ¤½¤¦¤Ç¤Ê¤±¤ì¤Ð #MCHAR_INVALID_CODE ¤òÊÖ¤¹¡£ */
1469 mchar_encode (MSymbol charset_name, int c)
1471 MCharset *charset = MCHARSET (charset_name);
1474 return MCHAR_INVALID_CODE;
1475 return ENCODE_CHAR (charset, c);
1481 @brief Call a function for all the characters in a specified charset.
1483 The mcharset_map_chars () function calls $FUNC for all the
1484 characters in the charset named $CHARSET_NAME. A call is done for
1485 a chunk of consecutive characters rather than character by
1488 $FUNC receives three arguments: $FROM, $TO, and $ARG. $FROM and
1489 $TO specify the range of character codes in $CHARSET. $ARG is the
1493 If the operation was successful, mcharset_map_chars () returns 0.
1494 Otherwise, it returns -1 and assigns an error code to the external
1495 variable #merror_code. */
1498 @brief »ØÄꤷ¤¿Ê¸»ú¥»¥Ã¥È¤Î¤¹¤Ù¤Æ¤Îʸ»ú¤ËÂФ·¤Æ´Ø¿ô¤ò¸Æ¤Ö.
1500 ´Ø¿ô mcharset_map_chars () ¤Ï $CHARSET_NAME
1501 ¤È¤¤¤¦Ì¾Á°¤ò»ý¤Äʸ»ú¥»¥Ã¥ÈÃæ¤Î¤¹¤Ù¤Æ¤Îʸ»ú¤ËÂФ·¤Æ $FUNC ¤ò¸Æ¤Ö¡£
1502 ¸Æ¤Ó½Ð¤·¤Ï°ìʸ»úËè¤Ç¤Ï¤Ê¤¯¡¢Ï¢Â³¤·¤¿Ê¸»ú¤Î¤Þ¤È¤Þ¤êñ°Ì¤Ç¹Ô¤Ê¤ï¤ì¤ë¡£
1504 ´Ø¿ô $FUNC ¤Ë¤Ï$FROM, $TO, $ARG ¤Î£³°ú¿ô¤¬ÅϤµ¤ì¤ë¡£$FROM ¤È $TO
1505 ¤Ï $CHARSET Ãæ¤Îʸ»ú¥³¡¼¥É¤ÎÈϰϤò»ØÄꤹ¤ë¡£$ARG ¤Ï $FUNC_ARG
1509 ½èÍý¤ËÀ®¸ù¤¹¤ì¤Ð mcharset_map_chars () ¤Ï 0 ¤òÊÖ¤¹¡£
1510 ¤½¤¦¤Ç¤Ê¤±¤ì¤Ð -1 ¤òÊÖ¤·¡¢³°ÉôÊÑ¿ô #merror_code ¤Ë¥¨¥é¡¼¥³¡¼¥É¤òÀßÄꤹ¤ë¡£ */
1514 @c MERROR_CHARSET */
1517 mchar_map_charset (MSymbol charset_name,
1518 void (*func) (int from, int to, void *arg),
1523 charset = MCHARSET (charset_name);
1525 MERROR (MERROR_CHARSET, -1);
1527 if (charset->encoder)
1529 int c = charset->min_char;
1532 if ((int) mchartable__lookup (charset->encoder, c, &next_c, 1) < 0)
1534 while (c <= charset->max_char)
1536 if ((int) mchartable__lookup (charset->encoder, c, &next_c, 1) >= 0)
1537 (*func) (c, next_c - 1, func_arg);
1542 (*func) (charset->min_char, charset->max_char, func_arg);