1 /* charset.c -- charset module.
2 Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010
3 National Institute of Advanced Industrial Science and Technology (AIST)
4 Registration Number H15PRO112
6 This file is part of the m17n library.
8 The m17n library is free software; you can redistribute it and/or
9 modify it under the terms of the GNU Lesser General Public License
10 as published by the Free Software Foundation; either version 2.1 of
11 the License, or (at your option) any later version.
13 The m17n library is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public
19 License along with the m17n library; if not, write to the Free
20 Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
23 @addtogroup m17nCharset
24 @brief Charset objects and API for them.
26 The m17n library uses @e charset objects to represent a coded
27 character sets (CCS). The m17n library supports many predefined
28 coded character sets. Moreover, application programs can add
29 other charsets. A character can belong to multiple charsets.
31 The m17n library distinguishes the following three concepts:
33 @li A @e code-point is a number assigned by the CCS to each
34 character. Code-points may or may not be continuous. The type
35 @c unsigned is used to represent a code-point. An invalid
36 code-point is represented by the macro @c MCHAR_INVALID_CODE.
38 @li A @e character @e index is the canonical index of a character
39 in a CCS. The character that has the character index N occupies
40 the Nth position when all the characters in the current CCS are
41 sorted by their code-points. Character indices in a CCS are
42 continuous and start with 0.
44 @li A @e character @e code is the internal representation in the
45 m17n library of a character. A character code is a signed integer
48 Each charset object defines how characters are converted between
49 code-points and character codes. To @e encode means converting
50 code-points to character codes and to @e decode means converting
51 character codes to code-points. */
54 @addtogroup m17nCharset
55 @brief ʸ»ú¥»¥Ã¥È¥ª¥Ö¥¸¥§¥¯¥È¤È¤½¤ì¤Ë´Ø¤¹¤ë API.
57 m17n ¥é¥¤¥Ö¥é¥ê¤Ï¡¢Éä¹æ²½Ê¸»ú½¸¹ç (CCS) ¤ò @e ʸ»ú¥»¥Ã¥È
58 ¤È¸Æ¤Ö¥ª¥Ö¥¸¥§¥¯¥È¤Çɽ¸½¤¹¤ë¡£
59 m17n ¥é¥¤¥Ö¥é¥ê¤Ï¿¤¯¤ÎÉä¹æ²½Ê¸»ú½¸¹ç¤ò¤¢¤é¤«¤¸¤á¥µ¥Ý¡¼¥È¤·¤Æ¤¤¤ë¤·¡¢¥¢¥×¥ê¥±¡¼¥·¥ç¥ó¥×¥í¥°¥é¥à¤¬Æȼ«¤Ëʸ»ú¥»¥Ã¥È¤òÄɲ乤뤳¤È¤â²Äǽ¤Ç¤¢¤ë¡£
60 °ì¤Ä¤Îʸ»ú¤ÏÊ£¿ô¤Îʸ»ú¥»¥Ã¥È¤Ë°¤·¤Æ¤â¤è¤¤¡£
62 m17n ¥é¥¤¥Ö¥é¥ê¤Ï¡¢°Ê²¼¤Î³µÇ°¤ò¶èÊ̤·¤Æ¤¤¤ë:
64 @li @e ¥³¡¼¥É¥Ý¥¤¥ó¥È ¤È¤Ï¡¢CCS ¤¬¤½¤ÎÃæ¤Î¸Ä¡¹¤Îʸ»ú¤ËÂФ·¤ÆÄêµÁ¤¹¤ë¿ôÃͤǤ¢¤ë¡£
65 ¥³¡¼¥É¥Ý¥¤¥ó¥È¤ÏϢ³¤·¤Æ¤¤¤ë¤È¤Ï¸Â¤é¤Ê¤¤¡£¥³¡¼¥É¥Ý¥¤¥ó¥È¤Ï
66 @c unsigned ·¿¤Ë¤è¤Ã¤Æɽ¤µ¤ì¤ë¡£Ìµ¸ú¤Ê¥³¡¼¥É¥Ý¥¤¥ó¥È¤Ï¥Þ¥¯¥í
67 @c MCHAR_INVALID_CODE ¤Çɽ¤µ¤ì¤ë¡£
69 @li @e ʸ»ú¥¤¥ó¥Ç¥Ã¥¯¥¹ ¤È¤Ï¡¢CCS Æâ¤Ç³Æʸ»ú¤Ë³ä¤êÅö¤Æ¤é¤ì¤ëÀµµ¬²½¤µ¤ì¤¿¥¤¥ó¥Ç¥Ã¥¯¥¹¤Ç¤¢¤ë¡£
70 ʸ»ú¥¤¥ó¥Ç¥Ã¥¯¥¹¤¬ N ¤Îʸ»ú¤Ï¡¢CCS Ãæ¤ÎÁ´Ê¸»ú¤ò¥³¡¼¥É¥Ý¥¤¥ó¥È½ç¤Ëʤ٤¿¤È¤¤Ë N ÈÖÌܤ˸½¤ï¤ì¤ë¡£
71 CCS Ãæ¤Îʸ»ú¥¤¥ó¥Ç¥Ã¥¯¥¹¤ÏϢ³¤·¤Æ¤ª¤ê¡¢0 ¤«¤é»Ï¤Þ¤ë¡£
73 @li @e ʸ»ú¥³¡¼¥É ¤È¤Ï¡¢m17n ¥é¥¤¥Ö¥é¥êÆâ¤Ë¤ª¤±¤ëʸ»ú¤ÎÆâÉôɽ¸½¤Ç¤¢¤ê¡¢21 ¥Ó¥Ã¥È°Ê¾å¤ÎŤµ¤ò»ý¤ÄÉä¹çÉÕ¤À°¿ô¤Ç¤¢¤ë¡£
75 ³Æʸ»ú¥»¥Ã¥È¥ª¥Ö¥¸¥§¥¯¥È¤Ï¡¢¤½¤Îʸ»ú¥»¥Ã¥È¤Ë°¤¹¤ëʸ»ú¤Î¥³¡¼¥É¥Ý¥¤¥ó¥È¤Èʸ»ú¥³¡¼¥É¤È¤Î´Ö¤ÎÊÑ´¹¤òµ¬Äꤹ¤ë¡£
76 ¥³¡¼¥É¥Ý¥¤¥ó¥È¤«¤éʸ»ú¥³¡¼¥É¤Ø¤ÎÊÑ´¹¤ò @e ¥Ç¥³¡¼¥É
77 ¤È¸Æ¤Ó¡¢Ê¸»ú¥³¡¼¥É¤«¤é¥³¡¼¥É¥Ý¥¤¥ó¥È¤Ø¤ÎÊÑ´¹¤ò @e ¥¨¥ó¥³¡¼¥É ¤È¸Æ¤Ö¡£ */
81 #if !defined (FOR_DOXYGEN) || defined (DOXYGEN_INTERNAL_MODULE)
82 /*** @addtogroup m17nInternal
92 #include "m17n-misc.h"
101 static int unified_max;
103 /** List of all charsets ever defined. */
111 static struct MCharsetList charset_list;
113 static MPlist *charset_definition_list;
115 /** Make a charset object from the template of MCharset structure
116 CHARSET, and return a pointer to the new charset object.
117 CHARSET->code_range[4N + 2] and CHARSET->code_range[4N + 3] are
121 make_charset (MCharset *charset)
123 unsigned min_code, max_code;
125 int *range = charset->code_range;
127 if (charset->dimension < 1 || charset->dimension > 4)
128 MERROR (MERROR_CHARSET, NULL);
129 if ((charset->final_byte > 0 && charset->final_byte < '0')
130 || charset->final_byte > 127)
131 MERROR (MERROR_CHARSET, NULL);
133 for (i = 0, n = 1; i < 4; i++)
135 if (range[i * 4] > range[i * 4 + 1])
136 MERROR (MERROR_CHARSET, NULL);
137 range[i * 4 + 2] = range[i * 4 + 1] - range[i * 4] + 1;
138 n *= range[i * 4 + 2];
139 range[i * 4 + 3] = n;
142 min_code = range[0] | (range[4] << 8) | (range[8] << 16) | (range[12] << 24);
143 if (charset->min_code == 0)
144 charset->min_code = min_code;
145 else if (charset->min_code < min_code)
146 MERROR (MERROR_CHARSET, NULL);
147 max_code = range[1] | (range[5] << 8) | (range[9] << 16) | (range[13] << 24);
148 if (charset->max_code == 0)
149 charset->max_code = max_code;
150 else if (charset->max_code > max_code)
151 MERROR (MERROR_CHARSET, NULL);
153 charset->code_range_min_code = min_code;
154 charset->fully_loaded = 0;
157 if (charset->method == Msubset)
161 if (charset->nparents != 1)
162 MERROR (MERROR_CHARSET, NULL);
163 parent = charset->parents[0];
164 if (parent->method == Msuperset
165 || charset->min_code - charset->subset_offset < parent->min_code
166 || charset->max_code - charset->subset_offset > parent->max_code)
167 MERROR (MERROR_CHARSET, NULL);
169 else if (charset->method == Msuperset)
171 if (charset->nparents < 2)
172 MERROR (MERROR_CHARSET, NULL);
173 for (i = 0; i < charset->nparents; i++)
174 if (charset->min_code > charset->parents[i]->min_code
175 || charset->max_code < charset->parents[i]->max_code)
176 MERROR (MERROR_CHARSET, NULL);
181 = (charset->dimension == 1
183 && (charset->dimension == 2
185 && (charset->dimension == 3
186 || range[10] == 256)))));
188 if (! charset->no_code_gap)
192 memset (charset->code_range_mask, 0,
193 sizeof charset->code_range_mask);
194 for (i = 0; i < 4; i++)
195 for (j = range[i * 4]; j <= range[i * 4 + 1]; j++)
196 charset->code_range_mask[j] |= (1 << i);
199 if (charset->method == Moffset)
201 charset->max_char = charset->min_char + range[15] - 1;
202 if (charset->min_char < 0
203 || charset->max_char < 0 || charset->max_char > unified_max)
204 MERROR (MERROR_CHARSET, NULL);
205 charset->simple = charset->no_code_gap;
206 charset->fully_loaded = 1;
208 else if (charset->method == Munify)
210 /* The magic number 12 below is to align to the SUB_BITS_2
211 (defined in chartab.c) boundary in a char-table. */
212 unified_max -= ((range[15] >> 12) + 1) << 12;
213 charset->unified_max = unified_max;
215 else if (charset->method != Mmap)
216 MERROR (MERROR_CHARSET, NULL);
219 MLIST_APPEND1 (&charset_list, charsets, charset, MERROR_CHARSET);
221 if (charset->final_byte > 0)
223 MLIST_APPEND1 (&mcharset__iso_2022_table, charsets, charset,
225 if (charset->revision <= 0)
227 int chars = range[2];
229 if (chars == 128) /* ASCII case */
231 else if (chars == 256) /* ISO-8859-X case */
233 MCHARSET_ISO_2022 (charset->dimension, chars, charset->final_byte)
242 load_charset_fully (MCharset *charset)
244 if (charset->method == Msubset)
246 MCharset *parent = charset->parents[0];
248 if (! parent->fully_loaded
249 && load_charset_fully (parent) < 0)
250 MERROR (MERROR_CHARSET, -1);
251 if (parent->method == Moffset)
255 code = charset->min_code - charset->subset_offset;
256 charset->min_char = DECODE_CHAR (parent, code);
257 code = charset->max_code - charset->subset_offset;
258 charset->max_char = DECODE_CHAR (parent, code);
262 unsigned min_code = charset->min_code - charset->subset_offset;
263 unsigned max_code = charset->max_code - charset->subset_offset;
264 int min_char = DECODE_CHAR (parent, min_code);
265 int max_char = min_char;
267 for (++min_code; min_code <= max_code; min_code++)
269 int c = DECODE_CHAR (parent, min_code);
275 else if (c > max_char)
279 charset->min_char = min_char;
280 charset->max_char = max_char;
283 else if (charset->method == Msuperset)
285 int min_char = 0, max_char = 0;
288 for (i = 0; i < charset->nparents; i++)
290 MCharset *parent = charset->parents[i];
292 if (! parent->fully_loaded
293 && load_charset_fully (parent) < 0)
294 MERROR (MERROR_CHARSET, -1);
296 min_char = parent->min_char, max_char = parent->max_char;
297 else if (parent->min_char < min_char)
298 min_char = parent->min_char;
299 else if (parent->max_char > max_char)
300 max_char = parent->max_char;
302 charset->min_char = min_char;
303 charset->max_char = max_char;
305 else /* charset->method is Mmap or Munify */
307 MDatabase *mdb = mdatabase_find (Mcharset, charset->name, Mnil, Mnil);
310 if (! mdb || ! (plist = mdatabase_load (mdb)))
311 MERROR (MERROR_CHARSET, -1);
312 charset->decoder = mplist_value (plist);
313 charset->encoder = mplist_value (mplist_next (plist));
314 M17N_OBJECT_UNREF (plist);
315 mchartable_range (charset->encoder,
316 &charset->min_char, &charset->max_char);
317 if (charset->method == Mmap)
318 charset->simple = charset->no_code_gap;
320 charset->max_char = charset->unified_max + 1 + charset->code_range[15];
323 charset->fully_loaded = 1;
327 /** Load a data of type @c charset from the file FD. */
330 load_charset (FILE *fp, MSymbol charset_name)
332 MCharset *charset = MCHARSET (charset_name);
341 MERROR (MERROR_DB, NULL);
342 size = (charset->code_range[15]
343 - (charset->min_code - charset->code_range_min_code));
344 MTABLE_MALLOC (decoder, size, MERROR_DB);
345 for (i = 0; i < size; i++)
347 encoder = mchartable (Minteger, (void *) MCHAR_INVALID_CODE);
349 while ((c = getc (fp)) != EOF)
351 unsigned code1, code2, c1, c2;
356 if (! fgets (buf, 256, fp))
360 if (sscanf (buf, "0x%x-0x%x 0x%x", &code1, &code2, &c1) == 3)
362 idx1 = CODE_POINT_TO_INDEX (charset, code1);
365 idx2 = CODE_POINT_TO_INDEX (charset, code2);
368 c2 = c1 + (idx2 - idx1);
370 else if (sscanf (buf, "0x%x 0x%x", &code1, &c1) == 2)
372 idx1 = idx2 = CODE_POINT_TO_INDEX (charset, code1);
379 if (idx1 >= 0 && idx2 >= 0)
382 mchartable_set (encoder, c1, (void *) code1);
383 for (idx1++, c1++; idx1 <= idx2; idx1++, c1++)
385 code1 = INDEX_TO_CODE_POINT (charset, idx1);
387 mchartable_set (encoder, c1, (void *) code1);
397 M17N_OBJECT_UNREF (encoder);
401 mplist_add (plist, Mt, decoder);
402 mplist_add (plist, Mt, encoder);
409 MPlist *mcharset__cache;
411 /* Predefined charsets. */
412 MCharset *mcharset__ascii;
413 MCharset *mcharset__binary;
414 MCharset *mcharset__m17n;
415 MCharset *mcharset__unicode;
417 MCharsetISO2022Table mcharset__iso_2022_table;
419 /** Initialize charset handler. */
426 unified_max = MCHAR_MAX;
428 mdatabase__load_charset_func = load_charset;
429 mcharset__cache = mplist ();
430 mplist_set (mcharset__cache, Mt, NULL);
432 MLIST_INIT1 (&charset_list, charsets, 128);
433 MLIST_INIT1 (&mcharset__iso_2022_table, charsets, 128);
434 charset_definition_list = mplist ();
436 memset (mcharset__iso_2022_table.classified, 0,
437 sizeof (mcharset__iso_2022_table.classified));
439 Mmethod = msymbol ("method");
440 Moffset = msymbol ("offset");
441 Mmap = msymbol ("map");
442 Munify = msymbol ("unify");
443 Msubset = msymbol ("subset");
444 Msuperset = msymbol ("superset");
446 Mdimension = msymbol ("dimension");
447 Mmin_range = msymbol ("min-range");
448 Mmax_range = msymbol ("max-range");
449 Mmin_code = msymbol ("min-code");
450 Mmax_code = msymbol ("max-code");
451 Mascii_compatible = msymbol ("ascii-compatible");
452 Mfinal_byte = msymbol ("final-byte");
453 Mrevision = msymbol ("revision");
454 Mmin_char = msymbol ("min-char");
455 Mmapfile = msymbol_as_managing_key ("mapfile");
456 Mparents = msymbol_as_managing_key ("parents");
457 Msubset_offset = msymbol ("subset-offset");
458 Mdefine_coding = msymbol ("define-coding");
459 Maliases = msymbol_as_managing_key ("aliases");
463 /* Setup predefined charsets. */
464 pl = mplist_add (pl, Mmethod, Moffset);
465 pl = mplist_add (pl, Mmin_range, (void *) 0);
466 pl = mplist_add (pl, Mmax_range, (void *) 0x7F);
467 pl = mplist_add (pl, Mascii_compatible, Mt);
468 pl = mplist_add (pl, Mfinal_byte, (void *) 'B');
469 pl = mplist_add (pl, Mmin_char, (void *) 0);
470 Mcharset_ascii = mchar_define_charset ("ascii", param);
472 mplist_put (param, Mmax_range, (void *) 0xFF);
473 mplist_put (param, Mfinal_byte, NULL);
474 Mcharset_iso_8859_1 = mchar_define_charset ("iso-8859-1", param);
476 mplist_put (param, Mmax_range, (void *) 0x10FFFF);
477 Mcharset_unicode = mchar_define_charset ("unicode", param);
479 mplist_put (param, Mmax_range, (void *) MCHAR_MAX);
480 Mcharset_m17n = mchar_define_charset ("m17n", param);
482 mplist_put (param, Mmax_range, (void *) 0xFF);
483 Mcharset_binary = mchar_define_charset ("binary", param);
485 M17N_OBJECT_UNREF (param);
487 mcharset__ascii = MCHARSET (Mcharset_ascii);
488 mcharset__binary = MCHARSET (Mcharset_binary);
489 mcharset__m17n = MCHARSET (Mcharset_m17n);
490 mcharset__unicode = MCHARSET (Mcharset_unicode);
496 mcharset__fini (void)
501 for (i = 0; i < charset_list.used; i++)
503 MCharset *charset = charset_list.charsets[i];
505 if (charset->decoder)
506 free (charset->decoder);
507 if (charset->encoder)
508 M17N_OBJECT_UNREF (charset->encoder);
511 M17N_OBJECT_UNREF (mcharset__cache);
512 MLIST_FREE1 (&charset_list, charsets);
513 MLIST_FREE1 (&mcharset__iso_2022_table, charsets);
514 MPLIST_DO (plist, charset_definition_list)
515 M17N_OBJECT_UNREF (MPLIST_VAL (plist));
516 M17N_OBJECT_UNREF (charset_definition_list);
521 mcharset__find (MSymbol name)
525 charset = msymbol_get (name, Mcharset);
528 MPlist *param = mplist_get (charset_definition_list, name);
530 MPLIST_KEY (mcharset__cache) = Mt;
533 param = mplist__from_plist (param);
534 mchar_define_charset (MSYMBOL_NAME (name), param);
535 charset = msymbol_get (name, Mcharset);
536 M17N_OBJECT_UNREF (param);
538 MPLIST_KEY (mcharset__cache) = name;
539 MPLIST_VAL (mcharset__cache) = charset;
544 /** Return the character corresponding to code-point CODE in CHARSET.
545 If CODE is invalid for CHARSET, return -1. */
548 mcharset__decode_char (MCharset *charset, unsigned code)
552 if (code < 128 && charset->ascii_compatible)
554 if (code < charset->min_code || code > charset->max_code)
557 if (! charset->fully_loaded
558 && load_charset_fully (charset) < 0)
559 MERROR (MERROR_CHARSET, -1);
561 if (charset->method == Msubset)
563 MCharset *parent = charset->parents[0];
565 code -= charset->subset_offset;
566 return DECODE_CHAR (parent, code);
569 if (charset->method == Msuperset)
573 for (i = 0; i < charset->nparents; i++)
575 MCharset *parent = charset->parents[i];
576 int c = DECODE_CHAR (parent, code);
584 idx = CODE_POINT_TO_INDEX (charset, code);
588 if (charset->method == Mmap)
589 return charset->decoder[idx];
591 if (charset->method == Munify)
593 int c = charset->decoder[idx];
596 c = charset->unified_max + 1 + idx;
600 /* Now charset->method should be Moffset. */
601 return (charset->min_char + idx);
605 /** Return the code point of character C in CHARSET. If CHARSET does not
606 contain C, return MCHAR_INVALID_CODE. */
609 mcharset__encode_char (MCharset *charset, int c)
611 if (! charset->fully_loaded
612 && load_charset_fully (charset) < 0)
613 MERROR (MERROR_CHARSET, MCHAR_INVALID_CODE);
615 if (charset->method == Msubset)
617 MCharset *parent = charset->parents[0];
618 unsigned code = ENCODE_CHAR (parent, c);
620 if (code == MCHAR_INVALID_CODE)
622 code += charset->subset_offset;
623 if (code >= charset->min_code && code <= charset->max_code)
625 return MCHAR_INVALID_CODE;
628 if (charset->method == Msuperset)
632 for (i = 0; i < charset->nparents; i++)
634 MCharset *parent = charset->parents[i];
635 unsigned code = ENCODE_CHAR (parent, c);
637 if (code != MCHAR_INVALID_CODE)
640 return MCHAR_INVALID_CODE;
643 if (c < charset->min_char || c > charset->max_char)
644 return MCHAR_INVALID_CODE;
646 if (charset->method == Mmap)
647 return (unsigned) mchartable_lookup (charset->encoder, c);
649 if (charset->method == Munify)
651 if (c > charset->unified_max)
653 c -= charset->unified_max - 1;
654 return INDEX_TO_CODE_POINT (charset, c);
656 return (unsigned) mchartable_lookup (charset->encoder, c);
659 /* Now charset->method should be Moffset */
660 c -= charset->min_char;
661 return INDEX_TO_CODE_POINT (charset, c);
665 mcharset__load_from_database ()
667 MDatabase *mdb = mdatabase_find (msymbol ("charset-list"), Mnil, Mnil, Mnil);
668 MPlist *def_list, *plist;
669 MPlist *definitions = charset_definition_list;
670 int mdebug_flag = MDEBUG_CHARSET;
675 def_list = (MPlist *) mdatabase_load (mdb);
676 MDEBUG_PRINT_TIME ("CHARSET", (stderr, " to load data."));
682 MPLIST_DO (plist, def_list)
687 if (! MPLIST_PLIST_P (plist))
688 MERROR (MERROR_CHARSET, -1);
689 pl = MPLIST_PLIST (plist);
690 if (! MPLIST_SYMBOL_P (pl))
691 MERROR (MERROR_CHARSET, -1);
692 name = MPLIST_SYMBOL (pl);
693 pl = MPLIST_NEXT (pl);
694 definitions = mplist_add (definitions, name, pl);
695 M17N_OBJECT_REF (pl);
696 p = mplist__from_plist (pl);
697 mchar_define_charset (MSYMBOL_NAME (name), p);
698 M17N_OBJECT_UNREF (p);
701 M17N_OBJECT_UNREF (def_list);
702 MDEBUG_PRINT_TIME ("CHARSET", (stderr, " to parse the loaded data."));
708 #endif /* !FOR_DOXYGEN || DOXYGEN_INTERNAL_MODULE */
713 /*** @addtogroup m17nCharset */
719 @brief Invalid code-point.
721 The macro #MCHAR_INVALID_CODE gives the invalid code-point. */
724 @brief ̵¸ú¤Ê¥³¡¼¥É¥Ý¥¤¥ó¥È.
726 ¥Þ¥¯¥í #MCHAR_INVALID_CODE ¤Ï̵¸ú¤Ê¥³¡¼¥É¥Ý¥¤¥ó¥È¤ò¼¨¤¹¡£ */
728 #define MCHAR_INVALID_CODE
734 @name Variables: Symbols representing a charset.
736 Each of the following symbols represents a predefined charset. */
739 @name ÊÑ¿ô: ʸ»ú¥»¥Ã¥È¤òɽ¸½¤¹¤ëÄêµÁºÑ¤ß¥·¥ó¥Ü¥ë.
741 °Ê²¼¤Î³Æ¥·¥ó¥Ü¥ë¤Ï¡¢ÄêµÁºÑ¤ßʸ»ú¥»¥Ã¥È¤òɽ¸½¤¹¤ë¡£ */
746 @brief Symbol representing the charset ASCII.
748 The symbol #Mcharset_ascii has name <tt>"ascii"</tt> and represents
749 the charset ISO 646, USA Version X3.4-1968 (ISO-IR-6). */
751 @brief ASCII ʸ»ú¥»¥Ã¥È¤òɽ¸½¤¹¤ë¥·¥ó¥Ü¥ë.
753 ¥·¥ó¥Ü¥ë #Mcharset_ascii ¤Ï <tt>"ascii"</tt> ¤È¤¤¤¦Ì¾Á°¤ò»ý¤Á¡¢
754 ISO 646, USA Version X3.4-1968 (ISO-IR-6) ʸ»ú¥»¥Ã¥È¤òɽ¸½¤¹¤ë¡£
757 MSymbol Mcharset_ascii;
761 @brief Symbol representing the charset ISO/IEC 8859/1.
763 The symbol #Mcharset_iso_8859_1 has name <tt>"iso-8859-1"</tt>
764 and represents the charset ISO/IEC 8859-1:1998. */
766 @brief ISO/IEC 8859-1:1998 ʸ»ú¥»¥Ã¥È¤òɽ¸½¤¹¤ë¥·¥ó¥Ü¥ë.
768 ¥·¥ó¥Ü¥ë #Mcharset_iso_8859_1 ¤Ï <tt>"iso-8859-1"</tt>
769 ¤È¤¤¤¦Ì¾Á°¤ò»ý¤Á¡¢ISO/IEC 8859-1:1998 ʸ»ú¥»¥Ã¥È¤òɽ¸½¤¹¤ë¡£
772 MSymbol Mcharset_iso_8859_1;
775 @brief Symbol representing the charset Unicode.
777 The symbol #Mcharset_unicode has name <tt>"unicode"</tt> and
778 represents the charset Unicode. */
780 @brief Unicode ʸ»ú¥»¥Ã¥È¤òɽ¸½¤¹¤ë¥·¥ó¥Ü¥ë.
782 ¥·¥ó¥Ü¥ë #Mcharset_unicode ¤Ï <tt>"unicode"</tt>
783 ¤È¤¤¤¦Ì¾Á°¤ò»ý¤Á¡¢Unicode ʸ»ú¥»¥Ã¥È¤òɽ¸½¤¹¤ë¡£ */
785 MSymbol Mcharset_unicode;
789 @brief Symbol representing the largest charset.
791 The symbol #Mcharset_m17n has name <tt>"m17n"</tt> and
792 represents the charset that contains all characters supported by
795 @brief Á´Ê¸»ú¤ò´Þ¤àʸ»ú¥»¥Ã¥È¤òɽ¸½¤¹¤ë¥·¥ó¥Ü¥ë.
797 ¥·¥ó¥Ü¥ë #Mcharset_m17n ¤Ï <tt>"m17n"</tt> ¤È¤¤¤¦Ì¾Á°¤ò»ý¤Á¡¢
798 m17n ¥é¥¤¥Ö¥é¥ê¤¬°·¤¦Á´¤Æ¤Îʸ»ú¤ò´Þ¤àʸ»ú¥»¥Ã¥È¤òɽ¸½¤¹¤ë¡£ */
800 MSymbol Mcharset_m17n;
804 @brief Symbol representing the charset for ill-decoded characters.
806 The symbol #Mcharset_binary has name <tt>"binary"</tt> and
807 represents the fake charset which the decoding functions put to an
808 M-text as a text property when they encounter an invalid byte
811 See @ref m17nConv for more details. */
814 @brief Àµ¤·¤¯¥Ç¥³¡¼¥É¤Ç¤¤Ê¤¤Ê¸»ú¤Îʸ»ú¥»¥Ã¥È¤òɽ¸½¤¹¤ë¥·¥ó¥Ü¥ë.
816 ¥·¥ó¥Ü¥ë #Mcharset_binary ¤Ï <tt>"binary"</tt>
817 ¤È¤¤¤¦Ì¾Á°¤ò»ý¤Á¡¢µ¶¤Î (fake) ʸ»ú¥»¥Ã¥È¤òɽ¸½¤¹¤ë¡£
818 ¥Ç¥³¡¼¥É´Ø¿ô¤Ï¡¢M-text ¤Î¥Æ¥¥¹¥È¥×¥í¥Ñ¥Æ¥£¤È¤·¤Æ¡¢Ìµ¸ú¤Ê¥Ð¥¤¥È¡Ê¥·¡¼¥¯¥¨¥ó¥¹¡Ë¤ËÁø¶ø¤·¤¿°ÌÃÖ¤òÉղ乤롣
820 ¾ÜºÙ¤Ï @ref m17nConv »²¾È¤Î¤³¤È¡£ */
822 MSymbol Mcharset_binary;
829 @name Variables: Parameter keys for mchar_define_charset ().
831 These are the predefined symbols to use as parameter keys for the
832 function mchar_define_charset () (which see). */
835 @name ÊÑ¿ô: mchar_define_charset ÍѤΥѥé¥á¡¼¥¿¡¦¥¡¼
837 ¤³¤ì¤é¤Ï¡¢´Ø¿ô mchar_define_charset () ÍѤΥѥé¥á¡¼¥¿¡¦¥¡¼¤È¤·¤Æ»È¤ï¤ì¤ë¥·¥ó¥Ü¥ë¤Ç¤¢¤ë¡£
838 ¾Ü¤·¤¯¤Ï¤³¤Î´Ø¿ô¤Î²òÀâ¤ò»²¾È¤Î¤³¤È¡£*/
848 MSymbol Mascii_compatible;
854 MSymbol Msubset_offset;
855 MSymbol Mdefine_coding;
862 @name Variables: Symbols representing charset methods.
864 These are the predefined symbols that can be a value of the
865 @b Mmethod parameter of a charset used in an argument to the
866 mchar_define_charset () function.
868 A method specifies how code-points and character codes are
869 converted. See the documentation of the mchar_define_charset ()
870 function for the details. */
873 @name ÊÑ¿ô: ʸ»ú¥»¥Ã¥È¤Î¥á¥½¥Ã¥É»ØÄê¤Ë»È¤ï¤ì¤ë¥·¥ó¥Ü¥ë
875 ¤³¤ì¤é¤Ï¡¢Ê¸»ú¥»¥Ã¥È¤Î @e ¥á¥½¥Ã¥É ¤ò»ØÄꤹ¤ë¤¿¤á¤ÎÄêµÁºÑ¤ß¥·¥ó¥Ü¥ë¤Ç¤¢¤ê¡¢Ê¸»ú¥»¥Ã¥È¤Î
876 @b Mmethod ¥Ñ¥é¥á¡¼¥¿¤ÎÃͤȤʤ뤳¤È¤¬¤Ç¤¤ë¡£
877 ¤³¤ÎÃͤϴؿô mchar_define_charset () ¤Î°ú¿ô¤È¤·¤Æ»È¤ï¤ì¤ë¡£
879 ¥á¥½¥Ã¥É¤È¤Ï¡¢¥³¡¼¥É¥Ý¥¤¥ó¥È¤Èʸ»ú¥³¡¼¥É¤òÁê¸ßÊÑ´¹¤¹¤ëºÝ¤ÎÊý¼°¤Î¤³¤È¤Ç¤¢¤ë¡£
880 ¾Ü¤·¤¯¤Ï´Ø¿ô mchar_define_charset () ¤Î²òÀâ¤ò»²¾È¤Î¤³¤È¡£ */
884 @brief Symbol for the offset type method of charset.
886 The symbol #Moffset has the name <tt>"offset"</tt> and, when used
887 as a value of @b Mmethod parameter of a charset, it means that the
888 conversion of code-points and character codes of the charset is
889 done by this calculation:
892 CHARACTER-CODE = CODE-POINT - MIN-CODE + MIN-CHAR
895 where, MIN-CODE is a value of @b Mmin_code parameter of the charset,
896 and MIN-CHAR is a value of @b Mmin_char parameter. */
899 @brief ¥ª¥Õ¥»¥Ã¥È·¿¤Î¥á¥½¥Ã¥É¤ò¼¨¤¹¥·¥ó¥Ü¥ë.
901 ¥·¥ó¥Ü¥ë #Moffset ¤Ï <tt>"offset"</tt> ¤È¤¤¤¦Ì¾Á°¤ò»ý¤Á¡¢Ê¸»ú¥»¥Ã¥È¤Î
902 @b Mmethod ¥Ñ¥é¥á¡¼¥¿¤ÎÃͤȤ·¤ÆÍѤ¤¤é¤ì¤¿¾ì¹ç¤Ë¤Ï¡¢¥³¡¼¥É¥Ý¥¤¥ó¥È¤Èʸ»ú¥»¥Ã¥È¤Îʸ»ú¥³¡¼¥É¤Î´Ö¤ÎÊÑ´¹¤¬°Ê²¼¤Î¼°¤Ë½¾¤Ã¤Æ¹Ô¤ï¤ì¤ë¤³¤È¤ò°ÕÌ£¤¹¤ë¡£
905 ʸ»ú¥³¡¼¥É = ¥³¡¼¥É¥Ý¥¤¥ó¥È - MIN-CODE + MIN-CHAR
908 ¤³¤³¤Ç¡¢MIN-CODE ¤Ïʸ»ú¥»¥Ã¥È¤Î @b Mmin_code ¥Ñ¥é¥á¡¼¥¿¤ÎÃͤǤ¢¤ê¡¢MIN-CHAR ¤Ï
909 @b Mmin_char ¥Ñ¥é¥á¡¼¥¿¤ÎÃͤǤ¢¤ë¡£ */
914 /***en @brief Symbol for the map type method of charset.
916 The symbol #Mmap has the name <tt>"map"</tt> and, when used as a
917 value of @b Mmethod parameter of a charset, it means that the
918 conversion of code-points and character codes of the charset is
919 done by map looking up. The map must be given by @b Mmapfile
922 /***ja @brief ¥Þ¥Ã¥×·¿¤Î¥á¥½¥Ã¥É¤ò¼¨¤¹¥·¥ó¥Ü¥ë.
924 ¥·¥ó¥Ü¥ë #Mmap ¤Ï <tt>"map"</tt> ¤È¤¤¤¦Ì¾Á°¤ò»ý¤Á¡¢Ê¸»ú¥»¥Ã¥È¤Î
925 @b Mmethod ¥Ñ¥é¥á¡¼¥¿¤ÎÃͤȤ·¤ÆÍѤ¤¤é¤ì¤¿¾ì¹ç¤Ë¤Ï¡¢¥³¡¼¥É¥Ý¥¤¥ó¥È¤Èʸ»ú¥»¥Ã¥È¤Îʸ»ú¥³¡¼¥É¤Î´Ö¤ÎÊÑ´¹¤¬¥Þ¥Ã¥×¤ò»²¾È¤¹¤ë¤³¤È¤Ë¤è¤Ã¤Æ¹Ô¤ï¤ì¤ë¤³¤È¤ò°ÕÌ£¤¹¤ë¡£
926 ¥Þ¥Ã¥×¤Ï @b Mmapfile ¥Ñ¥é¥á¡¼¥¿¤È¤·¤ÆÍ¿¤¨¤Ê¤±¤ì¤Ð¤Ê¤é¤Ê¤¤¡£ */
931 /***en @brief Symbol for the unify type method of charset.
933 The symbol #Munify has the name <tt>"unify"</tt> and, when used as
934 a value of @b Mmethod parameter of a charset, it means that the
935 conversion of code-points and character codes of the charset is
936 done by map looking up and offsetting. The map must be given by
937 @b Mmapfile parameter. For this kind of charset, a unique
938 continuous character code space for all characters is assigned.
940 If the map has an entry for a code-point, the conversion is done
941 by looking up the map. Otherwise, the conversion is done by this
945 CHARACTER-CODE = CODE-POINT - MIN-CODE + LOWEST-CHAR-CODE
948 where, MIN-CODE is a value of @b Mmin_code parameter of the charset,
949 and LOWEST-CHAR-CODE is the lowest character code of the assigned
952 /***ja @brief ¥æ¥Ë¥Õ¥¡¥¤·¿¤Î¥á¥½¥Ã¥É¤ò¼¨¤¹¥·¥ó¥Ü¥ë.
954 ¥·¥ó¥Ü¥ë #Munify ¤Ï <tt>"unify"</tt> ¤È¤¤¤¦Ì¾Á°¤ò»ý¤Á¡¢Ê¸»ú¥»¥Ã¥È¤Î
955 @b Mmethod ¥Ñ¥é¥á¡¼¥¿¤ÎÃͤȤ·¤ÆÍѤ¤¤é¤ì¤¿¾ì¹ç¤Ë¤Ï¡¢¥³¡¼¥É¥Ý¥¤¥ó¥È¤Èʸ»ú¥»¥Ã¥È¤Îʸ»ú¥³¡¼¥É¤Î´Ö¤ÎÊÑ´¹¤¬¡¢¥Þ¥Ã¥×¤Î»²¾È¤È¥ª¥Õ¥»¥Ã¥È¤ÎÁȤ߹ç¤ï¤»¤Ë¤è¤Ã¤Æ¹Ô¤ï¤ì¤ë¤³¤È¤ò°ÕÌ£¤¹¤ë¡£
956 ¥Þ¥Ã¥×¤Ï @b Mmapfile ¥Ñ¥é¥á¡¼¥¿¤È¤·¤ÆÍ¿¤¨¤Ê¤±¤ì¤Ð¤Ê¤é¤Ê¤¤¡£
957 ¤³¤Î¼ï¤Î³Æʸ»ú¥»¥Ã¥È¤Ë¤Ï¡¢Á´Ê¸»ú¤ËÂФ·¤ÆϢ³¤¹¤ë¥³¡¼¥É¥¹¥Ú¡¼¥¹¤¬¤½¤ì¤¾¤ì³ä¤êÅö¤Æ¤é¤ì¤ë¡£
959 ¥³¡¼¥É¥Ý¥¤¥ó¥È¤¬¥Þ¥Ã¥×¤Ë´Þ¤Þ¤ì¤Æ¤¤¤ì¤Ð¡¢ÊÑ´¹¤Ï¥Þ¥Ã¥×»²¾È¤Ë¤è¤Ã¤Æ¹Ô¤ï¤ì¤ë¡£
960 ¤½¤¦¤Ç¤Ê¤±¤ì¤Ð¡¢°Ê²¼¤Î¼°¤Ë½¾¤¦¡£
963 CHARACTER-CODE = CODE-POINT - MIN-CODE + LOWEST-CHAR-CODE
966 ¤³¤³¤Ç¡¢MIN-CODE ¤Ïʸ»ú¥»¥Ã¥È¤Î @b Mmin_code ¥Ñ¥é¥á¡¼¥¿¤ÎÃͤǤ¢¤ê¡¢
967 LOWEST-CHAR-CODE ¤Ï³ä¤êÅö¤Æ¤é¤ì¤¿¥³¡¼¥É¥¹¥Ú¡¼¥¹¤ÎºÇ¤â¾®¤µ¤¤Ê¸»ú¥³¡¼¥É¤Ç¤¢¤ë¡£
974 @brief Symbol for the subset type method of charset.
976 The symbol #Msubset has the name <tt>"subset"</tt> and, when used
977 as a value of @b Mmethod parameter of a charset, it means that the
978 charset is a subset of a parent charset. The parent charset must
979 be given by @b Mparents parameter. The conversion of code-points
980 and character codes of the charset is done conceptually by this
984 CHARACTER-CODE = PARENT-CODE (CODE-POINT) + SUBSET-OFFSET
987 where, PARENT-CODE is a pseudo function that returns a character
988 code of CODE-POINT in the parent charset, and SUBSET-OFFSET is a
989 value given by @b Msubset_offset parameter. */
991 /***ja @brief ¥µ¥Ö¥»¥Ã¥È·¿¤Î¥á¥½¥Ã¥É¤ò¼¨¤¹¥·¥ó¥Ü¥ë.
993 ¥·¥ó¥Ü¥ë #Msubset ¤Ï <tt>"subset"</tt> ¤È¤¤¤¦Ì¾Á°¤ò»ý¤Á¡¢Ê¸»ú¥»¥Ã¥È¤Î
994 @b Mmethod ¥Ñ¥é¥á¡¼¥¿¤ÎÃͤȤ·¤ÆÍѤ¤¤é¤ì¤¿¾ì¹ç¤Ë¤Ï¡¢¤³¤Îʸ»ú¥»¥Ã¥È¤¬Ê̤Îʸ»ú¥»¥Ã¥È¡Ê¿Æʸ»ú¥»¥Ã¥È¡Ë¤ÎÉôʬ½¸¹ç¤Ç¤¢¤ë¤³¤È¤ò°ÕÌ£¤¹¤ë¡£
995 ¿Æʸ»ú¥»¥Ã¥È¤Ï @b Mparents ¥Ñ¥é¥á¡¼¥¿¤Ë¤è¤Ã¤ÆÍ¿¤¨¤é¤ì¤Ê¤¯¤Æ¤Ï¤Ê¤é¤Ê¤¤¡£
996 ¥³¡¼¥É¥Ý¥¤¥ó¥È¤Èʸ»ú¥»¥Ã¥È¤Îʸ»ú¥³¡¼¥É¤Î´Ö¤ÎÊÑ´¹¤Ï¡¢³µÇ°Åª¤Ë¤Ï°Ê²¼¤Î¼°¤Ë½¾¤¦¡£
999 CHARACTER-CODE = PARENT-CODE (CODE-POINT) + SUBSET-OFFSET
1002 ¤³¤³¤Ç PARENT-CODE ¤Ï CODE-POINT
1003 ¤Î¿Æʸ»ú¥»¥Ã¥ÈÃæ¤Ç¤Îʸ»ú¥³¡¼¥É¤òÊÖ¤¹µ¼´Ø¿ô¤Ç¤¢¤ê¡¢SUBSET-OFFSET ¤Ï
1004 @b Msubset_offset ¥Ñ¥é¥á¡¼¥¿¤ÇÍ¿¤¨¤é¤ì¤ëÃͤǤ¢¤ë¡£
1011 @brief Symbol for the superset type method of charset.
1013 The symbol #Msuperset has the name <tt>"superset"</tt> and, when
1014 used as a value of @b Mmethod parameter of a charset, it means that
1015 the charset is a superset of parent charsets. The parent charsets
1016 must be given by @b Mparents parameter. */
1019 @brief ¥¹¡¼¥Ñ¡¼¥»¥Ã¥È·¿¤Î¥á¥½¥Ã¥É¤ò¼¨¤¹¥·¥ó¥Ü¥ë.
1021 ¥·¥ó¥Ü¥ë #Msuperset ¤Ï <tt>"superset"</tt> ¤È¤¤¤¦Ì¾Á°¤ò»ý¤Á¡¢Ê¸»ú¥»¥Ã¥È¤Î
1022 @b Mmethod ¥Ñ¥é¥á¡¼¥¿¤ÎÃͤȤ·¤ÆÍѤ¤¤é¤ì¤¿¾ì¹ç¤Ë¤Ï¡¢¤³¤Îʸ»ú¥»¥Ã¥È¤¬Ê̤Îʸ»ú¥»¥Ã¥È¡Ê¿Æʸ»ú¥»¥Ã¥È¡Ë¤Î¾å°Ì½¸¹ç¤Ç¤¢¤ë¤³¤È¤ò°ÕÌ£¤¹¤ë¡£
1023 ¿Æʸ»ú¥»¥Ã¥È¤Ï @b Mparents ¥Ñ¥é¥á¡¼¥¿¤Ë¤è¤Ã¤ÆÍ¿¤¨¤é¤ì¤Ê¤¯¤Æ¤Ï¤Ê¤é¤Ê¤¤¡£
1031 @brief Define a charset.
1033 The mchar_define_charset () function defines a new charset and
1034 makes it accessible via a symbol whose name is $NAME. $PLIST
1035 specifies parameters of the charset as below:
1039 <li> Key is @b Mmethod, value is a symbol.
1041 The value specifies the method for decoding/encoding code-points
1042 in the charset. It must be #Moffset, #Mmap (default), #Munify,
1043 #Msubset, or #Msuperset.
1045 <li> Key is @b Mdimension, value is an integer
1047 The value specifies the dimension of code-points of the charset.
1048 It must be 1 (default), 2, 3, or 4.
1050 <li> Key is @b Mmin_range, value is an unsigned integer
1052 The value specifies the minimum range of a code-point, which means
1053 that the Nth byte of the value is the minimum Nth byte of
1054 code-points of the charset. The default value is 0.
1056 <li> Key is @b Mmax_range, value is an unsigned integer
1058 The value specifies the maximum range of a code-point, which means
1059 that the Nth byte of the value is the maximum Nth byte of
1060 code-points of the charset. The default value is 0xFF, 0xFFFF,
1061 0xFFFFFF, or 0xFFFFFFFF if the dimension is 1, 2, 3, or 4
1064 <li> Key is @b Mmin_code, value is an unsigned integer
1066 The value specifies the minimum code-point of
1067 the charset. The default value is the minimum range.
1069 <li> Key is @b Mmax_code, value is an unsigned integer
1071 The value specifies the maximum code-point of
1072 the charset. The default value is the maximum range.
1074 <li> Key is @b Mascii_compatible, value is a symbol
1076 The value specifies whether the charset is ASCII compatible or
1077 not. If the value is #Mnil (default), it is not ASCII
1078 compatible, else compatible.
1080 <li> Key is @b Mfinal_byte, value is an integer
1082 The value specifies the @e final @e byte of the charset registered
1083 in The International Registry. It must be 0 (default) or 32..127.
1084 The value 0 means that the charset is not in the registry.
1086 <li> Key is @b Mrevision, value is an integer
1088 The value specifies the @e revision @e number of the charset
1089 registered in The International Registry. It must be 0..127. If
1090 the charset is not in The International Registry, the value is
1091 ignored. The value 0 means that the charset has no revision
1094 <li> Key is @b Mmin_char, value is an integer
1096 The value specifies the minimum character code of the charset.
1097 The default value is 0.
1099 <li> Key is @b Mmapfile, value is an M-text
1101 If the method is #Mmap or #Munify, a data that contains
1102 mapping information is added to the m17n database by calling
1103 the function mdatabase_define () with the value as an argument $EXTRA_INFO,
1104 i.e. the value is used as a file name of the data.
1106 Otherwise, this parameter is ignored.
1108 <li> Key is @b Mparents, value is a plist
1110 If the method is #Msubset, the value must is a plist of length
1111 1, and the value of the plist must be a symbol representing a
1114 If the method is #Msuperset, the value must be a plist of length
1115 less than 9, and the values of the plist must be symbols
1116 representing subset charsets.
1118 Otherwise, this parameter is ignored.
1120 <li> Key is @b Mdefine_coding, value is a symbol
1122 If the dimension of the charset is 1, the value specifies whether
1123 or not to define a coding system of the same name whose type is
1124 #Mcharset. A coding system is defined if the value is not #Mnil.
1126 Otherwise, this parameter is ignored.
1131 If the operation was successful, mchar_define_charset () returns a
1132 symbol whose name is $NAME. Otherwise it returns #Mnil and
1133 assigns an error code to the external variable #merror_code. */
1136 @brief ʸ»ú¥»¥Ã¥È¤òÄêµÁ¤¹¤ë.
1138 ´Ø¿ô mchar_define_charset () ¤Ï¿·¤·¤¤Ê¸»ú¥»¥Ã¥È¤òÄêµÁ¤·¡¢¤½¤ì¤ò
1139 $NAME ¤È¤¤¤¦Ì¾Á°¤ò»ý¤Ä¥·¥ó¥Ü¥ë·Ðͳ¤Ç¥¢¥¯¥»¥¹¤Ç¤¤ë¤è¤¦¤Ë¤¹¤ë¡£
1140 $PLIST ¤ÏÄêµÁ¤µ¤ì¤ëʸ»ú¥»¥Ã¥È¤Î¥Ñ¥é¥á¡¼¥¿¤ò°Ê²¼¤Î¤è¤¦¤Ë»ØÄꤹ¤ë¡£
1144 <li> ¥¡¼¤¬ @b Mmethod ¤ÇÃͤ¬¥·¥ó¥Ü¥ë¤Î»þ
1146 Ãͤϡ¢#Moffset, #Mmap (¥Ç¥Õ¥©¥ë¥ÈÃÍ), #Munify, #Msubset,
1147 #Msuperset ¤Î¤¤¤º¤ì¤«¤Ç¤¢¤ê¡¢Ê¸»ú¥»¥Ã¥È¤Î¥³¡¼¥É¥Ý¥¤¥ó¥È¤ò¥Ç¥³¡¼¥É¡¿¥¨¥ó¥³¡¼¥É¤¹¤ëºÝ¤Î¥á¥½¥Ã¥É¤ò»ØÄꤹ¤ë¡£
1149 <li> ¥¡¼¤¬ @b Mdimension ¤ÇÃͤ¬À°¿ôÃͤλþ
1151 Ãͤϡ¢1 (¥Ç¥Õ¥©¥ë¥ÈÃÍ), 2, 3, 4
1152 ¤Î¤¤¤º¤ì¤«¤Ç¤¢¤ê¡¢Ê¸»ú¥»¥Ã¥È¤Î¥³¡¼¥É¥Ý¥¤¥ó¥È¤Î¼¡¸µ¤Ç¤¢¤ë¡£
1154 <li> ¥¡¼¤¬ @b Mmin_range ¤ÇÃͤ¬ÈóÉéÀ°¿ôÃͤλþ
1156 Ãͤϥ³¡¼¥É¥Ý¥¤¥ó¥È¤ÎºÇ¾®¤ÎÃͤǤ¢¤ë¡£¤¹¤Ê¤ï¤Á¡¢¤³¤ÎÃͤΠN
1157 ÈÖÌܤΥХ¤¥È¤Ï¤³¤Îʸ»ú¥»¥Ã¥È¤Î¥³¡¼¥É¥Ý¥¤¥ó¥È¤Î N ÈÖÌܤΥХ¤¥È¤ÎºÇ¾®¤Î¤â¤Î¤È¤Ê¤ë¡£
1160 <li> ¥¡¼¤¬ @b Mmax_range ¤ÇÃͤ¬ÈóÉéÀ°¿ôÃͤλþ
1162 Ãͤϥ³¡¼¥É¥Ý¥¤¥ó¥È¤ÎºÇÂç¤ÎÃͤǤ¢¤ë¡£¤¹¤Ê¤ï¤Á¡¢¤³¤ÎÃͤΠN
1163 ÈÖÌܤΥХ¤¥È¤Ï¤³¤Îʸ»ú¥»¥Ã¥È¤Î¥³¡¼¥É¥Ý¥¤¥ó¥È¤Î N ÈÖÌܤΥХ¤¥È¤ÎºÇÂç¤Î¤â¤Î¤È¤Ê¤ë¡£
1164 ¥Ç¥Õ¥©¥ë¥ÈÃͤϡ¢¥³¡¼¥É¥Ý¥¤¥ó¥È¤Î¼¡¸µ¤¬ 1, 2, 3, 4 ¤Î»þ¡¢¤½¤ì¤¾¤ì
1165 0xFF, 0xFFFF, 0xFFFFFF, 0xFFFFFFFF ¡£
1167 <li> ¥¡¼¤¬ @b Mmin_code ¤ÇÃͤ¬ÈóÉéÀ°¿ôÃͤλþ
1169 ÃͤϤ³¤Îʸ»ú¥»¥Ã¥È¤ÎºÇ¾®¤Î¥³¡¼¥É¥Ý¥¤¥ó¥È¤Ç¤¢¤ë¡£¥Ç¥Õ¥©¥ë¥ÈÃͤÏ
1170 @b Mmin_range ¤ÎÃÍ¡£
1172 <li> ¥¡¼¤¬ @b Mmax_code ¤ÇÃͤ¬ÈóÉéÀ°¿ôÃͤλþ
1174 ÃͤϤ³¤Îʸ»ú¥»¥Ã¥È¤ÎºÇÂç¤Î¥³¡¼¥É¥Ý¥¤¥ó¥È¤Ç¤¢¤ë¡£¥Ç¥Õ¥©¥ë¥ÈÃͤÏ
1175 @b Mmax_range ¤ÎÃÍ¡£
1177 <li> ¥¡¼¤¬ @b Mascii_compatible ¤ÇÃͤ¬¥·¥ó¥Ü¥ë¤Î»þ
1179 ÃͤϤ³¤Îʸ»ú¥»¥Ã¥È¤¬ ASCII ¸ß´¹¤Ç¤¢¤ë¤«¤É¤¦¤«¤ò¼¨¤¹¡£¥Ç¥Õ¥©¥ë¥ÈÃͤÎ
1180 #Mnil ¤Ç¤¢¤ì¤Ð¸ß´¹¤Ç¤Ï¤Ê¤¯¡¢¤½¤ì°Ê³°¤Î¾ì¹ç¤Ï¸ß´¹¤Ç¤¢¤ë¡£
1182 <li> ¥¡¼¤¬ @b Mfinal_byte ¤ÇÃͤ¬À°¿ôÃͤλþ
1184 ÃͤϤ³¤Îʸ»ú¥»¥Ã¥È¤Î The International Registry ¤ËÅÐÏ¿¤µ¤ì¤Æ¤¤¤ë
1185 @e ½ªÃ¼¥Ð¥¤¥È ¤Ç¤¢¤ê¡¢0 (¥Ç¥Õ¥©¥ë¥ÈÃÍ) ¤Ç¤¢¤ë¤« 32..127 ¤Ç¤¢¤ë¡£0
1186 ¤ÏÅÐÏ¿¤µ¤ì¤Æ¤¤¤Ê¤¤¤³¤È¤ò°ÕÌ£¤¹¤ë¡£
1188 <li> ¥¡¼¤¬ @b Mrevision ¤ÇÃͤ¬À°¿ôÃͤλþ
1190 ÃÍ¤Ï The International Registry ¤ËÅÐÏ¿¤µ¤ì¤Æ¤¤¤ë @e revision @e
1191 number ¤Ç¤¢¤ê¡¢0..127 ¤Ç¤¢¤ë¡£
1192 ʸ»ú¥»¥Ã¥È¤¬ÅÐÏ¿¤µ¤ì¤Æ¤¤¤Ê¤¤¾ì¹ç¤Ë¤Ï¤³¤ÎÃͤÏ̵»ë¤µ¤ì¤ë¡£
1193 0 ¤Ï revision number ¤¬Â¸ºß¤·¤Ê¤¤¤³¤È¤ò°ÕÌ£¤¹¤ë¡£
1195 <li> ¥¡¼¤¬ @b Mmin_char ¤ÇÃͤ¬À°¿ôÃͤλþ
1197 ÃͤϤ³¤Îʸ»ú¥»¥Ã¥È¤ÎºÇ¾®¤Îʸ»ú¥³¡¼¥É¤Ç¤¢¤ë¡£¥Ç¥Õ¥©¥ë¥ÈÃÍ¤Ï 0 ¡£
1199 <li> ¥¡¼¤¬ @b Mmapfile ¤ÇÃͤ¬ M-text ¤Î»þ
1201 ¥á¥½¥Ã¥É¤¬ #Mmap ¤« #Munify ¤Î»þ¡¢´Ø¿ô mdatabase_define ()
1202 ¤ò¤³¤ÎÃͤò°ú¿ô $EXTRA_INFO ¤È¤·¤Æ¸Æ¤Ö¤³¤È¤Ë¤è¤Ã¤Æ¡¢¥Þ¥Ã¥Ô¥ó¥°¤Ë´Ø¤¹¤ë¥Ç¡¼¥¿¤¬
1203 m17n ¥Ç¡¼¥¿¥Ù¡¼¥¹¤ËÄɲ䵤ì¤ë¡£
1204 ¤¹¤Ê¤ï¤Á¡¢¤³¤ÎÃͤϥǡ¼¥¿¥Õ¥¡¥¤¥ë¤Î̾Á°¤Ç¤¢¤ë¡£
1206 ¤½¤¦¤Ç¤Ê¤±¤ì¤Ð¡¢¤³¤Î¥Ñ¥é¥á¡¼¥¿¤Ï̵»ë¤µ¤ì¤ë¡£
1208 <li> ¥¡¼¤¬ @b Mparents ¤ÇÃͤ¬ plist ¤Î»þ
1210 ¥á¥½¥Ã¥É¤¬ #Msubset ¤Ê¤é¤Ð¡¢ÃͤÏŤµ 1 ¤Î plist
1211 ¤Ç¤¢¤ê¡¢¤½¤ÎÃͤϤ³¤Îʸ»ú¥»¥Ã¥È¤Î¾å°Ì½¸¹ç¤È¤Ê¤ëʸ»ú¥»¥Ã¥È¤ò¼¨¤¹¥·¥ó¥Ü¥ë¤Ç¤¢¤ë¡£
1213 ¥á¥½¥Ã¥É¤¬ #Msuperset ¤Ê¤é¤Ð¡¢ÃͤÏŤµ 8 °Ê²¼¤Î plist
1214 ¤Ç¤¢¤ê¡¢¤½¤ì¤é¤ÎÃͤϤ³¤Îʸ»ú¥»¥Ã¥È¤Î²¼°Ì½¸¹ç¤Ç¤¢¤ëʸ»ú¥»¥Ã¥È¤ò¼¨¤¹¥·¥ó¥Ü¥ë¤Ç¤¢¤ë¡£
1216 ¤½¤¦¤Ç¤Ê¤±¤ì¤Ð¡¢¤³¤Î¥Ñ¥é¥á¡¼¥¿¤Ï̵»ë¤µ¤ì¤ë¡£
1218 <li> ¥¡¼¤¬ @b Mdefine_coding ¤ÇÃͤ¬¥·¥ó¥Ü¥ë¤Î»þ
1220 ʸ»ú¥»¥Ã¥È¤Î¼¡¸µ¤¬ 1 ¤Ê¤é¤Ð¡¢Ãͤ¬ #Mnil °Ê³°¤Î¾ì¹ç¤Ë #Mcharset ·¿
1221 ¤ÇƱ¤¸Ì¾Á°¤ò»ý¤Ä¥³¡¼¥É·Ï¤òÄêµÁ¤¹¤ë¡£
1223 ¤½¤¦¤Ç¤Ê¤±¤ì¤Ð¡¢¤³¤Î¥Ñ¥é¥á¡¼¥¿¤Ï̵»ë¤µ¤ì¤ë¡£
1228 ½èÍý¤¬À®¸ù¤¹¤ì¤Ð¡¢mchar_define_charset() ¤Ï $NAME
1229 ¤È¤¤¤¦Ì¾Á°¤Î¥·¥ó¥Ü¥ë¤òÊÖ¤¹¡£¤½¤¦¤Ç¤Ê¤±¤ì¤Ð #Mnil ¤òÊÖ¤·¡¢³°ÉôÊÑ¿ô
1230 #merror_code ¤Ë¥¨¥é¡¼¥³¡¼¥É¤òÀßÄꤹ¤ë¡£*/
1234 @c MERROR_CHARSET */
1237 mchar_define_charset (const char *name, MPlist *plist)
1239 MSymbol sym = msymbol (name);
1242 unsigned min_range, max_range;
1244 MText *mapfile = (MText *) mplist_get (plist, Mmapfile);
1246 MSTRUCT_CALLOC (charset, MERROR_CHARSET);
1247 charset->name = sym;
1248 charset->method = (MSymbol) mplist_get (plist, Mmethod);
1249 if (! charset->method)
1252 charset->method = Mmap;
1254 charset->method = Moffset;
1256 if (charset->method == Mmap || charset->method == Munify)
1259 MERROR (MERROR_CHARSET, Mnil);
1260 mdatabase_define (Mcharset, sym, Mnil, Mnil, NULL, mapfile->data);
1262 if (! (charset->dimension = (int) mplist_get (plist, Mdimension)))
1263 charset->dimension = 1;
1265 min_range = (unsigned) mplist_get (plist, Mmin_range);
1266 if ((pl = mplist_find_by_key (plist, Mmax_range)))
1268 max_range = (unsigned) MPLIST_VAL (pl);
1269 if (max_range >= 0x1000000)
1270 charset->dimension = 4;
1271 else if (max_range >= 0x10000 && charset->dimension < 3)
1272 charset->dimension = 3;
1273 else if (max_range >= 0x100 && charset->dimension < 2)
1274 charset->dimension = 2;
1276 else if (charset->dimension == 1)
1278 else if (charset->dimension == 2)
1280 else if (charset->dimension == 3)
1281 max_range = 0xFFFFFF;
1283 max_range = 0xFFFFFFFF;
1285 memset (charset->code_range, 0, sizeof charset->code_range);
1286 for (i = 0; i < charset->dimension; i++, min_range >>= 8, max_range >>= 8)
1288 charset->code_range[i * 4] = min_range & 0xFF;
1289 charset->code_range[i * 4 + 1] = max_range & 0xFF;
1291 if ((charset->min_code = (int) mplist_get (plist, Mmin_code)) < min_range)
1292 charset->min_code = min_range;
1293 if ((charset->max_code = (int) mplist_get (plist, Mmax_code)) > max_range)
1294 charset->max_code = max_range;
1295 charset->ascii_compatible
1296 = (MSymbol) mplist_get (plist, Mascii_compatible) != Mnil;
1297 charset->final_byte = (int) mplist_get (plist, Mfinal_byte);
1298 charset->revision = (int) mplist_get (plist, Mrevision);
1299 charset->min_char = (int) mplist_get (plist, Mmin_char);
1300 pl = (MPlist *) mplist_get (plist, Mparents);
1301 charset->nparents = pl ? mplist_length (pl) : 0;
1302 if (charset->nparents > 8)
1303 charset->nparents = 8;
1304 for (i = 0; i < charset->nparents; i++, pl = MPLIST_NEXT (pl))
1306 MSymbol parent_name;
1308 if (MPLIST_KEY (pl) != Msymbol)
1309 MERROR (MERROR_CHARSET, Mnil);
1310 parent_name = MPLIST_SYMBOL (pl);
1311 if (! (charset->parents[i] = MCHARSET (parent_name)))
1312 MERROR (MERROR_CHARSET, Mnil);
1315 charset->subset_offset = (int) mplist_get (plist, Msubset_offset);
1317 msymbol_put (sym, Mcharset, charset);
1318 charset = make_charset (charset);
1321 msymbol_put (msymbol__canonicalize (sym), Mcharset, charset);
1323 for (pl = (MPlist *) mplist_get (plist, Maliases);
1324 pl && MPLIST_KEY (pl) == Msymbol;
1325 pl = MPLIST_NEXT (pl))
1327 MSymbol alias = MPLIST_SYMBOL (pl);
1329 msymbol_put (alias, Mcharset, charset);
1330 msymbol_put (msymbol__canonicalize (alias), Mcharset, charset);
1333 if (mplist_get (plist, Mdefine_coding)
1334 && charset->dimension == 1
1335 && charset->code_range[0] == 0 && charset->code_range[1] == 255)
1336 mconv__register_charset_coding (sym);
1343 @brief Resolve charset name.
1345 The mchar_resolve_charset () function returns $SYMBOL if it
1346 represents a charset. Otherwise, canonicalize $SYMBOL as to a
1347 charset name, and if the canonicalized name represents a charset,
1348 return it. Otherwise, return #Mnil. */
1351 @brief ʸ»ú¥»¥Ã¥È̾¤ò²ò·è¤¹¤ë.
1353 ´Ø¿ô mchar_resolve_charset () ¤Ï $SYMBOL
1354 ¤¬Ê¸»ú¥»¥Ã¥È¤ò¼¨¤·¤Æ¤¤¤ì¤Ð¤½¤ì¤òÊÖ¤¹¡£
1356 ¤½¤¦¤Ç¤Ê¤±¤ì¤Ð¡¢$SYMBOL ¤òʸ»ú¥»¥Ã¥È̾¤È¤·¤ÆÀµµ¬²½¤·¡¢¤½¤ì¤¬Ê¸»ú¥»¥Ã¥È¤ò¼¨¤·¤Æ¤¤¤Æ¤¤¤ì¤ÐÀµµ¬²½¤·¤¿¤â¤Î¤òÊÖ¤¹¡£
1357 ¤½¤¦¤Ç¤Ê¤±¤ì¤Ð¡¢#Mnil ¤òÊÖ¤¹¡£ */
1360 mchar_resolve_charset (MSymbol symbol)
1362 MCharset *charset = (MCharset *) msymbol_get (symbol, Mcharset);
1366 symbol = msymbol__canonicalize (symbol);
1367 charset = (MCharset *) msymbol_get (symbol, Mcharset);
1370 return (charset ? charset->name : Mnil);
1376 @brief List symbols representing charsets.
1378 The mchar_list_charsets () function makes an array of symbols
1379 representing a charset, stores the pointer to the array in a place
1380 pointed to by $SYMBOLS, and returns the length of the array. */
1383 @brief ʸ»ú¥»¥Ã¥È¤òɽ¤ï¤¹¥·¥ó¥Ü¥ë¤òÎóµó¤¹¤ë.
1385 ´Ø¿ô mchar_list_charsets ()
1386 ¤Ï¡¢Ê¸»ú¥»¥Ã¥È¤ò¼¨¤¹¥·¥ó¥Ü¥ë¤òʤ٤¿ÇÛÎó¤òºî¤ê¡¢$SYMBOLS
1387 ¤Ç¥Ý¥¤¥ó¥È¤µ¤ì¤¿¾ì½ê¤Ë¤³¤ÎÇÛÎó¤Ø¤Î¥Ý¥¤¥ó¥¿¤òÃÖ¤¡¢ÇÛÎó¤ÎŤµ¤òÊÖ¤¹¡£ */
1390 mchar_list_charset (MSymbol **symbols)
1394 MTABLE_MALLOC ((*symbols), charset_list.used, MERROR_CHARSET);
1395 for (i = 0; i < charset_list.used; i++)
1396 (*symbols)[i] = charset_list.charsets[i]->name;
1403 @brief Decode a code-point.
1405 The mchar_decode () function decodes code-point $CODE in the
1406 charset represented by the symbol $CHARSET_NAME to get a character
1410 If decoding was successful, mchar_decode () returns the decoded
1411 character code. Otherwise it returns -1. */
1414 @brief ¥³¡¼¥É¥Ý¥¤¥ó¥È¤ò¥Ç¥³¡¼¥É¤¹¤ë.
1416 ´Ø¿ô mchar_decode () ¤Ï¡¢¥·¥ó¥Ü¥ë $CHARSET_NAME ¤Ç¼¨¤µ¤ì¤ëʸ»ú¥»¥Ã¥ÈÆâ¤Î
1417 $CODE ¤È¤¤¤¦¥³¡¼¥É¥Ý¥¤¥ó¥È¤ò¥Ç¥³¡¼¥É¤·¤Æʸ»ú¥³¡¼¥É¤òÆÀ¤ë¡£
1420 ¥Ç¥³¡¼¥É¤¬À®¸ù¤¹¤ì¤Ð¡¢mchar_decode () ¤Ï¥Ç¥³¡¼¥É¤µ¤ì¤¿Ê¸»ú¥³¡¼¥É¤òÊÖ¤¹¡£
1421 ¤½¤¦¤Ç¤Ê¤±¤ì¤Ð -1 ¤òÊÖ¤¹¡£ */
1428 mchar_decode (MSymbol charset_name, unsigned code)
1430 MCharset *charset = MCHARSET (charset_name);
1433 return MCHAR_INVALID_CODE;
1434 return DECODE_CHAR (charset, code);
1440 @brief Encode a character code.
1442 The mchar_encode () function encodes character code $C to get a
1443 code-point in the charset represented by the symbol $CHARSET_NAME.
1446 If encoding was successful, mchar_encode () returns the encoded
1447 code-point. Otherwise it returns #MCHAR_INVALID_CODE. */
1450 @brief ʸ»ú¥³¡¼¥É¤ò¥¨¥ó¥³¡¼¥É¤¹¤ë.
1452 ´Ø¿ô mchar_encode () ¤Ï¡¢Ê¸»ú¥³¡¼¥É $C ¤ò¥¨¥ó¥³¡¼¥É¤·¤Æ¥·¥ó¥Ü¥ë
1453 $CHARSET_NAME ¤Ç¼¨¤µ¤ì¤ëʸ»ú¥»¥Ã¥ÈÆâ¤Ë¤ª¤±¤ë¥³¡¼¥É¥Ý¥¤¥ó¥È¤òÆÀ¤ë¡£
1456 ¥¨¥ó¥³¡¼¥É¤¬À®¸ù¤¹¤ì¤Ð¡¢mchar_encode () ¤Ï¥¨¥ó¡¼¥É¤µ¤ì¤¿¥³¡¼¥É¥Ý¥¤¥ó¥È¤òÊÖ¤¹¡£
1457 ¤½¤¦¤Ç¤Ê¤±¤ì¤Ð #MCHAR_INVALID_CODE ¤òÊÖ¤¹¡£ */
1464 mchar_encode (MSymbol charset_name, int c)
1466 MCharset *charset = MCHARSET (charset_name);
1469 return MCHAR_INVALID_CODE;
1470 return ENCODE_CHAR (charset, c);
1476 @brief Call a function for all the characters in a specified charset.
1478 The mcharset_map_chars () function calls $FUNC for all the
1479 characters in the charset named $CHARSET_NAME. A call is done for
1480 a chunk of consecutive characters rather than character by
1483 $FUNC receives three arguments: $FROM, $TO, and $ARG. $FROM and
1484 $TO specify the range of character codes in $CHARSET. $ARG is the
1488 If the operation was successful, mcharset_map_chars () returns 0.
1489 Otherwise, it returns -1 and assigns an error code to the external
1490 variable #merror_code. */
1493 @brief »ØÄꤷ¤¿Ê¸»ú¥»¥Ã¥È¤Î¤¹¤Ù¤Æ¤Îʸ»ú¤ËÂФ·¤Æ´Ø¿ô¤ò¸Æ¤Ö.
1495 ´Ø¿ô mcharset_map_chars () ¤Ï $CHARSET_NAME
1496 ¤È¤¤¤¦Ì¾Á°¤ò»ý¤Äʸ»ú¥»¥Ã¥ÈÃæ¤Î¤¹¤Ù¤Æ¤Îʸ»ú¤ËÂФ·¤Æ $FUNC ¤ò¸Æ¤Ö¡£
1497 ¸Æ¤Ó½Ð¤·¤Ï°ìʸ»úËè¤Ç¤Ï¤Ê¤¯¡¢Ï¢Â³¤·¤¿Ê¸»ú¤Î¤Þ¤È¤Þ¤êñ°Ì¤Ç¹Ô¤Ê¤ï¤ì¤ë¡£
1499 ´Ø¿ô $FUNC ¤Ë¤Ï$FROM, $TO, $ARG ¤Î£³°ú¿ô¤¬ÅϤµ¤ì¤ë¡£$FROM ¤È $TO
1500 ¤Ï $CHARSET Ãæ¤Îʸ»ú¥³¡¼¥É¤ÎÈϰϤò»ØÄꤹ¤ë¡£$ARG ¤Ï $FUNC_ARG
1504 ½èÍý¤ËÀ®¸ù¤¹¤ì¤Ð mcharset_map_chars () ¤Ï 0 ¤òÊÖ¤¹¡£
1505 ¤½¤¦¤Ç¤Ê¤±¤ì¤Ð -1 ¤òÊÖ¤·¡¢³°ÉôÊÑ¿ô #merror_code ¤Ë¥¨¥é¡¼¥³¡¼¥É¤òÀßÄꤹ¤ë¡£ */
1509 @c MERROR_CHARSET */
1512 mchar_map_charset (MSymbol charset_name,
1513 void (*func) (int from, int to, void *arg),
1518 charset = MCHARSET (charset_name);
1520 MERROR (MERROR_CHARSET, -1);
1522 if (charset->encoder)
1524 int c = charset->min_char;
1527 if ((int) mchartable__lookup (charset->encoder, c, &next_c, 1) < 0)
1529 while (c <= charset->max_char)
1531 if ((int) mchartable__lookup (charset->encoder, c, &next_c, 1) >= 0)
1532 (*func) (c, next_c - 1, func_arg);
1537 (*func) (charset->min_char, charset->max_char, func_arg);