1 /* coding.c -- code conversion module.
2 Copyright (C) 2003, 2004
3 National Institute of Advanced Industrial Science and Technology (AIST)
4 Registration Number H15PRO112
6 This file is part of the m17n library.
8 The m17n library is free software; you can redistribute it and/or
9 modify it under the terms of the GNU Lesser General Public License
10 as published by the Free Software Foundation; either version 2.1 of
11 the License, or (at your option) any later version.
13 The m17n library is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public
19 License along with the m17n library; if not, write to the Free
20 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
25 @brief Coding system objects and API for them.
27 The m17n library represents a character encoding scheme (CES) of
28 coded character sets (CCS) as an object called @e coding @e
29 system. Application programs can add original coding systems.
31 To @e encode means converting code-points to character codes and
32 to @e decode means converting character codes back to code-points.
34 Application programs can decode a byte sequence with a specified
35 coding system into an M-text, and inversely, can encode an M-text
36 into a byte sequence. */
40 @brief ¥³¡¼¥É·Ï¥ª¥Ö¥¸¥§¥¯¥È¤È¤½¤ì¤Ë´Ø¤¹¤ë API
42 m17n ¥é¥¤¥Ö¥é¥ê¤Ï¡¢Éä¹æ²½Ê¸»ú½¸¹ç (coded character sets; CCS) ¤Îʸ
43 »úÉä¹ç²½Êý¼° (character encoding scheme; CES) ¤ò @e ¥³¡¼¥É·Ï ¤È¸Æ
44 ¤Ö¥ª¥Ö¥¸¥§¥¯¥È¤Çɽ¸½¤¹¤ë¡£m17n ¥é¥¤¥Ö¥é¥ê¤¬¥µ¥Ý¡¼¥È¤¹¤ëCES ¤Ï¡¢
45 UTF-8, UTF-16, ISO-2022, DIRECT-CHARSET, ¤½¤Î¾¡¢¤ËÂçÊ̤µ¤ì¤ë¡£¥¢
46 ¥×¥ê¥±¡¼¥·¥ç¥ó¥×¥í¥°¥é¥à¤¬Æȼ«¤Ë¥³¡¼¥É·Ï¤òÄɲ乤뤳¤È¤â²Äǽ¤Ç¤¢¤ë¡£
48 ¥³¡¼¥É¥Ý¥¤¥ó¥È¤«¤éʸ»ú¥³¡¼¥É¤Ø¤ÎÊÑ´¹¤ò @e ¥¨¥ó¥³¡¼¥É ¤È¸Æ¤Ó¡¢Ê¸»ú
49 ¥³¡¼¥É¤«¤é¥³¡¼¥É¥Ý¥¤¥ó¥È¤Ø¤ÎÊÑ´¹¤ò @e ¥Ç¥³¡¼¥É ¤È¸Æ¤Ö¡£
51 ¥¢¥×¥ê¥±¡¼¥·¥ç¥ó¥×¥í¥°¥é¥à¤Ï¡¢»ØÄꤵ¤ì¤¿¥³¡¼¥É·Ï¤Ç¥Ð¥¤¥ÈÎó¤ò¥Ç¥³¡¼
52 ¥É¤¹¤ë¤³¤È¤Ç M-text ¤òÆÀ¤ë¤³¤È¤¬¤Ç¤¤ë¡£¤Þ¤¿µÕ¤Ë¡¢»ØÄꤵ¤ì¤¿¥³¡¼¥É
53 ·Ï¤Ç M-text ¤ò¥¨¥ó¥³¡¼¥É¤·¤¹¤ë¤³¤È¤Ç¥Ð¥¤¥ÈÎó¤òÆÀ¤ë¤³¤È¤¬¤Ç¤¤ë¡£ */
57 #if !defined (FOR_DOXYGEN) || defined (DOXYGEN_INTERNAL_MODULE)
58 /*** @addtogroup m17nInternal
66 #include <sys/types.h>
71 #include "m17n-misc.h"
74 #include "character.h"
81 #define NUM_SUPPORTED_CHARSETS 32
83 /** Structure for coding system object. */
87 /** Name of the coding system. */
90 /** Type of the coding system. */
93 /* Number of supported charsets. */
96 /** Array of supported charsets. */
97 MCharset *charsets[NUM_SUPPORTED_CHARSETS];
99 /** If non-NULL, function to call at the time of creating and
100 reseting a converter. */
101 int (*resetter) (MConverter *converter);
103 int (*decoder) (unsigned char *str, int str_bytes, MText *mt,
104 MConverter *converter);
106 int (*encoder) (MText *mt, int from, int to,
107 unsigned char *str, int str_bytes,
108 MConverter *converter);
110 /** If non-zero, the coding system decode/encode ASCII characters as
112 int ascii_compatible;
114 /** Pointer to extra information given when the coding system is
115 defined. The meaning depends on <type>. */
118 /** Pointer to information referred on conversion. The meaning
119 depends on <type>. The value NULL means that the coding system
129 MCodingSystem **codings;
132 static struct MCodingList coding_list;
134 static MPlist *coding_definition_list;
138 Pointer to a structure of a coding system. */
140 ¥³¡¼¥É·Ï¤òɽ¤ï¤¹¥Ç¡¼¥¿¹½Â¤¤Ø¤Î¥Ý¥¤¥ó¥¿ */
141 MCodingSystem *coding;
144 Buffer for carryover bytes generated while decoding. */
146 ¥Ç¥³¡¼¥ÉÃæ¤Î¥¥ã¥ê¥£¥ª¡¼¥Ð¡¼¥Ð¥¤¥ÈÍѥХåե¡ */
147 unsigned char carryover[256];
150 Number of carryover bytes. */
152 ¥¥ã¥ê¥£¥ª¡¼¥Ð¡¼¥Ð¥¤¥È¿ô */
156 Beginning of the byte sequence bound to this converter. */
158 ¤³¤Î¥³¥ó¥Ð¡¼¥¿¤Ë·ë¤ÓÉÕ¤±¤é¤ì¤¿¥Ð¥¤¥ÈÎó¤ÎÀèƬ°ÌÃÖ */
168 Number of bytes already consumed in buf. */
170 buf Æâ¤Ç¤¹¤Ç¤Ë¾ÃÈñ¤µ¤ì¤¿¥Ð¥¤¥È¿ô */
174 Stream bound to this converter. */
176 ¤³¤Î¥³¥ó¥Ð¡¼¥¿¤Ë·ë¤ÓÉÕ¤±¤é¤ì¤¿¥¹¥È¥ê¡¼¥à */
180 Which of above two is in use. */
182 ¾åµ2¼Ô¤Î¤¤¤º¤ì¤¬»È¤ï¤ì¤Æ¤¤¤ë¤« */
202 /* Local macros and functions. */
204 /** At first, set SRC_BASE to SRC. Then check if we have already
205 produced AT_MOST chars. If so, set SRC_END to SRC, and jump to
206 source_end. Otherwise, get one more byte C from SRC. In that
207 case, if SRC == SRC_END, jump to the label source_end. */
209 #define ONE_MORE_BASE_BYTE(c) \
212 if (nchars == at_most) \
217 if (src == src_stop) \
219 if (src == src_end) \
221 src_base = src = source; \
222 if (src == src_end) \
224 src_stop = src_end; \
230 /** Get one more byte C from SRC. If SRC == SRC_END, jump to the
233 #define ONE_MORE_BYTE(c) \
235 if (src == src_stop) \
237 if (src == src_end) \
240 if (src == src_end) \
242 src_stop = src_end; \
248 #define REWIND_SRC_TO_BASE() \
250 if (src_base < source || src_base >= src_end) \
251 src_stop = internal->carryover + internal->carryover_bytes; \
256 /** Push back byte C to SRC. */
258 #define UNGET_ONE_BYTE(c) \
264 internal->carryover[0] = c; \
265 internal->carryover_bytes = 1; \
266 src = internal->carryover; \
267 src_stop = src + 1; \
272 /** Store multibyte representation of character C at DST and increment
273 DST to the next of the produced bytes. DST must be a pointer to
274 data area of M-text MT. If the produced bytes are going to exceed
275 DST_END, enlarge the data area of MT. */
277 #define EMIT_CHAR(c) \
279 int bytes = CHAR_BYTES (c); \
282 if (dst + bytes + 1 > dst_end) \
284 len = dst - mt->data; \
285 bytes = mt->allocated + bytes + (src_stop - src); \
286 mtext__enlarge (mt, bytes); \
287 dst = mt->data + len; \
288 dst_end = mt->data + mt->allocated; \
290 dst += CHAR_STRING (c, dst); \
295 /* Check if there is enough room to produce LEN bytes at DST. If not,
296 go to the label insufficient_destination. */
298 #define CHECK_DST(len) \
300 if (dst + (len) > dst_end) \
301 goto insufficient_destination; \
305 /** Take NUM_CHARS characters (NUM_BYTES bytes) already stored at
306 (MT->data + MT->nbytes) into MT, and put charset property on
307 them with CHARSET->name. */
309 #define TAKEIN_CHARS(mt, num_chars, num_bytes, charset) \
311 int chars = (num_chars); \
315 mtext__takein ((mt), chars, (num_bytes)); \
317 mtext_put_prop ((mt), (mt)->nchars - chars, (mt)->nchars, \
318 Mcharset, (void *) ((charset)->name)); \
323 #define SET_SRC(mt, format, from, to) \
325 if (format <= MTEXT_FORMAT_UTF_8) \
327 src = mt->data + POS_CHAR_TO_BYTE (mt, from); \
328 src_end = mt->data + POS_CHAR_TO_BYTE (mt, to); \
330 else if (format <= MTEXT_FORMAT_UTF_16BE) \
333 = mt->data + (sizeof (short)) * POS_CHAR_TO_BYTE (mt, from); \
335 = mt->data + (sizeof (short)) * POS_CHAR_TO_BYTE (mt, to); \
339 src = mt->data + (sizeof (int)) * from; \
340 src_end = mt->data + (sizeof (int)) * to; \
345 #define ONE_MORE_CHAR(c, bytes, format) \
347 if (src == src_end) \
349 if (format <= MTEXT_FORMAT_UTF_8) \
350 c = STRING_CHAR_AND_BYTES (src, bytes); \
351 else if (format <= MTEXT_FORMAT_UTF_16BE) \
353 c = mtext_ref_char (mt, from++); \
354 bytes = (sizeof (short)) * CHAR_UNITS_UTF16 (c); \
358 c = ((unsigned *) (mt->data))[from++]; \
359 bytes = sizeof (int); \
365 encode_unsupporeted_char (int c, unsigned char *dst, unsigned char *dst_end,
371 len = c < 0x10000 ? 8 : 10;
372 if (dst + len > dst_end)
375 mtext_put_prop (mt, pos, pos + 1, Mcoding, Mnil);
376 format = (c < 0xD800 ? "<U+%04X>"
377 : c < 0xE000 ? "<M+%04X>"
378 : c < 0x10000 ? "<U+%04X>"
379 : c < 0x110000 ? "<U+%06X>"
381 sprintf ((char *) dst, format, c);
387 /** Finish decoding of bytes at SOURCE (ending at SRC_END) into NCHARS
388 characters by CONVERTER into M-text MT. SRC is a pointer to the
389 not-yet processed bytes. ERROR is 1 iff an invalid byte was
393 finish_decoding (MText *mt, MConverter *converter, int nchars,
394 unsigned char *source, unsigned char *src_end,
398 MConverterStatus *internal = (MConverterStatus *) converter->internal_info;
401 internal->carryover_bytes = 0;
403 || (converter->last_block
404 && ! converter->lenient))
405 converter->result = MCONVERSION_RESULT_INVALID_BYTE;
406 else if (! converter->last_block)
408 unsigned char *dst = internal->carryover;
410 if (src < source || src > src_end)
412 dst += internal->carryover_bytes;
415 while (src < src_end)
417 internal->carryover_bytes = dst - internal->carryover;
418 converter->result = MCONVERSION_RESULT_INSUFFICIENT_SRC;
422 unsigned char *dst = mt->data + mt->nbytes;
423 unsigned char *dst_end = mt->data + mt->allocated;
424 unsigned char *src_stop = src_end;
426 int last_nchars = nchars;
428 if (src < source || src > src_end)
429 src_stop = internal->carryover + internal->carryover_bytes;
432 if (converter->at_most && nchars == converter->at_most)
446 TAKEIN_CHARS (mt, nchars - last_nchars, dst - (mt->data + mt->nbytes),
448 internal->carryover_bytes = 0;
451 converter->nchars += nchars;
452 converter->nbytes += ((src < source || src > src_end) ? 0 : src - source);
453 return (converter->result == MCONVERSION_RESULT_INVALID_BYTE ? -1 : 0);
458 /* Staffs for coding-systems of type MCODING_TYPE_CHARSET. */
461 setup_coding_charset (MCodingSystem *coding)
463 int ncharsets = coding->ncharsets;
464 unsigned *code_charset_table;
468 /* At first, reorder charset list by dimensions (a charset of
469 smaller dimension comes first). As the number of charsets is
470 usually very small (at most 32), we do a simple sort. */
475 MTABLE_ALLOCA (charsets, NUM_SUPPORTED_CHARSETS, MERROR_CODING);
476 memcpy (charsets, coding->charsets,
477 sizeof (MCharset *) * NUM_SUPPORTED_CHARSETS);
478 for (i = 0; i < 4; i++)
479 for (j = 0; j < ncharsets; j++)
480 if (charsets[j]->dimension == i)
481 coding->charsets[idx++] = charsets[j];
484 MTABLE_CALLOC (code_charset_table, 256, MERROR_CODING);
487 int dim = coding->charsets[ncharsets]->dimension;
488 int from = coding->charsets[ncharsets]->code_range[(dim - 1) * 4];
489 int to = coding->charsets[ncharsets]->code_range[(dim - 1) * 4 + 1];
491 if (coding->charsets[ncharsets]->ascii_compatible)
492 coding->ascii_compatible = 1;
494 code_charset_table[from++] |= 1 << ncharsets;
497 coding->extra_spec = (void *) code_charset_table;
502 reset_coding_charset (MConverter *converter)
504 MConverterStatus *internal = (MConverterStatus *) converter->internal_info;
505 MCodingSystem *coding = internal->coding;
508 && setup_coding_charset (coding) < 0)
515 decode_coding_charset (unsigned char *source, int src_bytes, MText *mt,
516 MConverter *converter)
518 MConverterStatus *internal = (MConverterStatus *) converter->internal_info;
519 MCodingSystem *coding = internal->coding;
520 unsigned char *src = internal->carryover;
521 unsigned char *src_stop = src + internal->carryover_bytes;
522 unsigned char *src_end = source + src_bytes;
523 unsigned char *src_base;
524 unsigned char *dst = mt->data + mt->nbytes;
525 unsigned char *dst_end = mt->data + mt->allocated;
528 int at_most = converter->at_most > 0 ? converter->at_most : -1;
530 unsigned *code_charset_table = (unsigned *) coding->extra_spec;
531 MCharset **charsets = coding->charsets;
532 MCharset *charset = mcharset__ascii;
537 MCharset *this_charset = NULL;
541 ONE_MORE_BASE_BYTE (c);
542 mask = code_charset_table[c];
552 while (! (mask & 1)) mask >>= 1, idx++;
553 this_charset = charsets[idx];
554 dim = this_charset->dimension;
558 code = (code << 8) | c;
561 c = DECODE_CHAR (this_charset, code);
568 if (! converter->lenient)
570 REWIND_SRC_TO_BASE ();
572 this_charset = mcharset__binary;
575 if (this_charset != mcharset__ascii
576 && this_charset != charset)
578 TAKEIN_CHARS (mt, nchars - last_nchars,
579 dst - (mt->data + mt->nbytes), charset);
580 charset = this_charset;
581 last_nchars = nchars;
585 /* We reach here because of an invalid byte. */
589 TAKEIN_CHARS (mt, nchars - last_nchars,
590 dst - (mt->data + mt->nbytes), charset);
591 return finish_decoding (mt, converter, nchars,
592 source, src_end, src_base, error);
596 encode_coding_charset (MText *mt, int from, int to,
597 unsigned char *destination, int dst_bytes,
598 MConverter *converter)
600 MConverterStatus *internal = (MConverterStatus *) converter->internal_info;
601 MCodingSystem *coding = internal->coding;
602 unsigned char *src, *src_end;
603 unsigned char *dst = destination;
604 unsigned char *dst_end = dst + dst_bytes;
606 int ncharsets = coding->ncharsets;
607 MCharset **charsets = coding->charsets;
608 int ascii_compatible = coding->ascii_compatible;
609 enum MTextFormat format = mt->format;
611 SET_SRC (mt, format, from, to);
616 ONE_MORE_CHAR (c, bytes, format);
618 if (c < 0x80 && ascii_compatible)
626 MCharset *charset = NULL;
631 charset = charsets[i];
632 code = ENCODE_CHAR (charset, c);
633 if (code != MCHAR_INVALID_CODE)
635 if (++i == ncharsets)
636 goto unsupported_char;
639 CHECK_DST (charset->dimension);
640 if (charset->dimension == 1)
644 else if (charset->dimension == 2)
647 *dst++ = code & 0xFF;
649 else if (charset->dimension == 3)
652 *dst++ = (code >> 8) & 0xFF;
653 *dst++ = code & 0xFF;
658 *dst++ = (code >> 16) & 0xFF;
659 *dst++ = (code >> 8) & 0xFF;
660 *dst++ = code & 0xFF;
671 if (! converter->lenient)
673 len = encode_unsupporeted_char (c, dst, dst_end, mt, from + nchars);
675 goto insufficient_destination;
681 /* We reach here because of an unsupported char. */
682 converter->result = MCONVERSION_RESULT_INVALID_CHAR;
685 insufficient_destination:
686 converter->result = MCONVERSION_RESULT_INSUFFICIENT_DST;
689 converter->nchars += nchars;
690 converter->nbytes += dst - destination;
691 return (converter->result == MCONVERSION_RESULT_INVALID_CHAR ? -1 : 0);
695 /* Staffs for coding-systems of type MCODING_TYPE_UTF (8). */
697 #define UTF8_CHARSET(p) \
698 (! ((p)[0] & 0x80) ? (mcharset__unicode) \
699 : CHAR_HEAD_P ((p) + 1) ? (mcharset__binary) \
700 : ! ((p)[0] & 0x20) ? (mcharset__unicode) \
701 : CHAR_HEAD_P ((p) + 2) ? (mcharset__binary) \
702 : ! ((p)[0] & 0x10) ? (mcharset__unicode) \
703 : CHAR_HEAD_P ((p) + 3) ? (mcharset__binary) \
704 : ! ((p)[0] & 0x08) ? ((((((p)[0] & 0x07) << 2) \
705 & (((p)[1] & 0x30) >> 4)) <= 0x10) \
706 ? (mcharset__unicode) \
707 : (mcharset__m17n)) \
708 : CHAR_HEAD_P ((p) + 4) ? (mcharset__binary) \
709 : ! ((p)[0] & 0x04) ? (mcharset__m17n) \
710 : CHAR_HEAD_P ((p) + 5) ? (mcharset__binary) \
711 : ! ((p)[0] & 0x02) ? (mcharset__m17n) \
712 : (mcharset__binary))
716 decode_coding_utf_8 (unsigned char *source, int src_bytes, MText *mt,
717 MConverter *converter)
719 MConverterStatus *internal = (MConverterStatus *) converter->internal_info;
720 MCodingSystem *coding = internal->coding;
721 unsigned char *src = internal->carryover;
722 unsigned char *src_stop = src + internal->carryover_bytes;
723 unsigned char *src_end = source + src_bytes;
724 unsigned char *src_base;
725 unsigned char *dst = mt->data + mt->nbytes;
726 unsigned char *dst_end = mt->data + mt->allocated;
729 int at_most = converter->at_most > 0 ? converter->at_most : -1;
731 int full = converter->lenient || (coding->charsets[0] == mcharset__m17n);
732 MCharset *charset = NULL;
737 MCharset *this_charset = NULL;
739 ONE_MORE_BASE_BYTE (c);
743 else if (!(c & 0x40))
745 else if (!(c & 0x20))
746 bytes = 2, c &= 0x1F;
747 else if (!(c & 0x10))
748 bytes = 3, c &= 0x0F;
749 else if (!(c & 0x08))
750 bytes = 4, c &= 0x07;
751 else if (!(c & 0x04))
752 bytes = 5, c &= 0x03;
753 else if (!(c & 0x02))
754 bytes = 6, c &= 0x01;
761 if ((c1 & 0xC0) != 0x80)
763 c = (c << 6) | (c1 & 0x3F);
767 || c < 0xD800 || (c >= 0xE000 && c < 0x110000))
771 if (! converter->lenient)
773 REWIND_SRC_TO_BASE ();
775 this_charset = mcharset__binary;
778 if (this_charset != charset)
780 TAKEIN_CHARS (mt, nchars - last_nchars,
781 dst - (mt->data + mt->nbytes), charset);
782 charset = this_charset;
783 last_nchars = nchars;
787 /* We reach here because of an invalid byte. */
791 TAKEIN_CHARS (mt, nchars - last_nchars,
792 dst - (mt->data + mt->nbytes), charset);
793 return finish_decoding (mt, converter, nchars,
794 source, src_end, src_base, error);
798 encode_coding_utf_8 (MText *mt, int from, int to,
799 unsigned char *destination, int dst_bytes,
800 MConverter *converter)
802 MConverterStatus *internal = (MConverterStatus *) converter->internal_info;
803 MCodingSystem *coding = internal->coding;
804 unsigned char *src, *src_end;
805 unsigned char *dst = destination;
806 unsigned char *dst_end = dst + dst_bytes;
808 enum MTextFormat format = mt->format;
810 SET_SRC (mt, format, from, to);
812 if (format <= MTEXT_FORMAT_UTF_8
813 && (converter->lenient
814 || coding->charsets[0] == mcharset__m17n))
816 if (dst_bytes < src_end - src)
818 int byte_pos = (src + dst_bytes) - mt->data;
820 to = POS_BYTE_TO_CHAR (mt, byte_pos);
821 byte_pos = POS_CHAR_TO_BYTE (mt, to);
822 src_end = mt->data + byte_pos;
823 converter->result = MCONVERSION_RESULT_INSUFFICIENT_DST;
825 memcpy (destination, src, src_end - src);
827 dst += src_end - src;
835 ONE_MORE_CHAR (c, bytes, format);
837 if ((c >= 0xD800 && c < 0xE000) || c >= 0x110000)
840 dst += CHAR_STRING (c, dst);
844 /* We reach here because of an unsupported char. */
845 converter->result = MCONVERSION_RESULT_INVALID_CHAR;
848 insufficient_destination:
849 converter->result = MCONVERSION_RESULT_INSUFFICIENT_DST;
852 converter->nchars += nchars;
853 converter->nbytes += dst - destination;
854 return (converter->result == MCONVERSION_RESULT_INVALID_CHAR ? -1 : 0);
858 /* Staffs for coding-systems of type MCODING_TYPE_UTF (16 & 32). */
879 enum utf_endian endian;
883 setup_coding_utf (MCodingSystem *coding)
885 MCodingInfoUTF *info = (MCodingInfoUTF *) (coding->extra_info);
886 MCodingInfoUTF *spec;
888 if (info->code_unit_bits == 8)
889 coding->ascii_compatible = 1;
890 else if (info->code_unit_bits == 16
891 || info->code_unit_bits == 32)
893 if (info->bom < 0 || info->bom > 2
894 || info->endian < 0 || info->endian > 1)
895 MERROR (MERROR_CODING, -1);
900 MSTRUCT_CALLOC (spec, MERROR_CODING);
902 coding->extra_spec = (void *) (spec);
907 reset_coding_utf (MConverter *converter)
909 MConverterStatus *internal = (MConverterStatus *) converter->internal_info;
910 MCodingSystem *coding = internal->coding;
911 struct utf_status *status = (struct utf_status *) &(converter->status);
914 && setup_coding_utf (coding) < 0)
918 status->surrogate = 0;
919 status->bom = ((MCodingInfoUTF *) (coding->extra_spec))->bom;
920 status->endian = ((MCodingInfoUTF *) (coding->extra_spec))->endian;
925 decode_coding_utf_16 (unsigned char *source, int src_bytes, MText *mt,
926 MConverter *converter)
928 MConverterStatus *internal = (MConverterStatus *) converter->internal_info;
929 unsigned char *src = internal->carryover;
930 unsigned char *src_stop = src + internal->carryover_bytes;
931 unsigned char *src_end = source + src_bytes;
932 unsigned char *src_base;
933 unsigned char *dst = mt->data + mt->nbytes;
934 unsigned char *dst_end = mt->data + mt->allocated;
937 int at_most = converter->at_most > 0 ? converter->at_most : -1;
938 struct utf_status *status = (struct utf_status *) &(converter->status);
939 unsigned char b1, b2;
940 MCharset *charset = NULL;
943 if (status->bom != UTF_BOM_NO)
947 ONE_MORE_BASE_BYTE (b1);
951 status->endian = UTF_BIG_ENDIAN;
952 else if (c == 0xFFFE)
953 status->endian = UTF_LITTLE_ENDIAN;
954 else if (status->bom == UTF_BOM_MAYBE
955 || converter->lenient)
957 status->endian = UTF_BIG_ENDIAN;
958 REWIND_SRC_TO_BASE ();
965 status->bom = UTF_BOM_NO;
971 MCharset *this_charset = NULL;
973 ONE_MORE_BASE_BYTE (b1);
975 if (status->endian == UTF_BIG_ENDIAN)
976 c = ((b1 << 8) | b2);
978 c = ((b2 << 8) | b1);
979 if (c < 0xD800 || c >= 0xE000)
985 if (status->endian == UTF_BIG_ENDIAN)
986 c1 = ((b1 << 8) | b2);
988 c1 = ((b2 << 8) | b1);
989 if (c1 < 0xDC00 || c1 >= 0xE000)
991 c = 0x10000 + ((c - 0xD800) << 10) + (c1 - 0xDC00);
996 if (! converter->lenient)
998 REWIND_SRC_TO_BASE ();
1001 if (status->endian == UTF_BIG_ENDIAN)
1002 c = ((b1 << 8) | b2);
1004 c = ((b2 << 8) | b1);
1005 this_charset = mcharset__binary;
1008 if (this_charset != charset)
1010 TAKEIN_CHARS (mt, nchars - last_nchars,
1011 dst - (mt->data + mt->nbytes), charset);
1012 charset = this_charset;
1013 last_nchars = nchars;
1017 /* We reach here because of an invalid byte. */
1021 TAKEIN_CHARS (mt, nchars - last_nchars,
1022 dst - (mt->data + mt->nbytes), charset);
1023 return finish_decoding (mt, converter, nchars,
1024 source, src_end, src_base, error);
1029 decode_coding_utf_32 (unsigned char *source, int src_bytes, MText *mt,
1030 MConverter *converter)
1032 MConverterStatus *internal = (MConverterStatus *) converter->internal_info;
1033 unsigned char *src = internal->carryover;
1034 unsigned char *src_stop = src + internal->carryover_bytes;
1035 unsigned char *src_end = source + src_bytes;
1036 unsigned char *src_base;
1037 unsigned char *dst = mt->data + mt->nbytes;
1038 unsigned char *dst_end = mt->data + mt->allocated;
1040 int last_nchars = 0;
1041 int at_most = converter->at_most > 0 ? converter->at_most : -1;
1042 struct utf_status *status = (struct utf_status *) &(converter->status);
1043 unsigned char b1, b2, b3, b4;
1044 MCharset *charset = NULL;
1047 if (status->bom != UTF_BOM_NO)
1051 ONE_MORE_BASE_BYTE (b1);
1055 c = (b1 << 24) | (b2 << 16) | (b3 << 8) | b4;
1056 if (c == 0x0000FEFF)
1057 status->endian = UTF_BIG_ENDIAN;
1058 else if (c == 0xFFFE0000)
1059 status->endian = UTF_LITTLE_ENDIAN;
1060 else if (status->bom == UTF_BOM_MAYBE
1061 || converter->lenient)
1063 status->endian = UTF_BIG_ENDIAN;
1064 REWIND_SRC_TO_BASE ();
1071 status->bom = UTF_BOM_NO;
1077 MCharset *this_charset = NULL;
1079 ONE_MORE_BASE_BYTE (b1);
1083 if (status->endian == UTF_BIG_ENDIAN)
1084 c = (b1 << 24) | (b2 << 16) | (b3 << 8) | b4;
1086 c = (b4 << 24) | (b3 << 16) | (b2 << 8) | b1;
1087 if (c < 0xD800 || (c >= 0xE000 && c < 0x110000))
1090 if (! converter->lenient)
1092 REWIND_SRC_TO_BASE ();
1094 this_charset = mcharset__binary;
1097 if (this_charset != charset)
1099 TAKEIN_CHARS (mt, nchars - last_nchars,
1100 dst - (mt->data + mt->nbytes), charset);
1101 charset = this_charset;
1102 last_nchars = nchars;
1106 /* We reach here because of an invalid byte. */
1110 TAKEIN_CHARS (mt, nchars - last_nchars,
1111 dst - (mt->data + mt->nbytes), charset);
1112 return finish_decoding (mt, converter, nchars,
1113 source, src_end, src_base, error);
1118 encode_coding_utf_16 (MText *mt, int from, int to,
1119 unsigned char *destination, int dst_bytes,
1120 MConverter *converter)
1122 unsigned char *src, *src_end;
1123 unsigned char *dst = destination;
1124 unsigned char *dst_end = dst + dst_bytes;
1126 struct utf_status *status = (struct utf_status *) &(converter->status);
1127 int big_endian = status->endian == UTF_BIG_ENDIAN;
1128 enum MTextFormat format = mt->format;
1130 SET_SRC (mt, format, from, to);
1132 if (status->bom != UTF_BOM_NO)
1136 *dst++ = 0xFE, *dst++ = 0xFF;
1138 *dst++ = 0xFF, *dst++ = 0xFE;
1139 status->bom = UTF_BOM_NO;
1146 ONE_MORE_CHAR (c, bytes, format);
1148 if (c < 0xD800 || (c >= 0xE000 && c < 0x10000))
1152 *dst++ = c >> 8, *dst++ = c & 0xFF;
1154 *dst++ = c & 0xFF, *dst++ = c >> 8;
1156 else if (c >= 0x10000 && c < 0x110000)
1162 c1 = (c >> 10) + 0xD800;
1163 c2 = (c & 0x3FF) + 0xDC00;
1165 *dst++ = c1 >> 8, *dst++ = c1 & 0xFF,
1166 *dst++ = c2 >> 8, *dst++ = c2 & 0xFF;
1168 *dst++ = c1 & 0xFF, *dst++ = c1 >> 8,
1169 *dst++ = c2 & 0xFF, *dst++ = c2 >> 8;
1173 unsigned char buf[11];
1176 if (! converter->lenient)
1178 len = encode_unsupporeted_char (c, buf, buf + (dst_end - dst),
1181 goto insufficient_destination;
1183 for (i = 0; i < len; i++)
1184 *dst++ = 0, *dst++ = buf[i];
1186 for (i = 0; i < len; i++)
1187 *dst++ = buf[i], *dst++ = 0;
1192 /* We reach here because of an unsupported char. */
1193 converter->result = MCONVERSION_RESULT_INVALID_CHAR;
1196 insufficient_destination:
1197 converter->result = MCONVERSION_RESULT_INSUFFICIENT_DST;
1200 converter->nchars += nchars;
1201 converter->nbytes += dst - destination;
1202 return (converter->result == MCONVERSION_RESULT_INVALID_CHAR ? -1 : 0);
1206 encode_coding_utf_32 (MText *mt, int from, int to,
1207 unsigned char *destination, int dst_bytes,
1208 MConverter *converter)
1210 unsigned char *src, *src_end;
1211 unsigned char *dst = destination;
1212 unsigned char *dst_end = dst + dst_bytes;
1214 struct utf_status *status = (struct utf_status *) &(converter->status);
1215 int big_endian = status->endian == UTF_BIG_ENDIAN;
1216 enum MTextFormat format = mt->format;
1218 SET_SRC (mt, format, from, to);
1220 if (status->bom != UTF_BOM_NO)
1224 *dst++ = 0x00, *dst++ = 0x00, *dst++ = 0xFE, *dst++ = 0xFF;
1226 *dst++ = 0xFF, *dst++ = 0xFE, *dst++ = 0x00, *dst++ = 0x00;
1227 status->bom = UTF_BOM_NO;
1234 ONE_MORE_CHAR (c, bytes, format);
1236 if (c < 0xD800 || (c >= 0xE000 && c < 0x110000))
1240 *dst++ = 0x00, *dst++ = c >> 16,
1241 *dst++ = (c >> 8) & 0xFF, *dst++ = c & 0xFF;
1243 *dst++ = c & 0xFF, *dst++ = (c >> 8) & 0xFF,
1244 *dst++ = c >> 16, *dst++ = 0x00;
1248 unsigned char buf[11];
1251 if (! converter->lenient)
1253 len = encode_unsupporeted_char (c, buf, buf + (dst_end - dst),
1256 goto insufficient_destination;
1258 for (i = 0; i < len; i++)
1259 *dst++ = 0, *dst++ = buf[i];
1261 for (i = 0; i < len; i++)
1262 *dst++ = buf[i], *dst++ = 0;
1267 /* We reach here because of an unsupported char. */
1268 converter->result = MCONVERSION_RESULT_INVALID_CHAR;
1271 insufficient_destination:
1272 converter->result = MCONVERSION_RESULT_INSUFFICIENT_DST;
1275 converter->nchars += nchars;
1276 converter->nbytes += dst - destination;
1277 return (converter->result == MCONVERSION_RESULT_INVALID_CHAR ? -1 : 0);
1281 /* Staffs for coding-systems of type MCODING_TYPE_ISO_2022. */
1283 #define ISO_CODE_STX 0x02 /* start text */
1284 #define ISO_CODE_SO 0x0E /* shift-out */
1285 #define ISO_CODE_SI 0x0F /* shift-in */
1286 #define ISO_CODE_SS2_7 0x19 /* single-shift-2 for 7-bit code */
1287 #define ISO_CODE_ESC 0x1B /* escape */
1288 #define ISO_CODE_SS2 0x8E /* single-shift-2 */
1289 #define ISO_CODE_SS3 0x8F /* single-shift-3 */
1291 /** Structure pointed by MCodingSystem.extra_spec. */
1293 struct iso_2022_spec
1297 /** Initial graphic registers (0..3) invoked to each graphic
1298 plane left and right. */
1299 int initial_invocation[2];
1301 /** Initially designated charsets for each graphic register. */
1302 MCharset *initial_designation[4];
1310 struct iso_2022_status
1313 MCharset *designation[4];
1314 unsigned single_shifting : 1;
1317 unsigned utf8_shifting : 1;
1318 MCharset *non_standard_charset;
1319 int non_standard_charset_bytes;
1320 int non_standard_encoding;
1323 enum iso_2022_code_class {
1324 ISO_control_0, /* Control codes in the range
1325 0x00..0x1F and 0x7F, except for the
1326 following 4 codes. */
1327 ISO_shift_out, /* ISO_CODE_SO (0x0E) */
1328 ISO_shift_in, /* ISO_CODE_SI (0x0F) */
1329 ISO_single_shift_2_7, /* ISO_CODE_SS2_7 (0x19) */
1330 ISO_escape, /* ISO_CODE_SO (0x1B) */
1331 ISO_control_1, /* Control codes in the range
1332 0x80..0x9F, except for the
1333 following 3 codes. */
1334 ISO_single_shift_2, /* ISO_CODE_SS2 (0x8E) */
1335 ISO_single_shift_3, /* ISO_CODE_SS3 (0x8F) */
1336 ISO_control_sequence_introducer, /* ISO_CODE_CSI (0x9B) */
1337 ISO_0x20_or_0x7F, /* Codes of the values 0x20 or 0x7F. */
1338 ISO_graphic_plane_0, /* Graphic codes in the range 0x21..0x7E. */
1339 ISO_0xA0_or_0xFF, /* Codes of the values 0xA0 or 0xFF. */
1340 ISO_graphic_plane_1 /* Graphic codes in the range 0xA1..0xFE. */
1341 } iso_2022_code_class[256];
1344 #define MCODING_ISO_DESIGNATION_MASK \
1345 (MCODING_ISO_DESIGNATION_G0 \
1346 | MCODING_ISO_DESIGNATION_G1 \
1347 | MCODING_ISO_DESIGNATION_CTEXT \
1348 | MCODING_ISO_DESIGNATION_CTEXT_EXT)
1351 setup_coding_iso_2022 (MCodingSystem *coding)
1353 MCodingInfoISO2022 *info = (MCodingInfoISO2022 *) (coding->extra_info);
1354 int ncharsets = coding->ncharsets;
1355 struct iso_2022_spec *spec;
1356 int designation_policy = info->flags & MCODING_ISO_DESIGNATION_MASK;
1359 coding->ascii_compatible = 0;
1361 MSTRUCT_CALLOC (spec, MERROR_CODING);
1363 spec->flags = info->flags;
1364 spec->initial_invocation[0] = info->initial_invocation[0];
1365 spec->initial_invocation[1] = info->initial_invocation[1];
1366 for (i = 0; i < 4; i++)
1367 spec->initial_designation[i] = NULL;
1368 if (designation_policy)
1370 spec->n_designations = ncharsets;
1371 if (spec->flags & MCODING_ISO_FULL_SUPPORT)
1372 spec->n_designations += mcharset__iso_2022_table.used;
1373 MTABLE_CALLOC (spec->designations, spec->n_designations, MERROR_CODING);
1374 for (i = 0; i < spec->n_designations; i++)
1375 spec->designations[i] = -1;
1379 if (spec->flags & MCODING_ISO_FULL_SUPPORT)
1380 MERROR (MERROR_CODING, -1);
1381 spec->designations = NULL;
1384 for (i = 0; i < ncharsets; i++)
1386 int reg = info->designations[i];
1389 && coding->charsets[i]->final_byte > 0
1390 && (reg < -4 || reg > 3))
1391 MERROR (MERROR_CODING, -1);
1394 if (spec->initial_designation[reg])
1395 MERROR (MERROR_CODING, -1);
1396 spec->initial_designation[reg] = coding->charsets[i];
1400 if (! designation_policy
1401 && ! (spec->flags & MCODING_ISO_EUC_TW_SHIFT))
1402 MERROR (MERROR_CODING, -1);
1406 if (designation_policy)
1407 spec->designations[i] = reg;
1408 if (coding->charsets[i] == mcharset__ascii)
1409 coding->ascii_compatible = 1;
1412 if (coding->ascii_compatible
1413 && (spec->flags & (MCODING_ISO_DESIGNATION_G0
1414 | MCODING_ISO_DESIGNATION_CTEXT
1415 | MCODING_ISO_DESIGNATION_CTEXT_EXT
1416 | MCODING_ISO_LOCKING_SHIFT)))
1417 coding->ascii_compatible = 0;
1419 if (spec->flags & MCODING_ISO_FULL_SUPPORT)
1420 for (i = 0; i < mcharset__iso_2022_table.used; i++)
1422 MCharset *charset = mcharset__iso_2022_table.charsets[i];
1424 spec->designations[ncharsets + i]
1425 = ((designation_policy == MCODING_ISO_DESIGNATION_CTEXT
1426 || designation_policy == MCODING_ISO_DESIGNATION_CTEXT_EXT)
1427 ? (charset->code_range[0] == 32
1428 || charset->code_range[1] == 255)
1429 : designation_policy == MCODING_ISO_DESIGNATION_G1);
1432 spec->use_esc = ((spec->flags & MCODING_ISO_DESIGNATION_MASK)
1433 || ((spec->flags & MCODING_ISO_LOCKING_SHIFT)
1434 && (spec->initial_designation[2]
1435 || spec->initial_designation[3]))
1436 || (! (spec->flags & MCODING_ISO_EIGHT_BIT)
1437 && (spec->flags & MCODING_ISO_SINGLE_SHIFT))
1438 || (spec->flags & MCODING_ISO_ISO6429));
1440 coding->extra_spec = (void *) spec;
1446 reset_coding_iso_2022 (MConverter *converter)
1448 MConverterStatus *internal = (MConverterStatus *) converter->internal_info;
1449 MCodingSystem *coding = internal->coding;
1450 struct iso_2022_status *status
1451 = (struct iso_2022_status *) &(converter->status);
1452 struct iso_2022_spec *spec;
1456 && setup_coding_iso_2022 (coding) < 0)
1460 spec = (struct iso_2022_spec *) coding->extra_spec;
1461 status->invocation[0] = spec->initial_invocation[0];
1462 status->invocation[1] = spec->initial_invocation[1];
1463 for (i = 0; i < 4; i++)
1464 status->designation[i] = spec->initial_designation[i];
1465 status->single_shifting = 0;
1472 #define ISO2022_DECODE_DESIGNATION(reg, dim, chars, final, rev) \
1474 MCharset *charset; \
1476 if ((final) < '0' || (final) >= 128) \
1477 goto invalid_byte; \
1480 charset = MCHARSET_ISO_2022 ((dim), (chars), (final)); \
1481 if (! (spec->flags & MCODING_ISO_FULL_SUPPORT)) \
1485 for (i = 0; i < coding->ncharsets; i++) \
1486 if (charset == coding->charsets[i]) \
1488 if (i == coding->ncharsets) \
1489 goto invalid_byte; \
1496 for (i = 0; i < mcharset__iso_2022_table.used; i++) \
1498 charset = mcharset__iso_2022_table.charsets[i]; \
1499 if (charset->revision == (rev) \
1500 && charset->dimension == (dim) \
1501 && charset->final_byte == (final) \
1502 && (charset->code_range[1] == (chars) \
1503 || ((chars) == 96 && charset->code_range[1] == 255))) \
1506 if (i == mcharset__iso_2022_table.used) \
1507 goto invalid_byte; \
1509 status->designation[reg] = charset; \
1514 find_ctext_non_standard_charset (char *charset_name)
1518 if (! strcmp (charset_name, "koi8-r"))
1519 charset = MCHARSET (msymbol ("koi8-r"));
1520 else if (! strcmp (charset_name, "big5-0"))
1521 charset = MCHARSET (msymbol ("big5"));
1528 decode_coding_iso_2022 (unsigned char *source, int src_bytes, MText *mt,
1529 MConverter *converter)
1531 MConverterStatus *internal = (MConverterStatus *) converter->internal_info;
1532 MCodingSystem *coding = internal->coding;
1533 unsigned char *src = internal->carryover;
1534 unsigned char *src_stop = src + internal->carryover_bytes;
1535 unsigned char *src_end = source + src_bytes;
1536 unsigned char *src_base;
1537 unsigned char *dst = mt->data + mt->nbytes;
1538 unsigned char *dst_end = mt->data + mt->allocated;
1540 int last_nchars = 0;
1541 int at_most = converter->at_most > 0 ? converter->at_most : -1;
1542 struct iso_2022_spec *spec = (struct iso_2022_spec *) coding->extra_spec;
1543 struct iso_2022_status *status
1544 = (struct iso_2022_status *) &(converter->status);
1545 MCharset *charset0, *charset1, *charset;
1547 MCharset *cns_charsets[15];
1549 charset0 = (status->invocation[0] >= 0
1550 ? status->designation[status->invocation[0]] : NULL);
1551 charset1 = (status->invocation[1] >= 0
1552 ? status->designation[status->invocation[1]] : NULL);
1553 charset = mcharset__ascii;
1555 if (spec->flags & MCODING_ISO_EUC_TW_SHIFT)
1559 memset (cns_charsets, 0, sizeof (cns_charsets));
1560 for (i = 0; i < coding->ncharsets; i++)
1561 if (coding->charsets[i]->dimension == 2
1562 && coding->charsets[i]->code_range[1] == 126)
1564 int final = coding->charsets[i]->final_byte;
1566 if (final >= 'G' && final <= 'M')
1567 cns_charsets[final - 'G'] = coding->charsets[i];
1569 cns_charsets[14] = coding->charsets[i];
1575 MCharset *this_charset = NULL;
1578 ONE_MORE_BASE_BYTE (c1);
1580 if (status->utf8_shifting)
1583 int bytes = CHAR_BYTES_BY_HEAD (c1);
1587 for (i = 1; i < bytes; i++)
1592 this_charset = UTF8_CHARSET (buf);
1593 c1 = STRING_CHAR_UTF8 (buf);
1597 if (status->non_standard_encoding > 0)
1601 this_charset = status->non_standard_charset;
1602 for (i = 1; i < status->non_standard_charset_bytes; i++)
1605 c1 = (c1 << 8) | c2;
1607 c1 = DECODE_CHAR (this_charset, c1);
1611 switch (iso_2022_code_class[c1])
1613 case ISO_graphic_plane_0:
1614 this_charset = charset0;
1617 case ISO_0x20_or_0x7F:
1619 || (charset0->code_range[0] != 32
1620 && charset0->code_range[1] != 255))
1621 /* This is SPACE or DEL. */
1622 this_charset = mcharset__ascii;
1624 /* This is a graphic character of plane 0. */
1625 this_charset = charset0;
1628 case ISO_graphic_plane_1:
1631 this_charset = charset1;
1634 case ISO_0xA0_or_0xFF:
1636 || charset1->code_range[0] == 33
1637 || ! (spec->flags & MCODING_ISO_EIGHT_BIT))
1639 /* This is a graphic character of plane 1. */
1642 this_charset = charset1;
1646 this_charset = mcharset__ascii;
1653 if ((spec->flags & MCODING_ISO_LOCKING_SHIFT)
1654 && status->designation[1])
1656 status->invocation[0] = 1;
1657 charset0 = status->designation[1];
1660 this_charset = mcharset__ascii;
1664 if (spec->flags & MCODING_ISO_LOCKING_SHIFT)
1666 status->invocation[0] = 0;
1667 charset0 = status->designation[0];
1670 this_charset = mcharset__ascii;
1673 case ISO_single_shift_2_7:
1674 if (! (spec->flags & MCODING_ISO_SINGLE_SHIFT_7))
1676 this_charset = mcharset__ascii;
1680 goto label_escape_sequence;
1682 case ISO_single_shift_2:
1683 if (spec->flags & MCODING_ISO_EUC_TW_SHIFT)
1686 if (c1 < 0xA1 || (c1 > 0xA7 && c1 < 0xAF) || c1 > 0xAF
1687 || ! cns_charsets[c1 - 0xA1])
1689 status->designation[2] = cns_charsets[c1 - 0xA1];
1691 else if (! (spec->flags & MCODING_ISO_SINGLE_SHIFT))
1693 /* SS2 is handled as an escape sequence of ESC 'N' */
1695 goto label_escape_sequence;
1697 case ISO_single_shift_3:
1698 if (! (spec->flags & MCODING_ISO_SINGLE_SHIFT))
1700 /* SS2 is handled as an escape sequence of ESC 'O' */
1702 goto label_escape_sequence;
1704 case ISO_control_sequence_introducer:
1705 /* CSI is handled as an escape sequence of ESC '[' ... */
1707 goto label_escape_sequence;
1710 if (! spec->use_esc)
1712 this_charset = mcharset__ascii;
1716 label_escape_sequence:
1717 /* Escape sequences handled here are invocation,
1718 designation, and direction specification. */
1721 case '&': /* revision of following character set */
1722 if (! (spec->flags & MCODING_ISO_DESIGNATION_MASK))
1723 goto unused_escape_sequence;
1725 if (c1 < '@' || c1 > '~')
1728 if (c1 != ISO_CODE_ESC)
1731 goto label_escape_sequence;
1733 case '$': /* designation of 2-byte character set */
1734 if (! (spec->flags & MCODING_ISO_DESIGNATION_MASK))
1735 goto unused_escape_sequence;
1737 if (c1 >= '@' && c1 <= 'B')
1738 { /* designation of JISX0208.1978, GB2312.1980, or
1740 ISO2022_DECODE_DESIGNATION (0, 2, 94, c1, -1);
1742 else if (c1 >= 0x28 && c1 <= 0x2B)
1743 { /* designation of (dimension 2, chars 94) character set */
1745 ISO2022_DECODE_DESIGNATION (c1 - 0x28, 2, 94, c2, -1);
1747 else if (c1 >= 0x2C && c1 <= 0x2F)
1748 { /* designation of (dimension 2, chars 96) character set */
1750 ISO2022_DECODE_DESIGNATION (c1 - 0x2C, 2, 96, c2, -1);
1754 /* We must update these variables now. */
1755 charset0 = status->designation[status->invocation[0]];
1756 charset1 = status->designation[status->invocation[1]];
1759 case 'n': /* invocation of locking-shift-2 */
1760 if (! (spec->flags & MCODING_ISO_LOCKING_SHIFT)
1761 || ! status->designation[2])
1763 status->invocation[0] = 2;
1764 charset0 = status->designation[2];
1767 case 'o': /* invocation of locking-shift-3 */
1768 if (! (spec->flags & MCODING_ISO_LOCKING_SHIFT)
1769 || ! status->designation[3])
1771 status->invocation[0] = 3;
1772 charset0 = status->designation[3];
1775 case 'N': /* invocation of single-shift-2 */
1776 if (! ((spec->flags & MCODING_ISO_SINGLE_SHIFT)
1777 || (spec->flags & MCODING_ISO_EUC_TW_SHIFT))
1778 || ! status->designation[2])
1780 this_charset = status->designation[2];
1782 if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
1786 case 'O': /* invocation of single-shift-3 */
1787 if (! (spec->flags & MCODING_ISO_SINGLE_SHIFT)
1788 || ! status->designation[3])
1790 this_charset = status->designation[3];
1792 if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
1796 case '[': /* specification of direction */
1797 if (! (spec->flags & MCODING_ISO_ISO6429))
1799 /* For the moment, nested direction is not supported.
1800 So, (coding->mode & CODING_MODE_DIRECTION) zero means
1801 left-to-right, and nonzero means right-to-left. */
1805 case ']': /* end of the current direction */
1806 case '0': /* end of the current direction */
1810 case '1': /* start of left-to-right direction */
1817 case '2': /* start of right-to-left direction */
1831 char charset_name[16];
1835 if (! spec->flags & MCODING_ISO_DESIGNATION_CTEXT_EXT)
1837 /* Compound-text uses these escape sequences:
1839 ESC % G -- utf-8 bytes -- ESC % @
1840 ESC % / 1 M L -- charset name -- STX -- bytes --
1841 ESC % / 2 M L -- charset name -- STX -- bytes --
1842 ESC % / 3 M L -- charset name -- STX -- bytes --
1843 ESC % / 4 M L -- charset name -- STX -- bytes --
1845 It also uses this sequence but that is not yet
1848 ESC % / 0 M L -- charset name -- STX -- bytes -- */
1853 status->utf8_shifting = 1;
1858 if (! status->utf8_shifting)
1860 status->utf8_shifting = 0;
1866 if (c1 < '1' || c1 > '4')
1868 status->non_standard_charset_bytes = c1 - '0';
1871 if (c1 < 128 || c2 < 128)
1873 bytes = (c1 - 128) * 128 + (c2 - 128);
1874 for (i = 0; i < 16; i++)
1877 if (c1 == ISO_CODE_STX)
1879 charset_name[i] = TOLOWER (c1);
1883 charset_name[i++] = '\0';
1884 this_charset = find_ctext_non_standard_charset (charset_name);
1887 status->non_standard_charset = this_charset;
1888 status->non_standard_encoding = bytes - i;
1893 if (! (spec->flags & MCODING_ISO_DESIGNATION_MASK))
1894 goto unused_escape_sequence;
1895 if (c1 >= 0x28 && c1 <= 0x2B)
1896 { /* designation of (dimension 1, chars 94) charset */
1898 ISO2022_DECODE_DESIGNATION (c1 - 0x28, 1, 94, c2, -1);
1900 else if (c1 >= 0x2C && c1 <= 0x2F)
1901 { /* designation of (dimension 1, chars 96) charset */
1903 ISO2022_DECODE_DESIGNATION (c1 - 0x2C, 1, 96, c2, -1);
1907 /* We must update these variables now. */
1908 charset0 = status->designation[status->invocation[0]];
1909 charset1 = status->designation[status->invocation[1]];
1912 unused_escape_sequence:
1913 UNGET_ONE_BYTE (c1);
1915 this_charset = mcharset__ascii;
1919 if (this_charset->dimension == 1)
1921 if (this_charset->code_range[1] <= 128)
1924 else if (this_charset->dimension == 2)
1927 c1 = ((c1 & 0x7F) << 8) | (c2 & 0x7F);
1929 else /* i.e. (dimension == 3) */
1933 c1 = ((c1 & 0x7F) << 16) | ((c2 & 0x7F) << 8) | (c3 & 0x7F);
1935 c1 = DECODE_CHAR (this_charset, c1);
1939 if (! converter->lenient)
1941 REWIND_SRC_TO_BASE ();
1943 this_charset = mcharset__binary;
1946 if (this_charset != mcharset__ascii
1947 && this_charset != charset)
1949 TAKEIN_CHARS (mt, nchars - last_nchars,
1950 dst - (mt->data + mt->nbytes), charset);
1951 charset = this_charset;
1952 last_nchars = nchars;
1955 if (status->non_standard_encoding > 0)
1956 status->non_standard_encoding -= status->non_standard_charset_bytes;
1958 /* We reach here because of an invalid byte. */
1964 TAKEIN_CHARS (mt, nchars - last_nchars,
1965 dst - (mt->data + mt->nbytes), charset);
1966 return finish_decoding (mt, converter, nchars,
1967 source, src_end, src_base, error);
1971 /* Produce codes (escape sequence) for designating CHARSET to graphic
1972 register REG at DST, and increment DST. If CHARSET->final-char is
1973 '@', 'A', or 'B' and SHORT_FORM is nonzero, produce designation
1974 sequence of short-form. Update STATUS->designation. */
1976 #define ISO2022_ENCODE_DESIGNATION(reg, charset, spec, status) \
1978 char *intermediate_char_94 = "()*+"; \
1979 char *intermediate_char_96 = ",-./"; \
1981 if (dst + 4 > dst_end) \
1982 goto memory_shortage; \
1983 *dst++ = ISO_CODE_ESC; \
1984 if (charset->dimension == 1) \
1986 if (charset->code_range[0] != 32 \
1987 && charset->code_range[1] != 255) \
1988 *dst++ = (unsigned char) (intermediate_char_94[reg]); \
1990 *dst++ = (unsigned char) (intermediate_char_96[reg]); \
1995 if (charset->code_range[0] != 32 \
1996 && charset->code_range[1] != 255) \
1998 if (spec->flags & MCODING_ISO_LONG_FORM \
2000 || charset->final_byte < '@' || charset->final_byte > 'B') \
2001 *dst++ = (unsigned char) (intermediate_char_94[reg]); \
2004 *dst++ = (unsigned char) (intermediate_char_96[reg]); \
2006 *dst++ = charset->final_byte; \
2008 status->designation[reg] = charset; \
2012 /* The following two macros produce codes (control character or escape
2013 sequence) for ISO-2022 single-shift functions (single-shift-2 and
2016 #define ISO2022_ENCODE_SINGLE_SHIFT_2(spec, status) \
2018 if (dst + 2 > dst_end) \
2019 goto memory_shortage; \
2020 if (! (spec->flags & MCODING_ISO_EIGHT_BIT)) \
2021 *dst++ = ISO_CODE_ESC, *dst++ = 'N'; \
2023 *dst++ = ISO_CODE_SS2; \
2024 status->single_shifting = 1; \
2028 #define ISO2022_ENCODE_SINGLE_SHIFT_3(spec, status) \
2030 if (dst + 2 > dst_end) \
2031 goto memory_shortage; \
2032 if (! (spec->flags & MCODING_ISO_EIGHT_BIT)) \
2033 *dst++ = ISO_CODE_ESC, *dst++ = 'O'; \
2035 *dst++ = ISO_CODE_SS3; \
2036 status->single_shifting = 1; \
2040 /* The following four macros produce codes (control character or
2041 escape sequence) for ISO-2022 locking-shift functions (shift-in,
2042 shift-out, locking-shift-2, and locking-shift-3). */
2044 #define ISO2022_ENCODE_SHIFT_IN(status) \
2046 if (dst + 1 > dst_end) \
2047 goto memory_shortage; \
2048 *dst++ = ISO_CODE_SI; \
2049 status->invocation[0] = 0; \
2053 #define ISO2022_ENCODE_SHIFT_OUT(status) \
2055 if (dst + 1 > dst_end) \
2056 goto memory_shortage; \
2057 *dst++ = ISO_CODE_SO; \
2058 status->invocation[0] = 1; \
2062 #define ISO2022_ENCODE_LOCKING_SHIFT_2(status) \
2064 if (dst + 2 > dst_end) \
2065 goto memory_shortage; \
2066 *dst++ = ISO_CODE_ESC, *dst++ = 'n'; \
2067 status->invocation[0] = 2; \
2071 #define ISO2022_ENCODE_LOCKING_SHIFT_3(status) \
2073 if (dst + 2 > dst_end) \
2074 goto memory_shortage; \
2075 *dst++ = ISO_CODE_ESC, *dst++ = 'o'; \
2076 status->invocation[0] = 3; \
2079 #define ISO2022_ENCODE_UTF8_SHIFT_START(len) \
2081 CHECK_DST (3 + len); \
2082 *dst++ = ISO_CODE_ESC; \
2085 status->utf8_shifting = 1; \
2089 #define ISO2022_ENCODE_UTF8_SHIFT_END() \
2092 *dst++ = ISO_CODE_ESC; \
2095 status->utf8_shifting = 0; \
2099 #define ISO2022_ENCODE_NON_STANDARD(name, len) \
2101 CHECK_DST (6 + len + 1 + non_standard_charset_bytes); \
2102 non_standard_begin = dst; \
2103 *dst++ = ISO_CODE_ESC; \
2106 *dst++ = '0' + non_standard_charset_bytes; \
2107 *dst++ = 0, *dst++ = 0; /* filled later */ \
2108 memcpy (dst, name, len); \
2110 *dst++ = ISO_CODE_STX; \
2111 non_standard_bytes = len + 1; \
2116 find_ctext_non_standard_name (MCharset *charset, int *bytes)
2118 char *name = msymbol_name (charset->name);
2120 if (! strcmp (name, "koi8-r"))
2122 else if (! strcmp (name, "big5"))
2123 name = "big5-0", *bytes = 2;
2129 /* Designate CHARSET to a graphic register specified in
2130 SPEC->designation. If the register is not yet invoked to graphic
2131 left not right, invoke it to graphic left. DSTP points to a
2132 variable containing a memory address where the output must go.
2133 DST_END is the limit of that memory.
2135 Return 0 if it succeeds. Return -1 otherwise, which means that the
2136 memory area is too short. By side effect, update the variable that
2140 iso_2022_designate_invoke_charset (MCodingSystem *coding,
2142 struct iso_2022_spec *spec,
2143 struct iso_2022_status *status,
2144 unsigned char **dstp,
2145 unsigned char *dst_end)
2148 unsigned char *dst = *dstp;
2150 for (i = 0; i < 4; i++)
2151 if (charset == status->designation[i])
2156 /* CHARSET is not yet designated to any graphic registers. */
2157 for (i = 0; i < coding->ncharsets; i++)
2158 if (charset == coding->charsets[i])
2160 if (i == coding->ncharsets)
2162 for (i = 0; i < mcharset__iso_2022_table.used; i++)
2163 if (charset == mcharset__iso_2022_table.charsets[i])
2165 i += coding->ncharsets;
2167 i = spec->designations[i];
2168 ISO2022_ENCODE_DESIGNATION (i, charset, spec, status);
2171 if (status->invocation[0] != i
2172 && status->invocation[1] != i)
2174 /* Graphic register I is not yet invoked. */
2177 case 0: /* graphic register 0 */
2178 ISO2022_ENCODE_SHIFT_IN (status);
2181 case 1: /* graphic register 1 */
2182 ISO2022_ENCODE_SHIFT_OUT (status);
2185 case 2: /* graphic register 2 */
2186 if (spec->flags & MCODING_ISO_SINGLE_SHIFT)
2187 ISO2022_ENCODE_SINGLE_SHIFT_2 (spec, status);
2189 ISO2022_ENCODE_LOCKING_SHIFT_2 (status);
2192 case 3: /* graphic register 3 */
2193 if (spec->flags & MCODING_ISO_SINGLE_SHIFT)
2194 ISO2022_ENCODE_SINGLE_SHIFT_3 (spec, status);
2196 ISO2022_ENCODE_LOCKING_SHIFT_3 (status);
2209 /* Reset the invocation/designation status to the initial one. SPEC
2210 and STATUS contain information about the current and initial
2211 invocation /designation status respectively. DSTP points to a
2212 variable containing a memory address where the output must go.
2213 DST_END is the limit of that memory.
2215 Return 0 if it succeeds. Return -1 otherwise, which means that the
2216 memory area is too short. By side effect, update the variable that
2220 iso_2022_reset_invocation_designation (struct iso_2022_spec *spec,
2221 struct iso_2022_status *status,
2222 unsigned char **dstp,
2223 unsigned char *dst_end)
2225 unsigned char *dst = *dstp;
2228 /* Reset the invocation status of GL. We have not yet supported GR
2230 if (status->invocation[0] != spec->initial_invocation[0]
2231 && spec->initial_invocation[0] >= 0)
2233 if (spec->initial_invocation[0] == 0)
2234 ISO2022_ENCODE_SHIFT_IN (status);
2235 else if (spec->initial_invocation[0] == 1)
2236 ISO2022_ENCODE_SHIFT_OUT (status);
2237 else if (spec->initial_invocation[0] == 2)
2238 ISO2022_ENCODE_LOCKING_SHIFT_2 (status);
2239 else /* i.e. spec->initial_invocation[0] == 3 */
2240 ISO2022_ENCODE_LOCKING_SHIFT_3 (status);
2243 /* Reset the designation status of G0..G3. */
2244 for (i = 0; i < 4; i++)
2245 if (status->designation[i] != spec->initial_designation[i]
2246 && spec->initial_designation[i])
2248 MCharset *charset = spec->initial_designation[i];
2250 ISO2022_ENCODE_DESIGNATION (i, charset, spec, status);
2263 encode_coding_iso_2022 (MText *mt, int from, int to,
2264 unsigned char *destination, int dst_bytes,
2265 MConverter *converter)
2267 MConverterStatus *internal = (MConverterStatus *) converter->internal_info;
2268 MCodingSystem *coding = internal->coding;
2269 unsigned char *src, *src_end;
2270 unsigned char *dst = destination;
2271 unsigned char *dst_end = dst + dst_bytes;
2273 unsigned char *dst_base;
2274 struct iso_2022_spec *spec = (struct iso_2022_spec *) coding->extra_spec;
2275 int full_support = spec->flags & MCODING_ISO_FULL_SUPPORT;
2276 struct iso_2022_status *status
2277 = (struct iso_2022_status *) &(converter->status);
2278 MCharset *primary, *charset0, *charset1;
2279 int next_primary_change;
2280 int ncharsets = coding->ncharsets;
2281 MCharset **charsets = coding->charsets;
2282 MCharset *cns_charsets[15];
2283 int ascii_compatible = coding->ascii_compatible;
2284 MCharset *non_standard_charset = NULL;
2285 int non_standard_charset_bytes = 0;
2286 int non_standard_bytes = 0;
2287 unsigned char *non_standard_begin = NULL;
2288 enum MTextFormat format = mt->format;
2290 SET_SRC (mt, format, from, to);
2292 if (spec->flags & MCODING_ISO_EUC_TW_SHIFT)
2296 memset (cns_charsets, 0, sizeof (cns_charsets));
2297 for (i = 0; i < ncharsets; i++)
2298 if (charsets[i]->dimension == 2)
2300 int final = charsets[i]->final_byte;
2302 if (final >= 'G' && final <= 'M')
2303 cns_charsets[final - 'G'] = charsets[i];
2305 cns_charsets[14] = charsets[i];
2309 next_primary_change = from;
2311 charset0 = status->designation[status->invocation[0]];
2312 charset1 = (status->invocation[1] < 0 ? NULL
2313 : status->designation[status->invocation[1]]);
2320 ONE_MORE_CHAR (c, bytes, format);
2322 if (c < 128 && ascii_compatible)
2324 if (status->utf8_shifting)
2325 ISO2022_ENCODE_UTF8_SHIFT_END ();
2329 else if (c <= 32 || c == 127)
2331 if (status->utf8_shifting)
2332 ISO2022_ENCODE_UTF8_SHIFT_END ();
2333 if (spec->flags & MCODING_ISO_RESET_AT_CNTL
2334 || (c == '\n' && spec->flags & MCODING_ISO_RESET_AT_EOL))
2336 if (iso_2022_reset_invocation_designation (spec, status,
2338 goto insufficient_destination;
2339 charset0 = status->designation[status->invocation[0]];
2340 charset1 = (status->invocation[1] < 0 ? NULL
2341 : status->designation[status->invocation[1]]);
2348 unsigned code = MCHAR_INVALID_CODE;
2349 MCharset *charset = NULL;
2351 int pos = from + nchars;
2353 if (pos >= next_primary_change)
2355 MSymbol primary_charset
2356 = (MSymbol) mtext_get_prop (mt, pos, Mcharset);
2357 primary = MCHARSET (primary_charset);
2358 if (primary && primary != mcharset__binary)
2360 if (primary->final_byte <= 0)
2362 else if (! full_support)
2366 for (i = 0; i < ncharsets; i++)
2367 if (primary == charsets[i])
2374 mtext_prop_range (mt, Mcharset, pos,
2375 NULL, &next_primary_change, 0);
2378 if (primary && primary != mcharset__binary)
2380 code = ENCODE_CHAR (primary, c);
2381 if (code != MCHAR_INVALID_CODE)
2386 if (c <= 32 || c == 127)
2389 charset = mcharset__ascii;
2395 for (i = 0; i < ncharsets; i++)
2397 charset = charsets[i];
2398 code = ENCODE_CHAR (charset, c);
2399 if (code != MCHAR_INVALID_CODE)
2404 if (spec->flags & MCODING_ISO_FULL_SUPPORT)
2406 for (i = 0; i < mcharset__iso_2022_table.used; i++)
2408 charset = mcharset__iso_2022_table.charsets[i];
2409 code = ENCODE_CHAR (charset, c);
2410 if (code != MCHAR_INVALID_CODE)
2413 if (i == mcharset__iso_2022_table.used)
2415 if (spec->flags & MCODING_ISO_DESIGNATION_CTEXT_EXT)
2416 goto unsupported_char;
2417 converter->result = MCONVERSION_RESULT_INVALID_CHAR;
2422 goto unsupported_char;
2428 && (charset->final_byte >= 0
2429 || spec->flags & MCODING_ISO_EUC_TW_SHIFT))
2431 if (code >= 0x80 && code < 0xA0)
2432 goto unsupported_char;
2434 if (status->utf8_shifting)
2435 ISO2022_ENCODE_UTF8_SHIFT_END ();
2436 if (charset == charset0)
2438 else if (charset == charset1)
2442 unsigned char *p = NULL;
2444 if (spec->flags & MCODING_ISO_EUC_TW_SHIFT)
2448 if (cns_charsets[0] == charset)
2454 for (i = 1; i < 15; i++)
2455 if (cns_charsets[i] == charset)
2458 *dst++ = ISO_CODE_SS2;
2461 status->single_shifting = 1;
2466 if (iso_2022_designate_invoke_charset
2467 (coding, charset, spec, status, &dst, dst_end) < 0)
2468 goto insufficient_destination;
2469 charset0 = status->designation[status->invocation[0]];
2470 charset1 = (status->invocation[1] < 0 ? NULL
2471 : status->designation[status->invocation[1]]);
2473 if (status->single_shifting)
2475 = (spec->flags & MCODING_ISO_EIGHT_BIT) ? 0x80 : 0;
2476 else if (charset == charset0)
2481 if (charset->dimension == 1)
2484 *dst++ = code | gr_mask;
2486 else if (charset->dimension == 2)
2489 *dst++ = (code >> 8) | gr_mask;
2490 *dst++ = (code & 0xFF) | gr_mask;
2495 *dst++ = (code >> 16) | gr_mask;
2496 *dst++ = ((code >> 8) & 0xFF) | gr_mask;
2497 *dst++ = (code & 0xFF) | gr_mask;
2499 status->single_shifting = 0;
2501 else if (charset && spec->flags & MCODING_ISO_DESIGNATION_CTEXT_EXT)
2503 if (charset != non_standard_charset)
2505 char *name = (find_ctext_non_standard_name
2506 (charset, &non_standard_charset_bytes));
2510 int len = strlen (name);
2512 ISO2022_ENCODE_NON_STANDARD (name, len);
2513 non_standard_charset = charset;
2516 non_standard_charset = NULL;
2519 if (non_standard_charset)
2521 if (dst + non_standard_charset_bytes > dst_end)
2522 goto insufficient_destination;
2523 non_standard_bytes += non_standard_charset_bytes;
2524 non_standard_begin[4] = (non_standard_bytes / 128) | 0x80;
2525 non_standard_begin[5] = (non_standard_bytes % 128) | 0x80;
2526 if (non_standard_charset_bytes == 1)
2528 else if (non_standard_charset_bytes == 2)
2529 *dst++ = code >> 8, *dst++ = code & 0xFF;
2530 else if (non_standard_charset_bytes == 3)
2531 *dst++ = code >> 16, *dst++ = (code >> 8) & 0xFF,
2532 *dst++ = code & 0xFF;
2533 else /* i.e non_standard_charset_bytes == 3 */
2534 *dst++ = code >> 24, *dst++ = (code >> 16) & 0xFF,
2535 *dst++ = (code >> 8) & 0xFF, *dst++ = code & 0xFF;
2539 int len = CHAR_BYTES (c);
2542 goto unsupported_char;
2543 if (! status->utf8_shifting)
2544 ISO2022_ENCODE_UTF8_SHIFT_START (len);
2547 CHAR_STRING (c, dst);
2551 goto unsupported_char;
2561 if (iso_2022_designate_invoke_charset (coding, mcharset__ascii,
2564 goto insufficient_destination;
2565 if (! converter->lenient)
2567 len = encode_unsupporeted_char (c, dst, dst_end, mt, from + nchars);
2569 goto insufficient_destination;
2575 /* We reach here because of an unsupported char. */
2576 converter->result = MCONVERSION_RESULT_INVALID_CHAR;
2579 insufficient_destination:
2581 converter->result = MCONVERSION_RESULT_INSUFFICIENT_DST;
2584 if (converter->result == MCONVERSION_RESULT_SUCCESS
2585 && converter->last_block)
2587 if (status->utf8_shifting)
2589 ISO2022_ENCODE_UTF8_SHIFT_END ();
2592 if (spec->flags & MCODING_ISO_RESET_AT_EOL
2593 && charset0 != spec->initial_designation[0])
2595 if (iso_2022_reset_invocation_designation (spec, status,
2597 goto insufficient_destination;
2600 converter->nchars += nchars;
2601 converter->nbytes += dst - destination;
2602 return (converter->result == MCONVERSION_RESULT_INVALID_CHAR ? -1 : 0);
2606 /* Staffs for coding-systems of type MCODING_TYPE_MISC. */
2608 /* For SJIS handling... */
2610 #define SJIS_TO_JIS(s1, s2) \
2612 ? (((s1 * 2 - (s1 >= 0xE0 ? 0x160 : 0xE0)) << 8) \
2614 : (((s1 * 2 - ((s1 >= 0xE0) ? 0x161 : 0xE1)) << 8) \
2615 | (s2 - ((s2 >= 0x7F) ? 0x20 : 0x1F))))
2617 #define JIS_TO_SJIS(c1, c2) \
2619 ? (((c1 / 2 + ((c1 < 0x5F) ? 0x71 : 0xB1)) << 8) \
2620 | (c2 + ((c2 >= 0x60) ? 0x20 : 0x1F))) \
2621 : (((c1 / 2 + ((c1 < 0x5F) ? 0x70 : 0xB0)) << 8) \
2626 reset_coding_sjis (MConverter *converter)
2628 MConverterStatus *internal = (MConverterStatus *) converter->internal_info;
2629 MCodingSystem *coding = internal->coding;
2631 if (! coding->ready)
2633 MSymbol kanji_sym = msymbol ("jisx0208.1983");
2634 MCharset *kanji = MCHARSET (kanji_sym);
2635 MSymbol kana_sym = msymbol ("jisx0201-kana");
2636 MCharset *kana = MCHARSET (kana_sym);
2638 if (! kanji_sym || ! kana_sym)
2640 coding->ncharsets = 3;
2641 coding->charsets[1] = kanji;
2642 coding->charsets[2] = kana;
2649 decode_coding_sjis (unsigned char *source, int src_bytes, MText *mt,
2650 MConverter *converter)
2652 MConverterStatus *internal = (MConverterStatus *) converter->internal_info;
2653 MCodingSystem *coding = internal->coding;
2654 unsigned char *src = internal->carryover;
2655 unsigned char *src_stop = src + internal->carryover_bytes;
2656 unsigned char *src_end = source + src_bytes;
2657 unsigned char *src_base;
2658 unsigned char *dst = mt->data + mt->nbytes;
2659 unsigned char *dst_end = mt->data + mt->allocated - MAX_UTF8_CHAR_BYTES;
2661 int last_nchars = 0;
2662 int at_most = converter->at_most > 0 ? converter->at_most : -1;
2664 MCharset *charset_roman = coding->charsets[0];
2665 MCharset *charset_kanji = coding->charsets[1];
2666 MCharset *charset_kana = coding->charsets[2];
2667 MCharset *charset = mcharset__ascii;
2672 MCharset *this_charset;
2675 ONE_MORE_BASE_BYTE (c1);
2680 this_charset = ((c1 <= 0x20 || c1 == 0x7F)
2684 else if ((c1 >= 0x81 && c1 <= 0x9F) || (c1 >= 0xE0 && c1 <= 0xEF))
2687 if ((c2 >= 0x40 && c2 <= 0x7F) || (c2 >= 80 && c2 <= 0xFC))
2689 this_charset = charset_kanji;
2690 c1 = SJIS_TO_JIS (c1, c2);
2695 else if (c1 >= 0xA1 && c1 <= 0xDF)
2697 this_charset = charset_kana;
2703 c = DECODE_CHAR (this_charset, c1);
2708 if (! converter->lenient)
2710 REWIND_SRC_TO_BASE ();
2712 this_charset = mcharset__binary;
2715 if (this_charset != mcharset__ascii
2716 && this_charset != charset)
2718 TAKEIN_CHARS (mt, nchars - last_nchars,
2719 dst - (mt->data + mt->nbytes), charset);
2720 charset = this_charset;
2721 last_nchars = nchars;
2725 /* We reach here because of an invalid byte. */
2729 TAKEIN_CHARS (mt, nchars - last_nchars,
2730 dst - (mt->data + mt->nbytes), charset);
2731 return finish_decoding (mt, converter, nchars,
2732 source, src_end, src_base, error);
2736 encode_coding_sjis (MText *mt, int from, int to,
2737 unsigned char *destination, int dst_bytes,
2738 MConverter *converter)
2740 MConverterStatus *internal = (MConverterStatus *) converter->internal_info;
2741 MCodingSystem *coding = internal->coding;
2742 unsigned char *src, *src_end;
2743 unsigned char *dst = destination;
2744 unsigned char *dst_end = dst + dst_bytes;
2746 MCharset *charset_roman = coding->charsets[0];
2747 MCharset *charset_kanji = coding->charsets[1];
2748 MCharset *charset_kana = coding->charsets[2];
2749 enum MTextFormat format = mt->format;
2751 SET_SRC (mt, format, from, to);
2758 ONE_MORE_CHAR (c, bytes, format);
2760 if (c <= 0x20 || c == 0x7F)
2767 if ((code = ENCODE_CHAR (charset_roman, c)) != MCHAR_INVALID_CODE)
2772 else if ((code = ENCODE_CHAR (charset_kanji, c))
2773 != MCHAR_INVALID_CODE)
2775 int c1 = code >> 8, c2 = code & 0xFF;
2776 code = JIS_TO_SJIS (c1, c2);
2779 *dst++ = code & 0xFF;
2781 else if ((code = ENCODE_CHAR (charset_kana, c))
2782 != MCHAR_INVALID_CODE)
2785 *dst++ = code | 0x80;
2789 if (! converter->lenient)
2791 len = encode_unsupporeted_char (c, dst, dst_end,
2794 goto insufficient_destination;
2801 /* We reach here because of an unsupported char. */
2802 converter->result = MCONVERSION_RESULT_INVALID_CHAR;
2805 insufficient_destination:
2806 converter->result = MCONVERSION_RESULT_INSUFFICIENT_DST;
2809 converter->nchars += nchars;
2810 converter->nbytes += dst - destination;
2811 return (converter->result == MCONVERSION_RESULT_INVALID_CHAR ? -1 : 0);
2815 static MCodingSystem *
2816 find_coding (MSymbol name)
2818 MCodingSystem *coding = (MCodingSystem *) msymbol_get (name, Mcoding);
2822 MPlist *param = mplist_get (coding_definition_list, name);
2826 param = mplist__from_plist (param);
2827 mconv_define_coding (MSYMBOL_NAME (name), param, NULL, NULL, NULL, NULL);
2828 coding = (MCodingSystem *) msymbol_get (name, Mcoding);
2829 M17N_OBJECT_UNREF (param);
2834 #define BINDING_NONE 0
2835 #define BINDING_BUFFER 1
2836 #define BINDING_STREAM 2
2838 #define CONVERT_WORKSIZE 0x10000
2844 mcoding__init (void)
2847 MPlist *param, *charsets, *pl;
2849 MLIST_INIT1 (&coding_list, codings, 128);
2850 coding_definition_list = mplist ();
2852 /* ISO-2022 specific initialize routine. */
2853 for (i = 0; i < 0x20; i++)
2854 iso_2022_code_class[i] = ISO_control_0;
2855 for (i = 0x21; i < 0x7F; i++)
2856 iso_2022_code_class[i] = ISO_graphic_plane_0;
2857 for (i = 0x80; i < 0xA0; i++)
2858 iso_2022_code_class[i] = ISO_control_1;
2859 for (i = 0xA1; i < 0xFF; i++)
2860 iso_2022_code_class[i] = ISO_graphic_plane_1;
2861 iso_2022_code_class[0x20] = iso_2022_code_class[0x7F] = ISO_0x20_or_0x7F;
2862 iso_2022_code_class[0xA0] = iso_2022_code_class[0xFF] = ISO_0xA0_or_0xFF;
2863 iso_2022_code_class[0x0E] = ISO_shift_out;
2864 iso_2022_code_class[0x0F] = ISO_shift_in;
2865 iso_2022_code_class[0x19] = ISO_single_shift_2_7;
2866 iso_2022_code_class[0x1B] = ISO_escape;
2867 iso_2022_code_class[0x8E] = ISO_single_shift_2;
2868 iso_2022_code_class[0x8F] = ISO_single_shift_3;
2869 iso_2022_code_class[0x9B] = ISO_control_sequence_introducer;
2871 Mcoding = msymbol ("coding");
2873 Mutf = msymbol ("utf");
2874 Miso_2022 = msymbol ("iso-2022");
2876 Mreset_at_eol = msymbol ("reset-at-eol");
2877 Mreset_at_cntl = msymbol ("reset-at-cntl");
2878 Meight_bit = msymbol ("eight-bit");
2879 Mlong_form = msymbol ("long-form");
2880 Mdesignation_g0 = msymbol ("designation-g0");
2881 Mdesignation_g1 = msymbol ("designation-g1");
2882 Mdesignation_ctext = msymbol ("designation-ctext");
2883 Mdesignation_ctext_ext = msymbol ("designation-ctext-ext");
2884 Mlocking_shift = msymbol ("locking-shift");
2885 Msingle_shift = msymbol ("single-shift");
2886 Msingle_shift_7 = msymbol ("single-shift-7");
2887 Meuc_tw_shift = msymbol ("euc-tw-shift");
2888 Miso_6429 = msymbol ("iso-6429");
2889 Mrevision_number = msymbol ("revision-number");
2890 Mfull_support = msymbol ("full-support");
2891 Mmaybe = msymbol ("maybe");
2893 Mtype = msymbol ("type");
2894 Mcharsets = msymbol_as_managing_key ("charsets");
2895 Mflags = msymbol_as_managing_key ("flags");
2896 Mdesignation = msymbol_as_managing_key ("designation");
2897 Minvocation = msymbol_as_managing_key ("invocation");
2898 Mcode_unit = msymbol ("code-unit");
2899 Mbom = msymbol ("bom");
2900 Mlittle_endian = msymbol ("little-endian");
2903 charsets = mplist ();
2905 /* Setup predefined codings. */
2906 mplist_set (charsets, Msymbol, Mcharset_ascii);
2907 pl = mplist_add (pl, Mtype, Mcharset);
2908 pl = mplist_add (pl, Mcharsets, charsets);
2909 Mcoding_us_ascii = mconv_define_coding ("us-ascii", param,
2910 NULL, NULL, NULL, NULL);
2913 MSymbol alias = msymbol ("ANSI_X3.4-1968");
2914 MCodingSystem *coding
2915 = (MCodingSystem *) msymbol_get (Mcoding_us_ascii, Mcoding);
2917 msymbol_put (alias, Mcoding, coding);
2918 alias = msymbol__canonicalize (alias);
2919 msymbol_put (alias, Mcoding, coding);
2922 mplist_set (charsets, Msymbol, Mcharset_iso_8859_1);
2923 Mcoding_iso_8859_1 = mconv_define_coding ("iso-8859-1", param,
2924 NULL, NULL, NULL, NULL);
2926 mplist_set (charsets, Msymbol, Mcharset_m17n);
2927 mplist_put (param, Mtype, Mutf);
2928 mplist_put (param, Mcode_unit, (void *) 8);
2929 Mcoding_utf_8_full = mconv_define_coding ("utf-8-full", param,
2930 NULL, NULL, NULL, NULL);
2932 mplist_set (charsets, Msymbol, Mcharset_unicode);
2933 Mcoding_utf_8 = mconv_define_coding ("utf-8", param,
2934 NULL, NULL, NULL, NULL);
2936 mplist_put (param, Mcode_unit, (void *) 16);
2937 mplist_put (param, Mbom, Mmaybe);
2938 #ifndef WORDS_BIGENDIAN
2939 mplist_put (param, Mlittle_endian, Mt);
2941 Mcoding_utf_16 = mconv_define_coding ("utf-16", param,
2942 NULL, NULL, NULL, NULL);
2944 mplist_put (param, Mcode_unit, (void *) 32);
2945 Mcoding_utf_32 = mconv_define_coding ("utf-32", param,
2946 NULL, NULL, NULL, NULL);
2948 mplist_put (param, Mcode_unit, (void *) 16);
2949 mplist_put (param, Mbom, Mnil);
2950 mplist_put (param, Mlittle_endian, Mnil);
2951 Mcoding_utf_16be = mconv_define_coding ("utf-16be", param,
2952 NULL, NULL, NULL, NULL);
2954 mplist_put (param, Mcode_unit, (void *) 32);
2955 Mcoding_utf_32be = mconv_define_coding ("utf-32be", param,
2956 NULL, NULL, NULL, NULL);
2958 mplist_put (param, Mcode_unit, (void *) 16);
2959 mplist_put (param, Mlittle_endian, Mt);
2960 Mcoding_utf_16le = mconv_define_coding ("utf-16le", param,
2961 NULL, NULL, NULL, NULL);
2963 mplist_put (param, Mcode_unit, (void *) 32);
2964 Mcoding_utf_32le = mconv_define_coding ("utf-32le", param,
2965 NULL, NULL, NULL, NULL);
2967 mplist_put (param, Mtype, Mnil);
2968 mplist_set (charsets, Msymbol, Mcharset_ascii);
2969 Mcoding_sjis = mconv_define_coding ("sjis", param,
2972 encode_coding_sjis, NULL);
2974 M17N_OBJECT_UNREF (charsets);
2975 M17N_OBJECT_UNREF (param);
2981 mcoding__fini (void)
2986 for (i = 0; i < coding_list.used; i++)
2988 MCodingSystem *coding = coding_list.codings[i];
2990 if (coding->extra_info)
2991 free (coding->extra_info);
2992 if (coding->extra_spec)
2993 free (coding->extra_spec);
2996 MLIST_FREE1 (&coding_list, codings);
2997 MPLIST_DO (plist, coding_definition_list)
2998 M17N_OBJECT_UNREF (MPLIST_VAL (plist));
2999 M17N_OBJECT_UNREF (coding_definition_list);
3003 mconv__define_coding_from_charset (MSymbol sym)
3005 MPlist *param = mplist (), *charsets = mplist ();
3007 mplist_set (charsets, Msymbol, sym);
3008 mplist_add (param, Mtype, Mcharset);
3009 mplist_add (param, Mcharsets, charsets);
3010 mconv_define_coding (msymbol_name (sym), param, NULL, NULL, NULL, NULL);
3011 M17N_OBJECT_UNREF (charsets);
3012 M17N_OBJECT_UNREF (param);
3016 mconv__register_charset_coding (MSymbol sym)
3018 if (! mplist_find_by_key (coding_definition_list, sym))
3020 MPlist *param = mplist (), *charsets = mplist ();
3022 mplist_set (charsets, Msymbol, sym);
3023 mplist_add (param, Msymbol, Mtype);
3024 mplist_add (param, Msymbol, Mcharset);
3025 mplist_add (param, Msymbol, Mcharsets);
3026 mplist_add (param, Mplist, charsets);
3027 mplist_put (coding_definition_list, sym, param);
3028 M17N_OBJECT_UNREF (charsets);
3034 mcoding__load_from_database ()
3036 MDatabase *mdb = mdatabase_find (msymbol ("coding-list"), Mnil, Mnil, Mnil);
3037 MPlist *def_list, *plist;
3038 MPlist *definitions = coding_definition_list;
3039 int mdebug_mask = MDEBUG_CODING;
3043 MDEBUG_PUSH_TIME ();
3044 def_list = (MPlist *) mdatabase_load (mdb);
3045 MDEBUG_PRINT_TIME ("CODING", (stderr, " to load the data."));
3050 MDEBUG_PUSH_TIME ();
3051 MPLIST_DO (plist, def_list)
3056 if (! MPLIST_PLIST_P (plist))
3057 MERROR (MERROR_CHARSET, -1);
3058 pl = MPLIST_PLIST (plist);
3059 if (! MPLIST_SYMBOL_P (pl))
3060 MERROR (MERROR_CHARSET, -1);
3061 name = MPLIST_SYMBOL (pl);
3062 pl = MPLIST_NEXT (pl);
3063 definitions = mplist_add (definitions, name, pl);
3064 M17N_OBJECT_REF (pl);
3067 M17N_OBJECT_UNREF (def_list);
3068 MDEBUG_PRINT_TIME ("CODING", (stderr, " to parse the loaded data."));
3074 #endif /* !FOR_DOXYGEN || DOXYGEN_INTERNAL_MODULE */
3078 /*** @addtogroup m17nConv */
3082 /***en @name Variables: Symbols representing a coding system */
3083 /***ja @name ÊÑ¿ô: ÄêµÁºÑ¤ß¥³¡¼¥É·Ï¤ò»ØÄꤹ¤ë¤¿¤á¤Î¥·¥ó¥Ü¥ë */
3088 @brief Symbol for the coding system US-ASCII
3090 The symbol #Mcoding_us_ascii has name <tt>"us-ascii"</tt> and
3091 represents a coding system for the CES US-ASCII. */
3094 @brief MIME charset "US-ASCII" ¤ËÂбþ¤¹¤ë¥³¡¼¥É·Ï¤Î¥·¥ó¥Ü¥ë
3096 ¥·¥ó¥Ü¥ë @c Mcoding_us_ascii ¤Ï <tt>"us-ascii"</tt> ¤È¤¤¤¦Ì¾Á°¤ò»ý¤Á¡¢
3097 MIME charset <tt>"US-ASCII"</tt> ¤ËÂбþ¤¹¤ë¥³¡¼¥É·Ï¤ò»ØÄꤹ¤ë¤¿¤á
3100 MSymbol Mcoding_us_ascii;
3104 @brief Symbol for the coding system ISO-8859-1
3106 The symbol #Mcoding_iso_8859_1 has name <tt>"iso-8859-1"</tt> and
3107 represents a coding system for the CES ISO-8859-1. */
3110 @brief MIME charset "ISO-8859-1" ¤ËÂбþ¤¹¤ë¥³¡¼¥É·Ï¤Î¥·¥ó¥Ü¥ë
3112 ¥·¥ó¥Ü¥ë @c Mcoding_iso_8859_1 ¤Ï <tt>"iso-8859-1"</tt> ¤È¤¤¤¦Ì¾Á°
3113 ¤ò»ý¤Á¡¢MIME charset <tt>"ISO-8859-1"</tt> ¤ËÂбþ¤¹¤ë¥³¡¼¥É·Ï¤ò»Ø
3114 Äꤹ¤ë¤¿¤á¤Ë»È¤ï¤ì¤ë¡£ */
3116 MSymbol Mcoding_iso_8859_1;
3120 @brief Symbol for the coding system UTF-8
3122 The symbol #Mcoding_utf_8 has name <tt>"utf-8"</tt> and represents
3123 a coding system for the CES UTF-8. */
3126 @brief RFC 2279 ¤Î "UTF-8" ¤ËÂбþ¤¹¤ë¥³¡¼¥É·Ï¤Î¥·¥ó¥Ü¥ë¡ÊUnicode ÍÑ¡Ë
3128 ¥·¥ó¥Ü¥ë @c Mcoding_utf_8 ¤Ï <tt>"utf-8"</tt> ¤È¤¤¤¦Ì¾Á°¤ò»ý¤Á¡¢
3129 RFC 2279 ¤ÇÄêµÁ¤µ¤ì¤ë<tt>"UTF-8"</tt> ¤ËÂбþ¤¹¤ë¥³¡¼¥É·Ï¤ò»ØÄꤹ¤ë
3130 ¤¿¤á¤Ë»È¤ï¤ì¤ë¡£¤³¤Î¥³¡¼¥É·Ï¤Ï Unicode ¤ÎÁ´¤Æ¤Îʸ»ú¤ò¥µ¥Ý¡¼¥È¤¹¤ë¡£
3133 MSymbol Mcoding_utf_8;
3140 The symbol #Mcoding_utf_8_full has name <tt>"utf-8-full"</tt> and
3141 represents a coding system that is a extension of UTF-8. This
3142 coding system uses the same encoding algorithm as UTF-8 but is not
3143 limited to the Unicode characters. It can encode all characters
3144 supported by the m17n library. */
3147 @brief RFC 2279 ¤Î "UTF-8" ¤ËÂбþ¤¹¤ë¥³¡¼¥É·Ï¤Î¥·¥ó¥Ü¥ë¡ÊÁ´Ê¸»úÍÑ¡Ë
3149 ¥·¥ó¥Ü¥ë @c Mcoding_utf_8_full ¤Ï <tt>"utf-8-full"</tt> ¤È¤¤¤¦Ì¾Á°
3150 ¤ò»ý¤Á¡¢RFC 2279 ¤ÇÄêµÁ¤µ¤ì¤ë<tt>"UTF-8"</tt> ¤ËÂбþ¤¹¤ë¥³¡¼¥É·Ï¤ò
3151 »ØÄꤹ¤ë¤¿¤á¤Ë»È¤ï¤ì¤ë¡£¤³¤Î¥³¡¼¥É·Ï¤Ï m17n ¥é¥¤¥Ö¥é¥ê¤¬°·¤¦Á´¤Æ¤Î
3152 ʸ»ú¤ò¥µ¥Ý¡¼¥È¤¹¤ë¡£ */
3154 MSymbol Mcoding_utf_8_full;
3160 The symbol #Mcoding_utf_16 has name <tt>"utf-16"</tt> and
3161 represents a coding system for the CES UTF-16 (RFC 2279). */
3163 @brief RFC 2781 ¤Î "UTF-16" ¤ËÂбþ¤¹¤ë¥³¡¼¥É·Ï¤Î¥·¥ó¥Ü¥ë
3165 ¥·¥ó¥Ü¥ë @c Mcoding_utf_16 ¤Ï <tt>"utf-16"</tt> ¤È¤¤¤¦Ì¾Á°¤ò»ý¤Á¡¢
3166 RFC 2279 ¤ÇÄêµÁ¤µ¤ì¤ë<tt>"UTF-16"</tt> ¤ËÂбþ¤¹¤ë¥³¡¼¥É·Ï¤ò»ØÄꤹ
3167 ¤ë¤¿¤á¤Ë»È¤ï¤ì¤ë¡£¤³¤Î¥³¡¼¥É·Ï¤Ï Unicode ¤ÎÁ´¤Æ¤Îʸ»ú¤ò¥µ¥Ý¡¼¥È¤¹
3170 MSymbol Mcoding_utf_16;
3176 The symbol #Mcoding_utf_16be has name <tt>"utf-16be"</tt> and
3177 represents a coding system for the CES UTF-16BE (RFC 2279). */
3179 MSymbol Mcoding_utf_16be;
3185 The symbol #Mcoding_utf_16le has name <tt>"utf-16le"</tt> and
3186 represents a coding system for the CES UTF-16LE (RFC 2279). */
3188 MSymbol Mcoding_utf_16le;
3194 The symbol #Mcoding_utf_32 has name <tt>"utf-32"</tt> and
3195 represents a coding system for the CES UTF-32 (RFC 2279). */
3197 MSymbol Mcoding_utf_32;
3203 The symbol #Mcoding_utf_32be has name <tt>"utf-32be"</tt> and
3204 represents a coding system for the CES UTF-32BE (RFC 2279). */
3206 MSymbol Mcoding_utf_32be;
3212 The symbol #Mcoding_utf_32le has name <tt>"utf-32le"</tt> and
3213 represents a coding system for the CES UTF-32LE (RFC 2279). */
3214 MSymbol Mcoding_utf_32le;
3220 The symbol #Mcoding_sjis has name <tt>"sjis"</tt> and represents a coding
3221 system for the CES Shift-JIS. */
3223 MSymbol Mcoding_sjis;
3228 @name Variables: Parameter keys for mconv_define_coding (). */
3233 Parameter key for mconv_define_coding () (which see). */
3239 MSymbol Mdesignation;
3240 MSymbol Minvocation;
3243 MSymbol Mlittle_endian;
3248 @name Variables: Symbols representing coding system type. */
3253 Symbol that can be a value of the #Mtype parameter of a coding
3254 system used in an argument to the mconv_define_coding () function
3265 @name Variables: Symbols appearing in the value of #Mfrag parameter. */
3270 Symbol that can be a value of the #Mflags parameter of a coding
3271 system used in an argument to the mconv_define_coding () function
3273 MSymbol Mreset_at_eol;
3275 MSymbol Mreset_at_cntl;
3278 MSymbol Mdesignation_g0;
3279 MSymbol Mdesignation_g1;
3280 MSymbol Mdesignation_ctext;
3281 MSymbol Mdesignation_ctext_ext;
3282 MSymbol Mlocking_shift;
3283 MSymbol Msingle_shift;
3284 MSymbol Msingle_shift_7;
3285 MSymbol Meuc_tw_shift;
3287 MSymbol Mrevision_number;
3288 MSymbol Mfull_support;
3293 @name Variables: etc
3295 Remaining variables. */
3296 /***ja @name ÊÑ¿ô: ¤½¤Î¾ */
3300 @brief Symbol whose name is "maybe".
3302 The variable #Mmaybe is a symbol of name <tt>"maybe"</tt>. It is
3303 used a value of #Mbom parameter of the function
3304 mconv_define_coding () (which see). */
3310 @brief The symbol @c Mcoding
3312 Any decoded M-text has a text property whose key is the predefined
3313 symbol @c Mcoding. The name of @c Mcoding is
3314 <tt>"coding"</tt>. */
3317 @brief ¥·¥ó¥Ü¥ë @c Mcoding
3319 ¥Ç¥³¡¼¥É¤µ¤ì¤¿ M-text ¤Ï¡¢¥¡¼¤¬ @c Mcoding ¤Ç¤¢¤ë¤è¤¦¤Ê¥Æ¥¥¹¥È¥×
3320 ¥í¥Ñ¥Æ¥£¤ò»ý¤Ä¡£¥·¥ó¥Ü¥ë @c Mcoding ¤Ï <tt>"coding"</tt> ¤È¤¤¤¦Ì¾
3321 Á°¤Ç¤¢¤é¤«¤¸¤áÄêµÁ¤µ¤ì¤Æ¤¤¤ë¡£ */
3329 @brief Define a coding system
3331 The mconv_define_coding () function defines a new coding system
3332 and makes it accessive via a symbol whose name is $NAME. $PLIST
3333 specifies parameters of the charset as below:
3337 <li> Key is @c Mtype, value is a symbol
3339 The value specifies the type of the coding system. It must be
3340 #Mcharset, #Mutf, #Miso_2022, or #Mnil.
3342 If the type is #Mcharset, $EXTRA_INFO is ignored.
3344 If the type is #Miso_2022, $EXTRA_INFO must be a pointer to
3345 #MCodingInfoISO2022.
3347 If the type is #Mutf, $EXTRA_INFO must be a pointer to
3350 If the type is #Mnil, the argument $RESETTER, $DECODER, and
3351 $ENCODER must be supplied. $EXTRA_INFO is ignored. Otherwise,
3352 they can be @c NULL and the m17n library provides proper defaults.
3354 <li> Key is #Mcharsets, value is a plist
3356 The value specifies a list charsets supported by the coding
3357 system. The keys of the plist must be #Msymbol, and the values
3358 must be symbols representing charsets.
3360 <li> Key is #Mflags, value is a plist
3362 If the type is #Miso_2022, the values specifies flags to control
3363 the ISO 2022 interpreter. The keys of the plist must e @c
3364 Msymbol, and values must be one of the following.
3370 If this flag exits, designation and invocation status is reset to
3371 the initial state at the end of line.
3373 <li> #Mreset_at_cntl
3375 If this flag exists, designation and invocation status is reset to
3376 the initial state at a control character.
3380 If this flag exists, the graphic plane right is used.
3384 If this flag exists, the over-long escape sequences (ESC '$' '('
3385 <final_byte>) are used for designating the charsets JISX0208.1978,
3386 GB2312, and JISX0208.
3388 <li> #Mdesignation_g0
3390 If this flag and #Mfull_support exists, designates charsets not
3391 listed in the charset list to the graphic register G0.
3393 <li> #Mdesignation_g1
3395 If this flag and #Mfull_support exists, designates charsets not
3396 listed in the charset list to the graphic register G1.
3398 <li> #Mdesignation_ctext
3400 If this flag and #Mfull_support exists, designates charsets not
3401 listed in the charset list to a graphic register G0 or G1 based on
3402 the criteria of the Compound Text.
3404 <li> #Mdesignation_ctext_ext
3406 If this flag and #Mfull_support exists, designates charsets not
3407 listed in the charset list to a graphic register G0 or G1, or use
3408 extended segment for such charsets based on the criteria of the
3411 <li> #Mlocking_shift
3413 If this flag exists, use locking shift.
3417 If this flag exists, use single shift.
3419 <li> #Msingle_shift_7
3421 If this flag exists, use 7-bit single shift code (0x19).
3423 <li> #Meuc_tw_shift;
3425 If this flag exists, use a special shifting according to EUC-TW.
3429 This flag is currently ignored.
3431 <li> #Mrevision_number
3433 If this flag exists, use a revision number escape sequence to
3434 designate a charset that has a revision number.
3438 If this flag exists, support all charsets registered in the
3439 International Registry.
3443 <li> Key is #Mdesignation, value is a plist
3445 If the type is #Miso_2022, the value specifies how to designate
3446 each supported characters. The keys of the plist must be @c
3447 Minteger, and the values must be numbers indicating a graphic
3448 registers. The Nth element value is for the Nth charset of the
3449 charset list. The value 0..3 means that it is assumed that a
3450 charset is already designated to the graphic register 0..3. The
3451 negative value G (-4..-1) means that a charset is not designated
3452 to any register at first, and if necessary, is designated to the
3453 (G+4) graphic register.
3455 <li> Key is #Minvocation, value is a plist
3457 If the type is #Miso_2022, the value specifies how to invocate
3458 each graphic registers. The plist length must be one or two. The
3459 keys of the plist must be #Minteger, and the values must be
3460 numbers indicating a graphic register. The value of the first
3461 element specifies which graphic register is invocated to the
3462 graphic plane left. If the length is one, no graphic register is
3463 invocated to the graphic plane right. Otherwise, the value of the
3464 second element specifies which graphic register is invocated to
3465 the graphic plane right.
3467 <li> Key is #Mcode_unit, value is an integer
3469 If the type is #Mutf, the value specifies the bit length of a
3470 code-unit. It must be 8, 16, or 32.
3472 <li> Key is #Mbom, value is a symbol
3474 If the type is #Mutf and the code-unit bit length is 16 or 32,
3475 it specifies whether or not to use BOM (Byte Order Mark). If the
3476 value is #Mnil (default), BOM is not used, else if the value is
3477 #Mmaybe, the existence of BOM is detected at decoding time, else
3480 <li> Key is #Mlittle_endian, value is a symbol
3482 If the type is #Mutf and the code-unit bit length is 16 or 32,
3483 it specifies whether or not the encoding is little endian. If the
3484 value is #Mnil (default), it is big endian, else it is little
3489 $RESETTER is a pointer to a function that resets a converter for
3490 the coding system to the initial status. The pointed function is
3491 called with one argument, a pointer to a converter object.
3493 $DECODER is a pointer to a function that decodes a byte sequence
3494 according to the coding system. The pointed function is called
3495 with four arguments:
3497 @li A pointer to the byte sequence to decode.
3498 @li The number of bytes to decode.
3499 @li A pointer to an M-text to which the decoded characters are appended.
3500 @li A pointer to a converter object.
3502 $DECODER must return 0 if it succeeds. Otherwise it must return -1.
3504 $ENCODER is a pointer to a function that encodes an M-text
3505 according to the coding system. The pointed function is called
3508 @li A pointer to the M-text to encode.
3509 @li The starting position of the encoding.
3510 @li The ending position of the encoding.
3511 @li A pointer to a memory area where the produced bytes are stored.
3512 @li The size of the memory area.
3513 @li A pointer to a converter object.
3515 $ENCODER must return 0 if it succeeds. Otherwise it must return -1.
3517 $EXTRA_INFO is a pointer to a data structure that contains extra
3518 information about the coding system. The type of the data
3519 structure depends on $TYPE.
3523 If the operation was successful, mconv_define_coding () returns a
3524 symbol whose name is $NAME. If an error is detected, it returns
3525 #Mnil and assigns an error code to the external variable @c
3529 @brief ¥³¡¼¥É·Ï¤ÎÄêµÁ
3531 ´Ø¿ô mconv_define_coding () ¤Ï¡¢¿·¤·¤¤¥³¡¼¥É·Ï¤òÄêµÁ¤·¡¢¤½¤ì¤ò
3532 $NAME ¤È¤¤¤¦Ì¾Á°¤Î¥·¥ó¥Ü¥ë·Ðͳ¤Ç¥¢¥¯¥»¥¹¤Ç¤¤ë¤è¤¦¤Ë¤¹¤ë¡£
3534 $TYPE ¤Ï Îóµó·¿ #MCodingType ¤Î¤¤¤º¤ì¤«¤Ç¤¢¤ê¡¢¥³¡¼¥É·Ï¤Î¹½Â¤¤ò
3537 $CHARSET_NAMES ¤Ï¥µ¥Ý¡¼¥È¤¹¤ëʸ»ú¥»¥Ã¥È¤òɽ¤ï¤¹¥·¥ó¥Ü¥ë¤ÎÇÛÎó¤Ç¤¢¤ê¡¢
3538 $NCHARSETS ¤Ï¤½¤ÎÍ×ÁÇ¿ô¤Ç¤¢¤ë¡£
3540 $TYPE ¤¬ #MCODING_TYPE_MISC ¤Ç¤¢¤ë¾ì¹ç¤Ë¤Ï¡¢$RESETTER, $DECODER,
3541 $ENCODER ¤òÍ¿¤¨¤Ê¤¯¤Æ¤Ï¤Ê¤é¤Ê¤¤¡£¤½¤ì°Ê³°¤Î¾ì¹ç¤Ë¤Ï¤³¤ì¤é¤Ï @c
3542 NULL ¤Ç¹½¤ï¤Ê¤¤¡£¤½¤ÎºÝ¤Ë¤Ï m17n ¥é¥¤¥Ö¥é¥ê¤¬Å¬Àڤʥǥե©¥ë¥ÈÃͤò
3545 $RESETTER ¤Ï¤³¤Î¥³¡¼¥É·ÏÍѤΥ³¥ó¥Ð¡¼¥¿¤ò½é´ü¾õÂ֤˥ꥻ¥Ã¥È¤¹¤ë´Ø¿ô
3546 ¤Ø¤Î¥Ý¥¤¥ó¥¿¤Ç¤¢¤ë¡£¤³¤Î´Ø¿ô¤Ï¥³¥ó¥Ð¡¼¥¿¥ª¥Ö¥¸¥§¥¯¥È¤Ø¤Î¥Ý¥¤¥ó¥¿¤È
3549 $DECODER ¤Ï¥Ð¥¤¥ÈÎó¤ò¤³¤Î¥³¡¼¥É·Ï¤Ë½¾¤Ã¤Æ¥Ç¥³¡¼¥É¤¹¤ë´Ø¿ô¤Ø¤Î¥Ý¥¤
3550 ¥ó¥¿¤Ç¤¢¤ë¡£¤³¤Î´Ø¿ô¤Ï°Ê²¼¤Î4°ú¿ô¤ò¤È¤ë¡£
3552 @li ¥Ð¥¤¥ÈÎó¤Ø¤Î¥Ý¥¤¥ó¥¿
3553 @li ¥Ç¥³¡¼¥É¤¹¤Ù¤¥Ð¥¤¥È¿ô
3554 @li ¥Ç¥³¡¼¥É·ë²Ì¤Îʸ»ú¤òÉղ乤ë M-text ¤Ø¤Î¥Ý¥¤¥ó¥¿
3555 @li ¥³¥ó¥Ð¡¼¥¿¥ª¥Ö¥¸¥§¥¯¥È¤Ø¤Î¥Ý¥¤¥ó¥¿
3557 $DECODER ¤ÏÀ®¸ù¤·¤¿¤È¤¤Ë¤Ï0¤ò¡¢¼ºÇÔ¤·¤¿¤È¤¤Ë¤Ï-1¤òÊÖ¤µ¤Ê¤¯¤Æ¤Ï¤Ê
3560 $ENCODER ¤Ï M-text ¤ò¤³¤Î¥³¡¼¥É·Ï¤Ë½¾¤Ã¤Æ¥¨¥ó¥³¡¼¥É¤¹
3561 ¤ë´Ø¿ô¤Ø¤Î¥Ý¥¤¥ó¥¿¤Ç¤¢¤ë¡£¤³¤Î´Ø¿ô¤Ï°Ê²¼¤Î6°ú¿ô¤ò¤È¤ë¡£
3563 @li M-text ¤Ø¤Î¥Ý¥¤¥ó¥¿
3564 @li M-text ¤Î¥¨¥ó¥³¡¼¥É³«»Ï°ÌÃÖ
3565 @li M-text ¤Î¥¨¥ó¥³¡¼¥É½ªÎ»°ÌÃÖ
3566 @li À¸À®¤·¤¿¥Ð¥¤¥È¤òÊÝ»ý¤¹¤ë¥á¥â¥êÎΰè¤Ø¤Î¥Ý¥¤¥ó¥¿
3567 @li ¥á¥â¥êÎΰè¤Î¥µ¥¤¥º
3568 @li ¥³¥ó¥Ð¡¼¥¿¥ª¥Ö¥¸¥§¥¯¥È¤Ø¤Î¥Ý¥¤¥ó¥¿
3570 $ENCODER ¤ÏÀ®¸ù¤·¤¿¤È¤¤Ë¤Ï0¤ò¡¢¼ºÇÔ¤·¤¿¤È¤¤Ë¤Ï-1¤òÊÖ¤µ¤Ê¤¯¤Æ¤Ï¤Ê
3573 $EXTRA_INFO ¤Ï¥³¡¼¥Ç¥£¥°¥·¥¹¥Æ¥à¤Ë´Ø¤¹¤ëÄɲþðÊó¤ò´Þ¤à¥Ç¡¼¥¿¹½Â¤¤Ø
3574 ¤Î¥Ý¥¤¥ó¥¿¤Ç¤¢¤ë¡£¤³¤Î¥Ç¡¼¥¿¹½Â¤¤Î¥¿¥¤¥×¤Ï $TYPE ¤Ë°Í¸¤¹¤ë¡£
3576 $TYPE ¤¬ #MCODING_TYPE_ISO_2022 ¤Ç¤¢¤ì¤Ð¡¢$EXTRA_INFO ¤Ï @c
3577 MCodingInfoISO2022 ¤Ø¤Î¥Ý¥¤¥ó¥¿¤Ç¤Ê¤¯¤Æ¤Ï¤Ê¤é¤Ê¤¤¡£
3579 $TYPE ¤¬ #MCODING_TYPE_UTF ¤Ç¤¢¤ì¤Ð¡¢$EXTRA_INFO ¤Ï @c
3580 MCodingInfoUTF ¤Ø¤Î¥Ý¥¤¥ó¥¿¤Ç¤Ê¤¯¤Æ¤Ï¤Ê¤é¤Ê¤¤¡£
3582 $TYPE ¤¬ #MCODING_TYPE_CHARSET, #MCODING_TYPE_MISC ¤Î¤É¤ì¤«¤Ç
3583 ¤¢¤ì¤Ð¡¢$EXTRA_INFO ¤Ï̵»ë¤µ¤ì¤ë¡£
3587 ½èÍý¤ËÀ®¸ù¤¹¤ì¤Ð mconv_define_coding () ¤Ï $NAME ¤È¤¤¤¦Ì¾Á°¤Î¥·
3588 ¥ó¥Ü¥ë¤òÊÖ¤¹¡£¤³¤Î¥·¥ó¥Ü¥ë¤Ï¡¢¥¡¼¤¬ $Mcoding ¤Ç¡¢ºî¤é¤ì¤¿¥³¡¼¥É·Ï
3589 ¤Ø¤Î¥Ý¥¤¥ó¥¿¤òÃͤȤ¹¤ë¥·¥ó¥Ü¥ë¥×¥í¥Ñ¥Æ¥£¤ò»ý¤Ä¡£ ¥¨¥é¡¼¤¬¸¡½Ð¤µ¤ì
3590 ¤¿¾ì¹ç¤Ï Mnil ¤òÊÖ¤·¡¢³°ÉôÊÑ¿ô #merror_code ¤Ë¥¨¥é¡¼¥³¡¼¥É¤òÀßÄꤹ¤ë¡£
3598 mconv_define_coding (char *name, MPlist *plist,
3599 int (*resetter) (MConverter *),
3600 int (*decoder) (unsigned char *, int, MText *,
3602 int (*encoder) (MText *, int, int,
3603 unsigned char *, int,
3607 MSymbol sym = msymbol (name);
3609 MCodingSystem *coding;
3612 MSTRUCT_MALLOC (coding, MERROR_CODING);
3614 if ((coding->type = (MSymbol) mplist_get (plist, Mtype)) == Mnil)
3615 coding->type = Mcharset;
3616 pl = (MPlist *) mplist_get (plist, Mcharsets);
3618 MERROR (MERROR_CODING, Mnil);
3619 coding->ncharsets = mplist_length (pl);
3620 if (coding->ncharsets > NUM_SUPPORTED_CHARSETS)
3621 coding->ncharsets = NUM_SUPPORTED_CHARSETS;
3622 for (i = 0; i < coding->ncharsets; i++, pl = MPLIST_NEXT (pl))
3624 MSymbol charset_name;
3626 if (MPLIST_KEY (pl) != Msymbol)
3627 MERROR (MERROR_CODING, Mnil);
3628 charset_name = MPLIST_SYMBOL (pl);
3629 if (! (coding->charsets[i] = MCHARSET (charset_name)))
3630 MERROR (MERROR_CODING, Mnil);
3633 coding->resetter = resetter;
3634 coding->decoder = decoder;
3635 coding->encoder = encoder;
3636 coding->ascii_compatible = 0;
3637 coding->extra_info = extra_info;
3638 coding->extra_spec = NULL;
3641 if (coding->type == Mcharset)
3643 if (! coding->resetter)
3644 coding->resetter = reset_coding_charset;
3645 if (! coding->decoder)
3646 coding->decoder = decode_coding_charset;
3647 if (! coding->encoder)
3648 coding->encoder = encode_coding_charset;
3650 else if (coding->type == Mutf)
3652 MCodingInfoUTF *info = malloc (sizeof (MCodingInfoUTF));
3655 if (! coding->resetter)
3656 coding->resetter = reset_coding_utf;
3658 info->code_unit_bits = (int) mplist_get (plist, Mcode_unit);
3659 if (info->code_unit_bits == 8)
3661 if (! coding->decoder)
3662 coding->decoder = decode_coding_utf_8;
3663 if (! coding->encoder)
3664 coding->encoder = encode_coding_utf_8;
3666 else if (info->code_unit_bits == 16)
3668 if (! coding->decoder)
3669 coding->decoder = decode_coding_utf_16;
3670 if (! coding->encoder)
3671 coding->encoder = encode_coding_utf_16;
3673 else if (info->code_unit_bits == 32)
3675 if (! coding->decoder)
3676 coding->decoder = decode_coding_utf_32;
3677 if (! coding->encoder)
3678 coding->encoder = encode_coding_utf_32;
3681 MERROR (MERROR_CODING, Mnil);
3682 val = (MSymbol) mplist_get (plist, Mbom);
3685 else if (val == Mmaybe)
3690 info->endian = (mplist_get (plist, Mlittle_endian) ? 1 : 0);
3691 coding->extra_info = info;
3693 else if (coding->type == Miso_2022)
3695 MCodingInfoISO2022 *info = malloc (sizeof (MCodingInfoISO2022));
3697 if (! coding->resetter)
3698 coding->resetter = reset_coding_iso_2022;
3699 if (! coding->decoder)
3700 coding->decoder = decode_coding_iso_2022;
3701 if (! coding->encoder)
3702 coding->encoder = encode_coding_iso_2022;
3704 info->initial_invocation[0] = 0;
3705 info->initial_invocation[1] = -1;
3706 pl = (MPlist *) mplist_get (plist, Minvocation);
3709 if (MPLIST_KEY (pl) != Minteger)
3710 MERROR (MERROR_CODING, Mnil);
3711 info->initial_invocation[0] = MPLIST_INTEGER (pl);
3712 if (! MPLIST_TAIL_P (pl))
3714 pl = MPLIST_NEXT (pl);
3715 if (MPLIST_KEY (pl) != Minteger)
3716 MERROR (MERROR_CODING, Mnil);
3717 info->initial_invocation[1] = MPLIST_INTEGER (pl);
3720 memset (info->designations, 0, sizeof (info->designations));
3721 for (i = 0, pl = (MPlist *) mplist_get (plist, Mdesignation);
3722 i < 32 && pl && MPLIST_KEY (pl) == Minteger;
3723 i++, pl = MPLIST_NEXT (pl))
3724 info->designations[i] = MPLIST_INTEGER (pl);
3727 MPLIST_DO (pl, (MPlist *) mplist_get (plist, Mflags))
3731 if (MPLIST_KEY (pl) != Msymbol)
3732 MERROR (MERROR_CODING, Mnil);
3733 val = MPLIST_SYMBOL (pl);
3734 if (val == Mreset_at_eol)
3735 info->flags |= MCODING_ISO_RESET_AT_EOL;
3736 else if (val == Mreset_at_cntl)
3737 info->flags |= MCODING_ISO_RESET_AT_CNTL;
3738 else if (val == Meight_bit)
3739 info->flags |= MCODING_ISO_EIGHT_BIT;
3740 else if (val == Mlong_form)
3741 info->flags |= MCODING_ISO_LOCKING_SHIFT;
3742 else if (val == Mdesignation_g0)
3743 info->flags |= MCODING_ISO_DESIGNATION_G0;
3744 else if (val == Mdesignation_g1)
3745 info->flags |= MCODING_ISO_DESIGNATION_G1;
3746 else if (val == Mdesignation_ctext)
3747 info->flags |= MCODING_ISO_DESIGNATION_CTEXT;
3748 else if (val == Mdesignation_ctext_ext)
3749 info->flags |= MCODING_ISO_DESIGNATION_CTEXT_EXT;
3750 else if (val == Mlocking_shift)
3751 info->flags |= MCODING_ISO_LOCKING_SHIFT;
3752 else if (val == Msingle_shift)
3753 info->flags |= MCODING_ISO_SINGLE_SHIFT;
3754 else if (val == Msingle_shift_7)
3755 info->flags |= MCODING_ISO_SINGLE_SHIFT_7;
3756 else if (val == Meuc_tw_shift)
3757 info->flags |= MCODING_ISO_EUC_TW_SHIFT;
3758 else if (val == Miso_6429)
3759 info->flags |= MCODING_ISO_ISO6429;
3760 else if (val == Mrevision_number)
3761 info->flags |= MCODING_ISO_REVISION_NUMBER;
3762 else if (val == Mfull_support)
3763 info->flags |= MCODING_ISO_FULL_SUPPORT;
3766 coding->extra_info = info;
3770 if (! coding->decoder || ! coding->encoder)
3771 MERROR (MERROR_CODING, Mnil);
3772 if (! coding->resetter)
3776 msymbol_put (sym, Mcoding, coding);
3777 msymbol_put (msymbol__canonicalize (sym), Mcoding, coding);
3778 plist = (MPlist *) mplist_get (plist, Maliases);
3781 MPLIST_DO (pl, plist)
3785 if (MPLIST_KEY (pl) != Msymbol)
3787 alias = MPLIST_SYMBOL (pl);
3788 msymbol_put (alias, Mcoding, coding);
3789 msymbol_put (msymbol__canonicalize (alias), Mcoding, coding);
3793 MLIST_APPEND1 (&coding_list, codings, coding, MERROR_CODING);
3801 @brief Resolve coding system name.
3803 The mconv_resolve_coding () function returns $SYMBOL if it
3804 represents a coding system. Otherwise, canonicalize $SYMBOL as to
3805 a coding system name, and if the canonicalized name represents a
3806 coding system, return it. Otherwise, return Mnil. */
3810 mconv_resolve_coding (MSymbol symbol)
3812 MCodingSystem *coding = find_coding (symbol);
3816 symbol = msymbol__canonicalize (symbol);
3817 coding = find_coding (symbol);
3819 return (coding ? coding->name : Mnil);
3826 @brief List symbols representing a coding system.
3828 The mconv_list_codings () function makes an array of symbols
3829 representing a coding system, stores the pointer to the array in a
3830 place pointed to by $SYMBOLS, and returns the length of the array. */
3833 mconv_list_codings (MSymbol **symbols)
3835 int i = coding_list.used + mplist_length (coding_definition_list);
3839 MTABLE_MALLOC ((*symbols), i, MERROR_CODING);
3841 MPLIST_DO (plist, coding_definition_list)
3842 (*symbols)[i++] = MPLIST_KEY (plist);
3843 for (j = 0; j < coding_list.used; j++)
3844 if (! mplist_find_by_key (coding_definition_list,
3845 coding_list.codings[j]->name))
3846 (*symbols)[i++] = coding_list.codings[j]->name;
3853 @brief Create a code converter bound to a buffer.
3855 The mconv_buffer_converter () function creates a pointer to a code
3856 converter for coding system $CODING. The code converter is bound
3857 to buffer area of $N bytes pointed to by $BUF. Subsequent
3858 decodings and encodings are done to/from this buffer area.
3860 $CODING can be #Mnil. In this case, a coding system associated
3861 with the current locale (LC_CTYPE) is used.
3864 If the operation was successful, mconv_buffer_converter () returns
3865 the created code converter. Otherwise it returns @c NULL and
3866 assigns an error code to the external variable #merror_code. */
3869 @brief ¥Ð¥Ã¥Õ¥¡¤Ë·ë¤ÓÉÕ¤±¤é¤ì¤¿¥³¡¼¥É¥³¥ó¥Ð¡¼¥¿¤òºî¤ë
3871 ´Ø¿ô mconv_buffer_converter () ¤Ï¡¢¥³¡¼¥É·Ï $CODING ÍѤΥ³¡¼¥É¥³¥ó
3872 ¥Ð¡¼¥¿¤òºî¤ë¡£¤³¤Î¥³¡¼¥É¥³¥ó¥Ð¡¼¥¿¤Ï¡¢$BUF ¤Ç¼¨¤µ¤ì¤ëÂ礤µ $N ¥Ð
3873 ¥¤¥È¤Î¥Ð¥Ã¥Õ¥¡Îΰè¤Ë·ë¤ÓÉÕ¤±¤é¤ì¤ë¡£¤³¤ì°Ê¹ß¤Î¥Ç¥³¡¼¥É¤ª¤è¤Ó
3874 ¥¨¥ó¥³¡¼¥É¤Ï¡¢¤³¤Î¥Ð¥Ã¥Õ¥¡Îΰè¤ËÂФ·¤Æ¹Ô¤Ê¤ï¤ì¤ë¡£
3876 $CODING ¤Ï #Mnil ¤Ç¤¢¤Ã¤Æ¤â¤è¤¤¡£¤³¤Î¾ì¹ç¤Ï¸½ºß¤Î¥í¥±¡¼¥ë
3877 (LC_CTYPE) ¤Ë´ØÏ¢ÉÕ¤±¤é¤ì¤¿¥³¡¼¥É·Ï¤¬»È¤ï¤ì¤ë¡£
3880 ¤â¤·½èÍý¤¬À®¸ù¤¹¤ì¤Ð mconv_buffer_converter () ¤Ï ºî¤é¤ì¤¿¥³¡¼¥É¥³
3881 ¥ó¥Ð¡¼¥¿¤òÊÖ¤¹¡£¤½¤¦¤Ç¤Ê¤±¤ì¤Ð @c NULL ¤òÊÖ¤·¡¢³°ÉôÊÑ¿ô #merror_code
3882 ¤Ë¥¨¥é¡¼¥³¡¼¥É¤òÀßÄꤹ¤ë¡£
3884 @latexonly \IPAlabel{mconverter} @endlatexonly */
3888 @c MERROR_SYMBOL, @c MERROR_CODING
3891 mconv_stream_converter () */
3894 mconv_buffer_converter (MSymbol name, unsigned char *buf, int n)
3896 MCodingSystem *coding;
3897 MConverter *converter;
3898 MConverterStatus *internal;
3901 name = mlocale_get_prop (mlocale__ctype, Mcoding);
3902 coding = find_coding (name);
3904 MERROR (MERROR_CODING, NULL);
3905 MSTRUCT_CALLOC (converter, MERROR_CODING);
3906 MSTRUCT_CALLOC (internal, MERROR_CODING);
3907 converter->internal_info = internal;
3908 internal->coding = coding;
3909 if (coding->resetter
3910 && (*coding->resetter) (converter) < 0)
3914 MERROR (MERROR_CODING, NULL);
3917 internal->unread = mtext ();
3918 internal->work_mt = mtext ();
3919 mtext__enlarge (internal->work_mt, MAX_UTF8_CHAR_BYTES);
3920 internal->buf = buf;
3922 internal->bufsize = n;
3923 internal->binding = BINDING_BUFFER;
3931 @brief Create a code converter bound to a stream.
3933 The mconv_stream_converter () function create a pointer to a code
3934 converter for coding system $CODING. The code converter is bound
3935 to stream $FP. Subsequent decodings and encodings are done
3936 to/from this stream.
3938 $CODING can be #Mnil. In this case, a coding system associated
3939 with the current locale (LC_CTYPE) is used.
3941 @return If the operation was successful, mconv_stream_converter ()
3942 returns the created code converter. Otherwise it returns @c NULL
3943 and assigns an error code to the external variable @c
3947 @brief ¥¹¥È¥ê¡¼¥à¤Ë·ë¤ÓÉÕ¤±¤é¤ì¤¿¥³¡¼¥É¥³¥ó¥Ð¡¼¥¿¤òºî¤ë
3949 ´Ø¿ô mconv_stream_converter () ¤Ï¡¢¥³¡¼¥É·Ï $CODING ÍѤΥ³¡¼¥É¥³¥ó
3950 ¥Ð¡¼¥¿¤òºî¤ë¡£¤³¤Î¥³¡¼¥É¥³¥ó¥Ð¡¼¥¿¤Ï¡¢¥¹¥È¥ê¡¼¥à $FP ¤Ë·ë¤ÓÉÕ¤±¤é
3951 ¤ì¤ë¡£¤³¤ì°Ê¹ß¤Î¥Ç¥³¡¼¥É¤ª¤è¤Ó¥¨¥ó¥³¡¼¥É¤Ï¡¢¤³¤Î¥¹¥È¥ê¡¼¥à¤ËÂФ·¤Æ
3954 $CODING ¤Ï #Mnil ¤Ç¤¢¤Ã¤Æ¤â¤è¤¤¡£¤³¤Î¾ì¹ç¤Ï¸½ºß¤Î¥í¥±¡¼¥ë
3955 (LC_CTYPE) ¤Ë´ØÏ¢ÉÕ¤±¤é¤ì¤¿¥³¡¼¥É·Ï¤¬»È¤ï¤ì¤ë¡£
3958 ¤â¤·½èÍý¤¬À®¸ù¤¹¤ì¤Ð¡¢mconv_stream_converter () ¤Ïºî¤é¤ì¤¿¥³¡¼¥É¥³
3959 ¥ó¥Ð¡¼¥¿¤òÊÖ¤¹¡£¤½¤¦¤Ç¤Ê¤±¤ì¤Ð @c NULL ¤òÊÖ¤·¡¢³°ÉôÊÑ¿ô #merror_code
3960 ¤Ë¥¨¥é¡¼¥³¡¼¥É¤òÀßÄꤹ¤ë¡£
3962 @latexonly \IPAlabel{mconverter} @endlatexonly */
3966 @c MERROR_SYMBOL, @c MERROR_CODING
3969 mconv_buffer_converter () */
3972 mconv_stream_converter (MSymbol name, FILE *fp)
3974 MCodingSystem *coding;
3975 MConverter *converter;
3976 MConverterStatus *internal;
3979 name = mlocale_get_prop (mlocale__ctype, Mcoding);
3980 coding = find_coding (name);
3982 MERROR (MERROR_CODING, NULL);
3983 MSTRUCT_CALLOC (converter, MERROR_CODING);
3984 MSTRUCT_CALLOC (internal, MERROR_CODING);
3985 converter->internal_info = internal;
3986 internal->coding = coding;
3987 if (coding->resetter
3988 && (*coding->resetter) (converter) < 0)
3992 MERROR (MERROR_CODING, NULL);
3995 if (fseek (fp, 0, SEEK_CUR) < 0)
4003 internal->seekable = 0;
4006 internal->seekable = 1;
4007 internal->unread = mtext ();
4008 internal->work_mt = mtext ();
4009 mtext__enlarge (internal->work_mt, MAX_UTF8_CHAR_BYTES);
4011 internal->binding = BINDING_STREAM;
4019 @brief Reset a code converter.
4021 The mconv_reset_converter () function resets code converter
4022 $CONVERTER to the initial state.
4025 If $CONVERTER->coding has its own reseter function,
4026 mconv_reset_converter () returns the result of that function
4027 applied to $CONVERTER. Otherwise it returns 0. */
4030 @brief ¥³¡¼¥É¥³¥ó¥Ð¡¼¥¿¤ò¥ê¥»¥Ã¥È¤¹¤ë
4032 ´Ø¿ô mconv_reset_converter () ¤Ï¥³¡¼¥É¥³¥ó¥Ð¡¼¥¿ $CONVERTER ¤ò½é´ü
4036 ¤â¤· $CONVERTER->coding ¤Ë¥ê¥»¥Ã¥ÈÍѤδؿô¤¬ÄêµÁ¤µ¤ì¤Æ¤¤¤ë¤Ê¤é¤Ð¡¢
4037 mconv_reset_converter () ¤Ï¤½¤Î´Ø¿ô¤Ë $CONVERTER ¤òŬÍѤ·¤¿·ë²Ì¤ò
4038 ÊÖ¤·¡¢¤½¤¦¤Ç¤Ê¤±¤ì¤Ð0¤òÊÖ¤¹¡£ */
4041 mconv_reset_converter (MConverter *converter)
4043 MConverterStatus *internal = (MConverterStatus *) converter->internal_info;
4045 converter->nchars = converter->nbytes = 0;
4046 converter->result = MCONVERSION_RESULT_SUCCESS;
4047 internal->carryover_bytes = 0;
4048 mtext_reset (internal->unread);
4049 if (internal->coding->resetter)
4050 return (*internal->coding->resetter) (converter);
4057 @brief Free a code converter.
4059 The mconv_free_converter () function frees the code converter
4063 @brief ¥³¡¼¥É¥³¥ó¥Ð¡¼¥¿¤ò²òÊü¤¹¤ë
4065 ´Ø¿ô mconv_free_converter () ¤Ï¥³¡¼¥É¥³¥ó¥Ð¡¼¥¿ $CONVERTER ¤ò²òÊü
4069 mconv_free_converter (MConverter *converter)
4071 MConverterStatus *internal = (MConverterStatus *) converter->internal_info;
4073 M17N_OBJECT_UNREF (internal->work_mt);
4074 M17N_OBJECT_UNREF (internal->unread);
4082 @brief Bind a buffer to a code converter.
4084 The mconv_rebind_buffer () function binds buffer area of $N bytes
4085 pointed to by $BUF to code converter $CONVERTER. Subsequent
4086 decodings and encodings are done to/from this newly bound buffer
4090 This function always returns $CONVERTER. */
4093 @brief ¥³¡¼¥É¥³¥ó¥Ð¡¼¥¿¤Ë¥Ð¥Ã¥Õ¥¡Îΰè¤ò·ë¤ÓÉÕ¤±¤ë
4095 ´Ø¿ô mconv_rebind_buffer () ¤Ï¡¢$BUF ¤Ë¤è¤Ã¤Æ»Ø¤µ¤ì¤¿Â礤µ $N ¥Ð
4096 ¥¤¥È¤Î¥Ð¥Ã¥Õ¥¡Îΰè¤ò¥³¡¼¥É¥³¥ó¥Ð¡¼¥¿ $CONVERTER ¤Ë·ë¤ÓÉÕ¤±¤ë¡£¤³¤ì
4097 °Ê¹ß¤Î¥Ç¥³¡¼¥É¤ª¤è¤Ó¥¨¥ó¥³¡¼¥É¤Ï¡¢¤³¤Î¿·¤¿¤Ë·ë¤ÓÉÕ¤±¤é¤ì¤¿¥Ð¥Ã¥Õ¥¡
4098 Îΰè¤ËÂФ·¤Æ¹Ô¤Ê¤ï¤ì¤ë¤è¤¦¤Ë¤Ê¤ë¡£
4101 ¤³¤Î´Ø¿ô¤Ï¾ï¤Ë $CONVERTER ¤òÊÖ¤¹¡£
4103 @latexonly \IPAlabel{mconv_rebind_buffer} @endlatexonly */
4107 mconv_rebind_stream () */
4110 mconv_rebind_buffer (MConverter *converter, unsigned char *buf, int n)
4112 MConverterStatus *internal = (MConverterStatus *) converter->internal_info;
4114 internal->buf = buf;
4116 internal->bufsize = n;
4117 internal->binding = BINDING_BUFFER;
4124 @brief Bind a stream to a code converter.
4126 The mconv_rebind_stream () function binds stream $FP to code
4127 converter $CONVERTER. Following decodings and encodings are done
4128 to/from this newly bound stream.
4131 This function always returns $CONVERTER. */
4134 @brief ¥³¡¼¥É¥³¥ó¥Ð¡¼¥¿¤Ë¥¹¥È¥ê¡¼¥à¤ò·ë¤ÓÉÕ¤±¤ë
4136 ´Ø¿ô mconv_rebind_stream () ¤Ï¡¢¥¹¥È¥ê¡¼¥à $FP ¤ò¥³¡¼¥É¥³¥ó¥Ð¡¼¥¿
4137 $CONVERTER ¤Ë·ë¤ÓÉÕ¤±¤ë¡£¤³¤ì°Ê¹ß¤Î¥Ç¥³¡¼¥É¤ª¤è¤Ó¥¨¥ó¥³¡¼¥É¤Ï¡¢
4138 ¤³¤Î¿·¤¿¤Ë·ë¤ÓÉÕ¤±¤é¤ì¤¿¥¹¥È¥ê¡¼¥à¤ËÂФ·¤Æ¹Ô¤Ê¤ï¤ì¤ë¤è¤¦¤Ë¤Ê¤ë¡£
4141 ¤³¤Î´Ø¿ô¤Ï¾ï¤Ë $CONVERTER ¤òÊÖ¤¹¡£
4143 @latexonly \IPAlabel{mconv_rebind_stream} @endlatexonly */
4147 mconv_rebind_buffer () */
4150 mconv_rebind_stream (MConverter *converter, FILE *fp)
4152 MConverterStatus *internal = (MConverterStatus *) converter->internal_info;
4154 if (fseek (fp, 0, SEEK_CUR) < 0)
4158 internal->seekable = 0;
4161 internal->seekable = 1;
4163 internal->binding = BINDING_STREAM;
4170 @brief Decode a byte sequence into an M-text.
4172 The mconv_decode () function decodes a byte sequence and appends
4173 the result at the end of M-text $MT. The source byte sequence is
4174 taken from currently bound the buffer area or the stream.
4177 If the operation was successful, mconv_decode () returns updated
4178 $MT. Otherwise it returns @c NULL and assigns an error code to
4179 the external variable #merror_code. */
4182 @brief ¥Ð¥¤¥ÈÎó¤ò M-text ¤Ë¥Ç¥³¡¼¥É¤¹¤ë
4184 ´Ø¿ô mconv_decode () ¤Ï¡¢¥Ð¥¤¥ÈÎó¤ò¥Ç¥³¡¼¥É¤·¤Æ¤½¤Î·ë²Ì¤ò M-text
4185 $MT ¤ÎËöÈø¤ËÄɲ乤롣¥Ç¥³¡¼¥É¸µ¤Î¥Ð¥¤¥ÈÎó¤Ï¡¢¸½ºß·ë¤ÓÉÕ¤±¤é¤ì¤Æ¤¤¤ë
4186 ¥Ð¥Ã¥Õ¥¡Îΰ褢¤ë¤¤¤Ï¥¹¥È¥ê¡¼¥à¤«¤é¼è¤é¤ì¤ë¡£
4189 ¤â¤·½èÍý¤¬À®¸ù¤¹¤ì¤Ð¡¢mconv_decode () ¤Ï¹¹¿·¤µ¤ì¤¿ $MT ¤òÊÖ¤¹¡£¤½
4190 ¤¦¤Ç¤Ê¤±¤ì¤Ð @c NULL ¤òÊÖ¤·¡¢³°ÉôÊÑ¿ô #merror_code ¤Ë¥¨¥é¡¼¥³¡¼¥É¤ò
4195 @c MERROR_IO, @c MERROR_CODING
4198 mconv_rebind_buffer (), mconv_rebind_stream (),
4199 mconv_encode (), mconv_encode_range (),
4200 mconv_decode_buffer (), mconv_decode_stream () */
4203 mconv_decode (MConverter *converter, MText *mt)
4205 MConverterStatus *internal = (MConverterStatus *) converter->internal_info;
4206 int at_most = converter->at_most > 0 ? converter->at_most : -1;
4209 M_CHECK_READONLY (mt, NULL);
4212 mtext__enlarge (mt, MAX_UTF8_CHAR_BYTES);
4214 converter->nchars = converter->nbytes = 0;
4215 converter->result = MCONVERSION_RESULT_SUCCESS;
4217 n = mtext_nchars (internal->unread);
4223 if (at_most > 0 && at_most < limit)
4226 for (i = 0, n -= 1; i < limit; i++, converter->nchars++, n--)
4227 mtext_cat_char (mt, mtext_ref_char (internal->unread, n));
4228 mtext_del (internal->unread, n + 1, internal->unread->nchars);
4231 if (at_most == limit)
4233 converter->at_most -= converter->nchars;
4237 if (internal->binding == BINDING_BUFFER)
4239 (*internal->coding->decoder) (internal->buf + internal->used,
4240 internal->bufsize - internal->used,
4242 internal->used += converter->nbytes;
4244 else if (internal->binding == BINDING_STREAM)
4246 unsigned char work[CONVERT_WORKSIZE];
4247 int last_block = converter->last_block;
4248 int use_fread = at_most < 0 && internal->seekable;
4250 converter->last_block = 0;
4253 int nbytes, prev_nbytes;
4255 if (feof (internal->fp))
4258 nbytes = fread (work, sizeof (unsigned char), CONVERT_WORKSIZE,
4262 int c = getc (internal->fp);
4265 work[0] = c, nbytes = 1;
4270 if (ferror (internal->fp))
4272 converter->result = MCONVERSION_RESULT_IO_ERROR;
4277 converter->last_block = last_block;
4278 prev_nbytes = converter->nbytes;
4279 (*internal->coding->decoder) (work, nbytes, mt, converter);
4280 if (converter->nbytes - prev_nbytes < nbytes)
4283 fseek (internal->fp, converter->nbytes - prev_nbytes - nbytes,
4286 ungetc (work[0], internal->fp);
4290 || (converter->at_most > 0
4291 && converter->nchars == converter->at_most))
4294 converter->last_block = last_block;
4296 else /* internal->binding == BINDING_NONE */
4297 MERROR (MERROR_CODING, NULL);
4299 converter->at_most = at_most;
4300 return ((converter->result == MCONVERSION_RESULT_SUCCESS
4301 || converter->result == MCONVERSION_RESULT_INSUFFICIENT_SRC)
4308 @brief Decode a buffer area based on a coding system.
4310 The mconv_decode_buffer () function decodes $N bytes of buffer
4311 area pointed to by $BUF based on the coding system $NAME. A
4312 temporary code converter for decoding is automatically created
4316 If the operation was successful, mconv_decode_buffer () returns
4317 the resulting M-text. Otherwise it returns NULL and assigns an
4318 error code to the external variable #merror_code. */
4321 @brief ¥³¡¼¥É·Ï¤Ë´ð¤Å¤¤¤Æ¥Ð¥Ã¥Õ¥¡Îΰè¤ò¥Ç¥³¡¼¥É¤¹¤ë
4323 ´Ø¿ô mconv_decode_buffer () ¤Ï¡¢$BUF ¤Ë¤è¤Ã¤Æ»Ø¤µ¤ì¤¿ $N ¥Ð¥¤¥È¤Î
4324 ¥Ð¥Ã¥Õ¥¡Îΰè¤ò¡¢¥³¡¼¥É·Ï $NAME ¤Ë´ð¤Å¤¤¤Æ¥Ç¥³¡¼¥É¤¹¤ë¡£¥Ç¥³¡¼¥É¤Ë
4325 ɬÍפʥ³¡¼¥É¥³¥ó¥Ð¡¼¥¿¤ÎºîÀ®¤È²òÊü¤Ï¼«Æ°Åª¤Ë¹Ô¤Ê¤ï¤ì¤ë¡£
4328 ¤â¤·½èÍý¤¬À®¸ù¤¹¤ì¤Ð¡¢mconv_decode_buffer () ¤ÏÆÀ¤é¤ì¤¿ M-text ¤ò
4329 ÊÖ¤¹¡£¤½¤¦¤Ç¤Ê¤±¤ì¤Ð @c NULL ¤òÊÖ¤·¡¢³°ÉôÊÑ¿ô #merror_code ¤Ë¥¨¥é¡¼
4330 ¥³¡¼¥É¤òÀßÄꤹ¤ë¡£ */
4334 @c MERROR_IO, @c MERROR_CODING
4337 mconv_decode (), mconv_decode_stream () */
4340 mconv_decode_buffer (MSymbol name, unsigned char *buf, int n)
4342 MConverter *converter = mconv_buffer_converter (name, buf, n);
4348 if (! mconv_decode (converter, mt))
4350 M17N_OBJECT_UNREF (mt);
4353 mconv_free_converter (converter);
4360 @brief Decode a stream input based on a coding system.
4362 The mconv_decode_stream () function decodes the entire byte
4363 sequence read in from stream $FP based on the coding system $NAME.
4364 A code converter for decoding is automatically created and freed.
4367 If the operation was successful, mconv_decode_stream () returns
4368 the resulting M-text. Otherwise it returns NULL and assigns an
4369 error code to the external variable #merror_code. */
4372 @brief ¥³¡¼¥É·Ï¤Ë´ð¤Å¤¤¤Æ¥¹¥È¥ê¡¼¥àÆþÎϤò¥Ç¥³¡¼¥É¤¹¤ë
4374 ´Ø¿ô mconv_decode_stream () ¤Ï¡¢¥¹¥È¥ê¡¼¥à $FP ¤«¤éÆɤ߹þ¤Þ¤ì¤ë¥Ð
4375 ¥¤¥ÈÎóÁ´ÂΤò¡¢¥³¡¼¥É·Ï $NAME ¤Ë´ð¤Å¤¤¤Æ¥Ç¥³¡¼¥É¤¹¤ë¡£¥Ç¥³¡¼¥É¤Ëɬ
4376 Íפʥ³¡¼¥É¥³¥ó¥Ð¡¼¥¿¤ÎºîÀ®¤È²òÊü¤Ï¼«Æ°Åª¤Ë¹Ô¤Ê¤ï¤ì¤ë¡£
4379 ¤â¤·½èÍý¤¬À®¸ù¤¹¤ì¤Ð¡¢mconv_decode_stream () ¤ÏÆÀ¤é¤ì¤¿ M-text ¤òÊÖ
4380 ¤¹¡£¤½¤¦¤Ç¤Ê¤±¤ì¤Ð @c NULL ¤òÊÖ¤·¡¢³°ÉôÊÑ¿ô #merror_code ¤Ë¥¨¥é¡¼¥³¡¼
4385 @c MERROR_IO, @c MERROR_CODING
4388 mconv_decode (), mconv_decode_buffer () */
4391 mconv_decode_stream (MSymbol name, FILE *fp)
4393 MConverter *converter = mconv_stream_converter (name, fp);
4399 if (! mconv_decode (converter, mt))
4401 M17N_OBJECT_UNREF (mt);
4404 mconv_free_converter (converter);
4410 /***en @brief Encode an M-text into a byte sequence.
4412 The mconv_encode () function encodes M-text $MT and writes the
4413 resulting byte sequence into the buffer area or the stream that is
4414 currently bound to code converter $CONVERTER.
4417 If the operation was successful, mconv_encode () returns the
4418 number of written bytes. Otherwise it returns -1 and assigns an
4419 error code to the external variable #merror_code. */
4422 @brief M-text ¤ò¥Ð¥¤¥ÈÎó¤Ë¥¨¥ó¥³¡¼¥É¤¹¤ë
4424 ´Ø¿ô mconv_encode () ¤Ï¡¢M-text $MT ¤ò¥¨¥ó¥³¡¼¥É¤·¤Æ¡¢¥³¡¼¥É¥³¥ó¥Ð¡¼
4425 ¥¿ $CONVERTER ¤Ë¸½ºß·ë¤ÓÉÕ¤±¤é¤ì¤Æ¤¤¤ë¥Ð¥Ã¥Õ¥¡Îΰ褢¤ë¤¤¤Ï¥¹¥È¥ê¡¼
4429 ¤â¤·½èÍý¤¬À®¸ù¤¹¤ì¤Ð¡¢mconv_encode () ¤Ï½ñ¤¹þ¤Þ¤ì¤¿¥Ð¥¤¥È¿ô¤òÊÖ¤¹¡£
4430 ¤½¤¦¤Ç¤Ê¤±¤ì¤Ð -1 ¤òÊÖ¤·¡¢³°ÉôÊÑ¿ô #merror_code ¤Ë¥¨¥é¡¼¥³¡¼¥É¤òÀßÄê
4435 @c MERROR_IO, @c MERROR_CODING
4438 mconv_rebind_buffer (), mconv_rebind_stream(),
4439 mconv_decode (), mconv_encode_range () */
4442 mconv_encode (MConverter *converter, MText *mt)
4444 return mconv_encode_range (converter, mt, 0, mtext_nchars (mt));
4450 @brief Encode a part of an M-text
4452 The mconv_encode_range () function encodes the text between $FROM
4453 (inclusive) and $TO (exclusive) in M-text $MT and writes the
4454 resulting byte sequence into the buffer area or the stream that is
4455 currently bound to code converter $CONVERTER.
4458 If the operation was successful, mconv_encode_range () returns the
4459 number of written bytes. Otherwise it returns -1 and assigns an
4460 error code to the external variable #merror_code. */
4463 @brief M-text ¤Î°ìÉô¤ò¤ò¥Ð¥¤¥ÈÎó¤Ë¥¨¥ó¥³¡¼¥É¤¹¤ë
4465 ´Ø¿ô mconv_encode_range () ¤Ï¡¢M-text $MT ¤Î $FROM ¡Ê´Þ¤à¡Ë¤«¤é
4466 $TO ¡Ê´Þ¤Þ¤Ê¤¤¡Ë¤Þ¤Ç¤ÎÈϰϤΥƥ¥¹¥È¤ò¥¨¥ó¥³¡¼¥É¤·¤Æ¡¢¥³¡¼¥É¥³¥ó¥Ð¡¼
4467 ¥¿ $CONVERTER ¤Ë¸½ºß·ë¤ÓÉÕ¤±¤é¤ì¤Æ¤¤¤ë¥Ð¥Ã¥Õ¥¡Îΰ褢¤ë¤¤¤Ï¥¹¥È¥ê¡¼
4471 ¤â¤·½èÍý¤¬À®¸ù¤¹¤ì¤Ð¡¢mconv_encode_range () ¤Ï½ñ¤¹þ¤Þ¤ì¤¿¥Ð¥¤¥È¿ô
4472 ¤òÊÖ¤¹¡£¤½¤¦¤Ç¤Ê¤±¤ì¤Ð -1 ¤òÊÖ¤·¡¢³°ÉôÊÑ¿ô #merror_code ¤Ë¥¨¥é¡¼¥³¡¼
4477 @c MERROR_RANGE, @c MERROR_IO, @c MERROR_CODING
4480 mconv_rebind_buffer (), mconv_rebind_stream(),
4481 mconv_decode (), mconv_encode () */
4484 mconv_encode_range (MConverter *converter, MText *mt, int from, int to)
4486 MConverterStatus *internal = (MConverterStatus *) converter->internal_info;
4488 M_CHECK_POS_X (mt, from, -1);
4489 M_CHECK_POS_X (mt, to, -1);
4493 if (converter->at_most > 0 && from + converter->at_most < to)
4494 to = from + converter->at_most;
4496 converter->nchars = converter->nbytes = 0;
4497 converter->result = MCONVERSION_RESULT_SUCCESS;
4499 mtext_put_prop (mt, from, to, Mcoding, internal->coding->name);
4500 if (internal->binding == BINDING_BUFFER)
4502 (*internal->coding->encoder) (mt, from, to,
4503 internal->buf + internal->used,
4504 internal->bufsize - internal->used,
4506 internal->used += converter->nbytes;
4508 else if (internal->binding == BINDING_STREAM)
4510 unsigned char work[CONVERT_WORKSIZE];
4515 int prev_nbytes = converter->nbytes;
4518 (*internal->coding->encoder) (mt, from, to, work,
4519 CONVERT_WORKSIZE, converter);
4520 this_nbytes = converter->nbytes - prev_nbytes;
4521 while (written < this_nbytes)
4523 int wrtn = fwrite (work + written, sizeof (unsigned char),
4524 this_nbytes - written, internal->fp);
4526 if (ferror (internal->fp))
4530 if (written < this_nbytes)
4532 converter->result = MCONVERSION_RESULT_IO_ERROR;
4535 from += converter->nchars;
4538 else /* fail safe */
4539 MERROR (MERROR_CODING, -1);
4541 return ((converter->result == MCONVERSION_RESULT_SUCCESS
4542 || converter->result == MCONVERSION_RESULT_INSUFFICIENT_DST)
4543 ? converter->nbytes : -1);
4549 @brief Encode an M-text into a buffer area.
4551 The mconv_encode_buffer () function encodes M-text $MT based on
4552 coding system $NAME and writes the resulting byte sequence into the
4553 buffer area pointed to by $BUF. At most $N bytes are written. A
4554 temporary code converter for encoding is automatically created
4558 If the operation was successful, mconv_encode_buffer () returns
4559 the number of written bytes. Otherwise it returns -1 and assigns
4560 an error code to the external variable #merror_code. */
4563 @brief M-text ¤ò¥¨¥ó¥³¡¼¥É¤·¤Æ¥Ð¥Ã¥Õ¥¡Îΰè¤Ë½ñ¤¹þ¤à
4565 ´Ø¿ô mconv_encode_buffer () ¤ÏM-text $MT ¤ò¥³¡¼¥É·Ï $NAME ¤Ë´ð¤Å¤¤
4566 ¤Æ¥¨¥ó¥³¡¼¥É¤·¡¢ÆÀ¤é¤ì¤¿¥Ð¥¤¥ÈÎó¤ò $BUF ¤Î»Ø¤¹¥Ð¥Ã¥Õ¥¡Îΰè¤Ë½ñ¤¹þ
4567 ¤à¡£$N ¤Ï½ñ¤¹þ¤àºÇÂç¥Ð¥¤¥È¿ô¤Ç¤¢¤ë¡£¥¨¥ó¥³¡¼¥É¤ËɬÍפʥ³¡¼¥É¥³¥ó
4568 ¥Ð¡¼¥¿¤ÎºîÀ®¤È²òÊü¤Ï¼«Æ°Åª¤Ë¹Ô¤Ê¤ï¤ì¤ë¡£
4571 ¤â¤·½èÍý¤¬À®¸ù¤¹¤ì¤Ð¡¢mconv_encode_buffer () ¤Ï½ñ¤¹þ¤Þ¤ì¤¿¥Ð¥¤¥È
4572 ¿ô¤òÊÖ¤¹¡£¤½¤¦¤Ç¤Ê¤±¤ì¤Ð-1¤òÊÖ¤·¡¢³°ÉôÊÑ¿ô #merror_code ¤Ë¥¨¥é¡¼¥³¡¼
4577 @c MERROR_IO, @c MERROR_CODING
4580 mconv_encode (), mconv_encode_stream () */
4583 mconv_encode_buffer (MSymbol name, MText *mt, unsigned char *buf, int n)
4585 MConverter *converter = mconv_buffer_converter (name, buf, n);
4590 ret = mconv_encode (converter, mt);
4591 mconv_free_converter (converter);
4598 @brief Encode an M-text to write to a stream.
4600 The mconv_encode_stream () function encodes M-text $MT based on
4601 coding system $NAME and writes the resulting byte sequence to
4602 stream $FP. A temporary code converter for encoding is
4603 automatically created and freed.
4606 If the operation was successful, mconv_encode_stream () returns
4607 the number of written bytes. Otherwise it returns -1 and assigns
4608 an error code to the external variable #merror_code. */
4611 @brief M-text ¤ò¥¨¥ó¥³¡¼¥É¤·¤Æ¥¹¥È¥ê¡¼¥à¤Ë½ñ¤¹þ¤à
4613 ´Ø¿ô mconv_encode_stream () ¤ÏM-text $MT ¤ò¥³¡¼¥É·Ï $NAME ¤Ë´ð¤Å¤¤
4614 ¤Æ¥¨¥ó¥³¡¼¥É¤·¡¢ÆÀ¤é¤ì¤¿¥Ð¥¤¥ÈÎó¤ò¥¹¥È¥ê¡¼¥à $FP ¤Ë½ñ¤½Ð¤¹¡£¥¨¥ó
4615 ¥³¡¼¥É¤ËɬÍפʥ³¡¼¥É¥³¥ó¥Ð¡¼¥¿¤ÎºîÀ®¤È²òÊü¤Ï¼«Æ°Åª¤Ë¹Ô¤Ê¤ï¤ì¤ë¡£
4618 ¤â¤·½èÍý¤¬À®¸ù¤¹¤ì¤Ð¡¢mconv_encode_stream () ¤Ï½ñ¤¹þ¤Þ¤ì¤¿¥Ð¥¤¥È¿ô
4619 ¤òÊÖ¤¹¡£¤½¤¦¤Ç¤Ê¤±¤ì¤Ð-1¤òÊÖ¤·¡¢³°ÉôÊÑ¿ô #merror_code ¤Ë¥¨¥é¡¼¥³¡¼¥É
4624 @c MERROR_IO, @c MERROR_CODING
4627 mconv_encode (), mconv_encode_buffer (), mconv_encode_file () */
4630 mconv_encode_stream (MSymbol name, MText *mt, FILE *fp)
4632 MConverter *converter = mconv_stream_converter (name, fp);
4637 ret = mconv_encode (converter, mt);
4638 mconv_free_converter (converter);
4645 @brief Read a character via a code converter.
4647 The mconv_getc () function reads one character from the buffer
4648 area or the stream that is currently bound to code converter
4649 $CONVERTER. The decoder of $CONVERTER is used to decode the byte
4650 sequence. The internal status of $CONVERTER is updated
4654 If the operation was successful, mconv_getc () returns the
4655 character read in. If the input source reaches EOF, it returns @c
4656 EOF without changing the external variable #merror_code. If an
4657 error is detected, it returns @c EOF and assigns an error code to
4661 @brief ¥³¡¼¥É¥³¥ó¥Ð¡¼¥¿·Ðͳ¤Ç1ʸ»úÆɤà
4663 ´Ø¿ô mconv_getc () ¤Ï¡¢¥³¡¼¥É¥³¥ó¥Ð¡¼¥¿ $CONVERTER ¤Ë¸½ºß·ë¤ÓÉÕ¤±
4664 ¤é¤ì¤Æ¤¤¤ë¥Ð¥Ã¥Õ¥¡Îΰ褢¤ë¤¤¤Ï¥¹¥È¥ê¡¼¥à¤«¤é1ʸ»ú¤òÆɤ߹þ¤à¡£¥Ð¥¤
4665 ¥ÈÎó¤Î¥Ç¥³¡¼¥É¤Ë¤Ï $CONVERTER ¤Î¥Ç¥³¡¼¥À¤¬ÍѤ¤¤é¤ì¤ë¡£$CONVERTER
4666 ¤ÎÆâÉô¾õÂÖ¤ÏɬÍפ˱þ¤¸¤Æ¹¹¿·¤µ¤ì¤ë¡£
4669 ½èÍý¤¬À®¸ù¤¹¤ì¤Ð¡¢mconv_getc () ¤ÏÆɤ߹þ¤Þ¤ì¤¿Ê¸»ú¤òÊÖ¤¹¡£ÆþÎϸ»¤¬
4670 EOF ¤Ë㤷¤¿¾ì¹ç¤Ï¡¢³°ÉôÊÑ¿ô #merror_code ¤òÊѤ¨¤º¤Ë @c EOF ¤òÊÖ¤¹¡£
4671 ¥¨¥é¡¼¤¬¸¡½Ð¤µ¤ì¤¿¾ì¹ç¤Ï @c EOF ¤òÊÖ¤·¡¢#merror_code ¤Ë¥¨¥é¡¼¥³¡¼¥É
4679 mconv_ungetc (), mconv_putc (), mconv_gets () */
4682 mconv_getc (MConverter *converter)
4684 MConverterStatus *internal = (MConverterStatus *) converter->internal_info;
4685 int at_most = converter->at_most;
4687 mtext_reset (internal->work_mt);
4688 converter->at_most = 1;
4689 mconv_decode (converter, internal->work_mt);
4690 converter->at_most = at_most;
4691 return (converter->nchars == 1
4692 ? STRING_CHAR (internal->work_mt->data)
4699 @brief Push a character back to a code converter.
4701 The mconv_ungetc () function pushes character $C back to code
4702 converter $CONVERTER. Any number of characters can be pushed
4703 back. The lastly pushed back character is firstly read by the
4704 subsequent mconv_getc () call. The characters pushed back are
4705 registered only in $CONVERTER; they are not written to the input
4706 source. The internal status of $CONVERTER is updated
4710 If the operation was successful, mconv_ungetc () returns $C.
4711 Otherwise it returns @c EOF and assigns an error code to the
4712 external variable #merror_code. */
4715 @brief ¥³¡¼¥É¥³¥ó¥Ð¡¼¥¿¤Ë1ʸ»úÌ᤹
4717 ´Ø¿ô mconv_ungetc () ¤Ï¡¢¥³¡¼¥É¥³¥ó¥Ð¡¼¥¿ $CONVERTER ¤Ëʸ»ú $C ¤ò
4718 ²¡¤·Ì᤹¡£²¡¤·Ì᤻¤ëʸ»ú¿ô¤ËÀ©¸Â¤Ï¤Ê¤¤¡£¤³¤Î¸å¤Ë mconv_getc () ¤ò
4719 ¸Æ¤Ó½Ð¤¹¤È¡¢ºÇ¸å¤ËÌᤵ¤ì¤¿Ê¸»ú¤¬ºÇ½é¤ËÆɤޤì¤ë¡£²¡¤·Ìᤵ¤ì¤¿Ê¸»ú¤Ï
4720 $CONVERTER ¤ÎÆâÉô¤ËÃߤ¨¤é¤ì¤ë¤À¤±¤Ç¤¢¤ê¡¢¼ÂºÝ¤ËÆþÎϸ»¤Ë½ñ¤¹þ¤Þ¤ì
4721 ¤ë¤ï¤±¤Ç¤Ï¤Ê¤¤¡£$CONVERTER ¤ÎÆâÉô¾õÂÖ¤ÏɬÍפ˱þ¤¸¤Æ¹¹¿·¤µ¤ì¤ë¡£
4724 ½èÍý¤¬À®¸ù¤¹¤ì¤Ð¡¢mconv_ungetc () ¤Ï $C ¤òÊÖ¤¹¡£¤½¤¦¤Ç¤Ê¤±¤ì¤Ð @c
4725 EOF ¤òÊÖ¤·¡¢³°ÉôÊÑ¿ô #merror_code ¤Ë¥¨¥é¡¼¥³¡¼¥É¤òÀßÄꤹ¤ë¡£ */
4729 @c MERROR_CODING, @c MERROR_CHAR
4732 mconv_getc (), mconv_putc (), mconv_gets () */
4735 mconv_ungetc (MConverter *converter, int c)
4737 MConverterStatus *internal = (MConverterStatus *) converter->internal_info;
4739 M_CHECK_CHAR (c, EOF);
4741 converter->result = MCONVERSION_RESULT_SUCCESS;
4742 mtext_cat_char (internal->unread, c);
4749 @brief Write a character via a code converter.
4751 The mconv_putc () function writes character $C to the buffer area
4752 or the stream that is currently bound to code converter
4753 $CONVERTER. The encoder of $CONVERTER is used to encode the
4754 character. The number of bytes actually written is set to the @c
4755 nbytes member of $CONVERTER. The internal status of $CONVERTER
4756 is updated appropriately.
4759 If the operation was successful, mconv_putc () returns $C.
4760 If an error is detected, it returns @c EOF and assigns
4761 an error code to the external variable #merror_code. */
4764 @brief ¥³¡¼¥É¥³¥ó¥Ð¡¼¥¿¤ò·Ðͳ¤Ç1ʸ»ú½ñ¤¯
4766 ´Ø¿ô mconv_putc () ¤Ï¡¢¥³¡¼¥É¥³¥ó¥Ð¡¼¥¿ $CONVERTER ¤Ë¸½ºß·ë¤ÓÉÕ¤±
4767 ¤é¤ì¤Æ¤¤¤ë¥Ð¥Ã¥Õ¥¡Îΰ褢¤ë¤¤¤Ï¥¹¥È¥ê¡¼¥à¤Ëʸ»ú $C ¤ò½ñ¤½Ð¤¹¡£Ê¸»ú
4768 ¤Î¥¨¥ó¥³¡¼¥É¤Ë¤Ï $CONVERTER ¤Î¥¨¥ó¥³¡¼¥À¤¬ÍѤ¤¤é¤ì¤ë¡£¼ÂºÝ¤Ë½ñ¤½Ð
4769 ¤µ¤ì¤¿¥Ð¥¤¥È¿ô¤Ï¡¢$CONVERTER ¤Î ¥á¥ó¥Ð¡¼ @c nbytes ¤Ë¥»¥Ã¥È¤µ¤ì¤ë¡£
4770 $CONVERTER ¤ÎÆâÉô¾õÂÖ¤ÏɬÍפ˱þ¤¸¤Æ¹¹¿·¤µ¤ì¤ë¡£
4773 ½èÍý¤¬À®¸ù¤¹¤ì¤Ð¡¢mconv_putc () ¤Ï $C ¤òÊÖ¤¹¡£¥¨¥é¡¼¤¬¸¡½Ð¤µ¤ì¤¿¾ì¹ç
4774 ¤Ï @c EOF ¤òÊÖ¤·¡¢³°ÉôÊÑ¿ô #merror_code ¤Ë¥¨¥é¡¼¥³¡¼¥É¤òÀßÄꤹ¤ë¡£ */
4778 @c MERROR_CODING, @c MERROR_IO, @c MERROR_CHAR
4781 mconv_getc (), mconv_ungetc (), mconv_gets () */
4784 mconv_putc (MConverter *converter, int c)
4786 MConverterStatus *internal = (MConverterStatus *) converter->internal_info;
4788 M_CHECK_CHAR (c, EOF);
4789 mtext_reset (internal->work_mt);
4790 mtext_cat_char (internal->work_mt, c);
4791 if (mconv_encode_range (converter, internal->work_mt, 0, 1) < 0)
4799 @brief Read a line using a code converter.
4801 The mconv_gets () function reads one line from the buffer area or
4802 the stream that is currently bound to code converter $CONVERTER.
4803 The decoder of $CONVERTER is used for decoding. The decoded
4804 character sequence is appended at the end of M-text $MT. The
4805 final newline character in the original byte sequence is not
4806 appended. The internal status of $CONVERTER is updated
4810 If the operation was successful, mconv_gets () returns the
4811 modified $MT. If it encounters EOF without reading a single
4812 character, it returns $MT without changing it. If an error is
4813 detected, it returns @c NULL and assigns an error code to @c
4817 @brief ¥³¡¼¥É¥³¥ó¥Ð¡¼¥¿¤ò»È¤Ã¤Æ1¹ÔÆɤà
4819 ´Ø¿ô mconv_gets () ¤Ï¡¢¥³¡¼¥É¥³¥ó¥Ð¡¼¥¿ $CONVERTER ¤Ë¸½ºß·ë¤ÓÉÕ¤±
4820 ¤é¤ì¤Æ¤¤¤ë¥Ð¥Ã¥Õ¥¡Îΰ褢¤ë¤¤¤Ï¥¹¥È¥ê¡¼¥à¤«¤é1¹Ô¤òÆɤ߹þ¤à¡£¥Ð¥¤¥È
4821 Îó¤Î¥Ç¥³¡¼¥É¤Ë¤Ï $CONVERTER ¤Î¥Ç¥³¡¼¥À¤¬ÍѤ¤¤é¤ì¤ë¡£¥Ç¥³¡¼¥É¤µ¤ì¤¿
4822 ʸ»úÎó¤Ï M-text $MT ¤ÎËöÈø¤ËÄɲ䵤ì¤ë¡£¸µ¤Î¥Ð¥¤¥ÈÎó¤Î½ªÃ¼²þ¹Ôʸ»ú
4823 ¤ÏÄɲ䵤ì¤Ê¤¤¡£$CONVERTER ¤ÎÆâÉô¾õÂÖ¤ÏɬÍפ˱þ¤¸¤Æ¹¹¿·¤µ¤ì¤ë¡£
4826 ½èÍý¤¬À®¸ù¤¹¤ì¤Ð¡¢mconv_gets () ¤ÏÊѹ¹¤µ¤ì¤¿ $MT ¤òÊÖ¤¹¡£¤â¤·1ʸ»ú
4827 ¤âÆɤޤº¤Ë EOF ¤ËÅö¤¿¤Ã¤¿¾ì¹ç¤Ï¡¢$MT ¤òÊѹ¹¤»¤º¤Ë¤½¤Î¤Þ¤ÞÊÖ¤¹¡£¥¨
4828 ¥é¡¼¤¬¸¡½Ð¤µ¤ì¤¿¾ì¹ç¤Ï @c NULL ¤òÊÖ¤·¡¢#merror_code ¤Ë¥¨¥é¡¼¥³¡¼¥É¤ò
4836 mconv_getc (), mconv_ungetc (), mconv_putc () */
4839 mconv_gets (MConverter *converter, MText *mt)
4843 M_CHECK_READONLY (mt, NULL);
4846 c = mconv_getc (converter);
4847 if (c == EOF || c == '\n')
4849 mtext_cat_char (mt, c);
4851 if (c == EOF && converter->result != MCONVERSION_RESULT_SUCCESS)
4852 /* mconv_getc () sets merror_code */