1 /* coding.c -- code conversion module.
2 Copyright (C) 2003, 2004
3 National Institute of Advanced Industrial Science and Technology (AIST)
4 Registration Number H15PRO112
6 This file is part of the m17n library.
8 The m17n library is free software; you can redistribute it and/or
9 modify it under the terms of the GNU Lesser General Public License
10 as published by the Free Software Foundation; either version 2.1 of
11 the License, or (at your option) any later version.
13 The m17n library is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public
19 License along with the m17n library; if not, write to the Free
20 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
25 @brief Coding system objects and API for them.
27 The m17n library represents a character encoding scheme (CES) of
28 coded character sets (CCS) as an object called @e coding @e
29 system. Application programs can add original coding systems.
31 To @e encode means converting code-points to character codes and
32 to @e decode means converting character codes back to code-points.
34 Application programs can decode a byte sequence with a specified
35 coding system into an M-text, and inversely, can encode an M-text
36 into a byte sequence. */
40 @brief ¥³¡¼¥É·Ï¥ª¥Ö¥¸¥§¥¯¥È¤È¤½¤ì¤Ë´Ø¤¹¤ë API
42 m17n ¥é¥¤¥Ö¥é¥ê¤Ï¡¢Éä¹æ²½Ê¸»ú½¸¹ç (coded character sets; CCS) ¤Îʸ
43 »úÉä¹ç²½Êý¼° (character encoding scheme; CES) ¤ò @e ¥³¡¼¥É·Ï ¤È¸Æ
44 ¤Ö¥ª¥Ö¥¸¥§¥¯¥È¤Çɽ¸½¤¹¤ë¡£m17n ¥é¥¤¥Ö¥é¥ê¤¬¥µ¥Ý¡¼¥È¤¹¤ëCES ¤Ï¡¢
45 UTF-8, UTF-16, ISO-2022, DIRECT-CHARSET, ¤½¤Î¾¡¢¤ËÂçÊ̤µ¤ì¤ë¡£¥¢
46 ¥×¥ê¥±¡¼¥·¥ç¥ó¥×¥í¥°¥é¥à¤¬Æȼ«¤Ë¥³¡¼¥É·Ï¤òÄɲ乤뤳¤È¤â²Äǽ¤Ç¤¢¤ë¡£
48 ¥³¡¼¥É¥Ý¥¤¥ó¥È¤«¤éʸ»ú¥³¡¼¥É¤Ø¤ÎÊÑ´¹¤ò @e ¥¨¥ó¥³¡¼¥É ¤È¸Æ¤Ó¡¢Ê¸»ú
49 ¥³¡¼¥É¤«¤é¥³¡¼¥É¥Ý¥¤¥ó¥È¤Ø¤ÎÊÑ´¹¤ò @e ¥Ç¥³¡¼¥É ¤È¸Æ¤Ö¡£
51 ¥¢¥×¥ê¥±¡¼¥·¥ç¥ó¥×¥í¥°¥é¥à¤Ï¡¢»ØÄꤵ¤ì¤¿¥³¡¼¥É·Ï¤Ç¥Ð¥¤¥ÈÎó¤ò¥Ç¥³¡¼
52 ¥É¤¹¤ë¤³¤È¤Ç M-text ¤òÆÀ¤ë¤³¤È¤¬¤Ç¤¤ë¡£¤Þ¤¿µÕ¤Ë¡¢»ØÄꤵ¤ì¤¿¥³¡¼¥É
53 ·Ï¤Ç M-text ¤ò¥¨¥ó¥³¡¼¥É¤·¤¹¤ë¤³¤È¤Ç¥Ð¥¤¥ÈÎó¤òÆÀ¤ë¤³¤È¤¬¤Ç¤¤ë¡£ */
57 #if !defined (FOR_DOXYGEN) || defined (DOXYGEN_INTERNAL_MODULE)
58 /*** @addtogroup m17nInternal
66 #include <sys/types.h>
71 #include "m17n-misc.h"
74 #include "character.h"
81 #define NUM_SUPPORTED_CHARSETS 32
83 /** Structure for coding system object. */
87 /** Name of the coding system. */
90 /** Type of the coding system. */
93 /* Number of supported charsets. */
96 /** Array of supported charsets. */
97 MCharset *charsets[NUM_SUPPORTED_CHARSETS];
99 /** If non-NULL, function to call at the time of creating and
100 reseting a converter. */
101 int (*resetter) (MConverter *converter);
103 int (*decoder) (unsigned char *str, int str_bytes, MText *mt,
104 MConverter *converter);
106 int (*encoder) (MText *mt, int from, int to,
107 unsigned char *str, int str_bytes,
108 MConverter *converter);
110 /** If non-zero, the coding system decode/encode ASCII characters as
112 int ascii_compatible;
114 /** Pointer to extra information given when the coding system is
115 defined. The meaning depends on <type>. */
118 /** Pointer to information referred on conversion. The meaning
119 depends on <type>. The value NULL means that the coding system
129 MCodingSystem **codings;
132 static struct MCodingList coding_list;
134 static MPlist *coding_definition_list;
138 Pointer to a structure of a coding system. */
140 ¥³¡¼¥É·Ï¤òɽ¤ï¤¹¥Ç¡¼¥¿¹½Â¤¤Ø¤Î¥Ý¥¤¥ó¥¿ */
141 MCodingSystem *coding;
144 Buffer for carryover bytes generated while decoding. */
146 ¥Ç¥³¡¼¥ÉÃæ¤Î¥¥ã¥ê¥£¥ª¡¼¥Ð¡¼¥Ð¥¤¥ÈÍѥХåե¡ */
147 unsigned char carryover[256];
150 Number of carryover bytes. */
152 ¥¥ã¥ê¥£¥ª¡¼¥Ð¡¼¥Ð¥¤¥È¿ô */
156 Beginning of the byte sequence bound to this converter. */
158 ¤³¤Î¥³¥ó¥Ð¡¼¥¿¤Ë·ë¤ÓÉÕ¤±¤é¤ì¤¿¥Ð¥¤¥ÈÎó¤ÎÀèƬ°ÌÃÖ */
168 Number of bytes already consumed in buf. */
170 buf Æâ¤Ç¤¹¤Ç¤Ë¾ÃÈñ¤µ¤ì¤¿¥Ð¥¤¥È¿ô */
174 Stream bound to this converter. */
176 ¤³¤Î¥³¥ó¥Ð¡¼¥¿¤Ë·ë¤ÓÉÕ¤±¤é¤ì¤¿¥¹¥È¥ê¡¼¥à */
180 Which of above two is in use. */
182 ¾åµ2¼Ô¤Î¤¤¤º¤ì¤¬»È¤ï¤ì¤Æ¤¤¤ë¤« */
202 /* Local macros and functions. */
204 /** At first, set SRC_BASE to SRC. Then check if we have already
205 produced AT_MOST chars. If so, set SRC_END to SRC, and jump to
206 source_end. Otherwise, get one more byte C from SRC. In that
207 case, if SRC == SRC_END, jump to the label source_end. */
209 #define ONE_MORE_BASE_BYTE(c) \
212 if (nchars == at_most) \
217 if (src == src_stop) \
219 if (src == src_end) \
221 src_base = src = source; \
222 if (src == src_end) \
224 src_stop = src_end; \
230 /** Get one more byte C from SRC. If SRC == SRC_END, jump to the
233 #define ONE_MORE_BYTE(c) \
235 if (src == src_stop) \
237 if (src == src_end) \
240 if (src == src_end) \
242 src_stop = src_end; \
248 #define REWIND_SRC_TO_BASE() \
250 if (src_base < source || src_base >= src_end) \
251 src_stop = internal->carryover + internal->carryover_bytes; \
256 /** Push back byte C to SRC. */
258 #define UNGET_ONE_BYTE(c) \
264 internal->carryover[0] = c; \
265 internal->carryover_bytes = 1; \
266 src = internal->carryover; \
267 src_stop = src + 1; \
272 /** Store multibyte representation of character C at DST and increment
273 DST to the next of the produced bytes. DST must be a pointer to
274 data area of M-text MT. If the produced bytes are going to exceed
275 DST_END, enlarge the data area of MT. */
277 #define EMIT_CHAR(c) \
279 int bytes = CHAR_BYTES (c); \
282 if (dst + bytes + 1 > dst_end) \
284 len = dst - mt->data; \
285 bytes = mt->allocated + bytes + (src_stop - src); \
286 mtext__enlarge (mt, bytes); \
287 dst = mt->data + len; \
288 dst_end = mt->data + mt->allocated; \
290 dst += CHAR_STRING (c, dst); \
295 /* Check if there is enough room to produce LEN bytes at DST. If not,
296 go to the label insufficient_destination. */
298 #define CHECK_DST(len) \
300 if (dst + (len) > dst_end) \
301 goto insufficient_destination; \
305 /** Take NUM_CHARS characters (NUM_BYTES bytes) already stored at
306 (MT->data + MT->nbytes) into MT, and put charset property on
307 them with CHARSET->name. */
309 #define TAKEIN_CHARS(mt, num_chars, num_bytes, charset) \
311 int chars = (num_chars); \
315 mtext__takein ((mt), chars, (num_bytes)); \
317 mtext_put_prop ((mt), (mt)->nchars - chars, (mt)->nchars, \
318 Mcharset, (void *) ((charset)->name)); \
323 #define SET_SRC(mt, format, from, to) \
325 if (format <= MTEXT_FORMAT_UTF_8) \
327 src = mt->data + POS_CHAR_TO_BYTE (mt, from); \
328 src_end = mt->data + POS_CHAR_TO_BYTE (mt, to); \
330 else if (format <= MTEXT_FORMAT_UTF_16BE) \
333 = mt->data + (sizeof (short)) * POS_CHAR_TO_BYTE (mt, from); \
335 = mt->data + (sizeof (short)) * POS_CHAR_TO_BYTE (mt, to); \
339 src = mt->data + (sizeof (int)) * from; \
340 src_end = mt->data + (sizeof (int)) * to; \
345 #define ONE_MORE_CHAR(c, bytes, format) \
347 if (src == src_end) \
349 if (format <= MTEXT_FORMAT_UTF_8) \
350 c = STRING_CHAR_AND_BYTES (src, bytes); \
351 else if (format <= MTEXT_FORMAT_UTF_16BE) \
353 c = mtext_ref_char (mt, from++); \
354 bytes = (sizeof (short)) * CHAR_UNITS_UTF16 (c); \
358 c = ((unsigned *) (mt->data))[from++]; \
359 bytes = sizeof (int); \
365 encode_unsupporeted_char (int c, unsigned char *dst, unsigned char *dst_end,
371 len = c < 0x10000 ? 8 : 10;
372 if (dst + len > dst_end)
375 mtext_put_prop (mt, pos, pos + 1, Mcoding, Mnil);
376 format = (c < 0xD800 ? "<U+%04X>"
377 : c < 0xE000 ? "<M+%04X>"
378 : c < 0x10000 ? "<U+%04X>"
379 : c < 0x110000 ? "<U+%06X>"
381 sprintf ((char *) dst, format, c);
387 /** Finish decoding of bytes at SOURCE (ending at SRC_END) into NCHARS
388 characters by CONVERTER into M-text MT. SRC is a pointer to the
389 not-yet processed bytes. ERROR is 1 iff an invalid byte was
393 finish_decoding (MText *mt, MConverter *converter, int nchars,
394 unsigned char *source, unsigned char *src_end,
398 MConverterStatus *internal = (MConverterStatus *) converter->internal_info;
401 internal->carryover_bytes = 0;
403 || (converter->last_block
404 && ! converter->lenient))
405 converter->result = MCONVERSION_RESULT_INVALID_BYTE;
406 else if (! converter->last_block)
408 unsigned char *dst = internal->carryover;
410 if (src < source || src > src_end)
412 dst += internal->carryover_bytes;
415 while (src < src_end)
417 internal->carryover_bytes = dst - internal->carryover;
418 converter->result = MCONVERSION_RESULT_INSUFFICIENT_SRC;
422 unsigned char *dst = mt->data + mt->nbytes;
423 unsigned char *dst_end = mt->data + mt->allocated;
424 unsigned char *src_stop = src_end;
426 int last_nchars = nchars;
428 if (src < source || src > src_end)
429 src_stop = internal->carryover + internal->carryover_bytes;
432 if (converter->at_most && nchars == converter->at_most)
446 TAKEIN_CHARS (mt, nchars - last_nchars, dst - (mt->data + mt->nbytes),
448 internal->carryover_bytes = 0;
451 converter->nchars += nchars;
452 converter->nbytes += ((src < source || src > src_end) ? 0 : src - source);
453 return (converter->result == MCONVERSION_RESULT_INVALID_BYTE ? -1 : 0);
458 /* Staffs for coding-systems of type MCODING_TYPE_CHARSET. */
461 setup_coding_charset (MCodingSystem *coding)
463 int ncharsets = coding->ncharsets;
464 unsigned *code_charset_table;
468 /* At first, reorder charset list by dimensions (a charset of
469 smaller dimension comes first). As the number of charsets is
470 usually very small (at most 32), we do a simple sort. */
475 MTABLE_ALLOCA (charsets, NUM_SUPPORTED_CHARSETS, MERROR_CODING);
476 memcpy (charsets, coding->charsets,
477 sizeof (MCharset *) * NUM_SUPPORTED_CHARSETS);
478 for (i = 0; i < 4; i++)
479 for (j = 0; j < ncharsets; j++)
480 if (charsets[j]->dimension == i)
481 coding->charsets[idx++] = charsets[j];
484 MTABLE_CALLOC (code_charset_table, 256, MERROR_CODING);
487 int dim = coding->charsets[ncharsets]->dimension;
488 int from = coding->charsets[ncharsets]->code_range[(dim - 1) * 4];
489 int to = coding->charsets[ncharsets]->code_range[(dim - 1) * 4 + 1];
491 if (coding->charsets[ncharsets]->ascii_compatible)
492 coding->ascii_compatible = 1;
494 code_charset_table[from++] |= 1 << ncharsets;
497 coding->extra_spec = (void *) code_charset_table;
502 reset_coding_charset (MConverter *converter)
504 MConverterStatus *internal = (MConverterStatus *) converter->internal_info;
505 MCodingSystem *coding = internal->coding;
508 && setup_coding_charset (coding) < 0)
515 decode_coding_charset (unsigned char *source, int src_bytes, MText *mt,
516 MConverter *converter)
518 MConverterStatus *internal = (MConverterStatus *) converter->internal_info;
519 MCodingSystem *coding = internal->coding;
520 unsigned char *src = internal->carryover;
521 unsigned char *src_stop = src + internal->carryover_bytes;
522 unsigned char *src_end = source + src_bytes;
523 unsigned char *src_base;
524 unsigned char *dst = mt->data + mt->nbytes;
525 unsigned char *dst_end = mt->data + mt->allocated;
528 int at_most = converter->at_most > 0 ? converter->at_most : -1;
530 unsigned *code_charset_table = (unsigned *) coding->extra_spec;
531 MCharset **charsets = coding->charsets;
532 MCharset *charset = mcharset__ascii;
537 MCharset *this_charset = NULL;
541 ONE_MORE_BASE_BYTE (c);
542 mask = code_charset_table[c];
552 while (! (mask & 1)) mask >>= 1, idx++;
553 this_charset = charsets[idx];
554 dim = this_charset->dimension;
558 code = (code << 8) | c;
561 c = DECODE_CHAR (this_charset, code);
568 if (! converter->lenient)
570 REWIND_SRC_TO_BASE ();
572 this_charset = mcharset__binary;
575 if (this_charset != mcharset__ascii
576 && this_charset != charset)
578 TAKEIN_CHARS (mt, nchars - last_nchars,
579 dst - (mt->data + mt->nbytes), charset);
580 charset = this_charset;
581 last_nchars = nchars;
585 /* We reach here because of an invalid byte. */
589 TAKEIN_CHARS (mt, nchars - last_nchars,
590 dst - (mt->data + mt->nbytes), charset);
591 return finish_decoding (mt, converter, nchars,
592 source, src_end, src_base, error);
596 encode_coding_charset (MText *mt, int from, int to,
597 unsigned char *destination, int dst_bytes,
598 MConverter *converter)
600 MConverterStatus *internal = (MConverterStatus *) converter->internal_info;
601 MCodingSystem *coding = internal->coding;
602 unsigned char *src, *src_end;
603 unsigned char *dst = destination;
604 unsigned char *dst_end = dst + dst_bytes;
606 int ncharsets = coding->ncharsets;
607 MCharset **charsets = coding->charsets;
608 int ascii_compatible = coding->ascii_compatible;
609 enum MTextFormat format = mt->format;
611 SET_SRC (mt, format, from, to);
616 ONE_MORE_CHAR (c, bytes, format);
618 if (c < 0x80 && ascii_compatible)
626 MCharset *charset = NULL;
631 charset = charsets[i];
632 code = ENCODE_CHAR (charset, c);
633 if (code != MCHAR_INVALID_CODE)
635 if (++i == ncharsets)
636 goto unsupported_char;
639 CHECK_DST (charset->dimension);
640 if (charset->dimension == 1)
644 else if (charset->dimension == 2)
647 *dst++ = code & 0xFF;
649 else if (charset->dimension == 3)
652 *dst++ = (code >> 8) & 0xFF;
653 *dst++ = code & 0xFF;
658 *dst++ = (code >> 16) & 0xFF;
659 *dst++ = (code >> 8) & 0xFF;
660 *dst++ = code & 0xFF;
671 if (! converter->lenient)
673 len = encode_unsupporeted_char (c, dst, dst_end, mt, from + nchars);
675 goto insufficient_destination;
681 /* We reach here because of an unsupported char. */
682 converter->result = MCONVERSION_RESULT_INVALID_CHAR;
685 insufficient_destination:
686 converter->result = MCONVERSION_RESULT_INSUFFICIENT_DST;
689 converter->nchars += nchars;
690 converter->nbytes += dst - destination;
691 return (converter->result == MCONVERSION_RESULT_INVALID_CHAR ? -1 : 0);
695 /* Staffs for coding-systems of type MCODING_TYPE_UTF (8). */
697 #define UTF8_CHARSET(p) \
698 (! ((p)[0] & 0x80) ? (mcharset__unicode) \
699 : CHAR_HEAD_P ((p) + 1) ? (mcharset__binary) \
700 : ! ((p)[0] & 0x20) ? (mcharset__unicode) \
701 : CHAR_HEAD_P ((p) + 2) ? (mcharset__binary) \
702 : ! ((p)[0] & 0x10) ? (mcharset__unicode) \
703 : CHAR_HEAD_P ((p) + 3) ? (mcharset__binary) \
704 : ! ((p)[0] & 0x08) ? ((((((p)[0] & 0x07) << 2) \
705 & (((p)[1] & 0x30) >> 4)) <= 0x10) \
706 ? (mcharset__unicode) \
707 : (mcharset__m17n)) \
708 : CHAR_HEAD_P ((p) + 4) ? (mcharset__binary) \
709 : ! ((p)[0] & 0x04) ? (mcharset__m17n) \
710 : CHAR_HEAD_P ((p) + 5) ? (mcharset__binary) \
711 : ! ((p)[0] & 0x02) ? (mcharset__m17n) \
712 : (mcharset__binary))
716 decode_coding_utf_8 (unsigned char *source, int src_bytes, MText *mt,
717 MConverter *converter)
719 MConverterStatus *internal = (MConverterStatus *) converter->internal_info;
720 MCodingSystem *coding = internal->coding;
721 unsigned char *src = internal->carryover;
722 unsigned char *src_stop = src + internal->carryover_bytes;
723 unsigned char *src_end = source + src_bytes;
724 unsigned char *src_base;
725 unsigned char *dst = mt->data + mt->nbytes;
726 unsigned char *dst_end = mt->data + mt->allocated;
729 int at_most = converter->at_most > 0 ? converter->at_most : -1;
731 int full = converter->lenient || (coding->charsets[0] == mcharset__m17n);
732 MCharset *charset = NULL;
737 MCharset *this_charset = NULL;
739 ONE_MORE_BASE_BYTE (c);
743 else if (!(c & 0x40))
745 else if (!(c & 0x20))
746 bytes = 2, c &= 0x1F;
747 else if (!(c & 0x10))
748 bytes = 3, c &= 0x0F;
749 else if (!(c & 0x08))
750 bytes = 4, c &= 0x07;
751 else if (!(c & 0x04))
752 bytes = 5, c &= 0x03;
753 else if (!(c & 0x02))
754 bytes = 6, c &= 0x01;
761 if ((c1 & 0xC0) != 0x80)
763 c = (c << 6) | (c1 & 0x3F);
767 || c < 0xD800 || (c >= 0xE000 && c < 0x110000))
771 if (! converter->lenient)
773 REWIND_SRC_TO_BASE ();
775 this_charset = mcharset__binary;
778 if (this_charset != charset)
780 TAKEIN_CHARS (mt, nchars - last_nchars,
781 dst - (mt->data + mt->nbytes), charset);
782 charset = this_charset;
783 last_nchars = nchars;
787 /* We reach here because of an invalid byte. */
791 TAKEIN_CHARS (mt, nchars - last_nchars,
792 dst - (mt->data + mt->nbytes), charset);
793 return finish_decoding (mt, converter, nchars,
794 source, src_end, src_base, error);
798 encode_coding_utf_8 (MText *mt, int from, int to,
799 unsigned char *destination, int dst_bytes,
800 MConverter *converter)
802 MConverterStatus *internal = (MConverterStatus *) converter->internal_info;
803 MCodingSystem *coding = internal->coding;
804 unsigned char *src, *src_end;
805 unsigned char *dst = destination;
806 unsigned char *dst_end = dst + dst_bytes;
808 enum MTextFormat format = mt->format;
810 SET_SRC (mt, format, from, to);
812 if (format <= MTEXT_FORMAT_UTF_8
813 && (converter->lenient
814 || coding->charsets[0] == mcharset__m17n))
816 if (dst_bytes < src_end - src)
818 int byte_pos = (src + dst_bytes) - mt->data;
820 to = POS_BYTE_TO_CHAR (mt, byte_pos);
821 byte_pos = POS_CHAR_TO_BYTE (mt, to);
822 src_end = mt->data + byte_pos;
823 converter->result = MCONVERSION_RESULT_INSUFFICIENT_DST;
825 memcpy (destination, src, src_end - src);
827 dst += src_end - src;
835 ONE_MORE_CHAR (c, bytes, format);
837 if ((c >= 0xD800 && c < 0xE000) || c >= 0x110000)
840 dst += CHAR_STRING (c, dst);
844 /* We reach here because of an unsupported char. */
845 converter->result = MCONVERSION_RESULT_INVALID_CHAR;
848 insufficient_destination:
849 converter->result = MCONVERSION_RESULT_INSUFFICIENT_DST;
852 converter->nchars += nchars;
853 converter->nbytes += dst - destination;
854 return (converter->result == MCONVERSION_RESULT_INVALID_CHAR ? -1 : 0);
858 /* Staffs for coding-systems of type MCODING_TYPE_UTF (16 & 32). */
879 enum utf_endian endian;
883 setup_coding_utf (MCodingSystem *coding)
885 MCodingInfoUTF *info = (MCodingInfoUTF *) (coding->extra_info);
886 MCodingInfoUTF *spec;
888 if (info->code_unit_bits == 8)
889 coding->ascii_compatible = 1;
890 else if (info->code_unit_bits == 16
891 || info->code_unit_bits == 32)
893 if (info->bom < 0 || info->bom > 2
894 || info->endian < 0 || info->endian > 1)
895 MERROR (MERROR_CODING, -1);
900 MSTRUCT_CALLOC (spec, MERROR_CODING);
902 coding->extra_spec = (void *) (spec);
907 reset_coding_utf (MConverter *converter)
909 MConverterStatus *internal = (MConverterStatus *) converter->internal_info;
910 MCodingSystem *coding = internal->coding;
911 struct utf_status *status = (struct utf_status *) &(converter->status);
914 && setup_coding_utf (coding) < 0)
918 status->surrogate = 0;
919 status->bom = ((MCodingInfoUTF *) (coding->extra_spec))->bom;
920 status->endian = ((MCodingInfoUTF *) (coding->extra_spec))->endian;
925 decode_coding_utf_16 (unsigned char *source, int src_bytes, MText *mt,
926 MConverter *converter)
928 MConverterStatus *internal = (MConverterStatus *) converter->internal_info;
929 unsigned char *src = internal->carryover;
930 unsigned char *src_stop = src + internal->carryover_bytes;
931 unsigned char *src_end = source + src_bytes;
932 unsigned char *src_base;
933 unsigned char *dst = mt->data + mt->nbytes;
934 unsigned char *dst_end = mt->data + mt->allocated;
937 int at_most = converter->at_most > 0 ? converter->at_most : -1;
938 struct utf_status *status = (struct utf_status *) &(converter->status);
939 unsigned char b1, b2;
940 MCharset *charset = NULL;
943 if (status->bom != UTF_BOM_NO)
947 ONE_MORE_BASE_BYTE (b1);
951 status->endian = UTF_BIG_ENDIAN;
952 else if (c == 0xFFFE)
953 status->endian = UTF_LITTLE_ENDIAN;
954 else if (status->bom == UTF_BOM_MAYBE
955 || converter->lenient)
957 status->endian = UTF_BIG_ENDIAN;
958 REWIND_SRC_TO_BASE ();
965 status->bom = UTF_BOM_NO;
971 MCharset *this_charset = NULL;
973 ONE_MORE_BASE_BYTE (b1);
975 if (status->endian == UTF_BIG_ENDIAN)
976 c = ((b1 << 8) | b2);
978 c = ((b2 << 8) | b1);
979 if (c < 0xD800 || c >= 0xE000)
985 if (status->endian == UTF_BIG_ENDIAN)
986 c1 = ((b1 << 8) | b2);
988 c1 = ((b2 << 8) | b1);
989 if (c1 < 0xDC00 || c1 >= 0xE000)
991 c = 0x10000 + ((c - 0xD800) << 10) + (c1 - 0xDC00);
996 if (! converter->lenient)
998 REWIND_SRC_TO_BASE ();
1001 if (status->endian == UTF_BIG_ENDIAN)
1002 c = ((b1 << 8) | b2);
1004 c = ((b2 << 8) | b1);
1005 this_charset = mcharset__binary;
1008 if (this_charset != charset)
1010 TAKEIN_CHARS (mt, nchars - last_nchars,
1011 dst - (mt->data + mt->nbytes), charset);
1012 charset = this_charset;
1013 last_nchars = nchars;
1017 /* We reach here because of an invalid byte. */
1021 TAKEIN_CHARS (mt, nchars - last_nchars,
1022 dst - (mt->data + mt->nbytes), charset);
1023 return finish_decoding (mt, converter, nchars,
1024 source, src_end, src_base, error);
1029 decode_coding_utf_32 (unsigned char *source, int src_bytes, MText *mt,
1030 MConverter *converter)
1032 MConverterStatus *internal = (MConverterStatus *) converter->internal_info;
1033 unsigned char *src = internal->carryover;
1034 unsigned char *src_stop = src + internal->carryover_bytes;
1035 unsigned char *src_end = source + src_bytes;
1036 unsigned char *src_base;
1037 unsigned char *dst = mt->data + mt->nbytes;
1038 unsigned char *dst_end = mt->data + mt->allocated;
1040 int last_nchars = 0;
1041 int at_most = converter->at_most > 0 ? converter->at_most : -1;
1042 struct utf_status *status = (struct utf_status *) &(converter->status);
1043 unsigned char b1, b2, b3, b4;
1044 MCharset *charset = NULL;
1047 if (status->bom != UTF_BOM_NO)
1051 ONE_MORE_BASE_BYTE (b1);
1055 c = (b1 << 24) | (b2 << 16) | (b3 << 8) | b4;
1056 if (c == 0x0000FEFF)
1057 status->endian = UTF_BIG_ENDIAN;
1058 else if (c == 0xFFFE0000)
1059 status->endian = UTF_LITTLE_ENDIAN;
1060 else if (status->bom == UTF_BOM_MAYBE
1061 || converter->lenient)
1063 status->endian = UTF_BIG_ENDIAN;
1064 REWIND_SRC_TO_BASE ();
1071 status->bom = UTF_BOM_NO;
1077 MCharset *this_charset = NULL;
1079 ONE_MORE_BASE_BYTE (b1);
1083 if (status->endian == UTF_BIG_ENDIAN)
1084 c = (b1 << 24) | (b2 << 16) | (b3 << 8) | b4;
1086 c = (b4 << 24) | (b3 << 16) | (b2 << 8) | b1;
1087 if (c < 0xD800 || (c >= 0xE000 && c < 0x110000))
1090 if (! converter->lenient)
1092 REWIND_SRC_TO_BASE ();
1094 this_charset = mcharset__binary;
1097 if (this_charset != charset)
1099 TAKEIN_CHARS (mt, nchars - last_nchars,
1100 dst - (mt->data + mt->nbytes), charset);
1101 charset = this_charset;
1102 last_nchars = nchars;
1106 /* We reach here because of an invalid byte. */
1110 TAKEIN_CHARS (mt, nchars - last_nchars,
1111 dst - (mt->data + mt->nbytes), charset);
1112 return finish_decoding (mt, converter, nchars,
1113 source, src_end, src_base, error);
1118 encode_coding_utf_16 (MText *mt, int from, int to,
1119 unsigned char *destination, int dst_bytes,
1120 MConverter *converter)
1122 unsigned char *src, *src_end;
1123 unsigned char *dst = destination;
1124 unsigned char *dst_end = dst + dst_bytes;
1126 struct utf_status *status = (struct utf_status *) &(converter->status);
1127 int big_endian = status->endian == UTF_BIG_ENDIAN;
1128 enum MTextFormat format = mt->format;
1130 SET_SRC (mt, format, from, to);
1132 if (status->bom != UTF_BOM_NO)
1136 *dst++ = 0xFE, *dst++ = 0xFF;
1138 *dst++ = 0xFF, *dst++ = 0xFE;
1139 status->bom = UTF_BOM_NO;
1146 ONE_MORE_CHAR (c, bytes, format);
1148 if (c < 0xD800 || (c >= 0xE000 && c < 0x10000))
1152 *dst++ = c >> 8, *dst++ = c & 0xFF;
1154 *dst++ = c & 0xFF, *dst++ = c >> 8;
1156 else if (c >= 0x10000 && c < 0x110000)
1162 c1 = (c >> 10) + 0xD800;
1163 c2 = (c & 0x3FF) + 0xDC00;
1165 *dst++ = c1 >> 8, *dst++ = c1 & 0xFF,
1166 *dst++ = c2 >> 8, *dst++ = c2 & 0xFF;
1168 *dst++ = c1 & 0xFF, *dst++ = c1 >> 8,
1169 *dst++ = c2 & 0xFF, *dst++ = c2 >> 8;
1173 unsigned char buf[11];
1176 if (! converter->lenient)
1178 len = encode_unsupporeted_char (c, buf, buf + (dst_end - dst),
1181 goto insufficient_destination;
1183 for (i = 0; i < len; i++)
1184 *dst++ = 0, *dst++ = buf[i];
1186 for (i = 0; i < len; i++)
1187 *dst++ = buf[i], *dst++ = 0;
1192 /* We reach here because of an unsupported char. */
1193 converter->result = MCONVERSION_RESULT_INVALID_CHAR;
1196 insufficient_destination:
1197 converter->result = MCONVERSION_RESULT_INSUFFICIENT_DST;
1200 converter->nchars += nchars;
1201 converter->nbytes += dst - destination;
1202 return (converter->result == MCONVERSION_RESULT_INVALID_CHAR ? -1 : 0);
1206 encode_coding_utf_32 (MText *mt, int from, int to,
1207 unsigned char *destination, int dst_bytes,
1208 MConverter *converter)
1210 unsigned char *src, *src_end;
1211 unsigned char *dst = destination;
1212 unsigned char *dst_end = dst + dst_bytes;
1214 struct utf_status *status = (struct utf_status *) &(converter->status);
1215 int big_endian = status->endian == UTF_BIG_ENDIAN;
1216 enum MTextFormat format = mt->format;
1218 SET_SRC (mt, format, from, to);
1220 if (status->bom != UTF_BOM_NO)
1224 *dst++ = 0x00, *dst++ = 0x00, *dst++ = 0xFE, *dst++ = 0xFF;
1226 *dst++ = 0xFF, *dst++ = 0xFE, *dst++ = 0x00, *dst++ = 0x00;
1227 status->bom = UTF_BOM_NO;
1234 ONE_MORE_CHAR (c, bytes, format);
1236 if (c < 0xD800 || (c >= 0xE000 && c < 0x110000))
1240 *dst++ = 0x00, *dst++ = c >> 16,
1241 *dst++ = (c >> 8) & 0xFF, *dst++ = c & 0xFF;
1243 *dst++ = c & 0xFF, *dst++ = (c >> 8) & 0xFF,
1244 *dst++ = c >> 16, *dst++ = 0x00;
1248 unsigned char buf[11];
1251 if (! converter->lenient)
1253 len = encode_unsupporeted_char (c, buf, buf + (dst_end - dst),
1256 goto insufficient_destination;
1258 for (i = 0; i < len; i++)
1259 *dst++ = 0, *dst++ = buf[i];
1261 for (i = 0; i < len; i++)
1262 *dst++ = buf[i], *dst++ = 0;
1267 /* We reach here because of an unsupported char. */
1268 converter->result = MCONVERSION_RESULT_INVALID_CHAR;
1271 insufficient_destination:
1272 converter->result = MCONVERSION_RESULT_INSUFFICIENT_DST;
1275 converter->nchars += nchars;
1276 converter->nbytes += dst - destination;
1277 return (converter->result == MCONVERSION_RESULT_INVALID_CHAR ? -1 : 0);
1281 /* Staffs for coding-systems of type MCODING_TYPE_ISO_2022. */
1283 #define ISO_CODE_STX 0x02 /* start text */
1284 #define ISO_CODE_SO 0x0E /* shift-out */
1285 #define ISO_CODE_SI 0x0F /* shift-in */
1286 #define ISO_CODE_SS2_7 0x19 /* single-shift-2 for 7-bit code */
1287 #define ISO_CODE_ESC 0x1B /* escape */
1288 #define ISO_CODE_SS2 0x8E /* single-shift-2 */
1289 #define ISO_CODE_SS3 0x8F /* single-shift-3 */
1291 /** Structure pointed by MCodingSystem.extra_spec. */
1293 struct iso_2022_spec
1297 /** Initial graphic registers (0..3) invoked to each graphic
1298 plane left and right. */
1299 int initial_invocation[2];
1301 /** Initially designated charsets for each graphic register. */
1302 MCharset *initial_designation[4];
1310 struct iso_2022_status
1313 MCharset *designation[4];
1314 unsigned single_shifting : 1;
1317 unsigned utf8_shifting : 1;
1318 MCharset *non_standard_charset;
1319 int non_standard_charset_bytes;
1320 int non_standard_encoding;
1323 enum iso_2022_code_class {
1324 ISO_control_0, /* Control codes in the range
1325 0x00..0x1F and 0x7F, except for the
1326 following 4 codes. */
1327 ISO_shift_out, /* ISO_CODE_SO (0x0E) */
1328 ISO_shift_in, /* ISO_CODE_SI (0x0F) */
1329 ISO_single_shift_2_7, /* ISO_CODE_SS2_7 (0x19) */
1330 ISO_escape, /* ISO_CODE_SO (0x1B) */
1331 ISO_control_1, /* Control codes in the range
1332 0x80..0x9F, except for the
1333 following 3 codes. */
1334 ISO_single_shift_2, /* ISO_CODE_SS2 (0x8E) */
1335 ISO_single_shift_3, /* ISO_CODE_SS3 (0x8F) */
1336 ISO_control_sequence_introducer, /* ISO_CODE_CSI (0x9B) */
1337 ISO_0x20_or_0x7F, /* Codes of the values 0x20 or 0x7F. */
1338 ISO_graphic_plane_0, /* Graphic codes in the range 0x21..0x7E. */
1339 ISO_0xA0_or_0xFF, /* Codes of the values 0xA0 or 0xFF. */
1340 ISO_graphic_plane_1 /* Graphic codes in the range 0xA1..0xFE. */
1341 } iso_2022_code_class[256];
1344 #define MCODING_ISO_DESIGNATION_MASK \
1345 (MCODING_ISO_DESIGNATION_G0 \
1346 | MCODING_ISO_DESIGNATION_G1 \
1347 | MCODING_ISO_DESIGNATION_CTEXT \
1348 | MCODING_ISO_DESIGNATION_CTEXT_EXT)
1351 setup_coding_iso_2022 (MCodingSystem *coding)
1353 MCodingInfoISO2022 *info = (MCodingInfoISO2022 *) (coding->extra_info);
1354 int ncharsets = coding->ncharsets;
1355 struct iso_2022_spec *spec;
1356 int designation_policy = info->flags & MCODING_ISO_DESIGNATION_MASK;
1359 coding->ascii_compatible = 0;
1361 MSTRUCT_CALLOC (spec, MERROR_CODING);
1363 spec->flags = info->flags;
1364 spec->initial_invocation[0] = info->initial_invocation[0];
1365 spec->initial_invocation[1] = info->initial_invocation[1];
1366 for (i = 0; i < 4; i++)
1367 spec->initial_designation[i] = NULL;
1368 if (designation_policy)
1370 spec->n_designations = ncharsets;
1371 if (spec->flags & MCODING_ISO_FULL_SUPPORT)
1372 spec->n_designations += mcharset__iso_2022_table.used;
1373 MTABLE_CALLOC (spec->designations, spec->n_designations, MERROR_CODING);
1374 for (i = 0; i < spec->n_designations; i++)
1375 spec->designations[i] = -1;
1379 if (spec->flags & MCODING_ISO_FULL_SUPPORT)
1380 MERROR (MERROR_CODING, -1);
1381 spec->designations = NULL;
1384 for (i = 0; i < ncharsets; i++)
1386 int reg = info->designations[i];
1389 && coding->charsets[i]->final_byte > 0
1390 && (reg < -4 || reg > 3))
1391 MERROR (MERROR_CODING, -1);
1394 if (spec->initial_designation[reg])
1395 MERROR (MERROR_CODING, -1);
1396 spec->initial_designation[reg] = coding->charsets[i];
1400 if (! designation_policy
1401 && ! (spec->flags & MCODING_ISO_EUC_TW_SHIFT))
1402 MERROR (MERROR_CODING, -1);
1406 if (designation_policy)
1407 spec->designations[i] = reg;
1408 if (coding->charsets[i] == mcharset__ascii)
1409 coding->ascii_compatible = 1;
1412 if (coding->ascii_compatible
1413 && (spec->flags & (MCODING_ISO_DESIGNATION_G0
1414 | MCODING_ISO_DESIGNATION_CTEXT
1415 | MCODING_ISO_DESIGNATION_CTEXT_EXT
1416 | MCODING_ISO_LOCKING_SHIFT)))
1417 coding->ascii_compatible = 0;
1419 if (spec->flags & MCODING_ISO_FULL_SUPPORT)
1420 for (i = 0; i < mcharset__iso_2022_table.used; i++)
1422 MCharset *charset = mcharset__iso_2022_table.charsets[i];
1424 spec->designations[ncharsets + i]
1425 = ((designation_policy == MCODING_ISO_DESIGNATION_CTEXT
1426 || designation_policy == MCODING_ISO_DESIGNATION_CTEXT_EXT)
1427 ? (charset->code_range[0] == 32
1428 || charset->code_range[1] == 255)
1429 : designation_policy == MCODING_ISO_DESIGNATION_G1);
1432 spec->use_esc = ((spec->flags & MCODING_ISO_DESIGNATION_MASK)
1433 || ((spec->flags & MCODING_ISO_LOCKING_SHIFT)
1434 && (spec->initial_designation[2]
1435 || spec->initial_designation[3]))
1436 || (! (spec->flags & MCODING_ISO_EIGHT_BIT)
1437 && (spec->flags & MCODING_ISO_SINGLE_SHIFT))
1438 || (spec->flags & MCODING_ISO_ISO6429));
1440 coding->extra_spec = (void *) spec;
1446 reset_coding_iso_2022 (MConverter *converter)
1448 MConverterStatus *internal = (MConverterStatus *) converter->internal_info;
1449 MCodingSystem *coding = internal->coding;
1450 struct iso_2022_status *status
1451 = (struct iso_2022_status *) &(converter->status);
1452 struct iso_2022_spec *spec;
1456 && setup_coding_iso_2022 (coding) < 0)
1460 spec = (struct iso_2022_spec *) coding->extra_spec;
1461 status->invocation[0] = spec->initial_invocation[0];
1462 status->invocation[1] = spec->initial_invocation[1];
1463 for (i = 0; i < 4; i++)
1464 status->designation[i] = spec->initial_designation[i];
1465 status->single_shifting = 0;
1472 #define ISO2022_DECODE_DESIGNATION(reg, dim, chars, final, rev) \
1474 MCharset *charset; \
1476 if ((final) < '0' || (final) >= 128) \
1477 goto invalid_byte; \
1480 charset = MCHARSET_ISO_2022 ((dim), (chars), (final)); \
1481 if (! (spec->flags & MCODING_ISO_FULL_SUPPORT)) \
1485 for (i = 0; i < coding->ncharsets; i++) \
1486 if (charset == coding->charsets[i]) \
1488 if (i == coding->ncharsets) \
1489 goto invalid_byte; \
1496 for (i = 0; i < mcharset__iso_2022_table.used; i++) \
1498 charset = mcharset__iso_2022_table.charsets[i]; \
1499 if (charset->revision == (rev) \
1500 && charset->dimension == (dim) \
1501 && charset->final_byte == (final) \
1502 && (charset->code_range[1] == (chars) \
1503 || ((chars) == 96 && charset->code_range[1] == 255))) \
1506 if (i == mcharset__iso_2022_table.used) \
1507 goto invalid_byte; \
1509 status->designation[reg] = charset; \
1514 find_ctext_non_standard_charset (char *charset_name)
1518 if (! strcmp (charset_name, "koi8-r"))
1519 charset = MCHARSET (msymbol ("koi8-r"));
1520 else if (! strcmp (charset_name, "big5-0"))
1521 charset = MCHARSET (msymbol ("big5"));
1528 decode_coding_iso_2022 (unsigned char *source, int src_bytes, MText *mt,
1529 MConverter *converter)
1531 MConverterStatus *internal = (MConverterStatus *) converter->internal_info;
1532 MCodingSystem *coding = internal->coding;
1533 unsigned char *src = internal->carryover;
1534 unsigned char *src_stop = src + internal->carryover_bytes;
1535 unsigned char *src_end = source + src_bytes;
1536 unsigned char *src_base;
1537 unsigned char *dst = mt->data + mt->nbytes;
1538 unsigned char *dst_end = mt->data + mt->allocated;
1540 int last_nchars = 0;
1541 int at_most = converter->at_most > 0 ? converter->at_most : -1;
1542 struct iso_2022_spec *spec = (struct iso_2022_spec *) coding->extra_spec;
1543 struct iso_2022_status *status
1544 = (struct iso_2022_status *) &(converter->status);
1545 MCharset *charset0, *charset1, *charset;
1547 MCharset *cns_charsets[15];
1549 charset0 = (status->invocation[0] >= 0
1550 ? status->designation[status->invocation[0]] : NULL);
1551 charset1 = (status->invocation[1] >= 0
1552 ? status->designation[status->invocation[1]] : NULL);
1553 charset = mcharset__ascii;
1555 if (spec->flags & MCODING_ISO_EUC_TW_SHIFT)
1559 memset (cns_charsets, 0, sizeof (cns_charsets));
1560 for (i = 0; i < coding->ncharsets; i++)
1561 if (coding->charsets[i]->dimension == 2
1562 && coding->charsets[i]->code_range[1] == 126)
1564 int final = coding->charsets[i]->final_byte;
1566 if (final >= 'G' && final <= 'M')
1567 cns_charsets[final - 'G'] = coding->charsets[i];
1569 cns_charsets[14] = coding->charsets[i];
1575 MCharset *this_charset = NULL;
1578 ONE_MORE_BASE_BYTE (c1);
1580 if (status->utf8_shifting)
1583 int bytes = CHAR_BYTES_BY_HEAD (c1);
1587 for (i = 1; i < bytes; i++)
1592 this_charset = UTF8_CHARSET (buf);
1593 c1 = STRING_CHAR_UTF8 (buf);
1597 if (status->non_standard_encoding > 0)
1601 this_charset = status->non_standard_charset;
1602 for (i = 1; i < status->non_standard_charset_bytes; i++)
1605 c1 = (c1 << 8) | c2;
1607 c1 = DECODE_CHAR (this_charset, c1);
1611 switch (iso_2022_code_class[c1])
1613 case ISO_graphic_plane_0:
1614 this_charset = charset0;
1617 case ISO_0x20_or_0x7F:
1619 || (charset0->code_range[0] != 32
1620 && charset0->code_range[1] != 255))
1621 /* This is SPACE or DEL. */
1622 this_charset = mcharset__ascii;
1624 /* This is a graphic character of plane 0. */
1625 this_charset = charset0;
1628 case ISO_graphic_plane_1:
1631 this_charset = charset1;
1634 case ISO_0xA0_or_0xFF:
1636 || charset1->code_range[0] == 33
1637 || ! (spec->flags & MCODING_ISO_EIGHT_BIT))
1639 /* This is a graphic character of plane 1. */
1642 this_charset = charset1;
1646 this_charset = mcharset__ascii;
1653 if ((spec->flags & MCODING_ISO_LOCKING_SHIFT)
1654 && status->designation[1])
1656 status->invocation[0] = 1;
1657 charset0 = status->designation[1];
1660 this_charset = mcharset__ascii;
1664 if (spec->flags & MCODING_ISO_LOCKING_SHIFT)
1666 status->invocation[0] = 0;
1667 charset0 = status->designation[0];
1670 this_charset = mcharset__ascii;
1673 case ISO_single_shift_2_7:
1674 if (! (spec->flags & MCODING_ISO_SINGLE_SHIFT_7))
1676 this_charset = mcharset__ascii;
1680 goto label_escape_sequence;
1682 case ISO_single_shift_2:
1683 if (spec->flags & MCODING_ISO_EUC_TW_SHIFT)
1686 if (c1 < 0xA1 || (c1 > 0xA7 && c1 < 0xAF) || c1 > 0xAF
1687 || ! cns_charsets[c1 - 0xA1])
1689 status->designation[2] = cns_charsets[c1 - 0xA1];
1691 else if (! (spec->flags & MCODING_ISO_SINGLE_SHIFT))
1693 /* SS2 is handled as an escape sequence of ESC 'N' */
1695 goto label_escape_sequence;
1697 case ISO_single_shift_3:
1698 if (! (spec->flags & MCODING_ISO_SINGLE_SHIFT))
1700 /* SS2 is handled as an escape sequence of ESC 'O' */
1702 goto label_escape_sequence;
1704 case ISO_control_sequence_introducer:
1705 /* CSI is handled as an escape sequence of ESC '[' ... */
1707 goto label_escape_sequence;
1710 if (! spec->use_esc)
1712 this_charset = mcharset__ascii;
1716 label_escape_sequence:
1717 /* Escape sequences handled here are invocation,
1718 designation, and direction specification. */
1721 case '&': /* revision of following character set */
1722 if (! (spec->flags & MCODING_ISO_DESIGNATION_MASK))
1723 goto unused_escape_sequence;
1725 if (c1 < '@' || c1 > '~')
1728 if (c1 != ISO_CODE_ESC)
1731 goto label_escape_sequence;
1733 case '$': /* designation of 2-byte character set */
1734 if (! (spec->flags & MCODING_ISO_DESIGNATION_MASK))
1735 goto unused_escape_sequence;
1737 if (c1 >= '@' && c1 <= 'B')
1738 { /* designation of JISX0208.1978, GB2312.1980, or
1740 ISO2022_DECODE_DESIGNATION (0, 2, 94, c1, -1);
1742 else if (c1 >= 0x28 && c1 <= 0x2B)
1743 { /* designation of (dimension 2, chars 94) character set */
1745 ISO2022_DECODE_DESIGNATION (c1 - 0x28, 2, 94, c2, -1);
1747 else if (c1 >= 0x2C && c1 <= 0x2F)
1748 { /* designation of (dimension 2, chars 96) character set */
1750 ISO2022_DECODE_DESIGNATION (c1 - 0x2C, 2, 96, c2, -1);
1754 /* We must update these variables now. */
1755 charset0 = status->designation[status->invocation[0]];
1756 charset1 = status->designation[status->invocation[1]];
1759 case 'n': /* invocation of locking-shift-2 */
1760 if (! (spec->flags & MCODING_ISO_LOCKING_SHIFT)
1761 || ! status->designation[2])
1763 status->invocation[0] = 2;
1764 charset0 = status->designation[2];
1767 case 'o': /* invocation of locking-shift-3 */
1768 if (! (spec->flags & MCODING_ISO_LOCKING_SHIFT)
1769 || ! status->designation[3])
1771 status->invocation[0] = 3;
1772 charset0 = status->designation[3];
1775 case 'N': /* invocation of single-shift-2 */
1776 if (! ((spec->flags & MCODING_ISO_SINGLE_SHIFT)
1777 || (spec->flags & MCODING_ISO_EUC_TW_SHIFT))
1778 || ! status->designation[2])
1780 this_charset = status->designation[2];
1782 if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
1786 case 'O': /* invocation of single-shift-3 */
1787 if (! (spec->flags & MCODING_ISO_SINGLE_SHIFT)
1788 || ! status->designation[3])
1790 this_charset = status->designation[3];
1792 if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
1796 case '[': /* specification of direction */
1797 if (! (spec->flags & MCODING_ISO_ISO6429))
1799 /* For the moment, nested direction is not supported.
1800 So, (coding->mode & CODING_MODE_DIRECTION) zero means
1801 left-to-right, and nonzero means right-to-left. */
1805 case ']': /* end of the current direction */
1806 case '0': /* end of the current direction */
1810 case '1': /* start of left-to-right direction */
1817 case '2': /* start of right-to-left direction */
1831 char charset_name[16];
1835 if (! spec->flags & MCODING_ISO_DESIGNATION_CTEXT_EXT)
1837 /* Compound-text uses these escape sequences:
1839 ESC % G -- utf-8 bytes -- ESC % @
1840 ESC % / 1 M L -- charset name -- STX -- bytes --
1841 ESC % / 2 M L -- charset name -- STX -- bytes --
1842 ESC % / 3 M L -- charset name -- STX -- bytes --
1843 ESC % / 4 M L -- charset name -- STX -- bytes --
1845 It also uses this sequence but that is not yet
1848 ESC % / 0 M L -- charset name -- STX -- bytes -- */
1853 status->utf8_shifting = 1;
1858 if (! status->utf8_shifting)
1860 status->utf8_shifting = 0;
1866 if (c1 < '1' || c1 > '4')
1868 status->non_standard_charset_bytes = c1 - '0';
1871 if (c1 < 128 || c2 < 128)
1873 bytes = (c1 - 128) * 128 + (c2 - 128);
1874 for (i = 0; i < 16; i++)
1877 if (c1 == ISO_CODE_STX)
1879 charset_name[i] = TOLOWER (c1);
1883 charset_name[i++] = '\0';
1884 this_charset = find_ctext_non_standard_charset (charset_name);
1887 status->non_standard_charset = this_charset;
1888 status->non_standard_encoding = bytes - i;
1893 if (! (spec->flags & MCODING_ISO_DESIGNATION_MASK))
1894 goto unused_escape_sequence;
1895 if (c1 >= 0x28 && c1 <= 0x2B)
1896 { /* designation of (dimension 1, chars 94) charset */
1898 ISO2022_DECODE_DESIGNATION (c1 - 0x28, 1, 94, c2, -1);
1900 else if (c1 >= 0x2C && c1 <= 0x2F)
1901 { /* designation of (dimension 1, chars 96) charset */
1903 ISO2022_DECODE_DESIGNATION (c1 - 0x2C, 1, 96, c2, -1);
1907 /* We must update these variables now. */
1908 charset0 = status->designation[status->invocation[0]];
1909 charset1 = status->designation[status->invocation[1]];
1912 unused_escape_sequence:
1913 UNGET_ONE_BYTE (c1);
1915 this_charset = mcharset__ascii;
1919 if (this_charset->dimension == 1)
1921 if (this_charset->code_range[1] <= 128)
1924 else if (this_charset->dimension == 2)
1927 c1 = ((c1 & 0x7F) << 8) | (c2 & 0x7F);
1929 else /* i.e. (dimension == 3) */
1933 c1 = ((c1 & 0x7F) << 16) | ((c2 & 0x7F) << 8) | (c3 & 0x7F);
1935 c1 = DECODE_CHAR (this_charset, c1);
1939 if (! converter->lenient)
1941 REWIND_SRC_TO_BASE ();
1943 this_charset = mcharset__binary;
1946 if (this_charset != mcharset__ascii
1947 && this_charset != charset)
1949 TAKEIN_CHARS (mt, nchars - last_nchars,
1950 dst - (mt->data + mt->nbytes), charset);
1951 charset = this_charset;
1952 last_nchars = nchars;
1955 if (status->non_standard_encoding > 0)
1956 status->non_standard_encoding -= status->non_standard_charset_bytes;
1958 /* We reach here because of an invalid byte. */
1964 TAKEIN_CHARS (mt, nchars - last_nchars,
1965 dst - (mt->data + mt->nbytes), charset);
1966 return finish_decoding (mt, converter, nchars,
1967 source, src_end, src_base, error);
1971 /* Produce codes (escape sequence) for designating CHARSET to graphic
1972 register REG at DST, and increment DST. If CHARSET->final-char is
1973 '@', 'A', or 'B' and SHORT_FORM is nonzero, produce designation
1974 sequence of short-form. Update STATUS->designation. */
1976 #define ISO2022_ENCODE_DESIGNATION(reg, charset, spec, status) \
1978 char *intermediate_char_94 = "()*+"; \
1979 char *intermediate_char_96 = ",-./"; \
1981 if (dst + 4 > dst_end) \
1982 goto memory_shortage; \
1983 *dst++ = ISO_CODE_ESC; \
1984 if (charset->dimension == 1) \
1986 if (charset->code_range[0] != 32 \
1987 && charset->code_range[1] != 255) \
1988 *dst++ = (unsigned char) (intermediate_char_94[reg]); \
1990 *dst++ = (unsigned char) (intermediate_char_96[reg]); \
1995 if (charset->code_range[0] != 32 \
1996 && charset->code_range[1] != 255) \
1998 if (spec->flags & MCODING_ISO_LONG_FORM \
2000 || charset->final_byte < '@' || charset->final_byte > 'B') \
2001 *dst++ = (unsigned char) (intermediate_char_94[reg]); \
2004 *dst++ = (unsigned char) (intermediate_char_96[reg]); \
2006 *dst++ = charset->final_byte; \
2008 status->designation[reg] = charset; \
2012 /* The following two macros produce codes (control character or escape
2013 sequence) for ISO-2022 single-shift functions (single-shift-2 and
2016 #define ISO2022_ENCODE_SINGLE_SHIFT_2(spec, status) \
2018 if (dst + 2 > dst_end) \
2019 goto memory_shortage; \
2020 if (! (spec->flags & MCODING_ISO_EIGHT_BIT)) \
2021 *dst++ = ISO_CODE_ESC, *dst++ = 'N'; \
2023 *dst++ = ISO_CODE_SS2; \
2024 status->single_shifting = 1; \
2028 #define ISO2022_ENCODE_SINGLE_SHIFT_3(spec, status) \
2030 if (dst + 2 > dst_end) \
2031 goto memory_shortage; \
2032 if (! (spec->flags & MCODING_ISO_EIGHT_BIT)) \
2033 *dst++ = ISO_CODE_ESC, *dst++ = 'O'; \
2035 *dst++ = ISO_CODE_SS3; \
2036 status->single_shifting = 1; \
2040 /* The following four macros produce codes (control character or
2041 escape sequence) for ISO-2022 locking-shift functions (shift-in,
2042 shift-out, locking-shift-2, and locking-shift-3). */
2044 #define ISO2022_ENCODE_SHIFT_IN(status) \
2046 if (dst + 1 > dst_end) \
2047 goto memory_shortage; \
2048 *dst++ = ISO_CODE_SI; \
2049 status->invocation[0] = 0; \
2053 #define ISO2022_ENCODE_SHIFT_OUT(status) \
2055 if (dst + 1 > dst_end) \
2056 goto memory_shortage; \
2057 *dst++ = ISO_CODE_SO; \
2058 status->invocation[0] = 1; \
2062 #define ISO2022_ENCODE_LOCKING_SHIFT_2(status) \
2064 if (dst + 2 > dst_end) \
2065 goto memory_shortage; \
2066 *dst++ = ISO_CODE_ESC, *dst++ = 'n'; \
2067 status->invocation[0] = 2; \
2071 #define ISO2022_ENCODE_LOCKING_SHIFT_3(status) \
2073 if (dst + 2 > dst_end) \
2074 goto memory_shortage; \
2075 *dst++ = ISO_CODE_ESC, *dst++ = 'o'; \
2076 status->invocation[0] = 3; \
2079 #define ISO2022_ENCODE_UTF8_SHIFT_START(len) \
2081 CHECK_DST (3 + len); \
2082 *dst++ = ISO_CODE_ESC; \
2085 status->utf8_shifting = 1; \
2089 #define ISO2022_ENCODE_UTF8_SHIFT_END() \
2092 *dst++ = ISO_CODE_ESC; \
2095 status->utf8_shifting = 0; \
2099 #define ISO2022_ENCODE_NON_STANDARD(name, len) \
2101 CHECK_DST (6 + len + 1 + non_standard_charset_bytes); \
2102 non_standard_begin = dst; \
2103 *dst++ = ISO_CODE_ESC; \
2106 *dst++ = '0' + non_standard_charset_bytes; \
2107 *dst++ = 0, *dst++ = 0; /* filled later */ \
2108 memcpy (dst, name, len); \
2110 *dst++ = ISO_CODE_STX; \
2111 non_standard_bytes = len + 1; \
2116 find_ctext_non_standard_name (MCharset *charset, int *bytes)
2118 char *name = msymbol_name (charset->name);
2120 if (! strcmp (name, "koi8-r"))
2122 else if (! strcmp (name, "big5"))
2123 name = "big5-0", *bytes = 2;
2129 /* Designate CHARSET to a graphic register specified in
2130 SPEC->designation. If the register is not yet invoked to graphic
2131 left not right, invoke it to graphic left. DSTP points to a
2132 variable containing a memory address where the output must go.
2133 DST_END is the limit of that memory.
2135 Return 0 if it succeeds. Return -1 otherwise, which means that the
2136 memory area is too short. By side effect, update the variable that
2140 iso_2022_designate_invoke_charset (MCodingSystem *coding,
2142 struct iso_2022_spec *spec,
2143 struct iso_2022_status *status,
2144 unsigned char **dstp,
2145 unsigned char *dst_end)
2148 unsigned char *dst = *dstp;
2150 for (i = 0; i < 4; i++)
2151 if (charset == status->designation[i])
2156 /* CHARSET is not yet designated to any graphic registers. */
2157 for (i = 0; i < coding->ncharsets; i++)
2158 if (charset == coding->charsets[i])
2160 if (i == coding->ncharsets)
2162 for (i = 0; i < mcharset__iso_2022_table.used; i++)
2163 if (charset == mcharset__iso_2022_table.charsets[i])
2165 i += coding->ncharsets;
2167 i = spec->designations[i];
2168 ISO2022_ENCODE_DESIGNATION (i, charset, spec, status);
2171 if (status->invocation[0] != i
2172 && status->invocation[1] != i)
2174 /* Graphic register I is not yet invoked. */
2177 case 0: /* graphic register 0 */
2178 ISO2022_ENCODE_SHIFT_IN (status);
2181 case 1: /* graphic register 1 */
2182 ISO2022_ENCODE_SHIFT_OUT (status);
2185 case 2: /* graphic register 2 */
2186 if (spec->flags & MCODING_ISO_SINGLE_SHIFT)
2187 ISO2022_ENCODE_SINGLE_SHIFT_2 (spec, status);
2189 ISO2022_ENCODE_LOCKING_SHIFT_2 (status);
2192 case 3: /* graphic register 3 */
2193 if (spec->flags & MCODING_ISO_SINGLE_SHIFT)
2194 ISO2022_ENCODE_SINGLE_SHIFT_3 (spec, status);
2196 ISO2022_ENCODE_LOCKING_SHIFT_3 (status);
2209 /* Reset the invocation/designation status to the initial one. SPEC
2210 and STATUS contain information about the current and initial
2211 invocation /designation status respectively. DSTP points to a
2212 variable containing a memory address where the output must go.
2213 DST_END is the limit of that memory.
2215 Return 0 if it succeeds. Return -1 otherwise, which means that the
2216 memory area is too short. By side effect, update the variable that
2220 iso_2022_reset_invocation_designation (struct iso_2022_spec *spec,
2221 struct iso_2022_status *status,
2222 unsigned char **dstp,
2223 unsigned char *dst_end)
2225 unsigned char *dst = *dstp;
2228 /* Reset the invocation status of GL. We have not yet supported GR
2230 if (status->invocation[0] != spec->initial_invocation[0]
2231 && spec->initial_invocation[0] >= 0)
2233 if (spec->initial_invocation[0] == 0)
2234 ISO2022_ENCODE_SHIFT_IN (status);
2235 else if (spec->initial_invocation[0] == 1)
2236 ISO2022_ENCODE_SHIFT_OUT (status);
2237 else if (spec->initial_invocation[0] == 2)
2238 ISO2022_ENCODE_LOCKING_SHIFT_2 (status);
2239 else /* i.e. spec->initial_invocation[0] == 3 */
2240 ISO2022_ENCODE_LOCKING_SHIFT_3 (status);
2243 /* Reset the designation status of G0..G3. */
2244 for (i = 0; i < 4; i++)
2245 if (status->designation[i] != spec->initial_designation[i]
2246 && spec->initial_designation[i])
2248 MCharset *charset = spec->initial_designation[i];
2250 ISO2022_ENCODE_DESIGNATION (i, charset, spec, status);
2263 encode_coding_iso_2022 (MText *mt, int from, int to,
2264 unsigned char *destination, int dst_bytes,
2265 MConverter *converter)
2267 MConverterStatus *internal = (MConverterStatus *) converter->internal_info;
2268 MCodingSystem *coding = internal->coding;
2269 unsigned char *src, *src_end;
2270 unsigned char *dst = destination;
2271 unsigned char *dst_end = dst + dst_bytes;
2273 unsigned char *dst_base;
2274 struct iso_2022_spec *spec = (struct iso_2022_spec *) coding->extra_spec;
2275 int full_support = spec->flags & MCODING_ISO_FULL_SUPPORT;
2276 struct iso_2022_status *status
2277 = (struct iso_2022_status *) &(converter->status);
2278 MCharset *primary, *charset0, *charset1;
2279 int next_primary_change;
2280 int ncharsets = coding->ncharsets;
2281 MCharset **charsets = coding->charsets;
2282 MCharset *cns_charsets[15];
2283 int ascii_compatible = coding->ascii_compatible;
2284 MCharset *non_standard_charset = NULL;
2285 int non_standard_charset_bytes = 0;
2286 int non_standard_bytes = 0;
2287 unsigned char *non_standard_begin = NULL;
2288 enum MTextFormat format = mt->format;
2290 SET_SRC (mt, format, from, to);
2292 if (spec->flags & MCODING_ISO_EUC_TW_SHIFT)
2296 memset (cns_charsets, 0, sizeof (cns_charsets));
2297 for (i = 0; i < ncharsets; i++)
2298 if (charsets[i]->dimension == 2)
2300 int final = charsets[i]->final_byte;
2302 if (final >= 'G' && final <= 'M')
2303 cns_charsets[final - 'G'] = charsets[i];
2305 cns_charsets[14] = charsets[i];
2309 next_primary_change = from;
2311 charset0 = status->designation[status->invocation[0]];
2312 charset1 = (status->invocation[1] < 0 ? NULL
2313 : status->designation[status->invocation[1]]);
2320 ONE_MORE_CHAR (c, bytes, format);
2322 if (c < 128 && ascii_compatible)
2324 if (status->utf8_shifting)
2325 ISO2022_ENCODE_UTF8_SHIFT_END ();
2329 else if (c <= 32 || c == 127)
2331 if (status->utf8_shifting)
2332 ISO2022_ENCODE_UTF8_SHIFT_END ();
2333 if (spec->flags & MCODING_ISO_RESET_AT_CNTL
2334 || (c == '\n' && spec->flags & MCODING_ISO_RESET_AT_EOL))
2336 if (iso_2022_reset_invocation_designation (spec, status,
2338 goto insufficient_destination;
2339 charset0 = status->designation[status->invocation[0]];
2340 charset1 = (status->invocation[1] < 0 ? NULL
2341 : status->designation[status->invocation[1]]);
2348 unsigned code = MCHAR_INVALID_CODE;
2349 MCharset *charset = NULL;
2351 int pos = from + nchars;
2353 if (pos >= next_primary_change)
2355 MSymbol primary_charset
2356 = (MSymbol) mtext_get_prop (mt, pos, Mcharset);
2357 primary = MCHARSET (primary_charset);
2358 if (primary && primary != mcharset__binary)
2360 if (primary->final_byte <= 0)
2362 else if (! full_support)
2366 for (i = 0; i < ncharsets; i++)
2367 if (primary == charsets[i])
2374 mtext_prop_range (mt, Mcharset, pos,
2375 NULL, &next_primary_change, 0);
2378 if (primary && primary != mcharset__binary)
2380 code = ENCODE_CHAR (primary, c);
2381 if (code != MCHAR_INVALID_CODE)
2386 if (c <= 32 || c == 127)
2389 charset = mcharset__ascii;
2395 for (i = 0; i < ncharsets; i++)
2397 charset = charsets[i];
2398 code = ENCODE_CHAR (charset, c);
2399 if (code != MCHAR_INVALID_CODE)
2404 if (spec->flags & MCODING_ISO_FULL_SUPPORT)
2406 for (i = 0; i < mcharset__iso_2022_table.used; i++)
2408 charset = mcharset__iso_2022_table.charsets[i];
2409 code = ENCODE_CHAR (charset, c);
2410 if (code != MCHAR_INVALID_CODE)
2413 if (i == mcharset__iso_2022_table.used)
2415 if (spec->flags & MCODING_ISO_DESIGNATION_CTEXT_EXT)
2416 goto unsupported_char;
2417 converter->result = MCONVERSION_RESULT_INVALID_CHAR;
2422 goto unsupported_char;
2428 && (charset->final_byte >= 0
2429 || spec->flags & MCODING_ISO_EUC_TW_SHIFT))
2431 if (code >= 0x80 && code < 0xA0)
2432 goto unsupported_char;
2434 if (status->utf8_shifting)
2435 ISO2022_ENCODE_UTF8_SHIFT_END ();
2436 if (charset == charset0)
2438 else if (charset == charset1)
2442 unsigned char *p = NULL;
2444 if (spec->flags & MCODING_ISO_EUC_TW_SHIFT)
2448 if (cns_charsets[0] == charset)
2454 for (i = 1; i < 15; i++)
2455 if (cns_charsets[i] == charset)
2458 *dst++ = ISO_CODE_SS2;
2461 status->single_shifting = 1;
2466 if (iso_2022_designate_invoke_charset
2467 (coding, charset, spec, status, &dst, dst_end) < 0)
2468 goto insufficient_destination;
2469 charset0 = status->designation[status->invocation[0]];
2470 charset1 = (status->invocation[1] < 0 ? NULL
2471 : status->designation[status->invocation[1]]);
2473 if (status->single_shifting)
2475 = (spec->flags & MCODING_ISO_EIGHT_BIT) ? 0x80 : 0;
2476 else if (charset == charset0)
2481 if (charset->dimension == 1)
2484 *dst++ = code | gr_mask;
2486 else if (charset->dimension == 2)
2489 *dst++ = (code >> 8) | gr_mask;
2490 *dst++ = (code & 0xFF) | gr_mask;
2495 *dst++ = (code >> 16) | gr_mask;
2496 *dst++ = ((code >> 8) & 0xFF) | gr_mask;
2497 *dst++ = (code & 0xFF) | gr_mask;
2499 status->single_shifting = 0;
2501 else if (charset && spec->flags & MCODING_ISO_DESIGNATION_CTEXT_EXT)
2503 if (charset != non_standard_charset)
2505 char *name = (find_ctext_non_standard_name
2506 (charset, &non_standard_charset_bytes));
2510 int len = strlen (name);
2512 ISO2022_ENCODE_NON_STANDARD (name, len);
2513 non_standard_charset = charset;
2516 non_standard_charset = NULL;
2519 if (non_standard_charset)
2521 if (dst + non_standard_charset_bytes > dst_end)
2522 goto insufficient_destination;
2523 non_standard_bytes += non_standard_charset_bytes;
2524 non_standard_begin[4] = (non_standard_bytes / 128) | 0x80;
2525 non_standard_begin[5] = (non_standard_bytes % 128) | 0x80;
2526 if (non_standard_charset_bytes == 1)
2528 else if (non_standard_charset_bytes == 2)
2529 *dst++ = code >> 8, *dst++ = code & 0xFF;
2530 else if (non_standard_charset_bytes == 3)
2531 *dst++ = code >> 16, *dst++ = (code >> 8) & 0xFF,
2532 *dst++ = code & 0xFF;
2533 else /* i.e non_standard_charset_bytes == 3 */
2534 *dst++ = code >> 24, *dst++ = (code >> 16) & 0xFF,
2535 *dst++ = (code >> 8) & 0xFF, *dst++ = code & 0xFF;
2539 int len = CHAR_BYTES (c);
2542 goto unsupported_char;
2543 if (! status->utf8_shifting)
2544 ISO2022_ENCODE_UTF8_SHIFT_START (len);
2547 CHAR_STRING (c, dst);
2551 goto unsupported_char;
2561 if (iso_2022_designate_invoke_charset (coding, mcharset__ascii,
2564 goto insufficient_destination;
2565 if (! converter->lenient)
2567 len = encode_unsupporeted_char (c, dst, dst_end, mt, from + nchars);
2569 goto insufficient_destination;
2575 /* We reach here because of an unsupported char. */
2576 converter->result = MCONVERSION_RESULT_INVALID_CHAR;
2579 insufficient_destination:
2581 converter->result = MCONVERSION_RESULT_INSUFFICIENT_DST;
2584 if (converter->result == MCONVERSION_RESULT_SUCCESS
2585 && converter->last_block)
2587 if (status->utf8_shifting)
2589 ISO2022_ENCODE_UTF8_SHIFT_END ();
2592 if (spec->flags & MCODING_ISO_RESET_AT_EOL
2593 && charset0 != spec->initial_designation[0])
2595 if (iso_2022_reset_invocation_designation (spec, status,
2597 goto insufficient_destination;
2600 converter->nchars += nchars;
2601 converter->nbytes += dst - destination;
2602 return (converter->result == MCONVERSION_RESULT_INVALID_CHAR ? -1 : 0);
2606 /* Staffs for coding-systems of type MCODING_TYPE_MISC. */
2608 /* For SJIS handling... */
2610 #define SJIS_TO_JIS(s1, s2) \
2612 ? (((s1 * 2 - (s1 >= 0xE0 ? 0x160 : 0xE0)) << 8) \
2614 : (((s1 * 2 - ((s1 >= 0xE0) ? 0x161 : 0xE1)) << 8) \
2615 | (s2 - ((s2 >= 0x7F) ? 0x20 : 0x1F))))
2617 #define JIS_TO_SJIS(c1, c2) \
2619 ? (((c1 / 2 + ((c1 < 0x5F) ? 0x71 : 0xB1)) << 8) \
2620 | (c2 + ((c2 >= 0x60) ? 0x20 : 0x1F))) \
2621 : (((c1 / 2 + ((c1 < 0x5F) ? 0x70 : 0xB0)) << 8) \
2626 reset_coding_sjis (MConverter *converter)
2628 MConverterStatus *internal = (MConverterStatus *) converter->internal_info;
2629 MCodingSystem *coding = internal->coding;
2631 if (! coding->ready)
2633 MSymbol kanji_sym = msymbol ("jisx0208.1983");
2634 MCharset *kanji = MCHARSET (kanji_sym);
2635 MSymbol kana_sym = msymbol ("jisx0201-kana");
2636 MCharset *kana = MCHARSET (kana_sym);
2638 if (! kanji_sym || ! kana_sym)
2640 coding->ncharsets = 3;
2641 coding->charsets[1] = kanji;
2642 coding->charsets[2] = kana;
2649 decode_coding_sjis (unsigned char *source, int src_bytes, MText *mt,
2650 MConverter *converter)
2652 MConverterStatus *internal = (MConverterStatus *) converter->internal_info;
2653 MCodingSystem *coding = internal->coding;
2654 unsigned char *src = internal->carryover;
2655 unsigned char *src_stop = src + internal->carryover_bytes;
2656 unsigned char *src_end = source + src_bytes;
2657 unsigned char *src_base;
2658 unsigned char *dst = mt->data + mt->nbytes;
2659 unsigned char *dst_end = mt->data + mt->allocated - MAX_UTF8_CHAR_BYTES;
2661 int last_nchars = 0;
2662 int at_most = converter->at_most > 0 ? converter->at_most : -1;
2664 MCharset *charset_roman = coding->charsets[0];
2665 MCharset *charset_kanji = coding->charsets[1];
2666 MCharset *charset_kana = coding->charsets[2];
2667 MCharset *charset = mcharset__ascii;
2672 MCharset *this_charset;
2675 ONE_MORE_BASE_BYTE (c1);
2680 this_charset = ((c1 <= 0x20 || c1 == 0x7F)
2684 else if ((c1 >= 0x81 && c1 <= 0x9F) || (c1 >= 0xE0 && c1 <= 0xEF))
2687 if ((c2 >= 0x40 && c2 <= 0x7F) || (c2 >= 80 && c2 <= 0xFC))
2689 this_charset = charset_kanji;
2690 c1 = SJIS_TO_JIS (c1, c2);
2695 else if (c1 >= 0xA1 && c1 <= 0xDF)
2697 this_charset = charset_kana;
2703 c = DECODE_CHAR (this_charset, c1);
2708 if (! converter->lenient)
2710 REWIND_SRC_TO_BASE ();
2712 this_charset = mcharset__binary;
2715 if (this_charset != mcharset__ascii
2716 && this_charset != charset)
2718 TAKEIN_CHARS (mt, nchars - last_nchars,
2719 dst - (mt->data + mt->nbytes), charset);
2720 charset = this_charset;
2721 last_nchars = nchars;
2725 /* We reach here because of an invalid byte. */
2729 TAKEIN_CHARS (mt, nchars - last_nchars,
2730 dst - (mt->data + mt->nbytes), charset);
2731 return finish_decoding (mt, converter, nchars,
2732 source, src_end, src_base, error);
2736 encode_coding_sjis (MText *mt, int from, int to,
2737 unsigned char *destination, int dst_bytes,
2738 MConverter *converter)
2740 MConverterStatus *internal = (MConverterStatus *) converter->internal_info;
2741 MCodingSystem *coding = internal->coding;
2742 unsigned char *src, *src_end;
2743 unsigned char *dst = destination;
2744 unsigned char *dst_end = dst + dst_bytes;
2746 MCharset *charset_roman = coding->charsets[0];
2747 MCharset *charset_kanji = coding->charsets[1];
2748 MCharset *charset_kana = coding->charsets[2];
2749 enum MTextFormat format = mt->format;
2751 SET_SRC (mt, format, from, to);
2758 ONE_MORE_CHAR (c, bytes, format);
2760 if (c <= 0x20 || c == 0x7F)
2767 if ((code = ENCODE_CHAR (charset_roman, c)) != MCHAR_INVALID_CODE)
2772 else if ((code = ENCODE_CHAR (charset_kanji, c))
2773 != MCHAR_INVALID_CODE)
2775 int c1 = code >> 8, c2 = code & 0xFF;
2776 code = JIS_TO_SJIS (c1, c2);
2779 *dst++ = code & 0xFF;
2781 else if ((code = ENCODE_CHAR (charset_kana, c))
2782 != MCHAR_INVALID_CODE)
2785 *dst++ = code | 0x80;
2789 if (! converter->lenient)
2791 len = encode_unsupporeted_char (c, dst, dst_end,
2794 goto insufficient_destination;
2801 /* We reach here because of an unsupported char. */
2802 converter->result = MCONVERSION_RESULT_INVALID_CHAR;
2805 insufficient_destination:
2806 converter->result = MCONVERSION_RESULT_INSUFFICIENT_DST;
2809 converter->nchars += nchars;
2810 converter->nbytes += dst - destination;
2811 return (converter->result == MCONVERSION_RESULT_INVALID_CHAR ? -1 : 0);
2815 static MCodingSystem *
2816 find_coding (MSymbol name)
2818 MCodingSystem *coding = (MCodingSystem *) msymbol_get (name, Mcoding);
2822 MPlist *param = mplist_get (coding_definition_list, name);
2826 param = mplist__from_plist (param);
2827 mconv_define_coding (MSYMBOL_NAME (name), param, NULL, NULL, NULL, NULL);
2828 coding = (MCodingSystem *) msymbol_get (name, Mcoding);
2829 M17N_OBJECT_UNREF (param);
2834 #define BINDING_NONE 0
2835 #define BINDING_BUFFER 1
2836 #define BINDING_STREAM 2
2838 #define CONVERT_WORKSIZE 0x10000
2844 mcoding__init (void)
2847 MPlist *param, *charsets, *pl;
2849 MLIST_INIT1 (&coding_list, codings, 128);
2850 coding_definition_list = mplist ();
2852 /* ISO-2022 specific initialize routine. */
2853 for (i = 0; i < 0x20; i++)
2854 iso_2022_code_class[i] = ISO_control_0;
2855 for (i = 0x21; i < 0x7F; i++)
2856 iso_2022_code_class[i] = ISO_graphic_plane_0;
2857 for (i = 0x80; i < 0xA0; i++)
2858 iso_2022_code_class[i] = ISO_control_1;
2859 for (i = 0xA1; i < 0xFF; i++)
2860 iso_2022_code_class[i] = ISO_graphic_plane_1;
2861 iso_2022_code_class[0x20] = iso_2022_code_class[0x7F] = ISO_0x20_or_0x7F;
2862 iso_2022_code_class[0xA0] = iso_2022_code_class[0xFF] = ISO_0xA0_or_0xFF;
2863 iso_2022_code_class[0x0E] = ISO_shift_out;
2864 iso_2022_code_class[0x0F] = ISO_shift_in;
2865 iso_2022_code_class[0x19] = ISO_single_shift_2_7;
2866 iso_2022_code_class[0x1B] = ISO_escape;
2867 iso_2022_code_class[0x8E] = ISO_single_shift_2;
2868 iso_2022_code_class[0x8F] = ISO_single_shift_3;
2869 iso_2022_code_class[0x9B] = ISO_control_sequence_introducer;
2871 Mcoding = msymbol ("coding");
2873 Mutf = msymbol ("utf");
2874 Miso_2022 = msymbol ("iso-2022");
2876 Mreset_at_eol = msymbol ("reset-at-eol");
2877 Mreset_at_cntl = msymbol ("reset-at-cntl");
2878 Meight_bit = msymbol ("eight-bit");
2879 Mlong_form = msymbol ("long-form");
2880 Mdesignation_g0 = msymbol ("designation-g0");
2881 Mdesignation_g1 = msymbol ("designation-g1");
2882 Mdesignation_ctext = msymbol ("designation-ctext");
2883 Mdesignation_ctext_ext = msymbol ("designation-ctext-ext");
2884 Mlocking_shift = msymbol ("locking-shift");
2885 Msingle_shift = msymbol ("single-shift");
2886 Msingle_shift_7 = msymbol ("single-shift-7");
2887 Meuc_tw_shift = msymbol ("euc-tw-shift");
2888 Miso_6429 = msymbol ("iso-6429");
2889 Mrevision_number = msymbol ("revision-number");
2890 Mfull_support = msymbol ("full-support");
2891 Mmaybe = msymbol ("maybe");
2893 Mtype = msymbol ("type");
2894 Mcharsets = msymbol_as_managing_key ("charsets");
2895 Mflags = msymbol_as_managing_key ("flags");
2896 Mdesignation = msymbol_as_managing_key ("designation");
2897 Minvocation = msymbol_as_managing_key ("invocation");
2898 Mcode_unit = msymbol ("code-unit");
2899 Mbom = msymbol ("bom");
2900 Mlittle_endian = msymbol ("little-endian");
2903 charsets = mplist ();
2905 /* Setup predefined codings. */
2906 mplist_set (charsets, Msymbol, Mcharset_ascii);
2907 pl = mplist_add (pl, Mtype, Mcharset);
2908 pl = mplist_add (pl, Mcharsets, charsets);
2909 Mcoding_us_ascii = mconv_define_coding ("us-ascii", param,
2910 NULL, NULL, NULL, NULL);
2913 MSymbol alias = msymbol ("ANSI_X3.4-1968");
2914 MCodingSystem *coding
2915 = (MCodingSystem *) msymbol_get (Mcoding_us_ascii, Mcoding);
2917 msymbol_put (alias, Mcoding, coding);
2918 alias = msymbol__canonicalize (alias);
2919 msymbol_put (alias, Mcoding, coding);
2922 mplist_set (charsets, Msymbol, Mcharset_iso_8859_1);
2923 Mcoding_iso_8859_1 = mconv_define_coding ("iso-8859-1", param,
2924 NULL, NULL, NULL, NULL);
2926 mplist_set (charsets, Msymbol, Mcharset_m17n);
2927 mplist_put (param, Mtype, Mutf);
2928 mplist_put (param, Mcode_unit, (void *) 8);
2929 Mcoding_utf_8_full = mconv_define_coding ("utf-8-full", param,
2930 NULL, NULL, NULL, NULL);
2932 mplist_set (charsets, Msymbol, Mcharset_unicode);
2933 Mcoding_utf_8 = mconv_define_coding ("utf-8", param,
2934 NULL, NULL, NULL, NULL);
2936 mplist_put (param, Mcode_unit, (void *) 16);
2937 mplist_put (param, Mbom, Mmaybe);
2938 #ifndef WORDS_BIGENDIAN
2939 mplist_put (param, Mlittle_endian, Mt);
2941 Mcoding_utf_16 = mconv_define_coding ("utf-16", param,
2942 NULL, NULL, NULL, NULL);
2944 mplist_put (param, Mcode_unit, (void *) 32);
2945 Mcoding_utf_32 = mconv_define_coding ("utf-32", param,
2946 NULL, NULL, NULL, NULL);
2948 mplist_put (param, Mcode_unit, (void *) 16);
2949 mplist_put (param, Mbom, Mnil);
2950 mplist_put (param, Mlittle_endian, Mnil);
2951 Mcoding_utf_16be = mconv_define_coding ("utf-16be", param,
2952 NULL, NULL, NULL, NULL);
2954 mplist_put (param, Mcode_unit, (void *) 32);
2955 Mcoding_utf_32be = mconv_define_coding ("utf-32be", param,
2956 NULL, NULL, NULL, NULL);
2958 mplist_put (param, Mcode_unit, (void *) 16);
2959 mplist_put (param, Mlittle_endian, Mt);
2960 Mcoding_utf_16le = mconv_define_coding ("utf-16le", param,
2961 NULL, NULL, NULL, NULL);
2963 mplist_put (param, Mcode_unit, (void *) 32);
2964 Mcoding_utf_32le = mconv_define_coding ("utf-32le", param,
2965 NULL, NULL, NULL, NULL);
2967 mplist_put (param, Mtype, Mnil);
2968 mplist_set (charsets, Msymbol, Mcharset_ascii);
2969 Mcoding_sjis = mconv_define_coding ("sjis", param,
2972 encode_coding_sjis, NULL);
2974 M17N_OBJECT_UNREF (charsets);
2975 M17N_OBJECT_UNREF (param);
2981 mcoding__fini (void)
2986 for (i = 0; i < coding_list.used; i++)
2988 MCodingSystem *coding = coding_list.codings[i];
2990 if (coding->extra_info)
2991 free (coding->extra_info);
2992 if (coding->extra_spec)
2994 if (coding->type == Miso_2022)
2995 free (((struct iso_2022_spec *) coding->extra_spec)->designations);
2996 free (coding->extra_spec);
3000 MLIST_FREE1 (&coding_list, codings);
3001 MPLIST_DO (plist, coding_definition_list)
3002 M17N_OBJECT_UNREF (MPLIST_VAL (plist));
3003 M17N_OBJECT_UNREF (coding_definition_list);
3007 mconv__define_coding_from_charset (MSymbol sym)
3009 MPlist *param = mplist (), *charsets = mplist ();
3011 mplist_set (charsets, Msymbol, sym);
3012 mplist_add (param, Mtype, Mcharset);
3013 mplist_add (param, Mcharsets, charsets);
3014 mconv_define_coding (msymbol_name (sym), param, NULL, NULL, NULL, NULL);
3015 M17N_OBJECT_UNREF (charsets);
3016 M17N_OBJECT_UNREF (param);
3020 mconv__register_charset_coding (MSymbol sym)
3022 if (! mplist_find_by_key (coding_definition_list, sym))
3024 MPlist *param = mplist (), *charsets = mplist ();
3026 mplist_set (charsets, Msymbol, sym);
3027 mplist_add (param, Msymbol, Mtype);
3028 mplist_add (param, Msymbol, Mcharset);
3029 mplist_add (param, Msymbol, Mcharsets);
3030 mplist_add (param, Mplist, charsets);
3031 mplist_put (coding_definition_list, sym, param);
3032 M17N_OBJECT_UNREF (charsets);
3038 mcoding__load_from_database ()
3040 MDatabase *mdb = mdatabase_find (msymbol ("coding-list"), Mnil, Mnil, Mnil);
3041 MPlist *def_list, *plist;
3042 MPlist *definitions = coding_definition_list;
3043 int mdebug_mask = MDEBUG_CODING;
3047 MDEBUG_PUSH_TIME ();
3048 def_list = (MPlist *) mdatabase_load (mdb);
3049 MDEBUG_PRINT_TIME ("CODING", (stderr, " to load the data."));
3054 MDEBUG_PUSH_TIME ();
3055 MPLIST_DO (plist, def_list)
3060 if (! MPLIST_PLIST_P (plist))
3061 MERROR (MERROR_CHARSET, -1);
3062 pl = MPLIST_PLIST (plist);
3063 if (! MPLIST_SYMBOL_P (pl))
3064 MERROR (MERROR_CHARSET, -1);
3065 name = MPLIST_SYMBOL (pl);
3066 pl = MPLIST_NEXT (pl);
3067 definitions = mplist_add (definitions, name, pl);
3068 M17N_OBJECT_REF (pl);
3071 M17N_OBJECT_UNREF (def_list);
3072 MDEBUG_PRINT_TIME ("CODING", (stderr, " to parse the loaded data."));
3078 #endif /* !FOR_DOXYGEN || DOXYGEN_INTERNAL_MODULE */
3082 /*** @addtogroup m17nConv */
3086 /***en @name Variables: Symbols representing a coding system */
3087 /***oldja @name ÊÑ¿ô: ÄêµÁºÑ¤ß¥³¡¼¥É·Ï¤ò»ØÄꤹ¤ë¤¿¤á¤Î¥·¥ó¥Ü¥ë */
3092 @brief Symbol for the coding system US-ASCII
3094 The symbol #Mcoding_us_ascii has name <tt>"us-ascii"</tt> and
3095 represents a coding system for the CES US-ASCII. */
3098 @brief MIME charset "US-ASCII" ¤ËÂбþ¤¹¤ë¥³¡¼¥É·Ï¤Î¥·¥ó¥Ü¥ë
3100 ¥·¥ó¥Ü¥ë @c Mcoding_us_ascii ¤Ï <tt>"us-ascii"</tt> ¤È¤¤¤¦Ì¾Á°¤ò»ý¤Á¡¢
3101 MIME charset <tt>"US-ASCII"</tt> ¤ËÂбþ¤¹¤ë¥³¡¼¥É·Ï¤ò»ØÄꤹ¤ë¤¿¤á
3104 MSymbol Mcoding_us_ascii;
3108 @brief Symbol for the coding system ISO-8859-1
3110 The symbol #Mcoding_iso_8859_1 has name <tt>"iso-8859-1"</tt> and
3111 represents a coding system for the CES ISO-8859-1. */
3114 @brief MIME charset "ISO-8859-1" ¤ËÂбþ¤¹¤ë¥³¡¼¥É·Ï¤Î¥·¥ó¥Ü¥ë
3116 ¥·¥ó¥Ü¥ë @c Mcoding_iso_8859_1 ¤Ï <tt>"iso-8859-1"</tt> ¤È¤¤¤¦Ì¾Á°
3117 ¤ò»ý¤Á¡¢MIME charset <tt>"ISO-8859-1"</tt> ¤ËÂбþ¤¹¤ë¥³¡¼¥É·Ï¤ò»Ø
3118 Äꤹ¤ë¤¿¤á¤Ë»È¤ï¤ì¤ë¡£ */
3120 MSymbol Mcoding_iso_8859_1;
3124 @brief Symbol for the coding system UTF-8
3126 The symbol #Mcoding_utf_8 has name <tt>"utf-8"</tt> and represents
3127 a coding system for the CES UTF-8. */
3130 @brief RFC 2279 ¤Î "UTF-8" ¤ËÂбþ¤¹¤ë¥³¡¼¥É·Ï¤Î¥·¥ó¥Ü¥ë¡ÊUnicode ÍÑ¡Ë
3132 ¥·¥ó¥Ü¥ë @c Mcoding_utf_8 ¤Ï <tt>"utf-8"</tt> ¤È¤¤¤¦Ì¾Á°¤ò»ý¤Á¡¢
3133 RFC 2279 ¤ÇÄêµÁ¤µ¤ì¤ë<tt>"UTF-8"</tt> ¤ËÂбþ¤¹¤ë¥³¡¼¥É·Ï¤ò»ØÄꤹ¤ë
3134 ¤¿¤á¤Ë»È¤ï¤ì¤ë¡£¤³¤Î¥³¡¼¥É·Ï¤Ï Unicode ¤ÎÁ´¤Æ¤Îʸ»ú¤ò¥µ¥Ý¡¼¥È¤¹¤ë¡£
3137 MSymbol Mcoding_utf_8;
3144 The symbol #Mcoding_utf_8_full has name <tt>"utf-8-full"</tt> and
3145 represents a coding system that is a extension of UTF-8. This
3146 coding system uses the same encoding algorithm as UTF-8 but is not
3147 limited to the Unicode characters. It can encode all characters
3148 supported by the m17n library. */
3151 @brief RFC 2279 ¤Î "UTF-8" ¤ËÂбþ¤¹¤ë¥³¡¼¥É·Ï¤Î¥·¥ó¥Ü¥ë¡ÊÁ´Ê¸»úÍÑ¡Ë
3153 ¥·¥ó¥Ü¥ë @c Mcoding_utf_8_full ¤Ï <tt>"utf-8-full"</tt> ¤È¤¤¤¦Ì¾Á°
3154 ¤ò»ý¤Á¡¢RFC 2279 ¤ÇÄêµÁ¤µ¤ì¤ë<tt>"UTF-8"</tt> ¤ËÂбþ¤¹¤ë¥³¡¼¥É·Ï¤ò
3155 »ØÄꤹ¤ë¤¿¤á¤Ë»È¤ï¤ì¤ë¡£¤³¤Î¥³¡¼¥É·Ï¤Ï m17n ¥é¥¤¥Ö¥é¥ê¤¬°·¤¦Á´¤Æ¤Î
3156 ʸ»ú¤ò¥µ¥Ý¡¼¥È¤¹¤ë¡£ */
3158 MSymbol Mcoding_utf_8_full;
3164 The symbol #Mcoding_utf_16 has name <tt>"utf-16"</tt> and
3165 represents a coding system for the CES UTF-16 (RFC 2279). */
3167 @brief RFC 2781 ¤Î "UTF-16" ¤ËÂбþ¤¹¤ë¥³¡¼¥É·Ï¤Î¥·¥ó¥Ü¥ë
3169 ¥·¥ó¥Ü¥ë @c Mcoding_utf_16 ¤Ï <tt>"utf-16"</tt> ¤È¤¤¤¦Ì¾Á°¤ò»ý¤Á¡¢
3170 RFC 2279 ¤ÇÄêµÁ¤µ¤ì¤ë<tt>"UTF-16"</tt> ¤ËÂбþ¤¹¤ë¥³¡¼¥É·Ï¤ò»ØÄꤹ
3171 ¤ë¤¿¤á¤Ë»È¤ï¤ì¤ë¡£¤³¤Î¥³¡¼¥É·Ï¤Ï Unicode ¤ÎÁ´¤Æ¤Îʸ»ú¤ò¥µ¥Ý¡¼¥È¤¹
3174 MSymbol Mcoding_utf_16;
3180 The symbol #Mcoding_utf_16be has name <tt>"utf-16be"</tt> and
3181 represents a coding system for the CES UTF-16BE (RFC 2279). */
3183 MSymbol Mcoding_utf_16be;
3189 The symbol #Mcoding_utf_16le has name <tt>"utf-16le"</tt> and
3190 represents a coding system for the CES UTF-16LE (RFC 2279). */
3192 MSymbol Mcoding_utf_16le;
3198 The symbol #Mcoding_utf_32 has name <tt>"utf-32"</tt> and
3199 represents a coding system for the CES UTF-32 (RFC 2279). */
3201 MSymbol Mcoding_utf_32;
3207 The symbol #Mcoding_utf_32be has name <tt>"utf-32be"</tt> and
3208 represents a coding system for the CES UTF-32BE (RFC 2279). */
3210 MSymbol Mcoding_utf_32be;
3216 The symbol #Mcoding_utf_32le has name <tt>"utf-32le"</tt> and
3217 represents a coding system for the CES UTF-32LE (RFC 2279). */
3218 MSymbol Mcoding_utf_32le;
3224 The symbol #Mcoding_sjis has name <tt>"sjis"</tt> and represents a coding
3225 system for the CES Shift-JIS. */
3227 MSymbol Mcoding_sjis;
3232 @name Variables: Parameter keys for mconv_define_coding (). */
3237 Parameter key for mconv_define_coding () (which see). */
3243 MSymbol Mdesignation;
3244 MSymbol Minvocation;
3247 MSymbol Mlittle_endian;
3252 @name Variables: Symbols representing coding system type. */
3257 Symbol that can be a value of the #Mtype parameter of a coding
3258 system used in an argument to the mconv_define_coding () function
3269 @name Variables: Symbols appearing in the value of #Mfrag parameter. */
3274 Symbol that can be a value of the #Mflags parameter of a coding
3275 system used in an argument to the mconv_define_coding () function
3277 MSymbol Mreset_at_eol;
3279 MSymbol Mreset_at_cntl;
3282 MSymbol Mdesignation_g0;
3283 MSymbol Mdesignation_g1;
3284 MSymbol Mdesignation_ctext;
3285 MSymbol Mdesignation_ctext_ext;
3286 MSymbol Mlocking_shift;
3287 MSymbol Msingle_shift;
3288 MSymbol Msingle_shift_7;
3289 MSymbol Meuc_tw_shift;
3291 MSymbol Mrevision_number;
3292 MSymbol Mfull_support;
3297 @name Variables: etc
3299 Remaining variables. */
3300 /***oldja @name ÊÑ¿ô: ¤½¤Î¾ */
3304 @brief Symbol whose name is "maybe".
3306 The variable #Mmaybe is a symbol of name <tt>"maybe"</tt>. It is
3307 used a value of #Mbom parameter of the function
3308 mconv_define_coding () (which see). */
3314 @brief The symbol @c Mcoding
3316 Any decoded M-text has a text property whose key is the predefined
3317 symbol @c Mcoding. The name of @c Mcoding is
3318 <tt>"coding"</tt>. */
3321 @brief ¥·¥ó¥Ü¥ë @c Mcoding
3323 ¥Ç¥³¡¼¥É¤µ¤ì¤¿ M-text ¤Ï¡¢¥¡¼¤¬ @c Mcoding ¤Ç¤¢¤ë¤è¤¦¤Ê¥Æ¥¥¹¥È¥×
3324 ¥í¥Ñ¥Æ¥£¤ò»ý¤Ä¡£¥·¥ó¥Ü¥ë @c Mcoding ¤Ï <tt>"coding"</tt> ¤È¤¤¤¦Ì¾
3325 Á°¤Ç¤¢¤é¤«¤¸¤áÄêµÁ¤µ¤ì¤Æ¤¤¤ë¡£ */
3333 @brief Define a coding system
3335 The mconv_define_coding () function defines a new coding system
3336 and makes it accessive via a symbol whose name is $NAME. $PLIST
3337 specifies parameters of the charset as below:
3341 <li> Key is @c Mtype, value is a symbol
3343 The value specifies the type of the coding system. It must be
3344 #Mcharset, #Mutf, #Miso_2022, or #Mnil.
3346 If the type is #Mcharset, $EXTRA_INFO is ignored.
3348 If the type is #Miso_2022, $EXTRA_INFO must be a pointer to
3349 #MCodingInfoISO2022.
3351 If the type is #Mutf, $EXTRA_INFO must be a pointer to
3354 If the type is #Mnil, the argument $RESETTER, $DECODER, and
3355 $ENCODER must be supplied. $EXTRA_INFO is ignored. Otherwise,
3356 they can be @c NULL and the m17n library provides proper defaults.
3358 <li> Key is #Mcharsets, value is a plist
3360 The value specifies a list charsets supported by the coding
3361 system. The keys of the plist must be #Msymbol, and the values
3362 must be symbols representing charsets.
3364 <li> Key is #Mflags, value is a plist
3366 If the type is #Miso_2022, the values specifies flags to control
3367 the ISO 2022 interpreter. The keys of the plist must e @c
3368 Msymbol, and values must be one of the following.
3374 If this flag exits, designation and invocation status is reset to
3375 the initial state at the end of line.
3377 <li> #Mreset_at_cntl
3379 If this flag exists, designation and invocation status is reset to
3380 the initial state at a control character.
3384 If this flag exists, the graphic plane right is used.
3388 If this flag exists, the over-long escape sequences (ESC '$' '('
3389 <final_byte>) are used for designating the charsets JISX0208.1978,
3390 GB2312, and JISX0208.
3392 <li> #Mdesignation_g0
3394 If this flag and #Mfull_support exists, designates charsets not
3395 listed in the charset list to the graphic register G0.
3397 <li> #Mdesignation_g1
3399 If this flag and #Mfull_support exists, designates charsets not
3400 listed in the charset list to the graphic register G1.
3402 <li> #Mdesignation_ctext
3404 If this flag and #Mfull_support exists, designates charsets not
3405 listed in the charset list to a graphic register G0 or G1 based on
3406 the criteria of the Compound Text.
3408 <li> #Mdesignation_ctext_ext
3410 If this flag and #Mfull_support exists, designates charsets not
3411 listed in the charset list to a graphic register G0 or G1, or use
3412 extended segment for such charsets based on the criteria of the
3415 <li> #Mlocking_shift
3417 If this flag exists, use locking shift.
3421 If this flag exists, use single shift.
3423 <li> #Msingle_shift_7
3425 If this flag exists, use 7-bit single shift code (0x19).
3427 <li> #Meuc_tw_shift;
3429 If this flag exists, use a special shifting according to EUC-TW.
3433 This flag is currently ignored.
3435 <li> #Mrevision_number
3437 If this flag exists, use a revision number escape sequence to
3438 designate a charset that has a revision number.
3442 If this flag exists, support all charsets registered in the
3443 International Registry.
3447 <li> Key is #Mdesignation, value is a plist
3449 If the type is #Miso_2022, the value specifies how to designate
3450 each supported characters. The keys of the plist must be @c
3451 Minteger, and the values must be numbers indicating a graphic
3452 registers. The Nth element value is for the Nth charset of the
3453 charset list. The value 0..3 means that it is assumed that a
3454 charset is already designated to the graphic register 0..3. The
3455 negative value G (-4..-1) means that a charset is not designated
3456 to any register at first, and if necessary, is designated to the
3457 (G+4) graphic register.
3459 <li> Key is #Minvocation, value is a plist
3461 If the type is #Miso_2022, the value specifies how to invocate
3462 each graphic registers. The plist length must be one or two. The
3463 keys of the plist must be #Minteger, and the values must be
3464 numbers indicating a graphic register. The value of the first
3465 element specifies which graphic register is invocated to the
3466 graphic plane left. If the length is one, no graphic register is
3467 invocated to the graphic plane right. Otherwise, the value of the
3468 second element specifies which graphic register is invocated to
3469 the graphic plane right.
3471 <li> Key is #Mcode_unit, value is an integer
3473 If the type is #Mutf, the value specifies the bit length of a
3474 code-unit. It must be 8, 16, or 32.
3476 <li> Key is #Mbom, value is a symbol
3478 If the type is #Mutf and the code-unit bit length is 16 or 32,
3479 it specifies whether or not to use BOM (Byte Order Mark). If the
3480 value is #Mnil (default), BOM is not used, else if the value is
3481 #Mmaybe, the existence of BOM is detected at decoding time, else
3484 <li> Key is #Mlittle_endian, value is a symbol
3486 If the type is #Mutf and the code-unit bit length is 16 or 32,
3487 it specifies whether or not the encoding is little endian. If the
3488 value is #Mnil (default), it is big endian, else it is little
3493 $RESETTER is a pointer to a function that resets a converter for
3494 the coding system to the initial status. The pointed function is
3495 called with one argument, a pointer to a converter object.
3497 $DECODER is a pointer to a function that decodes a byte sequence
3498 according to the coding system. The pointed function is called
3499 with four arguments:
3501 @li A pointer to the byte sequence to decode.
3502 @li The number of bytes to decode.
3503 @li A pointer to an M-text to which the decoded characters are appended.
3504 @li A pointer to a converter object.
3506 $DECODER must return 0 if it succeeds. Otherwise it must return -1.
3508 $ENCODER is a pointer to a function that encodes an M-text
3509 according to the coding system. The pointed function is called
3512 @li A pointer to the M-text to encode.
3513 @li The starting position of the encoding.
3514 @li The ending position of the encoding.
3515 @li A pointer to a memory area where the produced bytes are stored.
3516 @li The size of the memory area.
3517 @li A pointer to a converter object.
3519 $ENCODER must return 0 if it succeeds. Otherwise it must return -1.
3521 $EXTRA_INFO is a pointer to a data structure that contains extra
3522 information about the coding system. The type of the data
3523 structure depends on $TYPE.
3527 If the operation was successful, mconv_define_coding () returns a
3528 symbol whose name is $NAME. If an error is detected, it returns
3529 #Mnil and assigns an error code to the external variable @c
3533 @brief ¥³¡¼¥É·Ï¤ÎÄêµÁ
3535 ´Ø¿ô mconv_define_coding () ¤Ï¡¢¿·¤·¤¤¥³¡¼¥É·Ï¤òÄêµÁ¤·¡¢¤½¤ì¤ò
3536 $NAME ¤È¤¤¤¦Ì¾Á°¤Î¥·¥ó¥Ü¥ë·Ðͳ¤Ç¥¢¥¯¥»¥¹¤Ç¤¤ë¤è¤¦¤Ë¤¹¤ë¡£
3538 $TYPE ¤Ï Îóµó·¿ #MCodingType ¤Î¤¤¤º¤ì¤«¤Ç¤¢¤ê¡¢¥³¡¼¥É·Ï¤Î¹½Â¤¤ò
3541 $CHARSET_NAMES ¤Ï¥µ¥Ý¡¼¥È¤¹¤ëʸ»ú¥»¥Ã¥È¤òɽ¤ï¤¹¥·¥ó¥Ü¥ë¤ÎÇÛÎó¤Ç¤¢¤ê¡¢
3542 $NCHARSETS ¤Ï¤½¤ÎÍ×ÁÇ¿ô¤Ç¤¢¤ë¡£
3544 $TYPE ¤¬ #MCODING_TYPE_MISC ¤Ç¤¢¤ë¾ì¹ç¤Ë¤Ï¡¢$RESETTER, $DECODER,
3545 $ENCODER ¤òÍ¿¤¨¤Ê¤¯¤Æ¤Ï¤Ê¤é¤Ê¤¤¡£¤½¤ì°Ê³°¤Î¾ì¹ç¤Ë¤Ï¤³¤ì¤é¤Ï @c
3546 NULL ¤Ç¹½¤ï¤Ê¤¤¡£¤½¤ÎºÝ¤Ë¤Ï m17n ¥é¥¤¥Ö¥é¥ê¤¬Å¬Àڤʥǥե©¥ë¥ÈÃͤò
3549 $RESETTER ¤Ï¤³¤Î¥³¡¼¥É·ÏÍѤΥ³¥ó¥Ð¡¼¥¿¤ò½é´ü¾õÂ֤˥ꥻ¥Ã¥È¤¹¤ë´Ø¿ô
3550 ¤Ø¤Î¥Ý¥¤¥ó¥¿¤Ç¤¢¤ë¡£¤³¤Î´Ø¿ô¤Ï¥³¥ó¥Ð¡¼¥¿¥ª¥Ö¥¸¥§¥¯¥È¤Ø¤Î¥Ý¥¤¥ó¥¿¤È
3553 $DECODER ¤Ï¥Ð¥¤¥ÈÎó¤ò¤³¤Î¥³¡¼¥É·Ï¤Ë½¾¤Ã¤Æ¥Ç¥³¡¼¥É¤¹¤ë´Ø¿ô¤Ø¤Î¥Ý¥¤
3554 ¥ó¥¿¤Ç¤¢¤ë¡£¤³¤Î´Ø¿ô¤Ï°Ê²¼¤Î4°ú¿ô¤ò¤È¤ë¡£
3556 @li ¥Ð¥¤¥ÈÎó¤Ø¤Î¥Ý¥¤¥ó¥¿
3557 @li ¥Ç¥³¡¼¥É¤¹¤Ù¤¥Ð¥¤¥È¿ô
3558 @li ¥Ç¥³¡¼¥É·ë²Ì¤Îʸ»ú¤òÉղ乤ë M-text ¤Ø¤Î¥Ý¥¤¥ó¥¿
3559 @li ¥³¥ó¥Ð¡¼¥¿¥ª¥Ö¥¸¥§¥¯¥È¤Ø¤Î¥Ý¥¤¥ó¥¿
3561 $DECODER ¤ÏÀ®¸ù¤·¤¿¤È¤¤Ë¤Ï0¤ò¡¢¼ºÇÔ¤·¤¿¤È¤¤Ë¤Ï-1¤òÊÖ¤µ¤Ê¤¯¤Æ¤Ï¤Ê
3564 $ENCODER ¤Ï M-text ¤ò¤³¤Î¥³¡¼¥É·Ï¤Ë½¾¤Ã¤Æ¥¨¥ó¥³¡¼¥É¤¹
3565 ¤ë´Ø¿ô¤Ø¤Î¥Ý¥¤¥ó¥¿¤Ç¤¢¤ë¡£¤³¤Î´Ø¿ô¤Ï°Ê²¼¤Î6°ú¿ô¤ò¤È¤ë¡£
3567 @li M-text ¤Ø¤Î¥Ý¥¤¥ó¥¿
3568 @li M-text ¤Î¥¨¥ó¥³¡¼¥É³«»Ï°ÌÃÖ
3569 @li M-text ¤Î¥¨¥ó¥³¡¼¥É½ªÎ»°ÌÃÖ
3570 @li À¸À®¤·¤¿¥Ð¥¤¥È¤òÊÝ»ý¤¹¤ë¥á¥â¥êÎΰè¤Ø¤Î¥Ý¥¤¥ó¥¿
3571 @li ¥á¥â¥êÎΰè¤Î¥µ¥¤¥º
3572 @li ¥³¥ó¥Ð¡¼¥¿¥ª¥Ö¥¸¥§¥¯¥È¤Ø¤Î¥Ý¥¤¥ó¥¿
3574 $ENCODER ¤ÏÀ®¸ù¤·¤¿¤È¤¤Ë¤Ï0¤ò¡¢¼ºÇÔ¤·¤¿¤È¤¤Ë¤Ï-1¤òÊÖ¤µ¤Ê¤¯¤Æ¤Ï¤Ê
3577 $EXTRA_INFO ¤Ï¥³¡¼¥Ç¥£¥°¥·¥¹¥Æ¥à¤Ë´Ø¤¹¤ëÄɲþðÊó¤ò´Þ¤à¥Ç¡¼¥¿¹½Â¤¤Ø
3578 ¤Î¥Ý¥¤¥ó¥¿¤Ç¤¢¤ë¡£¤³¤Î¥Ç¡¼¥¿¹½Â¤¤Î¥¿¥¤¥×¤Ï $TYPE ¤Ë°Í¸¤¹¤ë¡£
3580 $TYPE ¤¬ #MCODING_TYPE_ISO_2022 ¤Ç¤¢¤ì¤Ð¡¢$EXTRA_INFO ¤Ï @c
3581 MCodingInfoISO2022 ¤Ø¤Î¥Ý¥¤¥ó¥¿¤Ç¤Ê¤¯¤Æ¤Ï¤Ê¤é¤Ê¤¤¡£
3583 $TYPE ¤¬ #MCODING_TYPE_UTF ¤Ç¤¢¤ì¤Ð¡¢$EXTRA_INFO ¤Ï @c
3584 MCodingInfoUTF ¤Ø¤Î¥Ý¥¤¥ó¥¿¤Ç¤Ê¤¯¤Æ¤Ï¤Ê¤é¤Ê¤¤¡£
3586 $TYPE ¤¬ #MCODING_TYPE_CHARSET, #MCODING_TYPE_MISC ¤Î¤É¤ì¤«¤Ç
3587 ¤¢¤ì¤Ð¡¢$EXTRA_INFO ¤Ï̵»ë¤µ¤ì¤ë¡£
3591 ½èÍý¤ËÀ®¸ù¤¹¤ì¤Ð mconv_define_coding () ¤Ï $NAME ¤È¤¤¤¦Ì¾Á°¤Î¥·
3592 ¥ó¥Ü¥ë¤òÊÖ¤¹¡£¤³¤Î¥·¥ó¥Ü¥ë¤Ï¡¢¥¡¼¤¬ $Mcoding ¤Ç¡¢ºî¤é¤ì¤¿¥³¡¼¥É·Ï
3593 ¤Ø¤Î¥Ý¥¤¥ó¥¿¤òÃͤȤ¹¤ë¥·¥ó¥Ü¥ë¥×¥í¥Ñ¥Æ¥£¤ò»ý¤Ä¡£ ¥¨¥é¡¼¤¬¸¡½Ð¤µ¤ì
3594 ¤¿¾ì¹ç¤Ï Mnil ¤òÊÖ¤·¡¢³°ÉôÊÑ¿ô #merror_code ¤Ë¥¨¥é¡¼¥³¡¼¥É¤òÀßÄꤹ¤ë¡£
3602 mconv_define_coding (char *name, MPlist *plist,
3603 int (*resetter) (MConverter *),
3604 int (*decoder) (unsigned char *, int, MText *,
3606 int (*encoder) (MText *, int, int,
3607 unsigned char *, int,
3611 MSymbol sym = msymbol (name);
3613 MCodingSystem *coding;
3616 MSTRUCT_MALLOC (coding, MERROR_CODING);
3618 if ((coding->type = (MSymbol) mplist_get (plist, Mtype)) == Mnil)
3619 coding->type = Mcharset;
3620 pl = (MPlist *) mplist_get (plist, Mcharsets);
3622 MERROR (MERROR_CODING, Mnil);
3623 coding->ncharsets = mplist_length (pl);
3624 if (coding->ncharsets > NUM_SUPPORTED_CHARSETS)
3625 coding->ncharsets = NUM_SUPPORTED_CHARSETS;
3626 for (i = 0; i < coding->ncharsets; i++, pl = MPLIST_NEXT (pl))
3628 MSymbol charset_name;
3630 if (MPLIST_KEY (pl) != Msymbol)
3631 MERROR (MERROR_CODING, Mnil);
3632 charset_name = MPLIST_SYMBOL (pl);
3633 if (! (coding->charsets[i] = MCHARSET (charset_name)))
3634 MERROR (MERROR_CODING, Mnil);
3637 coding->resetter = resetter;
3638 coding->decoder = decoder;
3639 coding->encoder = encoder;
3640 coding->ascii_compatible = 0;
3641 coding->extra_info = extra_info;
3642 coding->extra_spec = NULL;
3645 if (coding->type == Mcharset)
3647 if (! coding->resetter)
3648 coding->resetter = reset_coding_charset;
3649 if (! coding->decoder)
3650 coding->decoder = decode_coding_charset;
3651 if (! coding->encoder)
3652 coding->encoder = encode_coding_charset;
3654 else if (coding->type == Mutf)
3656 MCodingInfoUTF *info = malloc (sizeof (MCodingInfoUTF));
3659 if (! coding->resetter)
3660 coding->resetter = reset_coding_utf;
3662 info->code_unit_bits = (int) mplist_get (plist, Mcode_unit);
3663 if (info->code_unit_bits == 8)
3665 if (! coding->decoder)
3666 coding->decoder = decode_coding_utf_8;
3667 if (! coding->encoder)
3668 coding->encoder = encode_coding_utf_8;
3670 else if (info->code_unit_bits == 16)
3672 if (! coding->decoder)
3673 coding->decoder = decode_coding_utf_16;
3674 if (! coding->encoder)
3675 coding->encoder = encode_coding_utf_16;
3677 else if (info->code_unit_bits == 32)
3679 if (! coding->decoder)
3680 coding->decoder = decode_coding_utf_32;
3681 if (! coding->encoder)
3682 coding->encoder = encode_coding_utf_32;
3685 MERROR (MERROR_CODING, Mnil);
3686 val = (MSymbol) mplist_get (plist, Mbom);
3689 else if (val == Mmaybe)
3694 info->endian = (mplist_get (plist, Mlittle_endian) ? 1 : 0);
3695 coding->extra_info = info;
3697 else if (coding->type == Miso_2022)
3699 MCodingInfoISO2022 *info = malloc (sizeof (MCodingInfoISO2022));
3701 if (! coding->resetter)
3702 coding->resetter = reset_coding_iso_2022;
3703 if (! coding->decoder)
3704 coding->decoder = decode_coding_iso_2022;
3705 if (! coding->encoder)
3706 coding->encoder = encode_coding_iso_2022;
3708 info->initial_invocation[0] = 0;
3709 info->initial_invocation[1] = -1;
3710 pl = (MPlist *) mplist_get (plist, Minvocation);
3713 if (MPLIST_KEY (pl) != Minteger)
3714 MERROR (MERROR_CODING, Mnil);
3715 info->initial_invocation[0] = MPLIST_INTEGER (pl);
3716 if (! MPLIST_TAIL_P (pl))
3718 pl = MPLIST_NEXT (pl);
3719 if (MPLIST_KEY (pl) != Minteger)
3720 MERROR (MERROR_CODING, Mnil);
3721 info->initial_invocation[1] = MPLIST_INTEGER (pl);
3724 memset (info->designations, 0, sizeof (info->designations));
3725 for (i = 0, pl = (MPlist *) mplist_get (plist, Mdesignation);
3726 i < 32 && pl && MPLIST_KEY (pl) == Minteger;
3727 i++, pl = MPLIST_NEXT (pl))
3728 info->designations[i] = MPLIST_INTEGER (pl);
3731 MPLIST_DO (pl, (MPlist *) mplist_get (plist, Mflags))
3735 if (MPLIST_KEY (pl) != Msymbol)
3736 MERROR (MERROR_CODING, Mnil);
3737 val = MPLIST_SYMBOL (pl);
3738 if (val == Mreset_at_eol)
3739 info->flags |= MCODING_ISO_RESET_AT_EOL;
3740 else if (val == Mreset_at_cntl)
3741 info->flags |= MCODING_ISO_RESET_AT_CNTL;
3742 else if (val == Meight_bit)
3743 info->flags |= MCODING_ISO_EIGHT_BIT;
3744 else if (val == Mlong_form)
3745 info->flags |= MCODING_ISO_LOCKING_SHIFT;
3746 else if (val == Mdesignation_g0)
3747 info->flags |= MCODING_ISO_DESIGNATION_G0;
3748 else if (val == Mdesignation_g1)
3749 info->flags |= MCODING_ISO_DESIGNATION_G1;
3750 else if (val == Mdesignation_ctext)
3751 info->flags |= MCODING_ISO_DESIGNATION_CTEXT;
3752 else if (val == Mdesignation_ctext_ext)
3753 info->flags |= MCODING_ISO_DESIGNATION_CTEXT_EXT;
3754 else if (val == Mlocking_shift)
3755 info->flags |= MCODING_ISO_LOCKING_SHIFT;
3756 else if (val == Msingle_shift)
3757 info->flags |= MCODING_ISO_SINGLE_SHIFT;
3758 else if (val == Msingle_shift_7)
3759 info->flags |= MCODING_ISO_SINGLE_SHIFT_7;
3760 else if (val == Meuc_tw_shift)
3761 info->flags |= MCODING_ISO_EUC_TW_SHIFT;
3762 else if (val == Miso_6429)
3763 info->flags |= MCODING_ISO_ISO6429;
3764 else if (val == Mrevision_number)
3765 info->flags |= MCODING_ISO_REVISION_NUMBER;
3766 else if (val == Mfull_support)
3767 info->flags |= MCODING_ISO_FULL_SUPPORT;
3770 coding->extra_info = info;
3774 if (! coding->decoder || ! coding->encoder)
3775 MERROR (MERROR_CODING, Mnil);
3776 if (! coding->resetter)
3780 msymbol_put (sym, Mcoding, coding);
3781 msymbol_put (msymbol__canonicalize (sym), Mcoding, coding);
3782 plist = (MPlist *) mplist_get (plist, Maliases);
3785 MPLIST_DO (pl, plist)
3789 if (MPLIST_KEY (pl) != Msymbol)
3791 alias = MPLIST_SYMBOL (pl);
3792 msymbol_put (alias, Mcoding, coding);
3793 msymbol_put (msymbol__canonicalize (alias), Mcoding, coding);
3797 MLIST_APPEND1 (&coding_list, codings, coding, MERROR_CODING);
3805 @brief Resolve coding system name.
3807 The mconv_resolve_coding () function returns $SYMBOL if it
3808 represents a coding system. Otherwise, canonicalize $SYMBOL as to
3809 a coding system name, and if the canonicalized name represents a
3810 coding system, return it. Otherwise, return Mnil. */
3814 mconv_resolve_coding (MSymbol symbol)
3816 MCodingSystem *coding = find_coding (symbol);
3820 symbol = msymbol__canonicalize (symbol);
3821 coding = find_coding (symbol);
3823 return (coding ? coding->name : Mnil);
3830 @brief List symbols representing a coding system.
3832 The mconv_list_codings () function makes an array of symbols
3833 representing a coding system, stores the pointer to the array in a
3834 place pointed to by $SYMBOLS, and returns the length of the array. */
3837 mconv_list_codings (MSymbol **symbols)
3839 int i = coding_list.used + mplist_length (coding_definition_list);
3843 MTABLE_MALLOC ((*symbols), i, MERROR_CODING);
3845 MPLIST_DO (plist, coding_definition_list)
3846 (*symbols)[i++] = MPLIST_KEY (plist);
3847 for (j = 0; j < coding_list.used; j++)
3848 if (! mplist_find_by_key (coding_definition_list,
3849 coding_list.codings[j]->name))
3850 (*symbols)[i++] = coding_list.codings[j]->name;
3857 @brief Create a code converter bound to a buffer.
3859 The mconv_buffer_converter () function creates a pointer to a code
3860 converter for coding system $CODING. The code converter is bound
3861 to buffer area of $N bytes pointed to by $BUF. Subsequent
3862 decodings and encodings are done to/from this buffer area.
3864 $CODING can be #Mnil. In this case, a coding system associated
3865 with the current locale (LC_CTYPE) is used.
3868 If the operation was successful, mconv_buffer_converter () returns
3869 the created code converter. Otherwise it returns @c NULL and
3870 assigns an error code to the external variable #merror_code. */
3873 @brief ¥Ð¥Ã¥Õ¥¡¤Ë·ë¤ÓÉÕ¤±¤é¤ì¤¿¥³¡¼¥É¥³¥ó¥Ð¡¼¥¿¤òºî¤ë
3875 ´Ø¿ô mconv_buffer_converter () ¤Ï¡¢¥³¡¼¥É·Ï $CODING ÍѤΥ³¡¼¥É¥³¥ó
3876 ¥Ð¡¼¥¿¤òºî¤ë¡£¤³¤Î¥³¡¼¥É¥³¥ó¥Ð¡¼¥¿¤Ï¡¢$BUF ¤Ç¼¨¤µ¤ì¤ëÂ礤µ $N ¥Ð
3877 ¥¤¥È¤Î¥Ð¥Ã¥Õ¥¡Îΰè¤Ë·ë¤ÓÉÕ¤±¤é¤ì¤ë¡£¤³¤ì°Ê¹ß¤Î¥Ç¥³¡¼¥É¤ª¤è¤Ó
3878 ¥¨¥ó¥³¡¼¥É¤Ï¡¢¤³¤Î¥Ð¥Ã¥Õ¥¡Îΰè¤ËÂФ·¤Æ¹Ô¤Ê¤ï¤ì¤ë¡£
3880 $CODING ¤Ï #Mnil ¤Ç¤¢¤Ã¤Æ¤â¤è¤¤¡£¤³¤Î¾ì¹ç¤Ï¸½ºß¤Î¥í¥±¡¼¥ë
3881 (LC_CTYPE) ¤Ë´ØÏ¢ÉÕ¤±¤é¤ì¤¿¥³¡¼¥É·Ï¤¬»È¤ï¤ì¤ë¡£
3884 ¤â¤·½èÍý¤¬À®¸ù¤¹¤ì¤Ð mconv_buffer_converter () ¤Ï ºî¤é¤ì¤¿¥³¡¼¥É¥³
3885 ¥ó¥Ð¡¼¥¿¤òÊÖ¤¹¡£¤½¤¦¤Ç¤Ê¤±¤ì¤Ð @c NULL ¤òÊÖ¤·¡¢³°ÉôÊÑ¿ô #merror_code
3886 ¤Ë¥¨¥é¡¼¥³¡¼¥É¤òÀßÄꤹ¤ë¡£
3888 @latexonly \IPAlabel{mconverter} @endlatexonly */
3892 @c MERROR_SYMBOL, @c MERROR_CODING
3895 mconv_stream_converter () */
3898 mconv_buffer_converter (MSymbol name, unsigned char *buf, int n)
3900 MCodingSystem *coding;
3901 MConverter *converter;
3902 MConverterStatus *internal;
3905 name = mlocale_get_prop (mlocale__ctype, Mcoding);
3906 coding = find_coding (name);
3908 MERROR (MERROR_CODING, NULL);
3909 MSTRUCT_CALLOC (converter, MERROR_CODING);
3910 MSTRUCT_CALLOC (internal, MERROR_CODING);
3911 converter->internal_info = internal;
3912 internal->coding = coding;
3913 if (coding->resetter
3914 && (*coding->resetter) (converter) < 0)
3918 MERROR (MERROR_CODING, NULL);
3921 internal->unread = mtext ();
3922 internal->work_mt = mtext ();
3923 mtext__enlarge (internal->work_mt, MAX_UTF8_CHAR_BYTES);
3924 internal->buf = buf;
3926 internal->bufsize = n;
3927 internal->binding = BINDING_BUFFER;
3935 @brief Create a code converter bound to a stream.
3937 The mconv_stream_converter () function create a pointer to a code
3938 converter for coding system $CODING. The code converter is bound
3939 to stream $FP. Subsequent decodings and encodings are done
3940 to/from this stream.
3942 $CODING can be #Mnil. In this case, a coding system associated
3943 with the current locale (LC_CTYPE) is used.
3945 @return If the operation was successful, mconv_stream_converter ()
3946 returns the created code converter. Otherwise it returns @c NULL
3947 and assigns an error code to the external variable @c
3951 @brief ¥¹¥È¥ê¡¼¥à¤Ë·ë¤ÓÉÕ¤±¤é¤ì¤¿¥³¡¼¥É¥³¥ó¥Ð¡¼¥¿¤òºî¤ë
3953 ´Ø¿ô mconv_stream_converter () ¤Ï¡¢¥³¡¼¥É·Ï $CODING ÍѤΥ³¡¼¥É¥³¥ó
3954 ¥Ð¡¼¥¿¤òºî¤ë¡£¤³¤Î¥³¡¼¥É¥³¥ó¥Ð¡¼¥¿¤Ï¡¢¥¹¥È¥ê¡¼¥à $FP ¤Ë·ë¤ÓÉÕ¤±¤é
3955 ¤ì¤ë¡£¤³¤ì°Ê¹ß¤Î¥Ç¥³¡¼¥É¤ª¤è¤Ó¥¨¥ó¥³¡¼¥É¤Ï¡¢¤³¤Î¥¹¥È¥ê¡¼¥à¤ËÂФ·¤Æ
3958 $CODING ¤Ï #Mnil ¤Ç¤¢¤Ã¤Æ¤â¤è¤¤¡£¤³¤Î¾ì¹ç¤Ï¸½ºß¤Î¥í¥±¡¼¥ë
3959 (LC_CTYPE) ¤Ë´ØÏ¢ÉÕ¤±¤é¤ì¤¿¥³¡¼¥É·Ï¤¬»È¤ï¤ì¤ë¡£
3962 ¤â¤·½èÍý¤¬À®¸ù¤¹¤ì¤Ð¡¢mconv_stream_converter () ¤Ïºî¤é¤ì¤¿¥³¡¼¥É¥³
3963 ¥ó¥Ð¡¼¥¿¤òÊÖ¤¹¡£¤½¤¦¤Ç¤Ê¤±¤ì¤Ð @c NULL ¤òÊÖ¤·¡¢³°ÉôÊÑ¿ô #merror_code
3964 ¤Ë¥¨¥é¡¼¥³¡¼¥É¤òÀßÄꤹ¤ë¡£
3966 @latexonly \IPAlabel{mconverter} @endlatexonly */
3970 @c MERROR_SYMBOL, @c MERROR_CODING
3973 mconv_buffer_converter () */
3976 mconv_stream_converter (MSymbol name, FILE *fp)
3978 MCodingSystem *coding;
3979 MConverter *converter;
3980 MConverterStatus *internal;
3983 name = mlocale_get_prop (mlocale__ctype, Mcoding);
3984 coding = find_coding (name);
3986 MERROR (MERROR_CODING, NULL);
3987 MSTRUCT_CALLOC (converter, MERROR_CODING);
3988 MSTRUCT_CALLOC (internal, MERROR_CODING);
3989 converter->internal_info = internal;
3990 internal->coding = coding;
3991 if (coding->resetter
3992 && (*coding->resetter) (converter) < 0)
3996 MERROR (MERROR_CODING, NULL);
3999 if (fseek (fp, 0, SEEK_CUR) < 0)
4007 internal->seekable = 0;
4010 internal->seekable = 1;
4011 internal->unread = mtext ();
4012 internal->work_mt = mtext ();
4013 mtext__enlarge (internal->work_mt, MAX_UTF8_CHAR_BYTES);
4015 internal->binding = BINDING_STREAM;
4023 @brief Reset a code converter.
4025 The mconv_reset_converter () function resets code converter
4026 $CONVERTER to the initial state.
4029 If $CONVERTER->coding has its own reseter function,
4030 mconv_reset_converter () returns the result of that function
4031 applied to $CONVERTER. Otherwise it returns 0. */
4034 @brief ¥³¡¼¥É¥³¥ó¥Ð¡¼¥¿¤ò¥ê¥»¥Ã¥È¤¹¤ë
4036 ´Ø¿ô mconv_reset_converter () ¤Ï¥³¡¼¥É¥³¥ó¥Ð¡¼¥¿ $CONVERTER ¤ò½é´ü
4040 ¤â¤· $CONVERTER->coding ¤Ë¥ê¥»¥Ã¥ÈÍѤδؿô¤¬ÄêµÁ¤µ¤ì¤Æ¤¤¤ë¤Ê¤é¤Ð¡¢
4041 mconv_reset_converter () ¤Ï¤½¤Î´Ø¿ô¤Ë $CONVERTER ¤òŬÍѤ·¤¿·ë²Ì¤ò
4042 ÊÖ¤·¡¢¤½¤¦¤Ç¤Ê¤±¤ì¤Ð0¤òÊÖ¤¹¡£ */
4045 mconv_reset_converter (MConverter *converter)
4047 MConverterStatus *internal = (MConverterStatus *) converter->internal_info;
4049 converter->nchars = converter->nbytes = 0;
4050 converter->result = MCONVERSION_RESULT_SUCCESS;
4051 internal->carryover_bytes = 0;
4052 mtext_reset (internal->unread);
4053 if (internal->coding->resetter)
4054 return (*internal->coding->resetter) (converter);
4061 @brief Free a code converter.
4063 The mconv_free_converter () function frees the code converter
4067 @brief ¥³¡¼¥É¥³¥ó¥Ð¡¼¥¿¤ò²òÊü¤¹¤ë
4069 ´Ø¿ô mconv_free_converter () ¤Ï¥³¡¼¥É¥³¥ó¥Ð¡¼¥¿ $CONVERTER ¤ò²òÊü
4073 mconv_free_converter (MConverter *converter)
4075 MConverterStatus *internal = (MConverterStatus *) converter->internal_info;
4077 M17N_OBJECT_UNREF (internal->work_mt);
4078 M17N_OBJECT_UNREF (internal->unread);
4086 @brief Bind a buffer to a code converter.
4088 The mconv_rebind_buffer () function binds buffer area of $N bytes
4089 pointed to by $BUF to code converter $CONVERTER. Subsequent
4090 decodings and encodings are done to/from this newly bound buffer
4094 This function always returns $CONVERTER. */
4097 @brief ¥³¡¼¥É¥³¥ó¥Ð¡¼¥¿¤Ë¥Ð¥Ã¥Õ¥¡Îΰè¤ò·ë¤ÓÉÕ¤±¤ë
4099 ´Ø¿ô mconv_rebind_buffer () ¤Ï¡¢$BUF ¤Ë¤è¤Ã¤Æ»Ø¤µ¤ì¤¿Â礤µ $N ¥Ð
4100 ¥¤¥È¤Î¥Ð¥Ã¥Õ¥¡Îΰè¤ò¥³¡¼¥É¥³¥ó¥Ð¡¼¥¿ $CONVERTER ¤Ë·ë¤ÓÉÕ¤±¤ë¡£¤³¤ì
4101 °Ê¹ß¤Î¥Ç¥³¡¼¥É¤ª¤è¤Ó¥¨¥ó¥³¡¼¥É¤Ï¡¢¤³¤Î¿·¤¿¤Ë·ë¤ÓÉÕ¤±¤é¤ì¤¿¥Ð¥Ã¥Õ¥¡
4102 Îΰè¤ËÂФ·¤Æ¹Ô¤Ê¤ï¤ì¤ë¤è¤¦¤Ë¤Ê¤ë¡£
4105 ¤³¤Î´Ø¿ô¤Ï¾ï¤Ë $CONVERTER ¤òÊÖ¤¹¡£
4107 @latexonly \IPAlabel{mconv_rebind_buffer} @endlatexonly */
4111 mconv_rebind_stream () */
4114 mconv_rebind_buffer (MConverter *converter, unsigned char *buf, int n)
4116 MConverterStatus *internal = (MConverterStatus *) converter->internal_info;
4118 internal->buf = buf;
4120 internal->bufsize = n;
4121 internal->binding = BINDING_BUFFER;
4128 @brief Bind a stream to a code converter.
4130 The mconv_rebind_stream () function binds stream $FP to code
4131 converter $CONVERTER. Following decodings and encodings are done
4132 to/from this newly bound stream.
4135 This function always returns $CONVERTER. */
4138 @brief ¥³¡¼¥É¥³¥ó¥Ð¡¼¥¿¤Ë¥¹¥È¥ê¡¼¥à¤ò·ë¤ÓÉÕ¤±¤ë
4140 ´Ø¿ô mconv_rebind_stream () ¤Ï¡¢¥¹¥È¥ê¡¼¥à $FP ¤ò¥³¡¼¥É¥³¥ó¥Ð¡¼¥¿
4141 $CONVERTER ¤Ë·ë¤ÓÉÕ¤±¤ë¡£¤³¤ì°Ê¹ß¤Î¥Ç¥³¡¼¥É¤ª¤è¤Ó¥¨¥ó¥³¡¼¥É¤Ï¡¢
4142 ¤³¤Î¿·¤¿¤Ë·ë¤ÓÉÕ¤±¤é¤ì¤¿¥¹¥È¥ê¡¼¥à¤ËÂФ·¤Æ¹Ô¤Ê¤ï¤ì¤ë¤è¤¦¤Ë¤Ê¤ë¡£
4145 ¤³¤Î´Ø¿ô¤Ï¾ï¤Ë $CONVERTER ¤òÊÖ¤¹¡£
4147 @latexonly \IPAlabel{mconv_rebind_stream} @endlatexonly */
4151 mconv_rebind_buffer () */
4154 mconv_rebind_stream (MConverter *converter, FILE *fp)
4156 MConverterStatus *internal = (MConverterStatus *) converter->internal_info;
4158 if (fseek (fp, 0, SEEK_CUR) < 0)
4162 internal->seekable = 0;
4165 internal->seekable = 1;
4167 internal->binding = BINDING_STREAM;
4174 @brief Decode a byte sequence into an M-text.
4176 The mconv_decode () function decodes a byte sequence and appends
4177 the result at the end of M-text $MT. The source byte sequence is
4178 taken from currently bound the buffer area or the stream.
4181 If the operation was successful, mconv_decode () returns updated
4182 $MT. Otherwise it returns @c NULL and assigns an error code to
4183 the external variable #merror_code. */
4186 @brief ¥Ð¥¤¥ÈÎó¤ò M-text ¤Ë¥Ç¥³¡¼¥É¤¹¤ë
4188 ´Ø¿ô mconv_decode () ¤Ï¡¢¥Ð¥¤¥ÈÎó¤ò¥Ç¥³¡¼¥É¤·¤Æ¤½¤Î·ë²Ì¤ò M-text
4189 $MT ¤ÎËöÈø¤ËÄɲ乤롣¥Ç¥³¡¼¥É¸µ¤Î¥Ð¥¤¥ÈÎó¤Ï¡¢¸½ºß·ë¤ÓÉÕ¤±¤é¤ì¤Æ¤¤¤ë
4190 ¥Ð¥Ã¥Õ¥¡Îΰ褢¤ë¤¤¤Ï¥¹¥È¥ê¡¼¥à¤«¤é¼è¤é¤ì¤ë¡£
4193 ¤â¤·½èÍý¤¬À®¸ù¤¹¤ì¤Ð¡¢mconv_decode () ¤Ï¹¹¿·¤µ¤ì¤¿ $MT ¤òÊÖ¤¹¡£¤½
4194 ¤¦¤Ç¤Ê¤±¤ì¤Ð @c NULL ¤òÊÖ¤·¡¢³°ÉôÊÑ¿ô #merror_code ¤Ë¥¨¥é¡¼¥³¡¼¥É¤ò
4199 @c MERROR_IO, @c MERROR_CODING
4202 mconv_rebind_buffer (), mconv_rebind_stream (),
4203 mconv_encode (), mconv_encode_range (),
4204 mconv_decode_buffer (), mconv_decode_stream () */
4207 mconv_decode (MConverter *converter, MText *mt)
4209 MConverterStatus *internal = (MConverterStatus *) converter->internal_info;
4210 int at_most = converter->at_most > 0 ? converter->at_most : -1;
4213 M_CHECK_READONLY (mt, NULL);
4216 mtext__enlarge (mt, MAX_UTF8_CHAR_BYTES);
4218 converter->nchars = converter->nbytes = 0;
4219 converter->result = MCONVERSION_RESULT_SUCCESS;
4221 n = mtext_nchars (internal->unread);
4227 if (at_most > 0 && at_most < limit)
4230 for (i = 0, n -= 1; i < limit; i++, converter->nchars++, n--)
4231 mtext_cat_char (mt, mtext_ref_char (internal->unread, n));
4232 mtext_del (internal->unread, n + 1, internal->unread->nchars);
4235 if (at_most == limit)
4237 converter->at_most -= converter->nchars;
4241 if (internal->binding == BINDING_BUFFER)
4243 (*internal->coding->decoder) (internal->buf + internal->used,
4244 internal->bufsize - internal->used,
4246 internal->used += converter->nbytes;
4248 else if (internal->binding == BINDING_STREAM)
4250 unsigned char work[CONVERT_WORKSIZE];
4251 int last_block = converter->last_block;
4252 int use_fread = at_most < 0 && internal->seekable;
4254 converter->last_block = 0;
4257 int nbytes, prev_nbytes;
4259 if (feof (internal->fp))
4262 nbytes = fread (work, sizeof (unsigned char), CONVERT_WORKSIZE,
4266 int c = getc (internal->fp);
4269 work[0] = c, nbytes = 1;
4274 if (ferror (internal->fp))
4276 converter->result = MCONVERSION_RESULT_IO_ERROR;
4281 converter->last_block = last_block;
4282 prev_nbytes = converter->nbytes;
4283 (*internal->coding->decoder) (work, nbytes, mt, converter);
4284 if (converter->nbytes - prev_nbytes < nbytes)
4287 fseek (internal->fp, converter->nbytes - prev_nbytes - nbytes,
4290 ungetc (work[0], internal->fp);
4294 || (converter->at_most > 0
4295 && converter->nchars == converter->at_most))
4298 converter->last_block = last_block;
4300 else /* internal->binding == BINDING_NONE */
4301 MERROR (MERROR_CODING, NULL);
4303 converter->at_most = at_most;
4304 return ((converter->result == MCONVERSION_RESULT_SUCCESS
4305 || converter->result == MCONVERSION_RESULT_INSUFFICIENT_SRC)
4312 @brief Decode a buffer area based on a coding system.
4314 The mconv_decode_buffer () function decodes $N bytes of buffer
4315 area pointed to by $BUF based on the coding system $NAME. A
4316 temporary code converter for decoding is automatically created
4320 If the operation was successful, mconv_decode_buffer () returns
4321 the resulting M-text. Otherwise it returns NULL and assigns an
4322 error code to the external variable #merror_code. */
4325 @brief ¥³¡¼¥É·Ï¤Ë´ð¤Å¤¤¤Æ¥Ð¥Ã¥Õ¥¡Îΰè¤ò¥Ç¥³¡¼¥É¤¹¤ë
4327 ´Ø¿ô mconv_decode_buffer () ¤Ï¡¢$BUF ¤Ë¤è¤Ã¤Æ»Ø¤µ¤ì¤¿ $N ¥Ð¥¤¥È¤Î
4328 ¥Ð¥Ã¥Õ¥¡Îΰè¤ò¡¢¥³¡¼¥É·Ï $NAME ¤Ë´ð¤Å¤¤¤Æ¥Ç¥³¡¼¥É¤¹¤ë¡£¥Ç¥³¡¼¥É¤Ë
4329 ɬÍפʥ³¡¼¥É¥³¥ó¥Ð¡¼¥¿¤ÎºîÀ®¤È²òÊü¤Ï¼«Æ°Åª¤Ë¹Ô¤Ê¤ï¤ì¤ë¡£
4332 ¤â¤·½èÍý¤¬À®¸ù¤¹¤ì¤Ð¡¢mconv_decode_buffer () ¤ÏÆÀ¤é¤ì¤¿ M-text ¤ò
4333 ÊÖ¤¹¡£¤½¤¦¤Ç¤Ê¤±¤ì¤Ð @c NULL ¤òÊÖ¤·¡¢³°ÉôÊÑ¿ô #merror_code ¤Ë¥¨¥é¡¼
4334 ¥³¡¼¥É¤òÀßÄꤹ¤ë¡£ */
4338 @c MERROR_IO, @c MERROR_CODING
4341 mconv_decode (), mconv_decode_stream () */
4344 mconv_decode_buffer (MSymbol name, unsigned char *buf, int n)
4346 MConverter *converter = mconv_buffer_converter (name, buf, n);
4352 if (! mconv_decode (converter, mt))
4354 M17N_OBJECT_UNREF (mt);
4357 mconv_free_converter (converter);
4364 @brief Decode a stream input based on a coding system.
4366 The mconv_decode_stream () function decodes the entire byte
4367 sequence read in from stream $FP based on the coding system $NAME.
4368 A code converter for decoding is automatically created and freed.
4371 If the operation was successful, mconv_decode_stream () returns
4372 the resulting M-text. Otherwise it returns NULL and assigns an
4373 error code to the external variable #merror_code. */
4376 @brief ¥³¡¼¥É·Ï¤Ë´ð¤Å¤¤¤Æ¥¹¥È¥ê¡¼¥àÆþÎϤò¥Ç¥³¡¼¥É¤¹¤ë
4378 ´Ø¿ô mconv_decode_stream () ¤Ï¡¢¥¹¥È¥ê¡¼¥à $FP ¤«¤éÆɤ߹þ¤Þ¤ì¤ë¥Ð
4379 ¥¤¥ÈÎóÁ´ÂΤò¡¢¥³¡¼¥É·Ï $NAME ¤Ë´ð¤Å¤¤¤Æ¥Ç¥³¡¼¥É¤¹¤ë¡£¥Ç¥³¡¼¥É¤Ëɬ
4380 Íפʥ³¡¼¥É¥³¥ó¥Ð¡¼¥¿¤ÎºîÀ®¤È²òÊü¤Ï¼«Æ°Åª¤Ë¹Ô¤Ê¤ï¤ì¤ë¡£
4383 ¤â¤·½èÍý¤¬À®¸ù¤¹¤ì¤Ð¡¢mconv_decode_stream () ¤ÏÆÀ¤é¤ì¤¿ M-text ¤òÊÖ
4384 ¤¹¡£¤½¤¦¤Ç¤Ê¤±¤ì¤Ð @c NULL ¤òÊÖ¤·¡¢³°ÉôÊÑ¿ô #merror_code ¤Ë¥¨¥é¡¼¥³¡¼
4389 @c MERROR_IO, @c MERROR_CODING
4392 mconv_decode (), mconv_decode_buffer () */
4395 mconv_decode_stream (MSymbol name, FILE *fp)
4397 MConverter *converter = mconv_stream_converter (name, fp);
4403 if (! mconv_decode (converter, mt))
4405 M17N_OBJECT_UNREF (mt);
4408 mconv_free_converter (converter);
4414 /***en @brief Encode an M-text into a byte sequence.
4416 The mconv_encode () function encodes M-text $MT and writes the
4417 resulting byte sequence into the buffer area or the stream that is
4418 currently bound to code converter $CONVERTER.
4421 If the operation was successful, mconv_encode () returns the
4422 number of written bytes. Otherwise it returns -1 and assigns an
4423 error code to the external variable #merror_code. */
4426 @brief M-text ¤ò¥Ð¥¤¥ÈÎó¤Ë¥¨¥ó¥³¡¼¥É¤¹¤ë
4428 ´Ø¿ô mconv_encode () ¤Ï¡¢M-text $MT ¤ò¥¨¥ó¥³¡¼¥É¤·¤Æ¡¢¥³¡¼¥É¥³¥ó¥Ð¡¼
4429 ¥¿ $CONVERTER ¤Ë¸½ºß·ë¤ÓÉÕ¤±¤é¤ì¤Æ¤¤¤ë¥Ð¥Ã¥Õ¥¡Îΰ褢¤ë¤¤¤Ï¥¹¥È¥ê¡¼
4433 ¤â¤·½èÍý¤¬À®¸ù¤¹¤ì¤Ð¡¢mconv_encode () ¤Ï½ñ¤¹þ¤Þ¤ì¤¿¥Ð¥¤¥È¿ô¤òÊÖ¤¹¡£
4434 ¤½¤¦¤Ç¤Ê¤±¤ì¤Ð -1 ¤òÊÖ¤·¡¢³°ÉôÊÑ¿ô #merror_code ¤Ë¥¨¥é¡¼¥³¡¼¥É¤òÀßÄê
4439 @c MERROR_IO, @c MERROR_CODING
4442 mconv_rebind_buffer (), mconv_rebind_stream(),
4443 mconv_decode (), mconv_encode_range () */
4446 mconv_encode (MConverter *converter, MText *mt)
4448 return mconv_encode_range (converter, mt, 0, mtext_nchars (mt));
4454 @brief Encode a part of an M-text
4456 The mconv_encode_range () function encodes the text between $FROM
4457 (inclusive) and $TO (exclusive) in M-text $MT and writes the
4458 resulting byte sequence into the buffer area or the stream that is
4459 currently bound to code converter $CONVERTER.
4462 If the operation was successful, mconv_encode_range () returns the
4463 number of written bytes. Otherwise it returns -1 and assigns an
4464 error code to the external variable #merror_code. */
4467 @brief M-text ¤Î°ìÉô¤ò¤ò¥Ð¥¤¥ÈÎó¤Ë¥¨¥ó¥³¡¼¥É¤¹¤ë
4469 ´Ø¿ô mconv_encode_range () ¤Ï¡¢M-text $MT ¤Î $FROM ¡Ê´Þ¤à¡Ë¤«¤é
4470 $TO ¡Ê´Þ¤Þ¤Ê¤¤¡Ë¤Þ¤Ç¤ÎÈϰϤΥƥ¥¹¥È¤ò¥¨¥ó¥³¡¼¥É¤·¤Æ¡¢¥³¡¼¥É¥³¥ó¥Ð¡¼
4471 ¥¿ $CONVERTER ¤Ë¸½ºß·ë¤ÓÉÕ¤±¤é¤ì¤Æ¤¤¤ë¥Ð¥Ã¥Õ¥¡Îΰ褢¤ë¤¤¤Ï¥¹¥È¥ê¡¼
4475 ¤â¤·½èÍý¤¬À®¸ù¤¹¤ì¤Ð¡¢mconv_encode_range () ¤Ï½ñ¤¹þ¤Þ¤ì¤¿¥Ð¥¤¥È¿ô
4476 ¤òÊÖ¤¹¡£¤½¤¦¤Ç¤Ê¤±¤ì¤Ð -1 ¤òÊÖ¤·¡¢³°ÉôÊÑ¿ô #merror_code ¤Ë¥¨¥é¡¼¥³¡¼
4481 @c MERROR_RANGE, @c MERROR_IO, @c MERROR_CODING
4484 mconv_rebind_buffer (), mconv_rebind_stream(),
4485 mconv_decode (), mconv_encode () */
4488 mconv_encode_range (MConverter *converter, MText *mt, int from, int to)
4490 MConverterStatus *internal = (MConverterStatus *) converter->internal_info;
4492 M_CHECK_POS_X (mt, from, -1);
4493 M_CHECK_POS_X (mt, to, -1);
4497 if (converter->at_most > 0 && from + converter->at_most < to)
4498 to = from + converter->at_most;
4500 converter->nchars = converter->nbytes = 0;
4501 converter->result = MCONVERSION_RESULT_SUCCESS;
4503 mtext_put_prop (mt, from, to, Mcoding, internal->coding->name);
4504 if (internal->binding == BINDING_BUFFER)
4506 (*internal->coding->encoder) (mt, from, to,
4507 internal->buf + internal->used,
4508 internal->bufsize - internal->used,
4510 internal->used += converter->nbytes;
4512 else if (internal->binding == BINDING_STREAM)
4514 unsigned char work[CONVERT_WORKSIZE];
4519 int prev_nbytes = converter->nbytes;
4522 (*internal->coding->encoder) (mt, from, to, work,
4523 CONVERT_WORKSIZE, converter);
4524 this_nbytes = converter->nbytes - prev_nbytes;
4525 while (written < this_nbytes)
4527 int wrtn = fwrite (work + written, sizeof (unsigned char),
4528 this_nbytes - written, internal->fp);
4530 if (ferror (internal->fp))
4534 if (written < this_nbytes)
4536 converter->result = MCONVERSION_RESULT_IO_ERROR;
4539 from += converter->nchars;
4542 else /* fail safe */
4543 MERROR (MERROR_CODING, -1);
4545 return ((converter->result == MCONVERSION_RESULT_SUCCESS
4546 || converter->result == MCONVERSION_RESULT_INSUFFICIENT_DST)
4547 ? converter->nbytes : -1);
4553 @brief Encode an M-text into a buffer area.
4555 The mconv_encode_buffer () function encodes M-text $MT based on
4556 coding system $NAME and writes the resulting byte sequence into the
4557 buffer area pointed to by $BUF. At most $N bytes are written. A
4558 temporary code converter for encoding is automatically created
4562 If the operation was successful, mconv_encode_buffer () returns
4563 the number of written bytes. Otherwise it returns -1 and assigns
4564 an error code to the external variable #merror_code. */
4567 @brief M-text ¤ò¥¨¥ó¥³¡¼¥É¤·¤Æ¥Ð¥Ã¥Õ¥¡Îΰè¤Ë½ñ¤¹þ¤à
4569 ´Ø¿ô mconv_encode_buffer () ¤ÏM-text $MT ¤ò¥³¡¼¥É·Ï $NAME ¤Ë´ð¤Å¤¤
4570 ¤Æ¥¨¥ó¥³¡¼¥É¤·¡¢ÆÀ¤é¤ì¤¿¥Ð¥¤¥ÈÎó¤ò $BUF ¤Î»Ø¤¹¥Ð¥Ã¥Õ¥¡Îΰè¤Ë½ñ¤¹þ
4571 ¤à¡£$N ¤Ï½ñ¤¹þ¤àºÇÂç¥Ð¥¤¥È¿ô¤Ç¤¢¤ë¡£¥¨¥ó¥³¡¼¥É¤ËɬÍפʥ³¡¼¥É¥³¥ó
4572 ¥Ð¡¼¥¿¤ÎºîÀ®¤È²òÊü¤Ï¼«Æ°Åª¤Ë¹Ô¤Ê¤ï¤ì¤ë¡£
4575 ¤â¤·½èÍý¤¬À®¸ù¤¹¤ì¤Ð¡¢mconv_encode_buffer () ¤Ï½ñ¤¹þ¤Þ¤ì¤¿¥Ð¥¤¥È
4576 ¿ô¤òÊÖ¤¹¡£¤½¤¦¤Ç¤Ê¤±¤ì¤Ð-1¤òÊÖ¤·¡¢³°ÉôÊÑ¿ô #merror_code ¤Ë¥¨¥é¡¼¥³¡¼
4581 @c MERROR_IO, @c MERROR_CODING
4584 mconv_encode (), mconv_encode_stream () */
4587 mconv_encode_buffer (MSymbol name, MText *mt, unsigned char *buf, int n)
4589 MConverter *converter = mconv_buffer_converter (name, buf, n);
4594 ret = mconv_encode (converter, mt);
4595 mconv_free_converter (converter);
4602 @brief Encode an M-text to write to a stream.
4604 The mconv_encode_stream () function encodes M-text $MT based on
4605 coding system $NAME and writes the resulting byte sequence to
4606 stream $FP. A temporary code converter for encoding is
4607 automatically created and freed.
4610 If the operation was successful, mconv_encode_stream () returns
4611 the number of written bytes. Otherwise it returns -1 and assigns
4612 an error code to the external variable #merror_code. */
4615 @brief M-text ¤ò¥¨¥ó¥³¡¼¥É¤·¤Æ¥¹¥È¥ê¡¼¥à¤Ë½ñ¤¹þ¤à
4617 ´Ø¿ô mconv_encode_stream () ¤ÏM-text $MT ¤ò¥³¡¼¥É·Ï $NAME ¤Ë´ð¤Å¤¤
4618 ¤Æ¥¨¥ó¥³¡¼¥É¤·¡¢ÆÀ¤é¤ì¤¿¥Ð¥¤¥ÈÎó¤ò¥¹¥È¥ê¡¼¥à $FP ¤Ë½ñ¤½Ð¤¹¡£¥¨¥ó
4619 ¥³¡¼¥É¤ËɬÍפʥ³¡¼¥É¥³¥ó¥Ð¡¼¥¿¤ÎºîÀ®¤È²òÊü¤Ï¼«Æ°Åª¤Ë¹Ô¤Ê¤ï¤ì¤ë¡£
4622 ¤â¤·½èÍý¤¬À®¸ù¤¹¤ì¤Ð¡¢mconv_encode_stream () ¤Ï½ñ¤¹þ¤Þ¤ì¤¿¥Ð¥¤¥È¿ô
4623 ¤òÊÖ¤¹¡£¤½¤¦¤Ç¤Ê¤±¤ì¤Ð-1¤òÊÖ¤·¡¢³°ÉôÊÑ¿ô #merror_code ¤Ë¥¨¥é¡¼¥³¡¼¥É
4628 @c MERROR_IO, @c MERROR_CODING
4631 mconv_encode (), mconv_encode_buffer (), mconv_encode_file () */
4634 mconv_encode_stream (MSymbol name, MText *mt, FILE *fp)
4636 MConverter *converter = mconv_stream_converter (name, fp);
4641 ret = mconv_encode (converter, mt);
4642 mconv_free_converter (converter);
4649 @brief Read a character via a code converter.
4651 The mconv_getc () function reads one character from the buffer
4652 area or the stream that is currently bound to code converter
4653 $CONVERTER. The decoder of $CONVERTER is used to decode the byte
4654 sequence. The internal status of $CONVERTER is updated
4658 If the operation was successful, mconv_getc () returns the
4659 character read in. If the input source reaches EOF, it returns @c
4660 EOF without changing the external variable #merror_code. If an
4661 error is detected, it returns @c EOF and assigns an error code to
4665 @brief ¥³¡¼¥É¥³¥ó¥Ð¡¼¥¿·Ðͳ¤Ç1ʸ»úÆɤà
4667 ´Ø¿ô mconv_getc () ¤Ï¡¢¥³¡¼¥É¥³¥ó¥Ð¡¼¥¿ $CONVERTER ¤Ë¸½ºß·ë¤ÓÉÕ¤±
4668 ¤é¤ì¤Æ¤¤¤ë¥Ð¥Ã¥Õ¥¡Îΰ褢¤ë¤¤¤Ï¥¹¥È¥ê¡¼¥à¤«¤é1ʸ»ú¤òÆɤ߹þ¤à¡£¥Ð¥¤
4669 ¥ÈÎó¤Î¥Ç¥³¡¼¥É¤Ë¤Ï $CONVERTER ¤Î¥Ç¥³¡¼¥À¤¬ÍѤ¤¤é¤ì¤ë¡£$CONVERTER
4670 ¤ÎÆâÉô¾õÂÖ¤ÏɬÍפ˱þ¤¸¤Æ¹¹¿·¤µ¤ì¤ë¡£
4673 ½èÍý¤¬À®¸ù¤¹¤ì¤Ð¡¢mconv_getc () ¤ÏÆɤ߹þ¤Þ¤ì¤¿Ê¸»ú¤òÊÖ¤¹¡£ÆþÎϸ»¤¬
4674 EOF ¤Ë㤷¤¿¾ì¹ç¤Ï¡¢³°ÉôÊÑ¿ô #merror_code ¤òÊѤ¨¤º¤Ë @c EOF ¤òÊÖ¤¹¡£
4675 ¥¨¥é¡¼¤¬¸¡½Ð¤µ¤ì¤¿¾ì¹ç¤Ï @c EOF ¤òÊÖ¤·¡¢#merror_code ¤Ë¥¨¥é¡¼¥³¡¼¥É
4683 mconv_ungetc (), mconv_putc (), mconv_gets () */
4686 mconv_getc (MConverter *converter)
4688 MConverterStatus *internal = (MConverterStatus *) converter->internal_info;
4689 int at_most = converter->at_most;
4691 mtext_reset (internal->work_mt);
4692 converter->at_most = 1;
4693 mconv_decode (converter, internal->work_mt);
4694 converter->at_most = at_most;
4695 return (converter->nchars == 1
4696 ? STRING_CHAR (internal->work_mt->data)
4703 @brief Push a character back to a code converter.
4705 The mconv_ungetc () function pushes character $C back to code
4706 converter $CONVERTER. Any number of characters can be pushed
4707 back. The lastly pushed back character is firstly read by the
4708 subsequent mconv_getc () call. The characters pushed back are
4709 registered only in $CONVERTER; they are not written to the input
4710 source. The internal status of $CONVERTER is updated
4714 If the operation was successful, mconv_ungetc () returns $C.
4715 Otherwise it returns @c EOF and assigns an error code to the
4716 external variable #merror_code. */
4719 @brief ¥³¡¼¥É¥³¥ó¥Ð¡¼¥¿¤Ë1ʸ»úÌ᤹
4721 ´Ø¿ô mconv_ungetc () ¤Ï¡¢¥³¡¼¥É¥³¥ó¥Ð¡¼¥¿ $CONVERTER ¤Ëʸ»ú $C ¤ò
4722 ²¡¤·Ì᤹¡£²¡¤·Ì᤻¤ëʸ»ú¿ô¤ËÀ©¸Â¤Ï¤Ê¤¤¡£¤³¤Î¸å¤Ë mconv_getc () ¤ò
4723 ¸Æ¤Ó½Ð¤¹¤È¡¢ºÇ¸å¤ËÌᤵ¤ì¤¿Ê¸»ú¤¬ºÇ½é¤ËÆɤޤì¤ë¡£²¡¤·Ìᤵ¤ì¤¿Ê¸»ú¤Ï
4724 $CONVERTER ¤ÎÆâÉô¤ËÃߤ¨¤é¤ì¤ë¤À¤±¤Ç¤¢¤ê¡¢¼ÂºÝ¤ËÆþÎϸ»¤Ë½ñ¤¹þ¤Þ¤ì
4725 ¤ë¤ï¤±¤Ç¤Ï¤Ê¤¤¡£$CONVERTER ¤ÎÆâÉô¾õÂÖ¤ÏɬÍפ˱þ¤¸¤Æ¹¹¿·¤µ¤ì¤ë¡£
4728 ½èÍý¤¬À®¸ù¤¹¤ì¤Ð¡¢mconv_ungetc () ¤Ï $C ¤òÊÖ¤¹¡£¤½¤¦¤Ç¤Ê¤±¤ì¤Ð @c
4729 EOF ¤òÊÖ¤·¡¢³°ÉôÊÑ¿ô #merror_code ¤Ë¥¨¥é¡¼¥³¡¼¥É¤òÀßÄꤹ¤ë¡£ */
4733 @c MERROR_CODING, @c MERROR_CHAR
4736 mconv_getc (), mconv_putc (), mconv_gets () */
4739 mconv_ungetc (MConverter *converter, int c)
4741 MConverterStatus *internal = (MConverterStatus *) converter->internal_info;
4743 M_CHECK_CHAR (c, EOF);
4745 converter->result = MCONVERSION_RESULT_SUCCESS;
4746 mtext_cat_char (internal->unread, c);
4753 @brief Write a character via a code converter.
4755 The mconv_putc () function writes character $C to the buffer area
4756 or the stream that is currently bound to code converter
4757 $CONVERTER. The encoder of $CONVERTER is used to encode the
4758 character. The number of bytes actually written is set to the @c
4759 nbytes member of $CONVERTER. The internal status of $CONVERTER
4760 is updated appropriately.
4763 If the operation was successful, mconv_putc () returns $C.
4764 If an error is detected, it returns @c EOF and assigns
4765 an error code to the external variable #merror_code. */
4768 @brief ¥³¡¼¥É¥³¥ó¥Ð¡¼¥¿¤ò·Ðͳ¤Ç1ʸ»ú½ñ¤¯
4770 ´Ø¿ô mconv_putc () ¤Ï¡¢¥³¡¼¥É¥³¥ó¥Ð¡¼¥¿ $CONVERTER ¤Ë¸½ºß·ë¤ÓÉÕ¤±
4771 ¤é¤ì¤Æ¤¤¤ë¥Ð¥Ã¥Õ¥¡Îΰ褢¤ë¤¤¤Ï¥¹¥È¥ê¡¼¥à¤Ëʸ»ú $C ¤ò½ñ¤½Ð¤¹¡£Ê¸»ú
4772 ¤Î¥¨¥ó¥³¡¼¥É¤Ë¤Ï $CONVERTER ¤Î¥¨¥ó¥³¡¼¥À¤¬ÍѤ¤¤é¤ì¤ë¡£¼ÂºÝ¤Ë½ñ¤½Ð
4773 ¤µ¤ì¤¿¥Ð¥¤¥È¿ô¤Ï¡¢$CONVERTER ¤Î ¥á¥ó¥Ð¡¼ @c nbytes ¤Ë¥»¥Ã¥È¤µ¤ì¤ë¡£
4774 $CONVERTER ¤ÎÆâÉô¾õÂÖ¤ÏɬÍפ˱þ¤¸¤Æ¹¹¿·¤µ¤ì¤ë¡£
4777 ½èÍý¤¬À®¸ù¤¹¤ì¤Ð¡¢mconv_putc () ¤Ï $C ¤òÊÖ¤¹¡£¥¨¥é¡¼¤¬¸¡½Ð¤µ¤ì¤¿¾ì¹ç
4778 ¤Ï @c EOF ¤òÊÖ¤·¡¢³°ÉôÊÑ¿ô #merror_code ¤Ë¥¨¥é¡¼¥³¡¼¥É¤òÀßÄꤹ¤ë¡£ */
4782 @c MERROR_CODING, @c MERROR_IO, @c MERROR_CHAR
4785 mconv_getc (), mconv_ungetc (), mconv_gets () */
4788 mconv_putc (MConverter *converter, int c)
4790 MConverterStatus *internal = (MConverterStatus *) converter->internal_info;
4792 M_CHECK_CHAR (c, EOF);
4793 mtext_reset (internal->work_mt);
4794 mtext_cat_char (internal->work_mt, c);
4795 if (mconv_encode_range (converter, internal->work_mt, 0, 1) < 0)
4803 @brief Read a line using a code converter.
4805 The mconv_gets () function reads one line from the buffer area or
4806 the stream that is currently bound to code converter $CONVERTER.
4807 The decoder of $CONVERTER is used for decoding. The decoded
4808 character sequence is appended at the end of M-text $MT. The
4809 final newline character in the original byte sequence is not
4810 appended. The internal status of $CONVERTER is updated
4814 If the operation was successful, mconv_gets () returns the
4815 modified $MT. If it encounters EOF without reading a single
4816 character, it returns $MT without changing it. If an error is
4817 detected, it returns @c NULL and assigns an error code to @c
4821 @brief ¥³¡¼¥É¥³¥ó¥Ð¡¼¥¿¤ò»È¤Ã¤Æ1¹ÔÆɤà
4823 ´Ø¿ô mconv_gets () ¤Ï¡¢¥³¡¼¥É¥³¥ó¥Ð¡¼¥¿ $CONVERTER ¤Ë¸½ºß·ë¤ÓÉÕ¤±
4824 ¤é¤ì¤Æ¤¤¤ë¥Ð¥Ã¥Õ¥¡Îΰ褢¤ë¤¤¤Ï¥¹¥È¥ê¡¼¥à¤«¤é1¹Ô¤òÆɤ߹þ¤à¡£¥Ð¥¤¥È
4825 Îó¤Î¥Ç¥³¡¼¥É¤Ë¤Ï $CONVERTER ¤Î¥Ç¥³¡¼¥À¤¬ÍѤ¤¤é¤ì¤ë¡£¥Ç¥³¡¼¥É¤µ¤ì¤¿
4826 ʸ»úÎó¤Ï M-text $MT ¤ÎËöÈø¤ËÄɲ䵤ì¤ë¡£¸µ¤Î¥Ð¥¤¥ÈÎó¤Î½ªÃ¼²þ¹Ôʸ»ú
4827 ¤ÏÄɲ䵤ì¤Ê¤¤¡£$CONVERTER ¤ÎÆâÉô¾õÂÖ¤ÏɬÍפ˱þ¤¸¤Æ¹¹¿·¤µ¤ì¤ë¡£
4830 ½èÍý¤¬À®¸ù¤¹¤ì¤Ð¡¢mconv_gets () ¤ÏÊѹ¹¤µ¤ì¤¿ $MT ¤òÊÖ¤¹¡£¤â¤·1ʸ»ú
4831 ¤âÆɤޤº¤Ë EOF ¤ËÅö¤¿¤Ã¤¿¾ì¹ç¤Ï¡¢$MT ¤òÊѹ¹¤»¤º¤Ë¤½¤Î¤Þ¤ÞÊÖ¤¹¡£¥¨
4832 ¥é¡¼¤¬¸¡½Ð¤µ¤ì¤¿¾ì¹ç¤Ï @c NULL ¤òÊÖ¤·¡¢#merror_code ¤Ë¥¨¥é¡¼¥³¡¼¥É¤ò
4840 mconv_getc (), mconv_ungetc (), mconv_putc () */
4843 mconv_gets (MConverter *converter, MText *mt)
4847 M_CHECK_READONLY (mt, NULL);
4850 c = mconv_getc (converter);
4851 if (c == EOF || c == '\n')
4853 mtext_cat_char (mt, c);
4855 if (c == EOF && converter->result != MCONVERSION_RESULT_SUCCESS)
4856 /* mconv_getc () sets merror_code */