1 /* coding.c -- code conversion module.
2 Copyright (C) 2003, 2004
3 National Institute of Advanced Industrial Science and Technology (AIST)
4 Registration Number H15PRO112
6 This file is part of the m17n library.
8 The m17n library is free software; you can redistribute it and/or
9 modify it under the terms of the GNU Lesser General Public License
10 as published by the Free Software Foundation; either version 2.1 of
11 the License, or (at your option) any later version.
13 The m17n library is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public
19 License along with the m17n library; if not, write to the Free
20 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
25 @brief Coding system objects and API for them.
27 The m17n library represents a character encoding scheme (CES) of
28 coded character sets (CCS) as an object called @e coding @e
29 system. Application programs can add original coding systems.
31 To @e encode means converting code-points to character codes and
32 to @e decode means converting character codes back to code-points.
34 Application programs can decode a byte sequence with a specified
35 coding system into an M-text, and inversely, can encode an M-text
36 into a byte sequence. */
40 @brief ¥³¡¼¥É·Ï¥ª¥Ö¥¸¥§¥¯¥È¤È¤½¤ì¤Ë´Ø¤¹¤ë API
42 m17n ¥é¥¤¥Ö¥é¥ê¤Ï¡¢Éä¹æ²½Ê¸»ú½¸¹ç (coded character sets; CCS) ¤Îʸ
43 »úÉä¹ç²½Êý¼° (character encoding scheme; CES) ¤ò @e ¥³¡¼¥É·Ï ¤È¸Æ
44 ¤Ö¥ª¥Ö¥¸¥§¥¯¥È¤Çɽ¸½¤¹¤ë¡£m17n ¥é¥¤¥Ö¥é¥ê¤¬¥µ¥Ý¡¼¥È¤¹¤ëCES ¤Ï¡¢
45 UTF-8, UTF-16, ISO-2022, DIRECT-CHARSET, ¤½¤Î¾¡¢¤ËÂçÊ̤µ¤ì¤ë¡£¥¢
46 ¥×¥ê¥±¡¼¥·¥ç¥ó¥×¥í¥°¥é¥à¤¬Æȼ«¤Ë¥³¡¼¥É·Ï¤òÄɲ乤뤳¤È¤â²Äǽ¤Ç¤¢¤ë¡£
48 ¥³¡¼¥É¥Ý¥¤¥ó¥È¤«¤éʸ»ú¥³¡¼¥É¤Ø¤ÎÊÑ´¹¤ò @e ¥¨¥ó¥³¡¼¥É ¤È¸Æ¤Ó¡¢Ê¸»ú
49 ¥³¡¼¥É¤«¤é¥³¡¼¥É¥Ý¥¤¥ó¥È¤Ø¤ÎÊÑ´¹¤ò @e ¥Ç¥³¡¼¥É ¤È¸Æ¤Ö¡£
51 ¥¢¥×¥ê¥±¡¼¥·¥ç¥ó¥×¥í¥°¥é¥à¤Ï¡¢»ØÄꤵ¤ì¤¿¥³¡¼¥É·Ï¤Ç¥Ð¥¤¥ÈÎó¤ò¥Ç¥³¡¼
52 ¥É¤¹¤ë¤³¤È¤Ç M-text ¤òÆÀ¤ë¤³¤È¤¬¤Ç¤¤ë¡£¤Þ¤¿µÕ¤Ë¡¢»ØÄꤵ¤ì¤¿¥³¡¼¥É
53 ·Ï¤Ç M-text ¤ò¥¨¥ó¥³¡¼¥É¤·¤¹¤ë¤³¤È¤Ç¥Ð¥¤¥ÈÎó¤òÆÀ¤ë¤³¤È¤¬¤Ç¤¤ë¡£ */
57 #if !defined (FOR_DOXYGEN) || defined (DOXYGEN_INTERNAL_MODULE)
58 /*** @addtogroup m17nInternal
66 #include <sys/types.h>
71 #include "m17n-misc.h"
74 #include "character.h"
81 #define NUM_SUPPORTED_CHARSETS 32
83 /** Structure for coding system object. */
87 /** Name of the coding system. */
90 /** Type of the coding system. */
93 /* Number of supported charsets. */
96 /** Array of supported charsets. */
97 MCharset *charsets[NUM_SUPPORTED_CHARSETS];
99 /** If non-NULL, function to call at the time of creating and
100 reseting a converter. */
101 int (*resetter) (MConverter *converter);
103 int (*decoder) (unsigned char *str, int str_bytes, MText *mt,
104 MConverter *converter);
106 int (*encoder) (MText *mt, int from, int to,
107 unsigned char *str, int str_bytes,
108 MConverter *converter);
110 /** If non-zero, the coding system decode/encode ASCII characters as
112 int ascii_compatible;
114 /** Pointer to extra information given when the coding system is
115 defined. The meaning depends on <type>. */
118 /** Pointer to information referred on conversion. The meaning
119 depends on <type>. The value NULL means that the coding system
129 MCodingSystem **codings;
132 static struct MCodingList coding_list;
134 static MPlist *coding_definition_list;
138 Pointer to a structure of a coding system. */
140 ¥³¡¼¥É·Ï¤òɽ¤ï¤¹¥Ç¡¼¥¿¹½Â¤¤Ø¤Î¥Ý¥¤¥ó¥¿ */
141 MCodingSystem *coding;
144 Buffer for carryover bytes generated while decoding. */
146 ¥Ç¥³¡¼¥ÉÃæ¤Î¥¥ã¥ê¥£¥ª¡¼¥Ð¡¼¥Ð¥¤¥ÈÍѥХåե¡ */
147 unsigned char carryover[256];
150 Number of carryover bytes. */
152 ¥¥ã¥ê¥£¥ª¡¼¥Ð¡¼¥Ð¥¤¥È¿ô */
156 Beginning of the byte sequence bound to this converter. */
158 ¤³¤Î¥³¥ó¥Ð¡¼¥¿¤Ë·ë¤ÓÉÕ¤±¤é¤ì¤¿¥Ð¥¤¥ÈÎó¤ÎÀèƬ°ÌÃÖ */
168 Number of bytes already consumed in buf. */
170 buf Æâ¤Ç¤¹¤Ç¤Ë¾ÃÈñ¤µ¤ì¤¿¥Ð¥¤¥È¿ô */
174 Stream bound to this converter. */
176 ¤³¤Î¥³¥ó¥Ð¡¼¥¿¤Ë·ë¤ÓÉÕ¤±¤é¤ì¤¿¥¹¥È¥ê¡¼¥à */
180 Which of above two is in use. */
182 ¾åµ2¼Ô¤Î¤¤¤º¤ì¤¬»È¤ï¤ì¤Æ¤¤¤ë¤« */
202 /* Local macros and functions. */
204 /** At first, set SRC_BASE to SRC. Then check if we have already
205 produced AT_MOST chars. If so, set SRC_END to SRC, and jump to
206 source_end. Otherwise, get one more byte C from SRC. In that
207 case, if SRC == SRC_END, jump to the label source_end. */
209 #define ONE_MORE_BASE_BYTE(c) \
212 if (nchars == at_most) \
217 if (src == src_stop) \
219 if (src == src_end) \
221 src_base = src = source; \
222 if (src == src_end) \
224 src_stop = src_end; \
230 /** Get one more byte C from SRC. If SRC == SRC_END, jump to the
233 #define ONE_MORE_BYTE(c) \
235 if (src == src_stop) \
237 if (src == src_end) \
240 if (src == src_end) \
242 src_stop = src_end; \
248 #define REWIND_SRC_TO_BASE() \
250 if (src_base < source || src_base >= src_end) \
251 src_stop = internal->carryover + internal->carryover_bytes; \
256 /** Push back byte C to SRC. */
258 #define UNGET_ONE_BYTE(c) \
264 internal->carryover[0] = c; \
265 internal->carryover_bytes = 1; \
266 src = internal->carryover; \
267 src_stop = src + 1; \
272 /** Store multibyte representation of character C at DST and increment
273 DST to the next of the produced bytes. DST must be a pointer to
274 data area of M-text MT. If the produced bytes are going to exceed
275 DST_END, enlarge the data area of MT. */
277 #define EMIT_CHAR(c) \
279 int bytes = CHAR_BYTES (c); \
282 if (dst + bytes + 1 > dst_end) \
284 len = dst - mt->data; \
285 bytes = mt->allocated + bytes + (src_stop - src); \
286 mtext__enlarge (mt, bytes); \
287 dst = mt->data + len; \
288 dst_end = mt->data + mt->allocated; \
290 dst += CHAR_STRING (c, dst); \
295 /* Check if there is enough room to produce LEN bytes at DST. If not,
296 go to the label insufficient_destination. */
298 #define CHECK_DST(len) \
300 if (dst + (len) > dst_end) \
301 goto insufficient_destination; \
305 /** Take NUM_CHARS characters (NUM_BYTES bytes) already stored at
306 (MT->data + MT->nbytes) into MT, and put charset property on
307 them with CHARSET->name. */
309 #define TAKEIN_CHARS(mt, num_chars, num_bytes, charset) \
311 int chars = (num_chars); \
315 mtext__takein ((mt), chars, (num_bytes)); \
317 mtext_put_prop ((mt), (mt)->nchars - chars, (mt)->nchars, \
318 Mcharset, (void *) ((charset)->name)); \
323 #define SET_SRC(mt, format, from, to) \
325 if (format <= MTEXT_FORMAT_UTF_8) \
327 src = mt->data + POS_CHAR_TO_BYTE (mt, from); \
328 src_end = mt->data + POS_CHAR_TO_BYTE (mt, to); \
330 else if (format <= MTEXT_FORMAT_UTF_16BE) \
333 = mt->data + (sizeof (short)) * POS_CHAR_TO_BYTE (mt, from); \
335 = mt->data + (sizeof (short)) * POS_CHAR_TO_BYTE (mt, to); \
339 src = mt->data + (sizeof (int)) * from; \
340 src_end = mt->data + (sizeof (int)) * to; \
345 #define ONE_MORE_CHAR(c, bytes, format) \
347 if (src == src_end) \
349 if (format <= MTEXT_FORMAT_UTF_8) \
350 c = STRING_CHAR_AND_BYTES (src, bytes); \
351 else if (format <= MTEXT_FORMAT_UTF_16BE) \
353 c = mtext_ref_char (mt, from++); \
354 bytes = (sizeof (short)) * CHAR_UNITS_UTF16 (c); \
358 c = ((unsigned *) (mt->data))[from++]; \
359 bytes = sizeof (int); \
365 encode_unsupporeted_char (int c, unsigned char *dst, unsigned char *dst_end,
371 len = c < 0x10000 ? 8 : 10;
372 if (dst + len > dst_end)
375 format = (c < 0xD800 ? "<U+%04X>"
376 : c < 0xE000 ? "<M+%04X>"
377 : c < 0x10000 ? "<U+%04X>"
378 : c < 0x110000 ? "<U+%06X>"
380 sprintf ((char *) dst, format, c);
386 /** Finish decoding of bytes at SOURCE (ending at SRC_END) into NCHARS
387 characters by CONVERTER into M-text MT. SRC is a pointer to the
388 not-yet processed bytes. ERROR is 1 iff an invalid byte was
392 finish_decoding (MText *mt, MConverter *converter, int nchars,
393 unsigned char *source, unsigned char *src_end,
397 MConverterStatus *internal = (MConverterStatus *) converter->internal_info;
400 internal->carryover_bytes = 0;
402 || (converter->last_block
403 && ! converter->lenient))
404 converter->result = MCONVERSION_RESULT_INVALID_BYTE;
405 else if (! converter->last_block)
407 unsigned char *dst = internal->carryover;
409 if (src < source || src > src_end)
411 dst += internal->carryover_bytes;
414 while (src < src_end)
416 internal->carryover_bytes = dst - internal->carryover;
417 converter->result = MCONVERSION_RESULT_INSUFFICIENT_SRC;
421 unsigned char *dst = mt->data + mt->nbytes;
422 unsigned char *dst_end = mt->data + mt->allocated;
423 unsigned char *src_stop = src_end;
425 int last_nchars = nchars;
427 if (src < source || src > src_end)
428 src_stop = internal->carryover + internal->carryover_bytes;
431 if (converter->at_most && nchars == converter->at_most)
445 TAKEIN_CHARS (mt, nchars - last_nchars, dst - (mt->data + mt->nbytes),
447 internal->carryover_bytes = 0;
450 converter->nchars += nchars;
451 converter->nbytes += ((src < source || src > src_end) ? 0 : src - source);
452 return (converter->result == MCONVERSION_RESULT_INVALID_BYTE ? -1 : 0);
457 /* Staffs for coding-systems of type MCODING_TYPE_CHARSET. */
460 setup_coding_charset (MCodingSystem *coding)
462 int ncharsets = coding->ncharsets;
463 unsigned *code_charset_table;
467 /* At first, reorder charset list by dimensions (a charset of
468 smaller dimension comes first). As the number of charsets is
469 usually very small (at most 32), we do a simple sort. */
474 MTABLE_ALLOCA (charsets, NUM_SUPPORTED_CHARSETS, MERROR_CODING);
475 memcpy (charsets, coding->charsets,
476 sizeof (MCharset *) * NUM_SUPPORTED_CHARSETS);
477 for (i = 0; i < 4; i++)
478 for (j = 0; j < ncharsets; j++)
479 if (charsets[j]->dimension == i)
480 coding->charsets[idx++] = charsets[j];
483 MTABLE_CALLOC (code_charset_table, 256, MERROR_CODING);
486 int dim = coding->charsets[ncharsets]->dimension;
487 int from = coding->charsets[ncharsets]->code_range[(dim - 1) * 4];
488 int to = coding->charsets[ncharsets]->code_range[(dim - 1) * 4 + 1];
490 if (coding->charsets[ncharsets]->ascii_compatible)
491 coding->ascii_compatible = 1;
493 code_charset_table[from++] |= 1 << ncharsets;
496 coding->extra_spec = (void *) code_charset_table;
501 reset_coding_charset (MConverter *converter)
503 MConverterStatus *internal = (MConverterStatus *) converter->internal_info;
504 MCodingSystem *coding = internal->coding;
507 && setup_coding_charset (coding) < 0)
514 decode_coding_charset (unsigned char *source, int src_bytes, MText *mt,
515 MConverter *converter)
517 MConverterStatus *internal = (MConverterStatus *) converter->internal_info;
518 MCodingSystem *coding = internal->coding;
519 unsigned char *src = internal->carryover;
520 unsigned char *src_stop = src + internal->carryover_bytes;
521 unsigned char *src_end = source + src_bytes;
522 unsigned char *src_base;
523 unsigned char *dst = mt->data + mt->nbytes;
524 unsigned char *dst_end = mt->data + mt->allocated;
527 int at_most = converter->at_most > 0 ? converter->at_most : -1;
529 unsigned *code_charset_table = (unsigned *) coding->extra_spec;
530 MCharset **charsets = coding->charsets;
531 MCharset *charset = mcharset__ascii;
536 MCharset *this_charset = NULL;
540 ONE_MORE_BASE_BYTE (c);
541 mask = code_charset_table[c];
551 while (! (mask & 1)) mask >>= 1, idx++;
552 this_charset = charsets[idx];
553 dim = this_charset->dimension;
557 code = (code << 8) | c;
560 c = DECODE_CHAR (this_charset, code);
567 if (! converter->lenient)
569 REWIND_SRC_TO_BASE ();
571 this_charset = mcharset__binary;
574 if (this_charset != mcharset__ascii
575 && this_charset != charset)
577 TAKEIN_CHARS (mt, nchars - last_nchars,
578 dst - (mt->data + mt->nbytes), charset);
579 charset = this_charset;
580 last_nchars = nchars;
584 /* We reach here because of an invalid byte. */
588 TAKEIN_CHARS (mt, nchars - last_nchars,
589 dst - (mt->data + mt->nbytes), charset);
590 return finish_decoding (mt, converter, nchars,
591 source, src_end, src_base, error);
595 encode_coding_charset (MText *mt, int from, int to,
596 unsigned char *destination, int dst_bytes,
597 MConverter *converter)
599 MConverterStatus *internal = (MConverterStatus *) converter->internal_info;
600 MCodingSystem *coding = internal->coding;
601 unsigned char *src, *src_end;
602 unsigned char *dst = destination;
603 unsigned char *dst_end = dst + dst_bytes;
605 int ncharsets = coding->ncharsets;
606 MCharset **charsets = coding->charsets;
607 int ascii_compatible = coding->ascii_compatible;
608 enum MTextFormat format = mt->format;
610 SET_SRC (mt, format, from, to);
615 ONE_MORE_CHAR (c, bytes, format);
617 if (c < 0x80 && ascii_compatible)
625 MCharset *charset = NULL;
630 charset = charsets[i];
631 code = ENCODE_CHAR (charset, c);
632 if (code != MCHAR_INVALID_CODE)
634 if (++i == ncharsets)
635 goto unsupported_char;
638 CHECK_DST (charset->dimension);
639 if (charset->dimension == 1)
643 else if (charset->dimension == 2)
646 *dst++ = code & 0xFF;
648 else if (charset->dimension == 3)
651 *dst++ = (code >> 8) & 0xFF;
652 *dst++ = code & 0xFF;
657 *dst++ = (code >> 16) & 0xFF;
658 *dst++ = (code >> 8) & 0xFF;
659 *dst++ = code & 0xFF;
670 if (! converter->lenient)
672 len = encode_unsupporeted_char (c, dst, dst_end, mt, from + nchars);
674 goto insufficient_destination;
680 /* We reach here because of an unsupported char. */
681 converter->result = MCONVERSION_RESULT_INVALID_CHAR;
684 insufficient_destination:
685 converter->result = MCONVERSION_RESULT_INSUFFICIENT_DST;
688 converter->nchars += nchars;
689 converter->nbytes += dst - destination;
690 return (converter->result == MCONVERSION_RESULT_INVALID_CHAR ? -1 : 0);
694 /* Staffs for coding-systems of type MCODING_TYPE_UTF (8). */
696 #define UTF8_CHARSET(p) \
697 (! ((p)[0] & 0x80) ? (mcharset__unicode) \
698 : CHAR_HEAD_P ((p) + 1) ? (mcharset__binary) \
699 : ! ((p)[0] & 0x20) ? (mcharset__unicode) \
700 : CHAR_HEAD_P ((p) + 2) ? (mcharset__binary) \
701 : ! ((p)[0] & 0x10) ? (mcharset__unicode) \
702 : CHAR_HEAD_P ((p) + 3) ? (mcharset__binary) \
703 : ! ((p)[0] & 0x08) ? ((((((p)[0] & 0x07) << 2) \
704 & (((p)[1] & 0x30) >> 4)) <= 0x10) \
705 ? (mcharset__unicode) \
706 : (mcharset__m17n)) \
707 : CHAR_HEAD_P ((p) + 4) ? (mcharset__binary) \
708 : ! ((p)[0] & 0x04) ? (mcharset__m17n) \
709 : CHAR_HEAD_P ((p) + 5) ? (mcharset__binary) \
710 : ! ((p)[0] & 0x02) ? (mcharset__m17n) \
711 : (mcharset__binary))
715 decode_coding_utf_8 (unsigned char *source, int src_bytes, MText *mt,
716 MConverter *converter)
718 MConverterStatus *internal = (MConverterStatus *) converter->internal_info;
719 MCodingSystem *coding = internal->coding;
720 unsigned char *src = internal->carryover;
721 unsigned char *src_stop = src + internal->carryover_bytes;
722 unsigned char *src_end = source + src_bytes;
723 unsigned char *src_base;
724 unsigned char *dst = mt->data + mt->nbytes;
725 unsigned char *dst_end = mt->data + mt->allocated;
728 int at_most = converter->at_most > 0 ? converter->at_most : -1;
730 int full = converter->lenient || (coding->charsets[0] == mcharset__m17n);
731 MCharset *charset = NULL;
736 MCharset *this_charset = NULL;
738 ONE_MORE_BASE_BYTE (c);
742 else if (!(c & 0x40))
744 else if (!(c & 0x20))
745 bytes = 2, c &= 0x1F;
746 else if (!(c & 0x10))
747 bytes = 3, c &= 0x0F;
748 else if (!(c & 0x08))
749 bytes = 4, c &= 0x07;
750 else if (!(c & 0x04))
751 bytes = 5, c &= 0x03;
752 else if (!(c & 0x02))
753 bytes = 6, c &= 0x01;
760 if ((c1 & 0xC0) != 0x80)
762 c = (c << 6) | (c1 & 0x3F);
766 || c < 0xD800 || (c >= 0xE000 && c < 0x110000))
770 if (! converter->lenient)
772 REWIND_SRC_TO_BASE ();
774 this_charset = mcharset__binary;
777 if (this_charset != charset)
779 TAKEIN_CHARS (mt, nchars - last_nchars,
780 dst - (mt->data + mt->nbytes), charset);
781 charset = this_charset;
782 last_nchars = nchars;
786 /* We reach here because of an invalid byte. */
790 TAKEIN_CHARS (mt, nchars - last_nchars,
791 dst - (mt->data + mt->nbytes), charset);
792 return finish_decoding (mt, converter, nchars,
793 source, src_end, src_base, error);
797 encode_coding_utf_8 (MText *mt, int from, int to,
798 unsigned char *destination, int dst_bytes,
799 MConverter *converter)
801 MConverterStatus *internal = (MConverterStatus *) converter->internal_info;
802 MCodingSystem *coding = internal->coding;
803 unsigned char *src, *src_end;
804 unsigned char *dst = destination;
805 unsigned char *dst_end = dst + dst_bytes;
807 enum MTextFormat format = mt->format;
809 SET_SRC (mt, format, from, to);
811 if (format <= MTEXT_FORMAT_UTF_8
812 && (converter->lenient
813 || coding->charsets[0] == mcharset__m17n))
815 if (dst_bytes < src_end - src)
817 int byte_pos = (src + dst_bytes) - mt->data;
819 to = POS_BYTE_TO_CHAR (mt, byte_pos);
820 byte_pos = POS_CHAR_TO_BYTE (mt, to);
821 src_end = mt->data + byte_pos;
822 converter->result = MCONVERSION_RESULT_INSUFFICIENT_DST;
824 memcpy (destination, src, src_end - src);
826 dst += src_end - src;
834 ONE_MORE_CHAR (c, bytes, format);
836 if ((c >= 0xD800 && c < 0xE000) || c >= 0x110000)
839 dst += CHAR_STRING (c, dst);
843 /* We reach here because of an unsupported char. */
844 converter->result = MCONVERSION_RESULT_INVALID_CHAR;
847 insufficient_destination:
848 converter->result = MCONVERSION_RESULT_INSUFFICIENT_DST;
851 converter->nchars += nchars;
852 converter->nbytes += dst - destination;
853 return (converter->result == MCONVERSION_RESULT_INVALID_CHAR ? -1 : 0);
857 /* Staffs for coding-systems of type MCODING_TYPE_UTF (16 & 32). */
878 enum utf_endian endian;
882 setup_coding_utf (MCodingSystem *coding)
884 MCodingInfoUTF *info = (MCodingInfoUTF *) (coding->extra_info);
885 MCodingInfoUTF *spec;
887 if (info->code_unit_bits == 8)
888 coding->ascii_compatible = 1;
889 else if (info->code_unit_bits == 16
890 || info->code_unit_bits == 32)
892 if (info->bom < 0 || info->bom > 2
893 || info->endian < 0 || info->endian > 1)
894 MERROR (MERROR_CODING, -1);
899 MSTRUCT_CALLOC (spec, MERROR_CODING);
901 coding->extra_spec = (void *) (spec);
906 reset_coding_utf (MConverter *converter)
908 MConverterStatus *internal = (MConverterStatus *) converter->internal_info;
909 MCodingSystem *coding = internal->coding;
910 struct utf_status *status = (struct utf_status *) &(converter->status);
913 && setup_coding_utf (coding) < 0)
917 status->surrogate = 0;
918 status->bom = ((MCodingInfoUTF *) (coding->extra_spec))->bom;
919 status->endian = ((MCodingInfoUTF *) (coding->extra_spec))->endian;
924 decode_coding_utf_16 (unsigned char *source, int src_bytes, MText *mt,
925 MConverter *converter)
927 MConverterStatus *internal = (MConverterStatus *) converter->internal_info;
928 unsigned char *src = internal->carryover;
929 unsigned char *src_stop = src + internal->carryover_bytes;
930 unsigned char *src_end = source + src_bytes;
931 unsigned char *src_base;
932 unsigned char *dst = mt->data + mt->nbytes;
933 unsigned char *dst_end = mt->data + mt->allocated;
936 int at_most = converter->at_most > 0 ? converter->at_most : -1;
937 struct utf_status *status = (struct utf_status *) &(converter->status);
938 unsigned char b1, b2;
939 MCharset *charset = NULL;
942 if (status->bom != UTF_BOM_NO)
946 ONE_MORE_BASE_BYTE (b1);
950 status->endian = UTF_BIG_ENDIAN;
951 else if (c == 0xFFFE)
952 status->endian = UTF_LITTLE_ENDIAN;
953 else if (status->bom == UTF_BOM_MAYBE
954 || converter->lenient)
956 status->endian = UTF_BIG_ENDIAN;
957 REWIND_SRC_TO_BASE ();
964 status->bom = UTF_BOM_NO;
970 MCharset *this_charset = NULL;
972 ONE_MORE_BASE_BYTE (b1);
974 if (status->endian == UTF_BIG_ENDIAN)
975 c = ((b1 << 8) | b2);
977 c = ((b2 << 8) | b1);
978 if (c < 0xD800 || c >= 0xE000)
984 if (status->endian == UTF_BIG_ENDIAN)
985 c1 = ((b1 << 8) | b2);
987 c1 = ((b2 << 8) | b1);
988 if (c1 < 0xDC00 || c1 >= 0xE000)
990 c = 0x10000 + ((c - 0xD800) << 10) + (c1 - 0xDC00);
995 if (! converter->lenient)
997 REWIND_SRC_TO_BASE ();
1000 if (status->endian == UTF_BIG_ENDIAN)
1001 c = ((b1 << 8) | b2);
1003 c = ((b2 << 8) | b1);
1004 this_charset = mcharset__binary;
1007 if (this_charset != charset)
1009 TAKEIN_CHARS (mt, nchars - last_nchars,
1010 dst - (mt->data + mt->nbytes), charset);
1011 charset = this_charset;
1012 last_nchars = nchars;
1016 /* We reach here because of an invalid byte. */
1020 TAKEIN_CHARS (mt, nchars - last_nchars,
1021 dst - (mt->data + mt->nbytes), charset);
1022 return finish_decoding (mt, converter, nchars,
1023 source, src_end, src_base, error);
1028 decode_coding_utf_32 (unsigned char *source, int src_bytes, MText *mt,
1029 MConverter *converter)
1031 MConverterStatus *internal = (MConverterStatus *) converter->internal_info;
1032 unsigned char *src = internal->carryover;
1033 unsigned char *src_stop = src + internal->carryover_bytes;
1034 unsigned char *src_end = source + src_bytes;
1035 unsigned char *src_base;
1036 unsigned char *dst = mt->data + mt->nbytes;
1037 unsigned char *dst_end = mt->data + mt->allocated;
1039 int last_nchars = 0;
1040 int at_most = converter->at_most > 0 ? converter->at_most : -1;
1041 struct utf_status *status = (struct utf_status *) &(converter->status);
1042 unsigned char b1, b2, b3, b4;
1043 MCharset *charset = NULL;
1046 if (status->bom != UTF_BOM_NO)
1050 ONE_MORE_BASE_BYTE (b1);
1054 c = (b1 << 24) | (b2 << 16) | (b3 << 8) | b4;
1055 if (c == 0x0000FEFF)
1056 status->endian = UTF_BIG_ENDIAN;
1057 else if (c == 0xFFFE0000)
1058 status->endian = UTF_LITTLE_ENDIAN;
1059 else if (status->bom == UTF_BOM_MAYBE
1060 || converter->lenient)
1062 status->endian = UTF_BIG_ENDIAN;
1063 REWIND_SRC_TO_BASE ();
1070 status->bom = UTF_BOM_NO;
1076 MCharset *this_charset = NULL;
1078 ONE_MORE_BASE_BYTE (b1);
1082 if (status->endian == UTF_BIG_ENDIAN)
1083 c = (b1 << 24) | (b2 << 16) | (b3 << 8) | b4;
1085 c = (b4 << 24) | (b3 << 16) | (b2 << 8) | b1;
1086 if (c < 0xD800 || (c >= 0xE000 && c < 0x110000))
1089 if (! converter->lenient)
1091 REWIND_SRC_TO_BASE ();
1093 this_charset = mcharset__binary;
1096 if (this_charset != charset)
1098 TAKEIN_CHARS (mt, nchars - last_nchars,
1099 dst - (mt->data + mt->nbytes), charset);
1100 charset = this_charset;
1101 last_nchars = nchars;
1105 /* We reach here because of an invalid byte. */
1109 TAKEIN_CHARS (mt, nchars - last_nchars,
1110 dst - (mt->data + mt->nbytes), charset);
1111 return finish_decoding (mt, converter, nchars,
1112 source, src_end, src_base, error);
1117 encode_coding_utf_16 (MText *mt, int from, int to,
1118 unsigned char *destination, int dst_bytes,
1119 MConverter *converter)
1121 unsigned char *src, *src_end;
1122 unsigned char *dst = destination;
1123 unsigned char *dst_end = dst + dst_bytes;
1125 struct utf_status *status = (struct utf_status *) &(converter->status);
1126 int big_endian = status->endian == UTF_BIG_ENDIAN;
1127 enum MTextFormat format = mt->format;
1129 SET_SRC (mt, format, from, to);
1131 if (status->bom != UTF_BOM_NO)
1135 *dst++ = 0xFE, *dst++ = 0xFF;
1137 *dst++ = 0xFF, *dst++ = 0xFE;
1138 status->bom = UTF_BOM_NO;
1145 ONE_MORE_CHAR (c, bytes, format);
1147 if (c < 0xD800 || (c >= 0xE000 && c < 0x10000))
1151 *dst++ = c >> 8, *dst++ = c & 0xFF;
1153 *dst++ = c & 0xFF, *dst++ = c >> 8;
1155 else if (c >= 0x10000 && c < 0x110000)
1161 c1 = (c >> 10) + 0xD800;
1162 c2 = (c & 0x3FF) + 0xDC00;
1164 *dst++ = c1 >> 8, *dst++ = c1 & 0xFF,
1165 *dst++ = c2 >> 8, *dst++ = c2 & 0xFF;
1167 *dst++ = c1 & 0xFF, *dst++ = c1 >> 8,
1168 *dst++ = c2 & 0xFF, *dst++ = c2 >> 8;
1172 unsigned char buf[11];
1175 if (! converter->lenient)
1177 len = encode_unsupporeted_char (c, buf, buf + (dst_end - dst),
1180 goto insufficient_destination;
1182 for (i = 0; i < len; i++)
1183 *dst++ = 0, *dst++ = buf[i];
1185 for (i = 0; i < len; i++)
1186 *dst++ = buf[i], *dst++ = 0;
1191 /* We reach here because of an unsupported char. */
1192 converter->result = MCONVERSION_RESULT_INVALID_CHAR;
1195 insufficient_destination:
1196 converter->result = MCONVERSION_RESULT_INSUFFICIENT_DST;
1199 converter->nchars += nchars;
1200 converter->nbytes += dst - destination;
1201 return (converter->result == MCONVERSION_RESULT_INVALID_CHAR ? -1 : 0);
1205 encode_coding_utf_32 (MText *mt, int from, int to,
1206 unsigned char *destination, int dst_bytes,
1207 MConverter *converter)
1209 unsigned char *src, *src_end;
1210 unsigned char *dst = destination;
1211 unsigned char *dst_end = dst + dst_bytes;
1213 struct utf_status *status = (struct utf_status *) &(converter->status);
1214 int big_endian = status->endian == UTF_BIG_ENDIAN;
1215 enum MTextFormat format = mt->format;
1217 SET_SRC (mt, format, from, to);
1219 if (status->bom != UTF_BOM_NO)
1223 *dst++ = 0x00, *dst++ = 0x00, *dst++ = 0xFE, *dst++ = 0xFF;
1225 *dst++ = 0xFF, *dst++ = 0xFE, *dst++ = 0x00, *dst++ = 0x00;
1226 status->bom = UTF_BOM_NO;
1233 ONE_MORE_CHAR (c, bytes, format);
1235 if (c < 0xD800 || (c >= 0xE000 && c < 0x110000))
1239 *dst++ = 0x00, *dst++ = c >> 16,
1240 *dst++ = (c >> 8) & 0xFF, *dst++ = c & 0xFF;
1242 *dst++ = c & 0xFF, *dst++ = (c >> 8) & 0xFF,
1243 *dst++ = c >> 16, *dst++ = 0x00;
1247 unsigned char buf[11];
1250 if (! converter->lenient)
1252 len = encode_unsupporeted_char (c, buf, buf + (dst_end - dst),
1255 goto insufficient_destination;
1257 for (i = 0; i < len; i++)
1258 *dst++ = 0, *dst++ = buf[i];
1260 for (i = 0; i < len; i++)
1261 *dst++ = buf[i], *dst++ = 0;
1266 /* We reach here because of an unsupported char. */
1267 converter->result = MCONVERSION_RESULT_INVALID_CHAR;
1270 insufficient_destination:
1271 converter->result = MCONVERSION_RESULT_INSUFFICIENT_DST;
1274 converter->nchars += nchars;
1275 converter->nbytes += dst - destination;
1276 return (converter->result == MCONVERSION_RESULT_INVALID_CHAR ? -1 : 0);
1280 /* Staffs for coding-systems of type MCODING_TYPE_ISO_2022. */
1282 #define ISO_CODE_STX 0x02 /* start text */
1283 #define ISO_CODE_SO 0x0E /* shift-out */
1284 #define ISO_CODE_SI 0x0F /* shift-in */
1285 #define ISO_CODE_SS2_7 0x19 /* single-shift-2 for 7-bit code */
1286 #define ISO_CODE_ESC 0x1B /* escape */
1287 #define ISO_CODE_SS2 0x8E /* single-shift-2 */
1288 #define ISO_CODE_SS3 0x8F /* single-shift-3 */
1290 /** Structure pointed by MCodingSystem.extra_spec. */
1292 struct iso_2022_spec
1296 /** Initial graphic registers (0..3) invoked to each graphic
1297 plane left and right. */
1298 int initial_invocation[2];
1300 /** Initially designated charsets for each graphic register. */
1301 MCharset *initial_designation[4];
1309 struct iso_2022_status
1312 MCharset *designation[4];
1313 unsigned single_shifting : 1;
1316 unsigned utf8_shifting : 1;
1317 MCharset *non_standard_charset;
1318 int non_standard_charset_bytes;
1319 int non_standard_encoding;
1322 enum iso_2022_code_class {
1323 ISO_control_0, /* Control codes in the range
1324 0x00..0x1F and 0x7F, except for the
1325 following 4 codes. */
1326 ISO_shift_out, /* ISO_CODE_SO (0x0E) */
1327 ISO_shift_in, /* ISO_CODE_SI (0x0F) */
1328 ISO_single_shift_2_7, /* ISO_CODE_SS2_7 (0x19) */
1329 ISO_escape, /* ISO_CODE_SO (0x1B) */
1330 ISO_control_1, /* Control codes in the range
1331 0x80..0x9F, except for the
1332 following 3 codes. */
1333 ISO_single_shift_2, /* ISO_CODE_SS2 (0x8E) */
1334 ISO_single_shift_3, /* ISO_CODE_SS3 (0x8F) */
1335 ISO_control_sequence_introducer, /* ISO_CODE_CSI (0x9B) */
1336 ISO_0x20_or_0x7F, /* Codes of the values 0x20 or 0x7F. */
1337 ISO_graphic_plane_0, /* Graphic codes in the range 0x21..0x7E. */
1338 ISO_0xA0_or_0xFF, /* Codes of the values 0xA0 or 0xFF. */
1339 ISO_graphic_plane_1 /* Graphic codes in the range 0xA1..0xFE. */
1340 } iso_2022_code_class[256];
1343 #define MCODING_ISO_DESIGNATION_MASK \
1344 (MCODING_ISO_DESIGNATION_G0 \
1345 | MCODING_ISO_DESIGNATION_G1 \
1346 | MCODING_ISO_DESIGNATION_CTEXT \
1347 | MCODING_ISO_DESIGNATION_CTEXT_EXT)
1350 setup_coding_iso_2022 (MCodingSystem *coding)
1352 MCodingInfoISO2022 *info = (MCodingInfoISO2022 *) (coding->extra_info);
1353 int ncharsets = coding->ncharsets;
1354 struct iso_2022_spec *spec;
1355 int designation_policy = info->flags & MCODING_ISO_DESIGNATION_MASK;
1358 coding->ascii_compatible = 0;
1360 MSTRUCT_CALLOC (spec, MERROR_CODING);
1362 spec->flags = info->flags;
1363 spec->initial_invocation[0] = info->initial_invocation[0];
1364 spec->initial_invocation[1] = info->initial_invocation[1];
1365 for (i = 0; i < 4; i++)
1366 spec->initial_designation[i] = NULL;
1367 if (designation_policy)
1369 spec->n_designations = ncharsets;
1370 if (spec->flags & MCODING_ISO_FULL_SUPPORT)
1371 spec->n_designations += mcharset__iso_2022_table.used;
1372 MTABLE_CALLOC (spec->designations, spec->n_designations, MERROR_CODING);
1373 for (i = 0; i < spec->n_designations; i++)
1374 spec->designations[i] = -1;
1378 if (spec->flags & MCODING_ISO_FULL_SUPPORT)
1379 MERROR (MERROR_CODING, -1);
1380 spec->designations = NULL;
1383 for (i = 0; i < ncharsets; i++)
1385 int reg = info->designations[i];
1388 && coding->charsets[i]->final_byte > 0
1389 && (reg < -4 || reg > 3))
1390 MERROR (MERROR_CODING, -1);
1393 if (spec->initial_designation[reg])
1394 MERROR (MERROR_CODING, -1);
1395 spec->initial_designation[reg] = coding->charsets[i];
1399 if (! designation_policy
1400 && ! (spec->flags & MCODING_ISO_EUC_TW_SHIFT))
1401 MERROR (MERROR_CODING, -1);
1405 if (designation_policy)
1406 spec->designations[i] = reg;
1407 if (coding->charsets[i] == mcharset__ascii)
1408 coding->ascii_compatible = 1;
1411 if (coding->ascii_compatible
1412 && (spec->flags & (MCODING_ISO_DESIGNATION_G0
1413 | MCODING_ISO_DESIGNATION_CTEXT
1414 | MCODING_ISO_DESIGNATION_CTEXT_EXT
1415 | MCODING_ISO_LOCKING_SHIFT)))
1416 coding->ascii_compatible = 0;
1418 if (spec->flags & MCODING_ISO_FULL_SUPPORT)
1419 for (i = 0; i < mcharset__iso_2022_table.used; i++)
1421 MCharset *charset = mcharset__iso_2022_table.charsets[i];
1423 spec->designations[ncharsets + i]
1424 = ((designation_policy == MCODING_ISO_DESIGNATION_CTEXT
1425 || designation_policy == MCODING_ISO_DESIGNATION_CTEXT_EXT)
1426 ? (charset->code_range[0] == 32
1427 || charset->code_range[1] == 255)
1428 : designation_policy == MCODING_ISO_DESIGNATION_G1);
1431 spec->use_esc = ((spec->flags & MCODING_ISO_DESIGNATION_MASK)
1432 || ((spec->flags & MCODING_ISO_LOCKING_SHIFT)
1433 && (spec->initial_designation[2]
1434 || spec->initial_designation[3]))
1435 || (! (spec->flags & MCODING_ISO_EIGHT_BIT)
1436 && (spec->flags & MCODING_ISO_SINGLE_SHIFT))
1437 || (spec->flags & MCODING_ISO_ISO6429));
1439 coding->extra_spec = (void *) spec;
1445 reset_coding_iso_2022 (MConverter *converter)
1447 MConverterStatus *internal = (MConverterStatus *) converter->internal_info;
1448 MCodingSystem *coding = internal->coding;
1449 struct iso_2022_status *status
1450 = (struct iso_2022_status *) &(converter->status);
1451 struct iso_2022_spec *spec;
1455 && setup_coding_iso_2022 (coding) < 0)
1459 spec = (struct iso_2022_spec *) coding->extra_spec;
1460 status->invocation[0] = spec->initial_invocation[0];
1461 status->invocation[1] = spec->initial_invocation[1];
1462 for (i = 0; i < 4; i++)
1463 status->designation[i] = spec->initial_designation[i];
1464 status->single_shifting = 0;
1471 #define ISO2022_DECODE_DESIGNATION(reg, dim, chars, final, rev) \
1473 MCharset *charset; \
1475 if ((final) < '0' || (final) >= 128) \
1476 goto invalid_byte; \
1479 charset = MCHARSET_ISO_2022 ((dim), (chars), (final)); \
1480 if (! (spec->flags & MCODING_ISO_FULL_SUPPORT)) \
1484 for (i = 0; i < coding->ncharsets; i++) \
1485 if (charset == coding->charsets[i]) \
1487 if (i == coding->ncharsets) \
1488 goto invalid_byte; \
1495 for (i = 0; i < mcharset__iso_2022_table.used; i++) \
1497 charset = mcharset__iso_2022_table.charsets[i]; \
1498 if (charset->revision == (rev) \
1499 && charset->dimension == (dim) \
1500 && charset->final_byte == (final) \
1501 && (charset->code_range[1] == (chars) \
1502 || ((chars) == 96 && charset->code_range[1] == 255))) \
1505 if (i == mcharset__iso_2022_table.used) \
1506 goto invalid_byte; \
1508 status->designation[reg] = charset; \
1513 find_ctext_non_standard_charset (char *charset_name)
1517 if (! strcmp (charset_name, "koi8-r"))
1518 charset = MCHARSET (msymbol ("koi8-r"));
1519 else if (! strcmp (charset_name, "big5-0"))
1520 charset = MCHARSET (msymbol ("big5"));
1527 decode_coding_iso_2022 (unsigned char *source, int src_bytes, MText *mt,
1528 MConverter *converter)
1530 MConverterStatus *internal = (MConverterStatus *) converter->internal_info;
1531 MCodingSystem *coding = internal->coding;
1532 unsigned char *src = internal->carryover;
1533 unsigned char *src_stop = src + internal->carryover_bytes;
1534 unsigned char *src_end = source + src_bytes;
1535 unsigned char *src_base;
1536 unsigned char *dst = mt->data + mt->nbytes;
1537 unsigned char *dst_end = mt->data + mt->allocated;
1539 int last_nchars = 0;
1540 int at_most = converter->at_most > 0 ? converter->at_most : -1;
1541 struct iso_2022_spec *spec = (struct iso_2022_spec *) coding->extra_spec;
1542 struct iso_2022_status *status
1543 = (struct iso_2022_status *) &(converter->status);
1544 MCharset *charset0, *charset1, *charset;
1546 MCharset *cns_charsets[15];
1548 charset0 = (status->invocation[0] >= 0
1549 ? status->designation[status->invocation[0]] : NULL);
1550 charset1 = (status->invocation[1] >= 0
1551 ? status->designation[status->invocation[1]] : NULL);
1552 charset = mcharset__ascii;
1554 if (spec->flags & MCODING_ISO_EUC_TW_SHIFT)
1558 memset (cns_charsets, 0, sizeof (cns_charsets));
1559 for (i = 0; i < coding->ncharsets; i++)
1560 if (coding->charsets[i]->dimension == 2
1561 && coding->charsets[i]->code_range[1] == 126)
1563 int final = coding->charsets[i]->final_byte;
1565 if (final >= 'G' && final <= 'M')
1566 cns_charsets[final - 'G'] = coding->charsets[i];
1568 cns_charsets[14] = coding->charsets[i];
1574 MCharset *this_charset = NULL;
1577 ONE_MORE_BASE_BYTE (c1);
1579 if (status->utf8_shifting)
1582 int bytes = CHAR_BYTES_BY_HEAD (c1);
1586 for (i = 1; i < bytes; i++)
1591 this_charset = UTF8_CHARSET (buf);
1592 c1 = STRING_CHAR_UTF8 (buf);
1596 if (status->non_standard_encoding > 0)
1600 this_charset = status->non_standard_charset;
1601 for (i = 1; i < status->non_standard_charset_bytes; i++)
1604 c1 = (c1 << 8) | c2;
1606 c1 = DECODE_CHAR (this_charset, c1);
1610 switch (iso_2022_code_class[c1])
1612 case ISO_graphic_plane_0:
1613 this_charset = charset0;
1616 case ISO_0x20_or_0x7F:
1618 || (charset0->code_range[0] != 32
1619 && charset0->code_range[1] != 255))
1620 /* This is SPACE or DEL. */
1621 this_charset = mcharset__ascii;
1623 /* This is a graphic character of plane 0. */
1624 this_charset = charset0;
1627 case ISO_graphic_plane_1:
1630 this_charset = charset1;
1633 case ISO_0xA0_or_0xFF:
1635 || charset1->code_range[0] == 33
1636 || ! (spec->flags & MCODING_ISO_EIGHT_BIT))
1638 /* This is a graphic character of plane 1. */
1641 this_charset = charset1;
1645 this_charset = mcharset__ascii;
1652 if ((spec->flags & MCODING_ISO_LOCKING_SHIFT)
1653 && status->designation[1])
1655 status->invocation[0] = 1;
1656 charset0 = status->designation[1];
1659 this_charset = mcharset__ascii;
1663 if (spec->flags & MCODING_ISO_LOCKING_SHIFT)
1665 status->invocation[0] = 0;
1666 charset0 = status->designation[0];
1669 this_charset = mcharset__ascii;
1672 case ISO_single_shift_2_7:
1673 if (! (spec->flags & MCODING_ISO_SINGLE_SHIFT_7))
1675 this_charset = mcharset__ascii;
1679 goto label_escape_sequence;
1681 case ISO_single_shift_2:
1682 if (spec->flags & MCODING_ISO_EUC_TW_SHIFT)
1685 if (c1 < 0xA1 || (c1 > 0xA7 && c1 < 0xAF) || c1 > 0xAF
1686 || ! cns_charsets[c1 - 0xA1])
1688 status->designation[2] = cns_charsets[c1 - 0xA1];
1690 else if (! (spec->flags & MCODING_ISO_SINGLE_SHIFT))
1692 /* SS2 is handled as an escape sequence of ESC 'N' */
1694 goto label_escape_sequence;
1696 case ISO_single_shift_3:
1697 if (! (spec->flags & MCODING_ISO_SINGLE_SHIFT))
1699 /* SS2 is handled as an escape sequence of ESC 'O' */
1701 goto label_escape_sequence;
1703 case ISO_control_sequence_introducer:
1704 /* CSI is handled as an escape sequence of ESC '[' ... */
1706 goto label_escape_sequence;
1709 if (! spec->use_esc)
1711 this_charset = mcharset__ascii;
1715 label_escape_sequence:
1716 /* Escape sequences handled here are invocation,
1717 designation, and direction specification. */
1720 case '&': /* revision of following character set */
1721 if (! (spec->flags & MCODING_ISO_DESIGNATION_MASK))
1722 goto unused_escape_sequence;
1724 if (c1 < '@' || c1 > '~')
1727 if (c1 != ISO_CODE_ESC)
1730 goto label_escape_sequence;
1732 case '$': /* designation of 2-byte character set */
1733 if (! (spec->flags & MCODING_ISO_DESIGNATION_MASK))
1734 goto unused_escape_sequence;
1736 if (c1 >= '@' && c1 <= 'B')
1737 { /* designation of JISX0208.1978, GB2312.1980, or
1739 ISO2022_DECODE_DESIGNATION (0, 2, 94, c1, -1);
1741 else if (c1 >= 0x28 && c1 <= 0x2B)
1742 { /* designation of (dimension 2, chars 94) character set */
1744 ISO2022_DECODE_DESIGNATION (c1 - 0x28, 2, 94, c2, -1);
1746 else if (c1 >= 0x2C && c1 <= 0x2F)
1747 { /* designation of (dimension 2, chars 96) character set */
1749 ISO2022_DECODE_DESIGNATION (c1 - 0x2C, 2, 96, c2, -1);
1753 /* We must update these variables now. */
1754 charset0 = status->designation[status->invocation[0]];
1755 charset1 = status->designation[status->invocation[1]];
1758 case 'n': /* invocation of locking-shift-2 */
1759 if (! (spec->flags & MCODING_ISO_LOCKING_SHIFT)
1760 || ! status->designation[2])
1762 status->invocation[0] = 2;
1763 charset0 = status->designation[2];
1766 case 'o': /* invocation of locking-shift-3 */
1767 if (! (spec->flags & MCODING_ISO_LOCKING_SHIFT)
1768 || ! status->designation[3])
1770 status->invocation[0] = 3;
1771 charset0 = status->designation[3];
1774 case 'N': /* invocation of single-shift-2 */
1775 if (! ((spec->flags & MCODING_ISO_SINGLE_SHIFT)
1776 || (spec->flags & MCODING_ISO_EUC_TW_SHIFT))
1777 || ! status->designation[2])
1779 this_charset = status->designation[2];
1781 if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
1785 case 'O': /* invocation of single-shift-3 */
1786 if (! (spec->flags & MCODING_ISO_SINGLE_SHIFT)
1787 || ! status->designation[3])
1789 this_charset = status->designation[3];
1791 if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
1795 case '[': /* specification of direction */
1796 if (! (spec->flags & MCODING_ISO_ISO6429))
1798 /* For the moment, nested direction is not supported.
1799 So, (coding->mode & CODING_MODE_DIRECTION) zero means
1800 left-to-right, and nonzero means right-to-left. */
1804 case ']': /* end of the current direction */
1805 case '0': /* end of the current direction */
1809 case '1': /* start of left-to-right direction */
1816 case '2': /* start of right-to-left direction */
1830 char charset_name[16];
1834 if (! spec->flags & MCODING_ISO_DESIGNATION_CTEXT_EXT)
1836 /* Compound-text uses these escape sequences:
1838 ESC % G -- utf-8 bytes -- ESC % @
1839 ESC % / 1 M L -- charset name -- STX -- bytes --
1840 ESC % / 2 M L -- charset name -- STX -- bytes --
1841 ESC % / 3 M L -- charset name -- STX -- bytes --
1842 ESC % / 4 M L -- charset name -- STX -- bytes --
1844 It also uses this sequence but that is not yet
1847 ESC % / 0 M L -- charset name -- STX -- bytes -- */
1852 status->utf8_shifting = 1;
1857 if (! status->utf8_shifting)
1859 status->utf8_shifting = 0;
1865 if (c1 < '1' || c1 > '4')
1867 status->non_standard_charset_bytes = c1 - '0';
1870 if (c1 < 128 || c2 < 128)
1872 bytes = (c1 - 128) * 128 + (c2 - 128);
1873 for (i = 0; i < 16; i++)
1876 if (c1 == ISO_CODE_STX)
1878 charset_name[i] = TOLOWER (c1);
1882 charset_name[i++] = '\0';
1883 this_charset = find_ctext_non_standard_charset (charset_name);
1886 status->non_standard_charset = this_charset;
1887 status->non_standard_encoding = bytes - i;
1892 if (! (spec->flags & MCODING_ISO_DESIGNATION_MASK))
1893 goto unused_escape_sequence;
1894 if (c1 >= 0x28 && c1 <= 0x2B)
1895 { /* designation of (dimension 1, chars 94) charset */
1897 ISO2022_DECODE_DESIGNATION (c1 - 0x28, 1, 94, c2, -1);
1899 else if (c1 >= 0x2C && c1 <= 0x2F)
1900 { /* designation of (dimension 1, chars 96) charset */
1902 ISO2022_DECODE_DESIGNATION (c1 - 0x2C, 1, 96, c2, -1);
1906 /* We must update these variables now. */
1907 charset0 = status->designation[status->invocation[0]];
1908 charset1 = status->designation[status->invocation[1]];
1911 unused_escape_sequence:
1912 UNGET_ONE_BYTE (c1);
1914 this_charset = mcharset__ascii;
1918 if (this_charset->dimension == 1)
1920 if (this_charset->code_range[1] <= 128)
1923 else if (this_charset->dimension == 2)
1926 c1 = ((c1 & 0x7F) << 8) | (c2 & 0x7F);
1928 else /* i.e. (dimension == 3) */
1932 c1 = ((c1 & 0x7F) << 16) | ((c2 & 0x7F) << 8) | (c3 & 0x7F);
1934 c1 = DECODE_CHAR (this_charset, c1);
1938 if (! converter->lenient)
1940 REWIND_SRC_TO_BASE ();
1942 this_charset = mcharset__binary;
1945 if (this_charset != mcharset__ascii
1946 && this_charset != charset)
1948 TAKEIN_CHARS (mt, nchars - last_nchars,
1949 dst - (mt->data + mt->nbytes), charset);
1950 charset = this_charset;
1951 last_nchars = nchars;
1954 if (status->non_standard_encoding > 0)
1955 status->non_standard_encoding -= status->non_standard_charset_bytes;
1957 /* We reach here because of an invalid byte. */
1963 TAKEIN_CHARS (mt, nchars - last_nchars,
1964 dst - (mt->data + mt->nbytes), charset);
1965 return finish_decoding (mt, converter, nchars,
1966 source, src_end, src_base, error);
1970 /* Produce codes (escape sequence) for designating CHARSET to graphic
1971 register REG at DST, and increment DST. If CHARSET->final-char is
1972 '@', 'A', or 'B' and SHORT_FORM is nonzero, produce designation
1973 sequence of short-form. Update STATUS->designation. */
1975 #define ISO2022_ENCODE_DESIGNATION(reg, charset, spec, status) \
1977 char *intermediate_char_94 = "()*+"; \
1978 char *intermediate_char_96 = ",-./"; \
1980 if (dst + 4 > dst_end) \
1981 goto memory_shortage; \
1982 *dst++ = ISO_CODE_ESC; \
1983 if (charset->dimension == 1) \
1985 if (charset->code_range[0] != 32 \
1986 && charset->code_range[1] != 255) \
1987 *dst++ = (unsigned char) (intermediate_char_94[reg]); \
1989 *dst++ = (unsigned char) (intermediate_char_96[reg]); \
1994 if (charset->code_range[0] != 32 \
1995 && charset->code_range[1] != 255) \
1997 if (spec->flags & MCODING_ISO_LONG_FORM \
1999 || charset->final_byte < '@' || charset->final_byte > 'B') \
2000 *dst++ = (unsigned char) (intermediate_char_94[reg]); \
2003 *dst++ = (unsigned char) (intermediate_char_96[reg]); \
2005 *dst++ = charset->final_byte; \
2007 status->designation[reg] = charset; \
2011 /* The following two macros produce codes (control character or escape
2012 sequence) for ISO-2022 single-shift functions (single-shift-2 and
2015 #define ISO2022_ENCODE_SINGLE_SHIFT_2(spec, status) \
2017 if (dst + 2 > dst_end) \
2018 goto memory_shortage; \
2019 if (! (spec->flags & MCODING_ISO_EIGHT_BIT)) \
2020 *dst++ = ISO_CODE_ESC, *dst++ = 'N'; \
2022 *dst++ = ISO_CODE_SS2; \
2023 status->single_shifting = 1; \
2027 #define ISO2022_ENCODE_SINGLE_SHIFT_3(spec, status) \
2029 if (dst + 2 > dst_end) \
2030 goto memory_shortage; \
2031 if (! (spec->flags & MCODING_ISO_EIGHT_BIT)) \
2032 *dst++ = ISO_CODE_ESC, *dst++ = 'O'; \
2034 *dst++ = ISO_CODE_SS3; \
2035 status->single_shifting = 1; \
2039 /* The following four macros produce codes (control character or
2040 escape sequence) for ISO-2022 locking-shift functions (shift-in,
2041 shift-out, locking-shift-2, and locking-shift-3). */
2043 #define ISO2022_ENCODE_SHIFT_IN(status) \
2045 if (dst + 1 > dst_end) \
2046 goto memory_shortage; \
2047 *dst++ = ISO_CODE_SI; \
2048 status->invocation[0] = 0; \
2052 #define ISO2022_ENCODE_SHIFT_OUT(status) \
2054 if (dst + 1 > dst_end) \
2055 goto memory_shortage; \
2056 *dst++ = ISO_CODE_SO; \
2057 status->invocation[0] = 1; \
2061 #define ISO2022_ENCODE_LOCKING_SHIFT_2(status) \
2063 if (dst + 2 > dst_end) \
2064 goto memory_shortage; \
2065 *dst++ = ISO_CODE_ESC, *dst++ = 'n'; \
2066 status->invocation[0] = 2; \
2070 #define ISO2022_ENCODE_LOCKING_SHIFT_3(status) \
2072 if (dst + 2 > dst_end) \
2073 goto memory_shortage; \
2074 *dst++ = ISO_CODE_ESC, *dst++ = 'o'; \
2075 status->invocation[0] = 3; \
2078 #define ISO2022_ENCODE_UTF8_SHIFT_START(len) \
2080 CHECK_DST (3 + len); \
2081 *dst++ = ISO_CODE_ESC; \
2084 status->utf8_shifting = 1; \
2088 #define ISO2022_ENCODE_UTF8_SHIFT_END() \
2091 *dst++ = ISO_CODE_ESC; \
2094 status->utf8_shifting = 0; \
2098 #define ISO2022_ENCODE_NON_STANDARD(name, len) \
2100 CHECK_DST (6 + len + 1 + non_standard_charset_bytes); \
2101 non_standard_begin = dst; \
2102 *dst++ = ISO_CODE_ESC; \
2105 *dst++ = '0' + non_standard_charset_bytes; \
2106 *dst++ = 0, *dst++ = 0; /* filled later */ \
2107 memcpy (dst, name, len); \
2109 *dst++ = ISO_CODE_STX; \
2110 non_standard_bytes = len + 1; \
2115 find_ctext_non_standard_name (MCharset *charset, int *bytes)
2117 char *name = msymbol_name (charset->name);
2119 if (! strcmp (name, "koi8-r"))
2121 else if (! strcmp (name, "big5"))
2122 name = "big5-0", *bytes = 2;
2128 /* Designate CHARSET to a graphic register specified in
2129 SPEC->designation. If the register is not yet invoked to graphic
2130 left not right, invoke it to graphic left. DSTP points to a
2131 variable containing a memory address where the output must go.
2132 DST_END is the limit of that memory.
2134 Return 0 if it succeeds. Return -1 otherwise, which means that the
2135 memory area is too short. By side effect, update the variable that
2139 iso_2022_designate_invoke_charset (MCodingSystem *coding,
2141 struct iso_2022_spec *spec,
2142 struct iso_2022_status *status,
2143 unsigned char **dstp,
2144 unsigned char *dst_end)
2147 unsigned char *dst = *dstp;
2149 for (i = 0; i < 4; i++)
2150 if (charset == status->designation[i])
2155 /* CHARSET is not yet designated to any graphic registers. */
2156 for (i = 0; i < coding->ncharsets; i++)
2157 if (charset == coding->charsets[i])
2159 if (i == coding->ncharsets)
2161 for (i = 0; i < mcharset__iso_2022_table.used; i++)
2162 if (charset == mcharset__iso_2022_table.charsets[i])
2164 i += coding->ncharsets;
2166 i = spec->designations[i];
2167 ISO2022_ENCODE_DESIGNATION (i, charset, spec, status);
2170 if (status->invocation[0] != i
2171 && status->invocation[1] != i)
2173 /* Graphic register I is not yet invoked. */
2176 case 0: /* graphic register 0 */
2177 ISO2022_ENCODE_SHIFT_IN (status);
2180 case 1: /* graphic register 1 */
2181 ISO2022_ENCODE_SHIFT_OUT (status);
2184 case 2: /* graphic register 2 */
2185 if (spec->flags & MCODING_ISO_SINGLE_SHIFT)
2186 ISO2022_ENCODE_SINGLE_SHIFT_2 (spec, status);
2188 ISO2022_ENCODE_LOCKING_SHIFT_2 (status);
2191 case 3: /* graphic register 3 */
2192 if (spec->flags & MCODING_ISO_SINGLE_SHIFT)
2193 ISO2022_ENCODE_SINGLE_SHIFT_3 (spec, status);
2195 ISO2022_ENCODE_LOCKING_SHIFT_3 (status);
2208 /* Reset the invocation/designation status to the initial one. SPEC
2209 and STATUS contain information about the current and initial
2210 invocation /designation status respectively. DSTP points to a
2211 variable containing a memory address where the output must go.
2212 DST_END is the limit of that memory.
2214 Return 0 if it succeeds. Return -1 otherwise, which means that the
2215 memory area is too short. By side effect, update the variable that
2219 iso_2022_reset_invocation_designation (struct iso_2022_spec *spec,
2220 struct iso_2022_status *status,
2221 unsigned char **dstp,
2222 unsigned char *dst_end)
2224 unsigned char *dst = *dstp;
2227 /* Reset the invocation status of GL. We have not yet supported GR
2229 if (status->invocation[0] != spec->initial_invocation[0]
2230 && spec->initial_invocation[0] >= 0)
2232 if (spec->initial_invocation[0] == 0)
2233 ISO2022_ENCODE_SHIFT_IN (status);
2234 else if (spec->initial_invocation[0] == 1)
2235 ISO2022_ENCODE_SHIFT_OUT (status);
2236 else if (spec->initial_invocation[0] == 2)
2237 ISO2022_ENCODE_LOCKING_SHIFT_2 (status);
2238 else /* i.e. spec->initial_invocation[0] == 3 */
2239 ISO2022_ENCODE_LOCKING_SHIFT_3 (status);
2242 /* Reset the designation status of G0..G3. */
2243 for (i = 0; i < 4; i++)
2244 if (status->designation[i] != spec->initial_designation[i]
2245 && spec->initial_designation[i])
2247 MCharset *charset = spec->initial_designation[i];
2249 ISO2022_ENCODE_DESIGNATION (i, charset, spec, status);
2262 encode_coding_iso_2022 (MText *mt, int from, int to,
2263 unsigned char *destination, int dst_bytes,
2264 MConverter *converter)
2266 MConverterStatus *internal = (MConverterStatus *) converter->internal_info;
2267 MCodingSystem *coding = internal->coding;
2268 unsigned char *src, *src_end;
2269 unsigned char *dst = destination;
2270 unsigned char *dst_end = dst + dst_bytes;
2272 unsigned char *dst_base;
2273 struct iso_2022_spec *spec = (struct iso_2022_spec *) coding->extra_spec;
2274 int full_support = spec->flags & MCODING_ISO_FULL_SUPPORT;
2275 struct iso_2022_status *status
2276 = (struct iso_2022_status *) &(converter->status);
2277 MCharset *primary, *charset0, *charset1;
2278 int next_primary_change;
2279 int ncharsets = coding->ncharsets;
2280 MCharset **charsets = coding->charsets;
2281 MCharset *cns_charsets[15];
2282 int ascii_compatible = coding->ascii_compatible;
2283 MCharset *non_standard_charset = NULL;
2284 int non_standard_charset_bytes = 0;
2285 int non_standard_bytes = 0;
2286 unsigned char *non_standard_begin = NULL;
2287 enum MTextFormat format = mt->format;
2289 SET_SRC (mt, format, from, to);
2291 if (spec->flags & MCODING_ISO_EUC_TW_SHIFT)
2295 memset (cns_charsets, 0, sizeof (cns_charsets));
2296 for (i = 0; i < ncharsets; i++)
2297 if (charsets[i]->dimension == 2)
2299 int final = charsets[i]->final_byte;
2301 if (final >= 'G' && final <= 'M')
2302 cns_charsets[final - 'G'] = charsets[i];
2304 cns_charsets[14] = charsets[i];
2308 next_primary_change = from;
2310 charset0 = status->designation[status->invocation[0]];
2311 charset1 = (status->invocation[1] < 0 ? NULL
2312 : status->designation[status->invocation[1]]);
2319 ONE_MORE_CHAR (c, bytes, format);
2321 if (c < 128 && ascii_compatible)
2323 if (status->utf8_shifting)
2324 ISO2022_ENCODE_UTF8_SHIFT_END ();
2328 else if (c <= 32 || c == 127)
2330 if (status->utf8_shifting)
2331 ISO2022_ENCODE_UTF8_SHIFT_END ();
2332 if (spec->flags & MCODING_ISO_RESET_AT_CNTL
2333 || (c == '\n' && spec->flags & MCODING_ISO_RESET_AT_EOL))
2335 if (iso_2022_reset_invocation_designation (spec, status,
2337 goto insufficient_destination;
2338 charset0 = status->designation[status->invocation[0]];
2339 charset1 = (status->invocation[1] < 0 ? NULL
2340 : status->designation[status->invocation[1]]);
2347 unsigned code = MCHAR_INVALID_CODE;
2348 MCharset *charset = NULL;
2350 int pos = from + nchars;
2352 if (pos >= next_primary_change)
2354 MSymbol primary_charset
2355 = (MSymbol) mtext_get_prop (mt, pos, Mcharset);
2356 primary = MCHARSET (primary_charset);
2357 if (primary && primary != mcharset__binary)
2359 if (primary->final_byte <= 0)
2361 else if (! full_support)
2365 for (i = 0; i < ncharsets; i++)
2366 if (primary == charsets[i])
2373 mtext_prop_range (mt, Mcharset, pos,
2374 NULL, &next_primary_change, 0);
2377 if (primary && primary != mcharset__binary)
2379 code = ENCODE_CHAR (primary, c);
2380 if (code != MCHAR_INVALID_CODE)
2385 if (c <= 32 || c == 127)
2388 charset = mcharset__ascii;
2394 for (i = 0; i < ncharsets; i++)
2396 charset = charsets[i];
2397 code = ENCODE_CHAR (charset, c);
2398 if (code != MCHAR_INVALID_CODE)
2403 if (spec->flags & MCODING_ISO_FULL_SUPPORT)
2405 for (i = 0; i < mcharset__iso_2022_table.used; i++)
2407 charset = mcharset__iso_2022_table.charsets[i];
2408 code = ENCODE_CHAR (charset, c);
2409 if (code != MCHAR_INVALID_CODE)
2412 if (i == mcharset__iso_2022_table.used)
2414 if (spec->flags & MCODING_ISO_DESIGNATION_CTEXT_EXT)
2415 goto unsupported_char;
2416 converter->result = MCONVERSION_RESULT_INVALID_CHAR;
2421 goto unsupported_char;
2427 && (charset->final_byte >= 0
2428 || spec->flags & MCODING_ISO_EUC_TW_SHIFT))
2430 if (code >= 0x80 && code < 0xA0)
2431 goto unsupported_char;
2433 if (status->utf8_shifting)
2434 ISO2022_ENCODE_UTF8_SHIFT_END ();
2435 if (charset == charset0)
2437 else if (charset == charset1)
2441 unsigned char *p = NULL;
2443 if (spec->flags & MCODING_ISO_EUC_TW_SHIFT)
2447 if (cns_charsets[0] == charset)
2453 for (i = 1; i < 15; i++)
2454 if (cns_charsets[i] == charset)
2457 *dst++ = ISO_CODE_SS2;
2460 status->single_shifting = 1;
2465 if (iso_2022_designate_invoke_charset
2466 (coding, charset, spec, status, &dst, dst_end) < 0)
2467 goto insufficient_destination;
2468 charset0 = status->designation[status->invocation[0]];
2469 charset1 = (status->invocation[1] < 0 ? NULL
2470 : status->designation[status->invocation[1]]);
2472 if (status->single_shifting)
2474 = (spec->flags & MCODING_ISO_EIGHT_BIT) ? 0x80 : 0;
2475 else if (charset == charset0)
2480 if (charset->dimension == 1)
2483 *dst++ = code | gr_mask;
2485 else if (charset->dimension == 2)
2488 *dst++ = (code >> 8) | gr_mask;
2489 *dst++ = (code & 0xFF) | gr_mask;
2494 *dst++ = (code >> 16) | gr_mask;
2495 *dst++ = ((code >> 8) & 0xFF) | gr_mask;
2496 *dst++ = (code & 0xFF) | gr_mask;
2498 status->single_shifting = 0;
2500 else if (charset && spec->flags & MCODING_ISO_DESIGNATION_CTEXT_EXT)
2502 if (charset != non_standard_charset)
2504 char *name = (find_ctext_non_standard_name
2505 (charset, &non_standard_charset_bytes));
2509 int len = strlen (name);
2511 ISO2022_ENCODE_NON_STANDARD (name, len);
2512 non_standard_charset = charset;
2515 non_standard_charset = NULL;
2518 if (non_standard_charset)
2520 if (dst + non_standard_charset_bytes > dst_end)
2521 goto insufficient_destination;
2522 non_standard_bytes += non_standard_charset_bytes;
2523 non_standard_begin[4] = (non_standard_bytes / 128) | 0x80;
2524 non_standard_begin[5] = (non_standard_bytes % 128) | 0x80;
2525 if (non_standard_charset_bytes == 1)
2527 else if (non_standard_charset_bytes == 2)
2528 *dst++ = code >> 8, *dst++ = code & 0xFF;
2529 else if (non_standard_charset_bytes == 3)
2530 *dst++ = code >> 16, *dst++ = (code >> 8) & 0xFF,
2531 *dst++ = code & 0xFF;
2532 else /* i.e non_standard_charset_bytes == 3 */
2533 *dst++ = code >> 24, *dst++ = (code >> 16) & 0xFF,
2534 *dst++ = (code >> 8) & 0xFF, *dst++ = code & 0xFF;
2538 int len = CHAR_BYTES (c);
2541 goto unsupported_char;
2542 if (! status->utf8_shifting)
2543 ISO2022_ENCODE_UTF8_SHIFT_START (len);
2546 CHAR_STRING (c, dst);
2550 goto unsupported_char;
2560 if (iso_2022_designate_invoke_charset (coding, mcharset__ascii,
2563 goto insufficient_destination;
2564 if (! converter->lenient)
2566 len = encode_unsupporeted_char (c, dst, dst_end, mt, from + nchars);
2568 goto insufficient_destination;
2574 /* We reach here because of an unsupported char. */
2575 converter->result = MCONVERSION_RESULT_INVALID_CHAR;
2578 insufficient_destination:
2580 converter->result = MCONVERSION_RESULT_INSUFFICIENT_DST;
2583 if (converter->result == MCONVERSION_RESULT_SUCCESS
2584 && converter->last_block)
2586 if (status->utf8_shifting)
2588 ISO2022_ENCODE_UTF8_SHIFT_END ();
2591 if (spec->flags & MCODING_ISO_RESET_AT_EOL
2592 && charset0 != spec->initial_designation[0])
2594 if (iso_2022_reset_invocation_designation (spec, status,
2596 goto insufficient_destination;
2599 converter->nchars += nchars;
2600 converter->nbytes += dst - destination;
2601 return (converter->result == MCONVERSION_RESULT_INVALID_CHAR ? -1 : 0);
2605 /* Staffs for coding-systems of type MCODING_TYPE_MISC. */
2607 /* For SJIS handling... */
2609 #define SJIS_TO_JIS(s1, s2) \
2611 ? (((s1 * 2 - (s1 >= 0xE0 ? 0x160 : 0xE0)) << 8) \
2613 : (((s1 * 2 - ((s1 >= 0xE0) ? 0x161 : 0xE1)) << 8) \
2614 | (s2 - ((s2 >= 0x7F) ? 0x20 : 0x1F))))
2616 #define JIS_TO_SJIS(c1, c2) \
2618 ? (((c1 / 2 + ((c1 < 0x5F) ? 0x71 : 0xB1)) << 8) \
2619 | (c2 + ((c2 >= 0x60) ? 0x20 : 0x1F))) \
2620 : (((c1 / 2 + ((c1 < 0x5F) ? 0x70 : 0xB0)) << 8) \
2625 reset_coding_sjis (MConverter *converter)
2627 MConverterStatus *internal = (MConverterStatus *) converter->internal_info;
2628 MCodingSystem *coding = internal->coding;
2630 if (! coding->ready)
2632 MSymbol kanji_sym = msymbol ("jisx0208.1983");
2633 MCharset *kanji = MCHARSET (kanji_sym);
2634 MSymbol kana_sym = msymbol ("jisx0201-kana");
2635 MCharset *kana = MCHARSET (kana_sym);
2637 if (! kanji_sym || ! kana_sym)
2639 coding->ncharsets = 3;
2640 coding->charsets[1] = kanji;
2641 coding->charsets[2] = kana;
2648 decode_coding_sjis (unsigned char *source, int src_bytes, MText *mt,
2649 MConverter *converter)
2651 MConverterStatus *internal = (MConverterStatus *) converter->internal_info;
2652 MCodingSystem *coding = internal->coding;
2653 unsigned char *src = internal->carryover;
2654 unsigned char *src_stop = src + internal->carryover_bytes;
2655 unsigned char *src_end = source + src_bytes;
2656 unsigned char *src_base;
2657 unsigned char *dst = mt->data + mt->nbytes;
2658 unsigned char *dst_end = mt->data + mt->allocated - MAX_UTF8_CHAR_BYTES;
2660 int last_nchars = 0;
2661 int at_most = converter->at_most > 0 ? converter->at_most : -1;
2663 MCharset *charset_roman = coding->charsets[0];
2664 MCharset *charset_kanji = coding->charsets[1];
2665 MCharset *charset_kana = coding->charsets[2];
2666 MCharset *charset = mcharset__ascii;
2671 MCharset *this_charset;
2674 ONE_MORE_BASE_BYTE (c1);
2679 this_charset = ((c1 <= 0x20 || c1 == 0x7F)
2683 else if ((c1 >= 0x81 && c1 <= 0x9F) || (c1 >= 0xE0 && c1 <= 0xEF))
2686 if ((c2 >= 0x40 && c2 <= 0x7F) || (c2 >= 80 && c2 <= 0xFC))
2688 this_charset = charset_kanji;
2689 c1 = SJIS_TO_JIS (c1, c2);
2694 else if (c1 >= 0xA1 && c1 <= 0xDF)
2696 this_charset = charset_kana;
2702 c = DECODE_CHAR (this_charset, c1);
2707 if (! converter->lenient)
2709 REWIND_SRC_TO_BASE ();
2711 this_charset = mcharset__binary;
2714 if (this_charset != mcharset__ascii
2715 && this_charset != charset)
2717 TAKEIN_CHARS (mt, nchars - last_nchars,
2718 dst - (mt->data + mt->nbytes), charset);
2719 charset = this_charset;
2720 last_nchars = nchars;
2724 /* We reach here because of an invalid byte. */
2728 TAKEIN_CHARS (mt, nchars - last_nchars,
2729 dst - (mt->data + mt->nbytes), charset);
2730 return finish_decoding (mt, converter, nchars,
2731 source, src_end, src_base, error);
2735 encode_coding_sjis (MText *mt, int from, int to,
2736 unsigned char *destination, int dst_bytes,
2737 MConverter *converter)
2739 MConverterStatus *internal = (MConverterStatus *) converter->internal_info;
2740 MCodingSystem *coding = internal->coding;
2741 unsigned char *src, *src_end;
2742 unsigned char *dst = destination;
2743 unsigned char *dst_end = dst + dst_bytes;
2745 MCharset *charset_roman = coding->charsets[0];
2746 MCharset *charset_kanji = coding->charsets[1];
2747 MCharset *charset_kana = coding->charsets[2];
2748 enum MTextFormat format = mt->format;
2750 SET_SRC (mt, format, from, to);
2757 ONE_MORE_CHAR (c, bytes, format);
2759 if (c <= 0x20 || c == 0x7F)
2766 if ((code = ENCODE_CHAR (charset_roman, c)) != MCHAR_INVALID_CODE)
2771 else if ((code = ENCODE_CHAR (charset_kanji, c))
2772 != MCHAR_INVALID_CODE)
2774 int c1 = code >> 8, c2 = code & 0xFF;
2775 code = JIS_TO_SJIS (c1, c2);
2778 *dst++ = code & 0xFF;
2780 else if ((code = ENCODE_CHAR (charset_kana, c))
2781 != MCHAR_INVALID_CODE)
2784 *dst++ = code | 0x80;
2788 if (! converter->lenient)
2790 len = encode_unsupporeted_char (c, dst, dst_end,
2793 goto insufficient_destination;
2800 /* We reach here because of an unsupported char. */
2801 converter->result = MCONVERSION_RESULT_INVALID_CHAR;
2804 insufficient_destination:
2805 converter->result = MCONVERSION_RESULT_INSUFFICIENT_DST;
2808 converter->nchars += nchars;
2809 converter->nbytes += dst - destination;
2810 return (converter->result == MCONVERSION_RESULT_INVALID_CHAR ? -1 : 0);
2814 static MCodingSystem *
2815 find_coding (MSymbol name)
2817 MCodingSystem *coding = (MCodingSystem *) msymbol_get (name, Mcoding);
2821 MPlist *param = mplist_get (coding_definition_list, name);
2825 param = mplist__from_plist (param);
2826 mconv_define_coding (MSYMBOL_NAME (name), param, NULL, NULL, NULL, NULL);
2827 coding = (MCodingSystem *) msymbol_get (name, Mcoding);
2828 M17N_OBJECT_UNREF (param);
2833 #define BINDING_NONE 0
2834 #define BINDING_BUFFER 1
2835 #define BINDING_STREAM 2
2837 #define CONVERT_WORKSIZE 0x10000
2843 mcoding__init (void)
2846 MPlist *param, *charsets, *pl;
2848 MLIST_INIT1 (&coding_list, codings, 128);
2849 coding_definition_list = mplist ();
2851 /* ISO-2022 specific initialize routine. */
2852 for (i = 0; i < 0x20; i++)
2853 iso_2022_code_class[i] = ISO_control_0;
2854 for (i = 0x21; i < 0x7F; i++)
2855 iso_2022_code_class[i] = ISO_graphic_plane_0;
2856 for (i = 0x80; i < 0xA0; i++)
2857 iso_2022_code_class[i] = ISO_control_1;
2858 for (i = 0xA1; i < 0xFF; i++)
2859 iso_2022_code_class[i] = ISO_graphic_plane_1;
2860 iso_2022_code_class[0x20] = iso_2022_code_class[0x7F] = ISO_0x20_or_0x7F;
2861 iso_2022_code_class[0xA0] = iso_2022_code_class[0xFF] = ISO_0xA0_or_0xFF;
2862 iso_2022_code_class[0x0E] = ISO_shift_out;
2863 iso_2022_code_class[0x0F] = ISO_shift_in;
2864 iso_2022_code_class[0x19] = ISO_single_shift_2_7;
2865 iso_2022_code_class[0x1B] = ISO_escape;
2866 iso_2022_code_class[0x8E] = ISO_single_shift_2;
2867 iso_2022_code_class[0x8F] = ISO_single_shift_3;
2868 iso_2022_code_class[0x9B] = ISO_control_sequence_introducer;
2870 Mcoding = msymbol ("coding");
2872 Mutf = msymbol ("utf");
2873 Miso_2022 = msymbol ("iso-2022");
2875 Mreset_at_eol = msymbol ("reset-at-eol");
2876 Mreset_at_cntl = msymbol ("reset-at-cntl");
2877 Meight_bit = msymbol ("eight-bit");
2878 Mlong_form = msymbol ("long-form");
2879 Mdesignation_g0 = msymbol ("designation-g0");
2880 Mdesignation_g1 = msymbol ("designation-g1");
2881 Mdesignation_ctext = msymbol ("designation-ctext");
2882 Mdesignation_ctext_ext = msymbol ("designation-ctext-ext");
2883 Mlocking_shift = msymbol ("locking-shift");
2884 Msingle_shift = msymbol ("single-shift");
2885 Msingle_shift_7 = msymbol ("single-shift-7");
2886 Meuc_tw_shift = msymbol ("euc-tw-shift");
2887 Miso_6429 = msymbol ("iso-6429");
2888 Mrevision_number = msymbol ("revision-number");
2889 Mfull_support = msymbol ("full-support");
2890 Mmaybe = msymbol ("maybe");
2892 Mtype = msymbol ("type");
2893 Mcharsets = msymbol_as_managing_key ("charsets");
2894 Mflags = msymbol_as_managing_key ("flags");
2895 Mdesignation = msymbol_as_managing_key ("designation");
2896 Minvocation = msymbol_as_managing_key ("invocation");
2897 Mcode_unit = msymbol ("code-unit");
2898 Mbom = msymbol ("bom");
2899 Mlittle_endian = msymbol ("little-endian");
2902 charsets = mplist ();
2904 /* Setup predefined codings. */
2905 mplist_set (charsets, Msymbol, Mcharset_ascii);
2906 pl = mplist_add (pl, Mtype, Mcharset);
2907 pl = mplist_add (pl, Mcharsets, charsets);
2908 Mcoding_us_ascii = mconv_define_coding ("us-ascii", param,
2909 NULL, NULL, NULL, NULL);
2912 MSymbol alias = msymbol ("ANSI_X3.4-1968");
2913 MCodingSystem *coding
2914 = (MCodingSystem *) msymbol_get (Mcoding_us_ascii, Mcoding);
2916 msymbol_put (alias, Mcoding, coding);
2917 alias = msymbol__canonicalize (alias);
2918 msymbol_put (alias, Mcoding, coding);
2921 mplist_set (charsets, Msymbol, Mcharset_iso_8859_1);
2922 Mcoding_iso_8859_1 = mconv_define_coding ("iso-8859-1", param,
2923 NULL, NULL, NULL, NULL);
2925 mplist_set (charsets, Msymbol, Mcharset_m17n);
2926 mplist_put (param, Mtype, Mutf);
2927 mplist_put (param, Mcode_unit, (void *) 8);
2928 Mcoding_utf_8_full = mconv_define_coding ("utf-8-full", param,
2929 NULL, NULL, NULL, NULL);
2931 mplist_set (charsets, Msymbol, Mcharset_unicode);
2932 Mcoding_utf_8 = mconv_define_coding ("utf-8", param,
2933 NULL, NULL, NULL, NULL);
2935 mplist_put (param, Mcode_unit, (void *) 16);
2936 mplist_put (param, Mbom, Mmaybe);
2937 #ifndef WORDS_BIGENDIAN
2938 mplist_put (param, Mlittle_endian, Mt);
2940 Mcoding_utf_16 = mconv_define_coding ("utf-16", param,
2941 NULL, NULL, NULL, NULL);
2943 mplist_put (param, Mcode_unit, (void *) 32);
2944 Mcoding_utf_32 = mconv_define_coding ("utf-32", param,
2945 NULL, NULL, NULL, NULL);
2947 mplist_put (param, Mcode_unit, (void *) 16);
2948 mplist_put (param, Mbom, Mnil);
2949 mplist_put (param, Mlittle_endian, Mnil);
2950 Mcoding_utf_16be = mconv_define_coding ("utf-16be", param,
2951 NULL, NULL, NULL, NULL);
2953 mplist_put (param, Mcode_unit, (void *) 32);
2954 Mcoding_utf_32be = mconv_define_coding ("utf-32be", param,
2955 NULL, NULL, NULL, NULL);
2957 mplist_put (param, Mcode_unit, (void *) 16);
2958 mplist_put (param, Mlittle_endian, Mt);
2959 Mcoding_utf_16le = mconv_define_coding ("utf-16le", param,
2960 NULL, NULL, NULL, NULL);
2962 mplist_put (param, Mcode_unit, (void *) 32);
2963 Mcoding_utf_32le = mconv_define_coding ("utf-32le", param,
2964 NULL, NULL, NULL, NULL);
2966 mplist_put (param, Mtype, Mnil);
2967 mplist_set (charsets, Msymbol, Mcharset_ascii);
2968 Mcoding_sjis = mconv_define_coding ("sjis", param,
2971 encode_coding_sjis, NULL);
2973 M17N_OBJECT_UNREF (charsets);
2974 M17N_OBJECT_UNREF (param);
2980 mcoding__fini (void)
2985 for (i = 0; i < coding_list.used; i++)
2987 MCodingSystem *coding = coding_list.codings[i];
2989 if (coding->extra_info)
2990 free (coding->extra_info);
2991 if (coding->extra_spec)
2992 free (coding->extra_spec);
2995 MLIST_FREE1 (&coding_list, codings);
2996 MPLIST_DO (plist, coding_definition_list)
2997 M17N_OBJECT_UNREF (MPLIST_VAL (plist));
2998 M17N_OBJECT_UNREF (coding_definition_list);
3002 mconv__define_coding_from_charset (MSymbol sym)
3004 MPlist *param = mplist (), *charsets = mplist ();
3006 mplist_set (charsets, Msymbol, sym);
3007 mplist_add (param, Mtype, Mcharset);
3008 mplist_add (param, Mcharsets, charsets);
3009 mconv_define_coding (msymbol_name (sym), param, NULL, NULL, NULL, NULL);
3010 M17N_OBJECT_UNREF (charsets);
3011 M17N_OBJECT_UNREF (param);
3015 mconv__register_charset_coding (MSymbol sym)
3017 if (! mplist_find_by_key (coding_definition_list, sym))
3019 MPlist *param = mplist (), *charsets = mplist ();
3021 mplist_set (charsets, Msymbol, sym);
3022 mplist_add (param, Msymbol, Mtype);
3023 mplist_add (param, Msymbol, Mcharset);
3024 mplist_add (param, Msymbol, Mcharsets);
3025 mplist_add (param, Mplist, charsets);
3026 mplist_put (coding_definition_list, sym, param);
3027 M17N_OBJECT_UNREF (charsets);
3033 mcoding__load_from_database ()
3035 MDatabase *mdb = mdatabase_find (msymbol ("coding-list"), Mnil, Mnil, Mnil);
3036 MPlist *def_list, *plist;
3037 MPlist *definitions = coding_definition_list;
3038 int mdebug_mask = MDEBUG_CODING;
3042 MDEBUG_PUSH_TIME ();
3043 def_list = (MPlist *) mdatabase_load (mdb);
3044 MDEBUG_PRINT_TIME ("CODING", (stderr, " to load the data."));
3049 MDEBUG_PUSH_TIME ();
3050 MPLIST_DO (plist, def_list)
3055 if (! MPLIST_PLIST_P (plist))
3056 MERROR (MERROR_CHARSET, -1);
3057 pl = MPLIST_PLIST (plist);
3058 if (! MPLIST_SYMBOL_P (pl))
3059 MERROR (MERROR_CHARSET, -1);
3060 name = MPLIST_SYMBOL (pl);
3061 pl = MPLIST_NEXT (pl);
3062 definitions = mplist_add (definitions, name, pl);
3063 M17N_OBJECT_REF (pl);
3066 M17N_OBJECT_UNREF (def_list);
3067 MDEBUG_PRINT_TIME ("CODING", (stderr, " to parse the loaded data."));
3073 #endif /* !FOR_DOXYGEN || DOXYGEN_INTERNAL_MODULE */
3077 /*** @addtogroup m17nConv */
3081 /***en @name Variables: Symbols representing a coding system */
3082 /***ja @name ÊÑ¿ô: ÄêµÁºÑ¤ß¥³¡¼¥É·Ï¤ò»ØÄꤹ¤ë¤¿¤á¤Î¥·¥ó¥Ü¥ë */
3087 @brief Symbol for the coding system US-ASCII
3089 The symbol #Mcoding_us_ascii has name <tt>"us-ascii"</tt> and
3090 represents a coding system for the CES US-ASCII. */
3093 @brief MIME charset "US-ASCII" ¤ËÂбþ¤¹¤ë¥³¡¼¥É·Ï¤Î¥·¥ó¥Ü¥ë
3095 ¥·¥ó¥Ü¥ë @c Mcoding_us_ascii ¤Ï <tt>"us-ascii"</tt> ¤È¤¤¤¦Ì¾Á°¤ò»ý¤Á¡¢
3096 MIME charset <tt>"US-ASCII"</tt> ¤ËÂбþ¤¹¤ë¥³¡¼¥É·Ï¤ò»ØÄꤹ¤ë¤¿¤á
3099 MSymbol Mcoding_us_ascii;
3103 @brief Symbol for the coding system ISO-8859-1
3105 The symbol #Mcoding_iso_8859_1 has name <tt>"iso-8859-1"</tt> and
3106 represents a coding system for the CES ISO-8859-1. */
3109 @brief MIME charset "ISO-8859-1" ¤ËÂбþ¤¹¤ë¥³¡¼¥É·Ï¤Î¥·¥ó¥Ü¥ë
3111 ¥·¥ó¥Ü¥ë @c Mcoding_iso_8859_1 ¤Ï <tt>"iso-8859-1"</tt> ¤È¤¤¤¦Ì¾Á°
3112 ¤ò»ý¤Á¡¢MIME charset <tt>"ISO-8859-1"</tt> ¤ËÂбþ¤¹¤ë¥³¡¼¥É·Ï¤ò»Ø
3113 Äꤹ¤ë¤¿¤á¤Ë»È¤ï¤ì¤ë¡£ */
3115 MSymbol Mcoding_iso_8859_1;
3119 @brief Symbol for the coding system UTF-8
3121 The symbol #Mcoding_utf_8 has name <tt>"utf-8"</tt> and represents
3122 a coding system for the CES UTF-8. */
3125 @brief RFC 2279 ¤Î "UTF-8" ¤ËÂбþ¤¹¤ë¥³¡¼¥É·Ï¤Î¥·¥ó¥Ü¥ë¡ÊUnicode ÍÑ¡Ë
3127 ¥·¥ó¥Ü¥ë @c Mcoding_utf_8 ¤Ï <tt>"utf-8"</tt> ¤È¤¤¤¦Ì¾Á°¤ò»ý¤Á¡¢
3128 RFC 2279 ¤ÇÄêµÁ¤µ¤ì¤ë<tt>"UTF-8"</tt> ¤ËÂбþ¤¹¤ë¥³¡¼¥É·Ï¤ò»ØÄꤹ¤ë
3129 ¤¿¤á¤Ë»È¤ï¤ì¤ë¡£¤³¤Î¥³¡¼¥É·Ï¤Ï Unicode ¤ÎÁ´¤Æ¤Îʸ»ú¤ò¥µ¥Ý¡¼¥È¤¹¤ë¡£
3132 MSymbol Mcoding_utf_8;
3139 The symbol #Mcoding_utf_8_full has name <tt>"utf-8-full"</tt> and
3140 represents a coding system that is a extension of UTF-8. This
3141 coding system uses the same encoding algorithm as UTF-8 but is not
3142 limited to the Unicode characters. It can encode all characters
3143 supported by the m17n library. */
3146 @brief RFC 2279 ¤Î "UTF-8" ¤ËÂбþ¤¹¤ë¥³¡¼¥É·Ï¤Î¥·¥ó¥Ü¥ë¡ÊÁ´Ê¸»úÍÑ¡Ë
3148 ¥·¥ó¥Ü¥ë @c Mcoding_utf_8_full ¤Ï <tt>"utf-8-full"</tt> ¤È¤¤¤¦Ì¾Á°
3149 ¤ò»ý¤Á¡¢RFC 2279 ¤ÇÄêµÁ¤µ¤ì¤ë<tt>"UTF-8"</tt> ¤ËÂбþ¤¹¤ë¥³¡¼¥É·Ï¤ò
3150 »ØÄꤹ¤ë¤¿¤á¤Ë»È¤ï¤ì¤ë¡£¤³¤Î¥³¡¼¥É·Ï¤Ï m17n ¥é¥¤¥Ö¥é¥ê¤¬°·¤¦Á´¤Æ¤Î
3151 ʸ»ú¤ò¥µ¥Ý¡¼¥È¤¹¤ë¡£ */
3153 MSymbol Mcoding_utf_8_full;
3159 The symbol #Mcoding_utf_16 has name <tt>"utf-16"</tt> and
3160 represents a coding system for the CES UTF-16 (RFC 2279). */
3162 @brief RFC 2781 ¤Î "UTF-16" ¤ËÂбþ¤¹¤ë¥³¡¼¥É·Ï¤Î¥·¥ó¥Ü¥ë
3164 ¥·¥ó¥Ü¥ë @c Mcoding_utf_16 ¤Ï <tt>"utf-16"</tt> ¤È¤¤¤¦Ì¾Á°¤ò»ý¤Á¡¢
3165 RFC 2279 ¤ÇÄêµÁ¤µ¤ì¤ë<tt>"UTF-16"</tt> ¤ËÂбþ¤¹¤ë¥³¡¼¥É·Ï¤ò»ØÄꤹ
3166 ¤ë¤¿¤á¤Ë»È¤ï¤ì¤ë¡£¤³¤Î¥³¡¼¥É·Ï¤Ï Unicode ¤ÎÁ´¤Æ¤Îʸ»ú¤ò¥µ¥Ý¡¼¥È¤¹
3169 MSymbol Mcoding_utf_16;
3175 The symbol #Mcoding_utf_16be has name <tt>"utf-16be"</tt> and
3176 represents a coding system for the CES UTF-16BE (RFC 2279). */
3178 MSymbol Mcoding_utf_16be;
3184 The symbol #Mcoding_utf_16le has name <tt>"utf-16le"</tt> and
3185 represents a coding system for the CES UTF-16LE (RFC 2279). */
3187 MSymbol Mcoding_utf_16le;
3193 The symbol #Mcoding_utf_32 has name <tt>"utf-32"</tt> and
3194 represents a coding system for the CES UTF-32 (RFC 2279). */
3196 MSymbol Mcoding_utf_32;
3202 The symbol #Mcoding_utf_32be has name <tt>"utf-32be"</tt> and
3203 represents a coding system for the CES UTF-32BE (RFC 2279). */
3205 MSymbol Mcoding_utf_32be;
3211 The symbol #Mcoding_utf_32le has name <tt>"utf-32le"</tt> and
3212 represents a coding system for the CES UTF-32LE (RFC 2279). */
3213 MSymbol Mcoding_utf_32le;
3219 The symbol #Mcoding_sjis has name <tt>"sjis"</tt> and represents a coding
3220 system for the CES Shift-JIS. */
3222 MSymbol Mcoding_sjis;
3227 @name Variables: Parameter keys for mconv_define_coding (). */
3232 Parameter key for mconv_define_coding () (which see). */
3238 MSymbol Mdesignation;
3239 MSymbol Minvocation;
3242 MSymbol Mlittle_endian;
3247 @name Variables: Symbols representing coding system type. */
3252 Symbol that can be a value of the #Mtype parameter of a coding
3253 system used in an argument to the mconv_define_coding () function
3264 @name Variables: Symbols appearing in the value of #Mfrag parameter. */
3269 Symbol that can be a value of the #Mflags parameter of a coding
3270 system used in an argument to the mconv_define_coding () function
3272 MSymbol Mreset_at_eol;
3274 MSymbol Mreset_at_cntl;
3277 MSymbol Mdesignation_g0;
3278 MSymbol Mdesignation_g1;
3279 MSymbol Mdesignation_ctext;
3280 MSymbol Mdesignation_ctext_ext;
3281 MSymbol Mlocking_shift;
3282 MSymbol Msingle_shift;
3283 MSymbol Msingle_shift_7;
3284 MSymbol Meuc_tw_shift;
3286 MSymbol Mrevision_number;
3287 MSymbol Mfull_support;
3292 @name Variables: etc
3294 Remaining variables. */
3295 /***ja @name ÊÑ¿ô: ¤½¤Î¾ */
3299 @brief Symbol whose name is "maybe".
3301 The variable #Mmaybe is a symbol of name <tt>"maybe"</tt>. It is
3302 used a value of #Mbom parameter of the function
3303 mconv_define_coding () (which see). */
3309 @brief The symbol @c Mcoding
3311 Any decoded M-text has a text property whose key is the predefined
3312 symbol @c Mcoding. The name of @c Mcoding is
3313 <tt>"coding"</tt>. */
3316 @brief ¥·¥ó¥Ü¥ë @c Mcoding
3318 ¥Ç¥³¡¼¥É¤µ¤ì¤¿ M-text ¤Ï¡¢¥¡¼¤¬ @c Mcoding ¤Ç¤¢¤ë¤è¤¦¤Ê¥Æ¥¥¹¥È¥×
3319 ¥í¥Ñ¥Æ¥£¤ò»ý¤Ä¡£¥·¥ó¥Ü¥ë @c Mcoding ¤Ï <tt>"coding"</tt> ¤È¤¤¤¦Ì¾
3320 Á°¤Ç¤¢¤é¤«¤¸¤áÄêµÁ¤µ¤ì¤Æ¤¤¤ë¡£ */
3328 @brief Define a coding system
3330 The mconv_define_coding () function defines a new coding system
3331 and makes it accessive via a symbol whose name is $NAME. $PLIST
3332 specifies parameters of the charset as below:
3336 <li> Key is @c Mtype, value is a symbol
3338 The value specifies the type of the coding system. It must be
3339 #Mcharset, #Mutf, #Miso_2022, or #Mnil.
3341 If the type is #Mcharset, $EXTRA_INFO is ignored.
3343 If the type is #Miso_2022, $EXTRA_INFO must be a pointer to
3344 #MCodingInfoISO2022.
3346 If the type is #Mutf, $EXTRA_INFO must be a pointer to
3349 If the type is #Mnil, the argument $RESETTER, $DECODER, and
3350 $ENCODER must be supplied. $EXTRA_INFO is ignored. Otherwise,
3351 they can be @c NULL and the m17n library provides proper defaults.
3353 <li> Key is #Mcharsets, value is a plist
3355 The value specifies a list charsets supported by the coding
3356 system. The keys of the plist must be #Msymbol, and the values
3357 must be symbols representing charsets.
3359 <li> Key is #Mflags, value is a plist
3361 If the type is #Miso_2022, the values specifies flags to control
3362 the ISO 2022 interpreter. The keys of the plist must e @c
3363 Msymbol, and values must be one of the following.
3369 If this flag exits, designation and invocation status is reset to
3370 the initial state at the end of line.
3372 <li> #Mreset_at_cntl
3374 If this flag exists, designation and invocation status is reset to
3375 the initial state at a control character.
3379 If this flag exists, the graphic plane right is used.
3383 If this flag exists, the over-long escape sequences (ESC '$' '('
3384 <final_byte>) are used for designating the charsets JISX0208.1978,
3385 GB2312, and JISX0208.
3387 <li> #Mdesignation_g0
3389 If this flag and #Mfull_support exists, designates charsets not
3390 listed in the charset list to the graphic register G0.
3392 <li> #Mdesignation_g1
3394 If this flag and #Mfull_support exists, designates charsets not
3395 listed in the charset list to the graphic register G1.
3397 <li> #Mdesignation_ctext
3399 If this flag and #Mfull_support exists, designates charsets not
3400 listed in the charset list to a graphic register G0 or G1 based on
3401 the criteria of the Compound Text.
3403 <li> #Mdesignation_ctext_ext
3405 If this flag and #Mfull_support exists, designates charsets not
3406 listed in the charset list to a graphic register G0 or G1, or use
3407 extended segment for such charsets based on the criteria of the
3410 <li> #Mlocking_shift
3412 If this flag exists, use locking shift.
3416 If this flag exists, use single shift.
3418 <li> #Msingle_shift_7
3420 If this flag exists, use 7-bit single shift code (0x19).
3422 <li> #Meuc_tw_shift;
3424 If this flag exists, use a special shifting according to EUC-TW.
3428 This flag is currently ignored.
3430 <li> #Mrevision_number
3432 If this flag exists, use a revision number escape sequence to
3433 designate a charset that has a revision number.
3437 If this flag exists, support all charsets registered in the
3438 International Registry.
3442 <li> Key is #Mdesignation, value is a plist
3444 If the type is #Miso_2022, the value specifies how to designate
3445 each supported characters. The keys of the plist must be @c
3446 Minteger, and the values must be numbers indicating a graphic
3447 registers. The Nth element value is for the Nth charset of the
3448 charset list. The value 0..3 means that it is assumed that a
3449 charset is already designated to the graphic register 0..3. The
3450 negative value G (-4..-1) means that a charset is not designated
3451 to any register at first, and if necessary, is designated to the
3452 (G+4) graphic register.
3454 <li> Key is #Minvocation, value is a plist
3456 If the type is #Miso_2022, the value specifies how to invocate
3457 each graphic registers. The plist length must be one or two. The
3458 keys of the plist must be #Minteger, and the values must be
3459 numbers indicating a graphic register. The value of the first
3460 element specifies which graphic register is invocated to the
3461 graphic plane left. If the length is one, no graphic register is
3462 invocated to the graphic plane right. Otherwise, the value of the
3463 second element specifies which graphic register is invocated to
3464 the graphic plane right.
3466 <li> Key is #Mcode_unit, value is an integer
3468 If the type is #Mutf, the value specifies the bit length of a
3469 code-unit. It must be 8, 16, or 32.
3471 <li> Key is #Mbom, value is a symbol
3473 If the type is #Mutf and the code-unit bit length is 16 or 32,
3474 it specifies whether or not to use BOM (Byte Order Mark). If the
3475 value is #Mnil (default), BOM is not used, else if the value is
3476 #Mmaybe, the existence of BOM is detected at decoding time, else
3479 <li> Key is #Mlittle_endian, value is a symbol
3481 If the type is #Mutf and the code-unit bit length is 16 or 32,
3482 it specifies whether or not the encoding is little endian. If the
3483 value is #Mnil (default), it is big endian, else it is little
3488 $RESETTER is a pointer to a function that resets a converter for
3489 the coding system to the initial status. The pointed function is
3490 called with one argument, a pointer to a converter object.
3492 $DECODER is a pointer to a function that decodes a byte sequence
3493 according to the coding system. The pointed function is called
3494 with four arguments:
3496 @li A pointer to the byte sequence to decode.
3497 @li The number of bytes to decode.
3498 @li A pointer to an M-text to which the decoded characters are appended.
3499 @li A pointer to a converter object.
3501 $DECODER must return 0 if it succeeds. Otherwise it must return -1.
3503 $ENCODER is a pointer to a function that encodes an M-text
3504 according to the coding system. The pointed function is called
3507 @li A pointer to the M-text to encode.
3508 @li The starting position of the encoding.
3509 @li The ending position of the encoding.
3510 @li A pointer to a memory area where the produced bytes are stored.
3511 @li The size of the memory area.
3512 @li A pointer to a converter object.
3514 $ENCODER must return 0 if it succeeds. Otherwise it must return -1.
3516 $EXTRA_INFO is a pointer to a data structure that contains extra
3517 information about the coding system. The type of the data
3518 structure depends on $TYPE.
3522 If the operation was successful, mconv_define_coding () returns a
3523 symbol whose name is $NAME. If an error is detected, it returns
3524 #Mnil and assigns an error code to the external variable @c
3528 @brief ¥³¡¼¥É·Ï¤ÎÄêµÁ
3530 ´Ø¿ô mconv_define_coding () ¤Ï¡¢¿·¤·¤¤¥³¡¼¥É·Ï¤òÄêµÁ¤·¡¢¤½¤ì¤ò
3531 $NAME ¤È¤¤¤¦Ì¾Á°¤Î¥·¥ó¥Ü¥ë·Ðͳ¤Ç¥¢¥¯¥»¥¹¤Ç¤¤ë¤è¤¦¤Ë¤¹¤ë¡£
3533 $TYPE ¤Ï Îóµó·¿ #MCodingType ¤Î¤¤¤º¤ì¤«¤Ç¤¢¤ê¡¢¥³¡¼¥É·Ï¤Î¹½Â¤¤ò
3536 $CHARSET_NAMES ¤Ï¥µ¥Ý¡¼¥È¤¹¤ëʸ»ú¥»¥Ã¥È¤òɽ¤ï¤¹¥·¥ó¥Ü¥ë¤ÎÇÛÎó¤Ç¤¢¤ê¡¢
3537 $NCHARSETS ¤Ï¤½¤ÎÍ×ÁÇ¿ô¤Ç¤¢¤ë¡£
3539 $TYPE ¤¬ #MCODING_TYPE_MISC ¤Ç¤¢¤ë¾ì¹ç¤Ë¤Ï¡¢$RESETTER, $DECODER,
3540 $ENCODER ¤òÍ¿¤¨¤Ê¤¯¤Æ¤Ï¤Ê¤é¤Ê¤¤¡£¤½¤ì°Ê³°¤Î¾ì¹ç¤Ë¤Ï¤³¤ì¤é¤Ï @c
3541 NULL ¤Ç¹½¤ï¤Ê¤¤¡£¤½¤ÎºÝ¤Ë¤Ï m17n ¥é¥¤¥Ö¥é¥ê¤¬Å¬Àڤʥǥե©¥ë¥ÈÃͤò
3544 $RESETTER ¤Ï¤³¤Î¥³¡¼¥É·ÏÍѤΥ³¥ó¥Ð¡¼¥¿¤ò½é´ü¾õÂ֤˥ꥻ¥Ã¥È¤¹¤ë´Ø¿ô
3545 ¤Ø¤Î¥Ý¥¤¥ó¥¿¤Ç¤¢¤ë¡£¤³¤Î´Ø¿ô¤Ï¥³¥ó¥Ð¡¼¥¿¥ª¥Ö¥¸¥§¥¯¥È¤Ø¤Î¥Ý¥¤¥ó¥¿¤È
3548 $DECODER ¤Ï¥Ð¥¤¥ÈÎó¤ò¤³¤Î¥³¡¼¥É·Ï¤Ë½¾¤Ã¤Æ¥Ç¥³¡¼¥É¤¹¤ë´Ø¿ô¤Ø¤Î¥Ý¥¤
3549 ¥ó¥¿¤Ç¤¢¤ë¡£¤³¤Î´Ø¿ô¤Ï°Ê²¼¤Î4°ú¿ô¤ò¤È¤ë¡£
3551 @li ¥Ð¥¤¥ÈÎó¤Ø¤Î¥Ý¥¤¥ó¥¿
3552 @li ¥Ç¥³¡¼¥É¤¹¤Ù¤¥Ð¥¤¥È¿ô
3553 @li ¥Ç¥³¡¼¥É·ë²Ì¤Îʸ»ú¤òÉղ乤ë M-text ¤Ø¤Î¥Ý¥¤¥ó¥¿
3554 @li ¥³¥ó¥Ð¡¼¥¿¥ª¥Ö¥¸¥§¥¯¥È¤Ø¤Î¥Ý¥¤¥ó¥¿
3556 $DECODER ¤ÏÀ®¸ù¤·¤¿¤È¤¤Ë¤Ï0¤ò¡¢¼ºÇÔ¤·¤¿¤È¤¤Ë¤Ï-1¤òÊÖ¤µ¤Ê¤¯¤Æ¤Ï¤Ê
3559 $ENCODER ¤Ï M-text ¤ò¤³¤Î¥³¡¼¥É·Ï¤Ë½¾¤Ã¤Æ¥¨¥ó¥³¡¼¥É¤¹
3560 ¤ë´Ø¿ô¤Ø¤Î¥Ý¥¤¥ó¥¿¤Ç¤¢¤ë¡£¤³¤Î´Ø¿ô¤Ï°Ê²¼¤Î6°ú¿ô¤ò¤È¤ë¡£
3562 @li M-text ¤Ø¤Î¥Ý¥¤¥ó¥¿
3563 @li M-text ¤Î¥¨¥ó¥³¡¼¥É³«»Ï°ÌÃÖ
3564 @li M-text ¤Î¥¨¥ó¥³¡¼¥É½ªÎ»°ÌÃÖ
3565 @li À¸À®¤·¤¿¥Ð¥¤¥È¤òÊÝ»ý¤¹¤ë¥á¥â¥êÎΰè¤Ø¤Î¥Ý¥¤¥ó¥¿
3566 @li ¥á¥â¥êÎΰè¤Î¥µ¥¤¥º
3567 @li ¥³¥ó¥Ð¡¼¥¿¥ª¥Ö¥¸¥§¥¯¥È¤Ø¤Î¥Ý¥¤¥ó¥¿
3569 $ENCODER ¤ÏÀ®¸ù¤·¤¿¤È¤¤Ë¤Ï0¤ò¡¢¼ºÇÔ¤·¤¿¤È¤¤Ë¤Ï-1¤òÊÖ¤µ¤Ê¤¯¤Æ¤Ï¤Ê
3572 $EXTRA_INFO ¤Ï¥³¡¼¥Ç¥£¥°¥·¥¹¥Æ¥à¤Ë´Ø¤¹¤ëÄɲþðÊó¤ò´Þ¤à¥Ç¡¼¥¿¹½Â¤¤Ø
3573 ¤Î¥Ý¥¤¥ó¥¿¤Ç¤¢¤ë¡£¤³¤Î¥Ç¡¼¥¿¹½Â¤¤Î¥¿¥¤¥×¤Ï $TYPE ¤Ë°Í¸¤¹¤ë¡£
3575 $TYPE ¤¬ #MCODING_TYPE_ISO_2022 ¤Ç¤¢¤ì¤Ð¡¢$EXTRA_INFO ¤Ï @c
3576 MCodingInfoISO2022 ¤Ø¤Î¥Ý¥¤¥ó¥¿¤Ç¤Ê¤¯¤Æ¤Ï¤Ê¤é¤Ê¤¤¡£
3578 $TYPE ¤¬ #MCODING_TYPE_UTF ¤Ç¤¢¤ì¤Ð¡¢$EXTRA_INFO ¤Ï @c
3579 MCodingInfoUTF ¤Ø¤Î¥Ý¥¤¥ó¥¿¤Ç¤Ê¤¯¤Æ¤Ï¤Ê¤é¤Ê¤¤¡£
3581 $TYPE ¤¬ #MCODING_TYPE_CHARSET, #MCODING_TYPE_MISC ¤Î¤É¤ì¤«¤Ç
3582 ¤¢¤ì¤Ð¡¢$EXTRA_INFO ¤Ï̵»ë¤µ¤ì¤ë¡£
3586 ½èÍý¤ËÀ®¸ù¤¹¤ì¤Ð mconv_define_coding () ¤Ï $NAME ¤È¤¤¤¦Ì¾Á°¤Î¥·
3587 ¥ó¥Ü¥ë¤òÊÖ¤¹¡£¤³¤Î¥·¥ó¥Ü¥ë¤Ï¡¢¥¡¼¤¬ $Mcoding ¤Ç¡¢ºî¤é¤ì¤¿¥³¡¼¥É·Ï
3588 ¤Ø¤Î¥Ý¥¤¥ó¥¿¤òÃͤȤ¹¤ë¥·¥ó¥Ü¥ë¥×¥í¥Ñ¥Æ¥£¤ò»ý¤Ä¡£ ¥¨¥é¡¼¤¬¸¡½Ð¤µ¤ì
3589 ¤¿¾ì¹ç¤Ï Mnil ¤òÊÖ¤·¡¢³°ÉôÊÑ¿ô #merror_code ¤Ë¥¨¥é¡¼¥³¡¼¥É¤òÀßÄꤹ¤ë¡£
3597 mconv_define_coding (char *name, MPlist *plist,
3598 int (*resetter) (MConverter *),
3599 int (*decoder) (unsigned char *, int, MText *,
3601 int (*encoder) (MText *, int, int,
3602 unsigned char *, int,
3606 MSymbol sym = msymbol (name);
3608 MCodingSystem *coding;
3611 MSTRUCT_MALLOC (coding, MERROR_CODING);
3613 if ((coding->type = (MSymbol) mplist_get (plist, Mtype)) == Mnil)
3614 coding->type = Mcharset;
3615 pl = (MPlist *) mplist_get (plist, Mcharsets);
3617 MERROR (MERROR_CODING, Mnil);
3618 coding->ncharsets = mplist_length (pl);
3619 if (coding->ncharsets > NUM_SUPPORTED_CHARSETS)
3620 coding->ncharsets = NUM_SUPPORTED_CHARSETS;
3621 for (i = 0; i < coding->ncharsets; i++, pl = MPLIST_NEXT (pl))
3623 MSymbol charset_name;
3625 if (MPLIST_KEY (pl) != Msymbol)
3626 MERROR (MERROR_CODING, Mnil);
3627 charset_name = MPLIST_SYMBOL (pl);
3628 if (! (coding->charsets[i] = MCHARSET (charset_name)))
3629 MERROR (MERROR_CODING, Mnil);
3632 coding->resetter = resetter;
3633 coding->decoder = decoder;
3634 coding->encoder = encoder;
3635 coding->ascii_compatible = 0;
3636 coding->extra_info = extra_info;
3637 coding->extra_spec = NULL;
3640 if (coding->type == Mcharset)
3642 if (! coding->resetter)
3643 coding->resetter = reset_coding_charset;
3644 if (! coding->decoder)
3645 coding->decoder = decode_coding_charset;
3646 if (! coding->encoder)
3647 coding->encoder = encode_coding_charset;
3649 else if (coding->type == Mutf)
3651 MCodingInfoUTF *info = malloc (sizeof (MCodingInfoUTF));
3654 if (! coding->resetter)
3655 coding->resetter = reset_coding_utf;
3657 info->code_unit_bits = (int) mplist_get (plist, Mcode_unit);
3658 if (info->code_unit_bits == 8)
3660 if (! coding->decoder)
3661 coding->decoder = decode_coding_utf_8;
3662 if (! coding->encoder)
3663 coding->encoder = encode_coding_utf_8;
3665 else if (info->code_unit_bits == 16)
3667 if (! coding->decoder)
3668 coding->decoder = decode_coding_utf_16;
3669 if (! coding->encoder)
3670 coding->encoder = encode_coding_utf_16;
3672 else if (info->code_unit_bits == 32)
3674 if (! coding->decoder)
3675 coding->decoder = decode_coding_utf_32;
3676 if (! coding->encoder)
3677 coding->encoder = encode_coding_utf_32;
3680 MERROR (MERROR_CODING, Mnil);
3681 val = (MSymbol) mplist_get (plist, Mbom);
3684 else if (val == Mmaybe)
3689 info->endian = (mplist_get (plist, Mlittle_endian) ? 1 : 0);
3690 coding->extra_info = info;
3692 else if (coding->type == Miso_2022)
3694 MCodingInfoISO2022 *info = malloc (sizeof (MCodingInfoISO2022));
3696 if (! coding->resetter)
3697 coding->resetter = reset_coding_iso_2022;
3698 if (! coding->decoder)
3699 coding->decoder = decode_coding_iso_2022;
3700 if (! coding->encoder)
3701 coding->encoder = encode_coding_iso_2022;
3703 info->initial_invocation[0] = 0;
3704 info->initial_invocation[1] = -1;
3705 pl = (MPlist *) mplist_get (plist, Minvocation);
3708 if (MPLIST_KEY (pl) != Minteger)
3709 MERROR (MERROR_CODING, Mnil);
3710 info->initial_invocation[0] = MPLIST_INTEGER (pl);
3711 if (! MPLIST_TAIL_P (pl))
3713 pl = MPLIST_NEXT (pl);
3714 if (MPLIST_KEY (pl) != Minteger)
3715 MERROR (MERROR_CODING, Mnil);
3716 info->initial_invocation[1] = MPLIST_INTEGER (pl);
3719 memset (info->designations, 0, sizeof (info->designations));
3720 for (i = 0, pl = (MPlist *) mplist_get (plist, Mdesignation);
3721 i < 32 && pl && MPLIST_KEY (pl) == Minteger;
3722 i++, pl = MPLIST_NEXT (pl))
3723 info->designations[i] = MPLIST_INTEGER (pl);
3726 MPLIST_DO (pl, (MPlist *) mplist_get (plist, Mflags))
3730 if (MPLIST_KEY (pl) != Msymbol)
3731 MERROR (MERROR_CODING, Mnil);
3732 val = MPLIST_SYMBOL (pl);
3733 if (val == Mreset_at_eol)
3734 info->flags |= MCODING_ISO_RESET_AT_EOL;
3735 else if (val == Mreset_at_cntl)
3736 info->flags |= MCODING_ISO_RESET_AT_CNTL;
3737 else if (val == Meight_bit)
3738 info->flags |= MCODING_ISO_EIGHT_BIT;
3739 else if (val == Mlong_form)
3740 info->flags |= MCODING_ISO_LOCKING_SHIFT;
3741 else if (val == Mdesignation_g0)
3742 info->flags |= MCODING_ISO_DESIGNATION_G0;
3743 else if (val == Mdesignation_g1)
3744 info->flags |= MCODING_ISO_DESIGNATION_G1;
3745 else if (val == Mdesignation_ctext)
3746 info->flags |= MCODING_ISO_DESIGNATION_CTEXT;
3747 else if (val == Mdesignation_ctext_ext)
3748 info->flags |= MCODING_ISO_DESIGNATION_CTEXT_EXT;
3749 else if (val == Mlocking_shift)
3750 info->flags |= MCODING_ISO_LOCKING_SHIFT;
3751 else if (val == Msingle_shift)
3752 info->flags |= MCODING_ISO_SINGLE_SHIFT;
3753 else if (val == Msingle_shift_7)
3754 info->flags |= MCODING_ISO_SINGLE_SHIFT_7;
3755 else if (val == Meuc_tw_shift)
3756 info->flags |= MCODING_ISO_EUC_TW_SHIFT;
3757 else if (val == Miso_6429)
3758 info->flags |= MCODING_ISO_ISO6429;
3759 else if (val == Mrevision_number)
3760 info->flags |= MCODING_ISO_REVISION_NUMBER;
3761 else if (val == Mfull_support)
3762 info->flags |= MCODING_ISO_FULL_SUPPORT;
3765 coding->extra_info = info;
3769 if (! coding->decoder || ! coding->encoder)
3770 MERROR (MERROR_CODING, Mnil);
3771 if (! coding->resetter)
3775 msymbol_put (sym, Mcoding, coding);
3776 msymbol_put (msymbol__canonicalize (sym), Mcoding, coding);
3777 plist = (MPlist *) mplist_get (plist, Maliases);
3780 MPLIST_DO (pl, plist)
3784 if (MPLIST_KEY (pl) != Msymbol)
3786 alias = MPLIST_SYMBOL (pl);
3787 msymbol_put (alias, Mcoding, coding);
3788 msymbol_put (msymbol__canonicalize (alias), Mcoding, coding);
3792 MLIST_APPEND1 (&coding_list, codings, coding, MERROR_CODING);
3800 @brief Resolve coding system name.
3802 The mconv_resolve_coding () function returns $SYMBOL if it
3803 represents a coding system. Otherwise, canonicalize $SYMBOL as to
3804 a coding system name, and if the canonicalized name represents a
3805 coding system, return it. Otherwise, return Mnil. */
3809 mconv_resolve_coding (MSymbol symbol)
3811 MCodingSystem *coding = find_coding (symbol);
3815 symbol = msymbol__canonicalize (symbol);
3816 coding = find_coding (symbol);
3818 return (coding ? coding->name : Mnil);
3825 @brief List symbols representing a coding system.
3827 The mconv_list_codings () function makes an array of symbols
3828 representing a coding system, stores the pointer to the array in a
3829 place pointed to by $SYMBOLS, and returns the length of the array. */
3832 mconv_list_codings (MSymbol **symbols)
3834 int i = coding_list.used + mplist_length (coding_definition_list);
3838 MTABLE_MALLOC ((*symbols), i, MERROR_CODING);
3840 MPLIST_DO (plist, coding_definition_list)
3841 (*symbols)[i++] = MPLIST_KEY (plist);
3842 for (j = 0; j < coding_list.used; j++)
3843 if (! mplist_find_by_key (coding_definition_list,
3844 coding_list.codings[j]->name))
3845 (*symbols)[i++] = coding_list.codings[j]->name;
3852 @brief Create a code converter bound to a buffer.
3854 The mconv_buffer_converter () function creates a pointer to a code
3855 converter for coding system $CODING. The code converter is bound
3856 to buffer area of $N bytes pointed to by $BUF. Subsequent
3857 decodings and encodings are done to/from this buffer area.
3859 $CODING can be #Mnil. In this case, a coding system associated
3860 with the current locale (LC_CTYPE) is used.
3863 If the operation was successful, mconv_buffer_converter () returns
3864 the created code converter. Otherwise it returns @c NULL and
3865 assigns an error code to the external variable #merror_code. */
3868 @brief ¥Ð¥Ã¥Õ¥¡¤Ë·ë¤ÓÉÕ¤±¤é¤ì¤¿¥³¡¼¥É¥³¥ó¥Ð¡¼¥¿¤òºî¤ë
3870 ´Ø¿ô mconv_buffer_converter () ¤Ï¡¢¥³¡¼¥É·Ï $CODING ÍѤΥ³¡¼¥É¥³¥ó
3871 ¥Ð¡¼¥¿¤òºî¤ë¡£¤³¤Î¥³¡¼¥É¥³¥ó¥Ð¡¼¥¿¤Ï¡¢$BUF ¤Ç¼¨¤µ¤ì¤ëÂ礤µ $N ¥Ð
3872 ¥¤¥È¤Î¥Ð¥Ã¥Õ¥¡Îΰè¤Ë·ë¤ÓÉÕ¤±¤é¤ì¤ë¡£¤³¤ì°Ê¹ß¤Î¥Ç¥³¡¼¥É¤ª¤è¤Ó
3873 ¥¨¥ó¥³¡¼¥É¤Ï¡¢¤³¤Î¥Ð¥Ã¥Õ¥¡Îΰè¤ËÂФ·¤Æ¹Ô¤Ê¤ï¤ì¤ë¡£
3875 $CODING ¤Ï #Mnil ¤Ç¤¢¤Ã¤Æ¤â¤è¤¤¡£¤³¤Î¾ì¹ç¤Ï¸½ºß¤Î¥í¥±¡¼¥ë
3876 (LC_CTYPE) ¤Ë´ØÏ¢ÉÕ¤±¤é¤ì¤¿¥³¡¼¥É·Ï¤¬»È¤ï¤ì¤ë¡£
3879 ¤â¤·½èÍý¤¬À®¸ù¤¹¤ì¤Ð mconv_buffer_converter () ¤Ï ºî¤é¤ì¤¿¥³¡¼¥É¥³
3880 ¥ó¥Ð¡¼¥¿¤òÊÖ¤¹¡£¤½¤¦¤Ç¤Ê¤±¤ì¤Ð @c NULL ¤òÊÖ¤·¡¢³°ÉôÊÑ¿ô #merror_code
3881 ¤Ë¥¨¥é¡¼¥³¡¼¥É¤òÀßÄꤹ¤ë¡£
3883 @latexonly \IPAlabel{mconverter} @endlatexonly */
3887 @c MERROR_SYMBOL, @c MERROR_CODING
3890 mconv_stream_converter () */
3893 mconv_buffer_converter (MSymbol name, unsigned char *buf, int n)
3895 MCodingSystem *coding;
3896 MConverter *converter;
3897 MConverterStatus *internal;
3900 name = mlocale_get_prop (mlocale__ctype, Mcoding);
3901 coding = find_coding (name);
3903 MERROR (MERROR_CODING, NULL);
3904 MSTRUCT_CALLOC (converter, MERROR_CODING);
3905 MSTRUCT_CALLOC (internal, MERROR_CODING);
3906 converter->internal_info = internal;
3907 internal->coding = coding;
3908 if (coding->resetter
3909 && (*coding->resetter) (converter) < 0)
3913 MERROR (MERROR_CODING, NULL);
3916 internal->unread = mtext ();
3917 internal->work_mt = mtext ();
3918 mtext__enlarge (internal->work_mt, MAX_UTF8_CHAR_BYTES);
3919 internal->buf = buf;
3921 internal->bufsize = n;
3922 internal->binding = BINDING_BUFFER;
3930 @brief Create a code converter bound to a stream.
3932 The mconv_stream_converter () function create a pointer to a code
3933 converter for coding system $CODING. The code converter is bound
3934 to stream $FP. Subsequent decodings and encodings are done
3935 to/from this stream.
3937 $CODING can be #Mnil. In this case, a coding system associated
3938 with the current locale (LC_CTYPE) is used.
3940 @return If the operation was successful, mconv_stream_converter ()
3941 returns the created code converter. Otherwise it returns @c NULL
3942 and assigns an error code to the external variable @c
3946 @brief ¥¹¥È¥ê¡¼¥à¤Ë·ë¤ÓÉÕ¤±¤é¤ì¤¿¥³¡¼¥É¥³¥ó¥Ð¡¼¥¿¤òºî¤ë
3948 ´Ø¿ô mconv_stream_converter () ¤Ï¡¢¥³¡¼¥É·Ï $CODING ÍѤΥ³¡¼¥É¥³¥ó
3949 ¥Ð¡¼¥¿¤òºî¤ë¡£¤³¤Î¥³¡¼¥É¥³¥ó¥Ð¡¼¥¿¤Ï¡¢¥¹¥È¥ê¡¼¥à $FP ¤Ë·ë¤ÓÉÕ¤±¤é
3950 ¤ì¤ë¡£¤³¤ì°Ê¹ß¤Î¥Ç¥³¡¼¥É¤ª¤è¤Ó¥¨¥ó¥³¡¼¥É¤Ï¡¢¤³¤Î¥¹¥È¥ê¡¼¥à¤ËÂФ·¤Æ
3953 $CODING ¤Ï #Mnil ¤Ç¤¢¤Ã¤Æ¤â¤è¤¤¡£¤³¤Î¾ì¹ç¤Ï¸½ºß¤Î¥í¥±¡¼¥ë
3954 (LC_CTYPE) ¤Ë´ØÏ¢ÉÕ¤±¤é¤ì¤¿¥³¡¼¥É·Ï¤¬»È¤ï¤ì¤ë¡£
3957 ¤â¤·½èÍý¤¬À®¸ù¤¹¤ì¤Ð¡¢mconv_stream_converter () ¤Ïºî¤é¤ì¤¿¥³¡¼¥É¥³
3958 ¥ó¥Ð¡¼¥¿¤òÊÖ¤¹¡£¤½¤¦¤Ç¤Ê¤±¤ì¤Ð @c NULL ¤òÊÖ¤·¡¢³°ÉôÊÑ¿ô #merror_code
3959 ¤Ë¥¨¥é¡¼¥³¡¼¥É¤òÀßÄꤹ¤ë¡£
3961 @latexonly \IPAlabel{mconverter} @endlatexonly */
3965 @c MERROR_SYMBOL, @c MERROR_CODING
3968 mconv_buffer_converter () */
3971 mconv_stream_converter (MSymbol name, FILE *fp)
3973 MCodingSystem *coding;
3974 MConverter *converter;
3975 MConverterStatus *internal;
3978 name = mlocale_get_prop (mlocale__ctype, Mcoding);
3979 coding = find_coding (name);
3981 MERROR (MERROR_CODING, NULL);
3982 MSTRUCT_CALLOC (converter, MERROR_CODING);
3983 MSTRUCT_CALLOC (internal, MERROR_CODING);
3984 converter->internal_info = internal;
3985 internal->coding = coding;
3986 if (coding->resetter
3987 && (*coding->resetter) (converter) < 0)
3991 MERROR (MERROR_CODING, NULL);
3994 if (fseek (fp, 0, SEEK_CUR) < 0)
4002 internal->seekable = 0;
4005 internal->seekable = 1;
4006 internal->unread = mtext ();
4007 internal->work_mt = mtext ();
4008 mtext__enlarge (internal->work_mt, MAX_UTF8_CHAR_BYTES);
4010 internal->binding = BINDING_STREAM;
4018 @brief Reset a code converter.
4020 The mconv_reset_converter () function resets code converter
4021 $CONVERTER to the initial state.
4024 If $CONVERTER->coding has its own reseter function,
4025 mconv_reset_converter () returns the result of that function
4026 applied to $CONVERTER. Otherwise it returns 0. */
4029 @brief ¥³¡¼¥É¥³¥ó¥Ð¡¼¥¿¤ò¥ê¥»¥Ã¥È¤¹¤ë
4031 ´Ø¿ô mconv_reset_converter () ¤Ï¥³¡¼¥É¥³¥ó¥Ð¡¼¥¿ $CONVERTER ¤ò½é´ü
4035 ¤â¤· $CONVERTER->coding ¤Ë¥ê¥»¥Ã¥ÈÍѤδؿô¤¬ÄêµÁ¤µ¤ì¤Æ¤¤¤ë¤Ê¤é¤Ð¡¢
4036 mconv_reset_converter () ¤Ï¤½¤Î´Ø¿ô¤Ë $CONVERTER ¤òŬÍѤ·¤¿·ë²Ì¤ò
4037 ÊÖ¤·¡¢¤½¤¦¤Ç¤Ê¤±¤ì¤Ð0¤òÊÖ¤¹¡£ */
4040 mconv_reset_converter (MConverter *converter)
4042 MConverterStatus *internal = (MConverterStatus *) converter->internal_info;
4044 converter->nchars = converter->nbytes = 0;
4045 converter->result = MCONVERSION_RESULT_SUCCESS;
4046 internal->carryover_bytes = 0;
4047 mtext_reset (internal->unread);
4048 if (internal->coding->resetter)
4049 return (*internal->coding->resetter) (converter);
4056 @brief Free a code converter.
4058 The mconv_free_converter () function frees the code converter
4062 @brief ¥³¡¼¥É¥³¥ó¥Ð¡¼¥¿¤ò²òÊü¤¹¤ë
4064 ´Ø¿ô mconv_free_converter () ¤Ï¥³¡¼¥É¥³¥ó¥Ð¡¼¥¿ $CONVERTER ¤ò²òÊü
4068 mconv_free_converter (MConverter *converter)
4070 MConverterStatus *internal = (MConverterStatus *) converter->internal_info;
4072 M17N_OBJECT_UNREF (internal->work_mt);
4073 M17N_OBJECT_UNREF (internal->unread);
4081 @brief Bind a buffer to a code converter.
4083 The mconv_rebind_buffer () function binds buffer area of $N bytes
4084 pointed to by $BUF to code converter $CONVERTER. Subsequent
4085 decodings and encodings are done to/from this newly bound buffer
4089 This function always returns $CONVERTER. */
4092 @brief ¥³¡¼¥É¥³¥ó¥Ð¡¼¥¿¤Ë¥Ð¥Ã¥Õ¥¡Îΰè¤ò·ë¤ÓÉÕ¤±¤ë
4094 ´Ø¿ô mconv_rebind_buffer () ¤Ï¡¢$BUF ¤Ë¤è¤Ã¤Æ»Ø¤µ¤ì¤¿Â礤µ $N ¥Ð
4095 ¥¤¥È¤Î¥Ð¥Ã¥Õ¥¡Îΰè¤ò¥³¡¼¥É¥³¥ó¥Ð¡¼¥¿ $CONVERTER ¤Ë·ë¤ÓÉÕ¤±¤ë¡£¤³¤ì
4096 °Ê¹ß¤Î¥Ç¥³¡¼¥É¤ª¤è¤Ó¥¨¥ó¥³¡¼¥É¤Ï¡¢¤³¤Î¿·¤¿¤Ë·ë¤ÓÉÕ¤±¤é¤ì¤¿¥Ð¥Ã¥Õ¥¡
4097 Îΰè¤ËÂФ·¤Æ¹Ô¤Ê¤ï¤ì¤ë¤è¤¦¤Ë¤Ê¤ë¡£
4100 ¤³¤Î´Ø¿ô¤Ï¾ï¤Ë $CONVERTER ¤òÊÖ¤¹¡£
4102 @latexonly \IPAlabel{mconv_rebind_buffer} @endlatexonly */
4106 mconv_rebind_stream () */
4109 mconv_rebind_buffer (MConverter *converter, unsigned char *buf, int n)
4111 MConverterStatus *internal = (MConverterStatus *) converter->internal_info;
4113 internal->buf = buf;
4115 internal->bufsize = n;
4116 internal->binding = BINDING_BUFFER;
4123 @brief Bind a stream to a code converter.
4125 The mconv_rebind_stream () function binds stream $FP to code
4126 converter $CONVERTER. Following decodings and encodings are done
4127 to/from this newly bound stream.
4130 This function always returns $CONVERTER. */
4133 @brief ¥³¡¼¥É¥³¥ó¥Ð¡¼¥¿¤Ë¥¹¥È¥ê¡¼¥à¤ò·ë¤ÓÉÕ¤±¤ë
4135 ´Ø¿ô mconv_rebind_stream () ¤Ï¡¢¥¹¥È¥ê¡¼¥à $FP ¤ò¥³¡¼¥É¥³¥ó¥Ð¡¼¥¿
4136 $CONVERTER ¤Ë·ë¤ÓÉÕ¤±¤ë¡£¤³¤ì°Ê¹ß¤Î¥Ç¥³¡¼¥É¤ª¤è¤Ó¥¨¥ó¥³¡¼¥É¤Ï¡¢
4137 ¤³¤Î¿·¤¿¤Ë·ë¤ÓÉÕ¤±¤é¤ì¤¿¥¹¥È¥ê¡¼¥à¤ËÂФ·¤Æ¹Ô¤Ê¤ï¤ì¤ë¤è¤¦¤Ë¤Ê¤ë¡£
4140 ¤³¤Î´Ø¿ô¤Ï¾ï¤Ë $CONVERTER ¤òÊÖ¤¹¡£
4142 @latexonly \IPAlabel{mconv_rebind_stream} @endlatexonly */
4146 mconv_rebind_buffer () */
4149 mconv_rebind_stream (MConverter *converter, FILE *fp)
4151 MConverterStatus *internal = (MConverterStatus *) converter->internal_info;
4153 if (fseek (fp, 0, SEEK_CUR) < 0)
4157 internal->seekable = 0;
4160 internal->seekable = 1;
4162 internal->binding = BINDING_STREAM;
4169 @brief Decode a byte sequence into an M-text.
4171 The mconv_decode () function decodes a byte sequence and appends
4172 the result at the end of M-text $MT. The source byte sequence is
4173 taken from currently bound the buffer area or the stream.
4176 If the operation was successful, mconv_decode () returns updated
4177 $MT. Otherwise it returns @c NULL and assigns an error code to
4178 the external variable #merror_code. */
4181 @brief ¥Ð¥¤¥ÈÎó¤ò M-text ¤Ë¥Ç¥³¡¼¥É¤¹¤ë
4183 ´Ø¿ô mconv_decode () ¤Ï¡¢¥Ð¥¤¥ÈÎó¤ò¥Ç¥³¡¼¥É¤·¤Æ¤½¤Î·ë²Ì¤ò M-text
4184 $MT ¤ÎËöÈø¤ËÄɲ乤롣¥Ç¥³¡¼¥É¸µ¤Î¥Ð¥¤¥ÈÎó¤Ï¡¢¸½ºß·ë¤ÓÉÕ¤±¤é¤ì¤Æ¤¤¤ë
4185 ¥Ð¥Ã¥Õ¥¡Îΰ褢¤ë¤¤¤Ï¥¹¥È¥ê¡¼¥à¤«¤é¼è¤é¤ì¤ë¡£
4188 ¤â¤·½èÍý¤¬À®¸ù¤¹¤ì¤Ð¡¢mconv_decode () ¤Ï¹¹¿·¤µ¤ì¤¿ $MT ¤òÊÖ¤¹¡£¤½
4189 ¤¦¤Ç¤Ê¤±¤ì¤Ð @c NULL ¤òÊÖ¤·¡¢³°ÉôÊÑ¿ô #merror_code ¤Ë¥¨¥é¡¼¥³¡¼¥É¤ò
4194 @c MERROR_IO, @c MERROR_CODING
4197 mconv_rebind_buffer (), mconv_rebind_stream (),
4198 mconv_encode (), mconv_encode_range (),
4199 mconv_decode_buffer (), mconv_decode_stream () */
4202 mconv_decode (MConverter *converter, MText *mt)
4204 MConverterStatus *internal = (MConverterStatus *) converter->internal_info;
4205 int at_most = converter->at_most > 0 ? converter->at_most : -1;
4208 M_CHECK_READONLY (mt, NULL);
4211 mtext__enlarge (mt, MAX_UTF8_CHAR_BYTES);
4213 converter->nchars = converter->nbytes = 0;
4214 converter->result = MCONVERSION_RESULT_SUCCESS;
4216 n = mtext_nchars (internal->unread);
4222 if (at_most > 0 && at_most < limit)
4225 for (i = 0, n -= 1; i < limit; i++, converter->nchars++, n--)
4226 mtext_cat_char (mt, mtext_ref_char (internal->unread, n));
4227 mtext_del (internal->unread, n + 1, internal->unread->nchars);
4230 if (at_most == limit)
4232 converter->at_most -= converter->nchars;
4236 if (internal->binding == BINDING_BUFFER)
4238 (*internal->coding->decoder) (internal->buf + internal->used,
4239 internal->bufsize - internal->used,
4241 internal->used += converter->nbytes;
4243 else if (internal->binding == BINDING_STREAM)
4245 unsigned char work[CONVERT_WORKSIZE];
4246 int last_block = converter->last_block;
4247 int use_fread = at_most < 0 && internal->seekable;
4249 converter->last_block = 0;
4252 int nbytes, prev_nbytes;
4254 if (feof (internal->fp))
4257 nbytes = fread (work, sizeof (unsigned char), CONVERT_WORKSIZE,
4261 int c = getc (internal->fp);
4264 work[0] = c, nbytes = 1;
4269 if (ferror (internal->fp))
4271 converter->result = MCONVERSION_RESULT_IO_ERROR;
4276 converter->last_block = last_block;
4277 prev_nbytes = converter->nbytes;
4278 (*internal->coding->decoder) (work, nbytes, mt, converter);
4279 if (converter->nbytes - prev_nbytes < nbytes)
4282 fseek (internal->fp, converter->nbytes - prev_nbytes - nbytes,
4285 ungetc (work[0], internal->fp);
4289 || (converter->at_most > 0
4290 && converter->nchars == converter->at_most))
4293 converter->last_block = last_block;
4295 else /* internal->binding == BINDING_NONE */
4296 MERROR (MERROR_CODING, NULL);
4298 converter->at_most = at_most;
4299 return ((converter->result == MCONVERSION_RESULT_SUCCESS
4300 || converter->result == MCONVERSION_RESULT_INSUFFICIENT_SRC)
4307 @brief Decode a buffer area based on a coding system.
4309 The mconv_decode_buffer () function decodes $N bytes of buffer
4310 area pointed to by $BUF based on the coding system $NAME. A
4311 temporary code converter for decoding is automatically created
4315 If the operation was successful, mconv_decode_buffer () returns
4316 the resulting M-text. Otherwise it returns NULL and assigns an
4317 error code to the external variable #merror_code. */
4320 @brief ¥³¡¼¥É·Ï¤Ë´ð¤Å¤¤¤Æ¥Ð¥Ã¥Õ¥¡Îΰè¤ò¥Ç¥³¡¼¥É¤¹¤ë
4322 ´Ø¿ô mconv_decode_buffer () ¤Ï¡¢$BUF ¤Ë¤è¤Ã¤Æ»Ø¤µ¤ì¤¿ $N ¥Ð¥¤¥È¤Î
4323 ¥Ð¥Ã¥Õ¥¡Îΰè¤ò¡¢¥³¡¼¥É·Ï $NAME ¤Ë´ð¤Å¤¤¤Æ¥Ç¥³¡¼¥É¤¹¤ë¡£¥Ç¥³¡¼¥É¤Ë
4324 ɬÍפʥ³¡¼¥É¥³¥ó¥Ð¡¼¥¿¤ÎºîÀ®¤È²òÊü¤Ï¼«Æ°Åª¤Ë¹Ô¤Ê¤ï¤ì¤ë¡£
4327 ¤â¤·½èÍý¤¬À®¸ù¤¹¤ì¤Ð¡¢mconv_decode_buffer () ¤ÏÆÀ¤é¤ì¤¿ M-text ¤ò
4328 ÊÖ¤¹¡£¤½¤¦¤Ç¤Ê¤±¤ì¤Ð @c NULL ¤òÊÖ¤·¡¢³°ÉôÊÑ¿ô #merror_code ¤Ë¥¨¥é¡¼
4329 ¥³¡¼¥É¤òÀßÄꤹ¤ë¡£ */
4333 @c MERROR_IO, @c MERROR_CODING
4336 mconv_decode (), mconv_decode_stream () */
4339 mconv_decode_buffer (MSymbol name, unsigned char *buf, int n)
4341 MConverter *converter = mconv_buffer_converter (name, buf, n);
4347 if (! mconv_decode (converter, mt))
4349 M17N_OBJECT_UNREF (mt);
4352 mconv_free_converter (converter);
4359 @brief Decode a stream input based on a coding system.
4361 The mconv_decode_stream () function decodes the entire byte
4362 sequence read in from stream $FP based on the coding system $NAME.
4363 A code converter for decoding is automatically created and freed.
4366 If the operation was successful, mconv_decode_stream () returns
4367 the resulting M-text. Otherwise it returns NULL and assigns an
4368 error code to the external variable #merror_code. */
4371 @brief ¥³¡¼¥É·Ï¤Ë´ð¤Å¤¤¤Æ¥¹¥È¥ê¡¼¥àÆþÎϤò¥Ç¥³¡¼¥É¤¹¤ë
4373 ´Ø¿ô mconv_decode_stream () ¤Ï¡¢¥¹¥È¥ê¡¼¥à $FP ¤«¤éÆɤ߹þ¤Þ¤ì¤ë¥Ð
4374 ¥¤¥ÈÎóÁ´ÂΤò¡¢¥³¡¼¥É·Ï $NAME ¤Ë´ð¤Å¤¤¤Æ¥Ç¥³¡¼¥É¤¹¤ë¡£¥Ç¥³¡¼¥É¤Ëɬ
4375 Íפʥ³¡¼¥É¥³¥ó¥Ð¡¼¥¿¤ÎºîÀ®¤È²òÊü¤Ï¼«Æ°Åª¤Ë¹Ô¤Ê¤ï¤ì¤ë¡£
4378 ¤â¤·½èÍý¤¬À®¸ù¤¹¤ì¤Ð¡¢mconv_decode_stream () ¤ÏÆÀ¤é¤ì¤¿ M-text ¤òÊÖ
4379 ¤¹¡£¤½¤¦¤Ç¤Ê¤±¤ì¤Ð @c NULL ¤òÊÖ¤·¡¢³°ÉôÊÑ¿ô #merror_code ¤Ë¥¨¥é¡¼¥³¡¼
4384 @c MERROR_IO, @c MERROR_CODING
4387 mconv_decode (), mconv_decode_buffer () */
4390 mconv_decode_stream (MSymbol name, FILE *fp)
4392 MConverter *converter = mconv_stream_converter (name, fp);
4398 if (! mconv_decode (converter, mt))
4400 M17N_OBJECT_UNREF (mt);
4403 mconv_free_converter (converter);
4409 /***en @brief Encode an M-text into a byte sequence.
4411 The mconv_encode () function encodes M-text $MT and writes the
4412 resulting byte sequence into the buffer area or the stream that is
4413 currently bound to code converter $CONVERTER.
4416 If the operation was successful, mconv_encode () returns the
4417 number of written bytes. Otherwise it returns -1 and assigns an
4418 error code to the external variable #merror_code. */
4421 @brief M-text ¤ò¥Ð¥¤¥ÈÎó¤Ë¥¨¥ó¥³¡¼¥É¤¹¤ë
4423 ´Ø¿ô mconv_encode () ¤Ï¡¢M-text $MT ¤ò¥¨¥ó¥³¡¼¥É¤·¤Æ¡¢¥³¡¼¥É¥³¥ó¥Ð¡¼
4424 ¥¿ $CONVERTER ¤Ë¸½ºß·ë¤ÓÉÕ¤±¤é¤ì¤Æ¤¤¤ë¥Ð¥Ã¥Õ¥¡Îΰ褢¤ë¤¤¤Ï¥¹¥È¥ê¡¼
4428 ¤â¤·½èÍý¤¬À®¸ù¤¹¤ì¤Ð¡¢mconv_encode () ¤Ï½ñ¤¹þ¤Þ¤ì¤¿¥Ð¥¤¥È¿ô¤òÊÖ¤¹¡£
4429 ¤½¤¦¤Ç¤Ê¤±¤ì¤Ð -1 ¤òÊÖ¤·¡¢³°ÉôÊÑ¿ô #merror_code ¤Ë¥¨¥é¡¼¥³¡¼¥É¤òÀßÄê
4434 @c MERROR_IO, @c MERROR_CODING
4437 mconv_rebind_buffer (), mconv_rebind_stream(),
4438 mconv_decode (), mconv_encode_range () */
4441 mconv_encode (MConverter *converter, MText *mt)
4443 return mconv_encode_range (converter, mt, 0, mtext_nchars (mt));
4449 @brief Encode a part of an M-text
4451 The mconv_encode_range () function encodes the text between $FROM
4452 (inclusive) and $TO (exclusive) in M-text $MT and writes the
4453 resulting byte sequence into the buffer area or the stream that is
4454 currently bound to code converter $CONVERTER.
4457 If the operation was successful, mconv_encode_range () returns the
4458 number of written bytes. Otherwise it returns -1 and assigns an
4459 error code to the external variable #merror_code. */
4462 @brief M-text ¤Î°ìÉô¤ò¤ò¥Ð¥¤¥ÈÎó¤Ë¥¨¥ó¥³¡¼¥É¤¹¤ë
4464 ´Ø¿ô mconv_encode_range () ¤Ï¡¢M-text $MT ¤Î $FROM ¡Ê´Þ¤à¡Ë¤«¤é
4465 $TO ¡Ê´Þ¤Þ¤Ê¤¤¡Ë¤Þ¤Ç¤ÎÈϰϤΥƥ¥¹¥È¤ò¥¨¥ó¥³¡¼¥É¤·¤Æ¡¢¥³¡¼¥É¥³¥ó¥Ð¡¼
4466 ¥¿ $CONVERTER ¤Ë¸½ºß·ë¤ÓÉÕ¤±¤é¤ì¤Æ¤¤¤ë¥Ð¥Ã¥Õ¥¡Îΰ褢¤ë¤¤¤Ï¥¹¥È¥ê¡¼
4470 ¤â¤·½èÍý¤¬À®¸ù¤¹¤ì¤Ð¡¢mconv_encode_range () ¤Ï½ñ¤¹þ¤Þ¤ì¤¿¥Ð¥¤¥È¿ô
4471 ¤òÊÖ¤¹¡£¤½¤¦¤Ç¤Ê¤±¤ì¤Ð -1 ¤òÊÖ¤·¡¢³°ÉôÊÑ¿ô #merror_code ¤Ë¥¨¥é¡¼¥³¡¼
4476 @c MERROR_RANGE, @c MERROR_IO, @c MERROR_CODING
4479 mconv_rebind_buffer (), mconv_rebind_stream(),
4480 mconv_decode (), mconv_encode () */
4483 mconv_encode_range (MConverter *converter, MText *mt, int from, int to)
4485 MConverterStatus *internal = (MConverterStatus *) converter->internal_info;
4487 M_CHECK_POS_X (mt, from, -1);
4488 M_CHECK_POS_X (mt, to, -1);
4492 if (converter->at_most > 0 && from + converter->at_most < to)
4493 to = from + converter->at_most;
4495 converter->nchars = converter->nbytes = 0;
4496 converter->result = MCONVERSION_RESULT_SUCCESS;
4498 if (internal->binding == BINDING_BUFFER)
4500 (*internal->coding->encoder) (mt, from, to,
4501 internal->buf + internal->used,
4502 internal->bufsize - internal->used,
4504 internal->used += converter->nbytes;
4506 else if (internal->binding == BINDING_STREAM)
4508 unsigned char work[CONVERT_WORKSIZE];
4513 int prev_nbytes = converter->nbytes;
4516 (*internal->coding->encoder) (mt, from, to, work,
4517 CONVERT_WORKSIZE, converter);
4518 this_nbytes = converter->nbytes - prev_nbytes;
4519 while (written < this_nbytes)
4521 int wrtn = fwrite (work + written, sizeof (unsigned char),
4522 this_nbytes - written, internal->fp);
4524 if (ferror (internal->fp))
4528 if (written < this_nbytes)
4530 converter->result = MCONVERSION_RESULT_IO_ERROR;
4533 from += converter->nchars;
4536 else /* fail safe */
4537 MERROR (MERROR_CODING, -1);
4539 return ((converter->result == MCONVERSION_RESULT_SUCCESS
4540 || converter->result == MCONVERSION_RESULT_INSUFFICIENT_DST)
4541 ? converter->nbytes : -1);
4547 @brief Encode an M-text into a buffer area.
4549 The mconv_encode_buffer () function encodes M-text $MT based on
4550 coding system $NAME and writes the resulting byte sequence into the
4551 buffer area pointed to by $BUF. At most $N bytes are written. A
4552 temporary code converter for encoding is automatically created
4556 If the operation was successful, mconv_encode_buffer () returns
4557 the number of written bytes. Otherwise it returns -1 and assigns
4558 an error code to the external variable #merror_code. */
4561 @brief M-text ¤ò¥¨¥ó¥³¡¼¥É¤·¤Æ¥Ð¥Ã¥Õ¥¡Îΰè¤Ë½ñ¤¹þ¤à
4563 ´Ø¿ô mconv_encode_buffer () ¤ÏM-text $MT ¤ò¥³¡¼¥É·Ï $NAME ¤Ë´ð¤Å¤¤
4564 ¤Æ¥¨¥ó¥³¡¼¥É¤·¡¢ÆÀ¤é¤ì¤¿¥Ð¥¤¥ÈÎó¤ò $BUF ¤Î»Ø¤¹¥Ð¥Ã¥Õ¥¡Îΰè¤Ë½ñ¤¹þ
4565 ¤à¡£$N ¤Ï½ñ¤¹þ¤àºÇÂç¥Ð¥¤¥È¿ô¤Ç¤¢¤ë¡£¥¨¥ó¥³¡¼¥É¤ËɬÍפʥ³¡¼¥É¥³¥ó
4566 ¥Ð¡¼¥¿¤ÎºîÀ®¤È²òÊü¤Ï¼«Æ°Åª¤Ë¹Ô¤Ê¤ï¤ì¤ë¡£
4569 ¤â¤·½èÍý¤¬À®¸ù¤¹¤ì¤Ð¡¢mconv_encode_buffer () ¤Ï½ñ¤¹þ¤Þ¤ì¤¿¥Ð¥¤¥È
4570 ¿ô¤òÊÖ¤¹¡£¤½¤¦¤Ç¤Ê¤±¤ì¤Ð-1¤òÊÖ¤·¡¢³°ÉôÊÑ¿ô #merror_code ¤Ë¥¨¥é¡¼¥³¡¼
4575 @c MERROR_IO, @c MERROR_CODING
4578 mconv_encode (), mconv_encode_stream () */
4581 mconv_encode_buffer (MSymbol name, MText *mt, unsigned char *buf, int n)
4583 MConverter *converter = mconv_buffer_converter (name, buf, n);
4588 ret = mconv_encode (converter, mt);
4589 mconv_free_converter (converter);
4596 @brief Encode an M-text to write to a stream.
4598 The mconv_encode_stream () function encodes M-text $MT based on
4599 coding system $NAME and writes the resulting byte sequence to
4600 stream $FP. A temporary code converter for encoding is
4601 automatically created and freed.
4604 If the operation was successful, mconv_encode_stream () returns
4605 the number of written bytes. Otherwise it returns -1 and assigns
4606 an error code to the external variable #merror_code. */
4609 @brief M-text ¤ò¥¨¥ó¥³¡¼¥É¤·¤Æ¥¹¥È¥ê¡¼¥à¤Ë½ñ¤¹þ¤à
4611 ´Ø¿ô mconv_encode_stream () ¤ÏM-text $MT ¤ò¥³¡¼¥É·Ï $NAME ¤Ë´ð¤Å¤¤
4612 ¤Æ¥¨¥ó¥³¡¼¥É¤·¡¢ÆÀ¤é¤ì¤¿¥Ð¥¤¥ÈÎó¤ò¥¹¥È¥ê¡¼¥à $FP ¤Ë½ñ¤½Ð¤¹¡£¥¨¥ó
4613 ¥³¡¼¥É¤ËɬÍפʥ³¡¼¥É¥³¥ó¥Ð¡¼¥¿¤ÎºîÀ®¤È²òÊü¤Ï¼«Æ°Åª¤Ë¹Ô¤Ê¤ï¤ì¤ë¡£
4616 ¤â¤·½èÍý¤¬À®¸ù¤¹¤ì¤Ð¡¢mconv_encode_stream () ¤Ï½ñ¤¹þ¤Þ¤ì¤¿¥Ð¥¤¥È¿ô
4617 ¤òÊÖ¤¹¡£¤½¤¦¤Ç¤Ê¤±¤ì¤Ð-1¤òÊÖ¤·¡¢³°ÉôÊÑ¿ô #merror_code ¤Ë¥¨¥é¡¼¥³¡¼¥É
4622 @c MERROR_IO, @c MERROR_CODING
4625 mconv_encode (), mconv_encode_buffer (), mconv_encode_file () */
4628 mconv_encode_stream (MSymbol name, MText *mt, FILE *fp)
4630 MConverter *converter = mconv_stream_converter (name, fp);
4635 ret = mconv_encode (converter, mt);
4636 mconv_free_converter (converter);
4643 @brief Read a character via a code converter.
4645 The mconv_getc () function reads one character from the buffer
4646 area or the stream that is currently bound to code converter
4647 $CONVERTER. The decoder of $CONVERTER is used to decode the byte
4648 sequence. The internal status of $CONVERTER is updated
4652 If the operation was successful, mconv_getc () returns the
4653 character read in. If the input source reaches EOF, it returns @c
4654 EOF without changing the external variable #merror_code. If an
4655 error is detected, it returns @c EOF and assigns an error code to
4659 @brief ¥³¡¼¥É¥³¥ó¥Ð¡¼¥¿·Ðͳ¤Ç1ʸ»úÆɤà
4661 ´Ø¿ô mconv_getc () ¤Ï¡¢¥³¡¼¥É¥³¥ó¥Ð¡¼¥¿ $CONVERTER ¤Ë¸½ºß·ë¤ÓÉÕ¤±
4662 ¤é¤ì¤Æ¤¤¤ë¥Ð¥Ã¥Õ¥¡Îΰ褢¤ë¤¤¤Ï¥¹¥È¥ê¡¼¥à¤«¤é1ʸ»ú¤òÆɤ߹þ¤à¡£¥Ð¥¤
4663 ¥ÈÎó¤Î¥Ç¥³¡¼¥É¤Ë¤Ï $CONVERTER ¤Î¥Ç¥³¡¼¥À¤¬ÍѤ¤¤é¤ì¤ë¡£$CONVERTER
4664 ¤ÎÆâÉô¾õÂÖ¤ÏɬÍפ˱þ¤¸¤Æ¹¹¿·¤µ¤ì¤ë¡£
4667 ½èÍý¤¬À®¸ù¤¹¤ì¤Ð¡¢mconv_getc () ¤ÏÆɤ߹þ¤Þ¤ì¤¿Ê¸»ú¤òÊÖ¤¹¡£ÆþÎϸ»¤¬
4668 EOF ¤Ë㤷¤¿¾ì¹ç¤Ï¡¢³°ÉôÊÑ¿ô #merror_code ¤òÊѤ¨¤º¤Ë @c EOF ¤òÊÖ¤¹¡£
4669 ¥¨¥é¡¼¤¬¸¡½Ð¤µ¤ì¤¿¾ì¹ç¤Ï @c EOF ¤òÊÖ¤·¡¢#merror_code ¤Ë¥¨¥é¡¼¥³¡¼¥É
4677 mconv_ungetc (), mconv_putc (), mconv_gets () */
4680 mconv_getc (MConverter *converter)
4682 MConverterStatus *internal = (MConverterStatus *) converter->internal_info;
4683 int at_most = converter->at_most;
4685 mtext_reset (internal->work_mt);
4686 converter->at_most = 1;
4687 mconv_decode (converter, internal->work_mt);
4688 converter->at_most = at_most;
4689 return (converter->nchars == 1
4690 ? STRING_CHAR (internal->work_mt->data)
4697 @brief Push a character back to a code converter.
4699 The mconv_ungetc () function pushes character $C back to code
4700 converter $CONVERTER. Any number of characters can be pushed
4701 back. The lastly pushed back character is firstly read by the
4702 subsequent mconv_getc () call. The characters pushed back are
4703 registered only in $CONVERTER; they are not written to the input
4704 source. The internal status of $CONVERTER is updated
4708 If the operation was successful, mconv_ungetc () returns $C.
4709 Otherwise it returns @c EOF and assigns an error code to the
4710 external variable #merror_code. */
4713 @brief ¥³¡¼¥É¥³¥ó¥Ð¡¼¥¿¤Ë1ʸ»úÌ᤹
4715 ´Ø¿ô mconv_ungetc () ¤Ï¡¢¥³¡¼¥É¥³¥ó¥Ð¡¼¥¿ $CONVERTER ¤Ëʸ»ú $C ¤ò
4716 ²¡¤·Ì᤹¡£²¡¤·Ì᤻¤ëʸ»ú¿ô¤ËÀ©¸Â¤Ï¤Ê¤¤¡£¤³¤Î¸å¤Ë mconv_getc () ¤ò
4717 ¸Æ¤Ó½Ð¤¹¤È¡¢ºÇ¸å¤ËÌᤵ¤ì¤¿Ê¸»ú¤¬ºÇ½é¤ËÆɤޤì¤ë¡£²¡¤·Ìᤵ¤ì¤¿Ê¸»ú¤Ï
4718 $CONVERTER ¤ÎÆâÉô¤ËÃߤ¨¤é¤ì¤ë¤À¤±¤Ç¤¢¤ê¡¢¼ÂºÝ¤ËÆþÎϸ»¤Ë½ñ¤¹þ¤Þ¤ì
4719 ¤ë¤ï¤±¤Ç¤Ï¤Ê¤¤¡£$CONVERTER ¤ÎÆâÉô¾õÂÖ¤ÏɬÍפ˱þ¤¸¤Æ¹¹¿·¤µ¤ì¤ë¡£
4722 ½èÍý¤¬À®¸ù¤¹¤ì¤Ð¡¢mconv_ungetc () ¤Ï $C ¤òÊÖ¤¹¡£¤½¤¦¤Ç¤Ê¤±¤ì¤Ð @c
4723 EOF ¤òÊÖ¤·¡¢³°ÉôÊÑ¿ô #merror_code ¤Ë¥¨¥é¡¼¥³¡¼¥É¤òÀßÄꤹ¤ë¡£ */
4727 @c MERROR_CODING, @c MERROR_CHAR
4730 mconv_getc (), mconv_putc (), mconv_gets () */
4733 mconv_ungetc (MConverter *converter, int c)
4735 MConverterStatus *internal = (MConverterStatus *) converter->internal_info;
4737 M_CHECK_CHAR (c, EOF);
4739 converter->result = MCONVERSION_RESULT_SUCCESS;
4740 mtext_cat_char (internal->unread, c);
4747 @brief Write a character via a code converter.
4749 The mconv_putc () function writes character $C to the buffer area
4750 or the stream that is currently bound to code converter
4751 $CONVERTER. The encoder of $CONVERTER is used to encode the
4752 character. The number of bytes actually written is set to the @c
4753 nbytes member of $CONVERTER. The internal status of $CONVERTER
4754 is updated appropriately.
4757 If the operation was successful, mconv_putc () returns $C.
4758 If an error is detected, it returns @c EOF and assigns
4759 an error code to the external variable #merror_code. */
4762 @brief ¥³¡¼¥É¥³¥ó¥Ð¡¼¥¿¤ò·Ðͳ¤Ç1ʸ»ú½ñ¤¯
4764 ´Ø¿ô mconv_putc () ¤Ï¡¢¥³¡¼¥É¥³¥ó¥Ð¡¼¥¿ $CONVERTER ¤Ë¸½ºß·ë¤ÓÉÕ¤±
4765 ¤é¤ì¤Æ¤¤¤ë¥Ð¥Ã¥Õ¥¡Îΰ褢¤ë¤¤¤Ï¥¹¥È¥ê¡¼¥à¤Ëʸ»ú $C ¤ò½ñ¤½Ð¤¹¡£Ê¸»ú
4766 ¤Î¥¨¥ó¥³¡¼¥É¤Ë¤Ï $CONVERTER ¤Î¥¨¥ó¥³¡¼¥À¤¬ÍѤ¤¤é¤ì¤ë¡£¼ÂºÝ¤Ë½ñ¤½Ð
4767 ¤µ¤ì¤¿¥Ð¥¤¥È¿ô¤Ï¡¢$CONVERTER ¤Î ¥á¥ó¥Ð¡¼ @c nbytes ¤Ë¥»¥Ã¥È¤µ¤ì¤ë¡£
4768 $CONVERTER ¤ÎÆâÉô¾õÂÖ¤ÏɬÍפ˱þ¤¸¤Æ¹¹¿·¤µ¤ì¤ë¡£
4771 ½èÍý¤¬À®¸ù¤¹¤ì¤Ð¡¢mconv_putc () ¤Ï $C ¤òÊÖ¤¹¡£¥¨¥é¡¼¤¬¸¡½Ð¤µ¤ì¤¿¾ì¹ç
4772 ¤Ï @c EOF ¤òÊÖ¤·¡¢³°ÉôÊÑ¿ô #merror_code ¤Ë¥¨¥é¡¼¥³¡¼¥É¤òÀßÄꤹ¤ë¡£ */
4776 @c MERROR_CODING, @c MERROR_IO, @c MERROR_CHAR
4779 mconv_getc (), mconv_ungetc (), mconv_gets () */
4782 mconv_putc (MConverter *converter, int c)
4784 MConverterStatus *internal = (MConverterStatus *) converter->internal_info;
4786 M_CHECK_CHAR (c, EOF);
4787 mtext_reset (internal->work_mt);
4788 mtext_cat_char (internal->work_mt, c);
4789 if (mconv_encode_range (converter, internal->work_mt, 0, 1) < 0)
4797 @brief Read a line using a code converter.
4799 The mconv_gets () function reads one line from the buffer area or
4800 the stream that is currently bound to code converter $CONVERTER.
4801 The decoder of $CONVERTER is used for decoding. The decoded
4802 character sequence is appended at the end of M-text $MT. The
4803 final newline character in the original byte sequence is not
4804 appended. The internal status of $CONVERTER is updated
4808 If the operation was successful, mconv_gets () returns the
4809 modified $MT. If it encounters EOF without reading a single
4810 character, it returns $MT without changing it. If an error is
4811 detected, it returns @c NULL and assigns an error code to @c
4815 @brief ¥³¡¼¥É¥³¥ó¥Ð¡¼¥¿¤ò»È¤Ã¤Æ1¹ÔÆɤà
4817 ´Ø¿ô mconv_gets () ¤Ï¡¢¥³¡¼¥É¥³¥ó¥Ð¡¼¥¿ $CONVERTER ¤Ë¸½ºß·ë¤ÓÉÕ¤±
4818 ¤é¤ì¤Æ¤¤¤ë¥Ð¥Ã¥Õ¥¡Îΰ褢¤ë¤¤¤Ï¥¹¥È¥ê¡¼¥à¤«¤é1¹Ô¤òÆɤ߹þ¤à¡£¥Ð¥¤¥È
4819 Îó¤Î¥Ç¥³¡¼¥É¤Ë¤Ï $CONVERTER ¤Î¥Ç¥³¡¼¥À¤¬ÍѤ¤¤é¤ì¤ë¡£¥Ç¥³¡¼¥É¤µ¤ì¤¿
4820 ʸ»úÎó¤Ï M-text $MT ¤ÎËöÈø¤ËÄɲ䵤ì¤ë¡£¸µ¤Î¥Ð¥¤¥ÈÎó¤Î½ªÃ¼²þ¹Ôʸ»ú
4821 ¤ÏÄɲ䵤ì¤Ê¤¤¡£$CONVERTER ¤ÎÆâÉô¾õÂÖ¤ÏɬÍפ˱þ¤¸¤Æ¹¹¿·¤µ¤ì¤ë¡£
4824 ½èÍý¤¬À®¸ù¤¹¤ì¤Ð¡¢mconv_gets () ¤ÏÊѹ¹¤µ¤ì¤¿ $MT ¤òÊÖ¤¹¡£¤â¤·1ʸ»ú
4825 ¤âÆɤޤº¤Ë EOF ¤ËÅö¤¿¤Ã¤¿¾ì¹ç¤Ï¡¢$MT ¤òÊѹ¹¤»¤º¤Ë¤½¤Î¤Þ¤ÞÊÖ¤¹¡£¥¨
4826 ¥é¡¼¤¬¸¡½Ð¤µ¤ì¤¿¾ì¹ç¤Ï @c NULL ¤òÊÖ¤·¡¢#merror_code ¤Ë¥¨¥é¡¼¥³¡¼¥É¤ò
4834 mconv_getc (), mconv_ungetc (), mconv_putc () */
4837 mconv_gets (MConverter *converter, MText *mt)
4841 M_CHECK_READONLY (mt, NULL);
4844 c = mconv_getc (converter);
4845 if (c == EOF || c == '\n')
4847 mtext_cat_char (mt, c);
4849 if (c == EOF && converter->result != MCONVERSION_RESULT_SUCCESS)
4850 /* mconv_getc () sets merror_code */