1 /* Code conversion functions.
2 Copyright (C) 1991, 1995 Free Software Foundation, Inc.
3 Copyright (C) 1995 Sun Microsystems, Inc.
4 Copyright (C) 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2008, 2011,
7 This file is part of XEmacs.
9 XEmacs is free software; you can redistribute it and/or modify it
10 under the terms of the GNU General Public License as published by the
11 Free Software Foundation; either version 2, or (at your option) any
14 XEmacs is distributed in the hope that it will be useful, but WITHOUT
15 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
16 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
19 You should have received a copy of the GNU General Public License
20 along with XEmacs; see the file COPYING. If not, write to
21 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
22 Boston, MA 02111-1307, USA. */
24 /* Synched up with: Mule 2.3. Not in FSF. */
26 /* Rewritten by Ben Wing <ben@xemacs.org>. */
27 /* Rewritten by MORIOKA Tomohiko <tomo@m17n.org> for XEmacs CHISE. */
41 #include "file-coding.h"
43 Lisp_Object Qcoding_system_error;
45 Lisp_Object Vkeyboard_coding_system;
46 Lisp_Object Vterminal_coding_system;
47 Lisp_Object Vcoding_system_for_read;
48 Lisp_Object Vcoding_system_for_write;
49 Lisp_Object Vfile_name_coding_system;
51 Lisp_Object Vcoded_charset_entity_reference_alist;
53 /* Table of symbols identifying each coding category. */
54 Lisp_Object coding_category_symbol[CODING_CATEGORY_LAST];
58 struct file_coding_dump {
59 /* Coding system currently associated with each coding category. */
60 Lisp_Object coding_category_system[CODING_CATEGORY_LAST];
62 /* Table of all coding categories in decreasing order of priority.
63 This describes a permutation of the possible coding categories. */
64 int coding_category_by_priority[CODING_CATEGORY_LAST];
66 #if defined(MULE) && !defined(UTF2000)
67 Lisp_Object ucs_to_mule_table[65536];
71 static const struct lrecord_description fcd_description_1[] = {
72 { XD_LISP_OBJECT_ARRAY, offsetof (struct file_coding_dump, coding_category_system), CODING_CATEGORY_LAST },
73 #if defined(MULE) && !defined(UTF2000)
74 { XD_LISP_OBJECT_ARRAY, offsetof (struct file_coding_dump, ucs_to_mule_table), countof (fcd->ucs_to_mule_table) },
79 static const struct struct_description fcd_description = {
80 sizeof (struct file_coding_dump),
84 Lisp_Object mule_to_ucs_table;
86 Lisp_Object Qcoding_systemp;
88 Lisp_Object Qraw_text, Qno_conversion, Qccl, Qiso2022;
89 /* Qinternal in general.c */
91 Lisp_Object Qmnemonic, Qeol_type;
92 Lisp_Object Qcr, Qcrlf, Qlf;
93 Lisp_Object Qeol_cr, Qeol_crlf, Qeol_lf;
94 Lisp_Object Qpost_read_conversion;
95 Lisp_Object Qpre_write_conversion;
98 Lisp_Object Qucs4, Qutf16, Qutf8;
99 Lisp_Object Qbig5, Qshift_jis;
100 Lisp_Object Qcharset_g0, Qcharset_g1, Qcharset_g2, Qcharset_g3;
101 Lisp_Object Qforce_g0_on_output, Qforce_g1_on_output;
102 Lisp_Object Qforce_g2_on_output, Qforce_g3_on_output;
103 Lisp_Object Qno_iso6429;
104 Lisp_Object Qinput_charset_conversion, Qoutput_charset_conversion;
105 Lisp_Object Qescape_quoted;
106 Lisp_Object Qshort, Qno_ascii_eol, Qno_ascii_cntl, Qseven, Qlock_shift;
109 Lisp_Object Qutf_8_mcs;
110 Lisp_Object Qdisable_composition, Qenable_decomposition;
111 Lisp_Object Qccs_priority_list;
112 Lisp_Object Quse_entity_reference;
113 Lisp_Object Qd, Qx, QX;
114 Lisp_Object Vdecomposition_feature_list;
116 Lisp_Object Qencode, Qdecode;
118 Lisp_Object Vcoding_system_hash_table;
120 int enable_multibyte_characters;
123 /* Additional information used by the ISO2022 decoder and detector. */
124 struct iso2022_decoder
126 /* CHARSET holds the character sets currently assigned to the G0
127 through G3 variables. It is initialized from the array
128 INITIAL_CHARSET in CODESYS. */
129 Lisp_Object charset[4];
131 /* Which registers are currently invoked into the left (GL) and
132 right (GR) halves of the 8-bit encoding space? */
133 int register_left, register_right;
135 /* ISO_ESC holds a value indicating part of an escape sequence
136 that has already been seen. */
137 enum iso_esc_flag esc;
139 /* This records the bytes we've seen so far in an escape sequence,
140 in case the sequence is invalid (we spit out the bytes unchanged). */
141 unsigned char esc_bytes[8];
143 /* Index for next byte to store in ISO escape sequence. */
146 #ifdef ENABLE_COMPOSITE_CHARS
147 /* Stuff seen so far when composing a string. */
148 unsigned_char_dynarr *composite_chars;
151 /* If we saw an invalid designation sequence for a particular
152 register, we flag it here and switch to ASCII. The next time we
153 see a valid designation for this register, we turn off the flag
154 and do the designation normally, but pretend the sequence was
155 invalid. The effect of all this is that (most of the time) the
156 escape sequences for both the switch to the unknown charset, and
157 the switch back to the known charset, get inserted literally into
158 the buffer and saved out as such. The hope is that we can
159 preserve the escape sequences so that the resulting written out
160 file makes sense. If we don't do any of this, the designation
161 to the invalid charset will be preserved but that switch back
162 to the known charset will probably get eaten because it was
163 the same charset that was already present in the register. */
164 unsigned char invalid_designated[4];
166 /* We try to do similar things as above for direction-switching
167 sequences. If we encountered a direction switch while an
168 invalid designation was present, or an invalid designation
169 just after a direction switch (i.e. no valid designation
170 encountered yet), we insert the direction-switch escape
171 sequence literally into the output stream, and later on
172 insert the corresponding direction-restoring escape sequence
174 unsigned int switched_dir_and_no_valid_charset_yet :1;
175 unsigned int invalid_switch_dir :1;
177 /* Tells the decoder to output the escape sequence literally
178 even though it was valid. Used in the games we play to
179 avoid lossage when we encounter invalid designations. */
180 unsigned int output_literally :1;
181 /* We encountered a direction switch followed by an invalid
182 designation. We didn't output the direction switch
183 literally because we didn't know about the invalid designation;
184 but we have to do so now. */
185 unsigned int output_direction_sequence :1;
188 EXFUN (Fcopy_coding_system, 2);
190 struct detection_state;
193 text_encode_generic (Lstream *encoding, const Bufbyte *src,
194 unsigned_char_dynarr *dst, Lstream_data_count n);
196 static int detect_coding_sjis (struct detection_state *st,
197 const Extbyte *src, Lstream_data_count n);
198 static void decode_coding_sjis (Lstream *decoding, const Extbyte *src,
199 unsigned_char_dynarr *dst, Lstream_data_count n);
200 void char_encode_shift_jis (struct encoding_stream *str, Emchar c,
201 unsigned_char_dynarr *dst, unsigned int *flags);
202 void char_finish_shift_jis (struct encoding_stream *str,
203 unsigned_char_dynarr *dst, unsigned int *flags);
205 static int detect_coding_big5 (struct detection_state *st,
206 const Extbyte *src, Lstream_data_count n);
207 static void decode_coding_big5 (Lstream *decoding, const Extbyte *src,
208 unsigned_char_dynarr *dst, Lstream_data_count n);
209 void char_encode_big5 (struct encoding_stream *str, Emchar c,
210 unsigned_char_dynarr *dst, unsigned int *flags);
211 void char_finish_big5 (struct encoding_stream *str,
212 unsigned_char_dynarr *dst, unsigned int *flags);
214 static int detect_coding_ucs4 (struct detection_state *st,
215 const Extbyte *src, Lstream_data_count n);
216 static void decode_coding_ucs4 (Lstream *decoding, const Extbyte *src,
217 unsigned_char_dynarr *dst, Lstream_data_count n);
218 void char_encode_ucs4 (struct encoding_stream *str, Emchar c,
219 unsigned_char_dynarr *dst, unsigned int *flags);
220 void char_finish_ucs4 (struct encoding_stream *str,
221 unsigned_char_dynarr *dst, unsigned int *flags);
223 static int detect_coding_utf16 (struct detection_state *st,
224 const Extbyte *src, Lstream_data_count n);
225 static void decode_coding_utf16 (Lstream *decoding, const Extbyte *src,
226 unsigned_char_dynarr *dst, Lstream_data_count n);
227 void char_encode_utf16 (struct encoding_stream *str, Emchar c,
228 unsigned_char_dynarr *dst, unsigned int *flags);
229 void char_finish_utf16 (struct encoding_stream *str,
230 unsigned_char_dynarr *dst, unsigned int *flags);
232 static int detect_coding_utf8 (struct detection_state *st,
233 const Extbyte *src, Lstream_data_count n);
234 static void decode_coding_utf8 (Lstream *decoding, const Extbyte *src,
235 unsigned_char_dynarr *dst, Lstream_data_count n);
236 void char_encode_utf8 (struct encoding_stream *str, Emchar c,
237 unsigned_char_dynarr *dst, unsigned int *flags);
238 void char_finish_utf8 (struct encoding_stream *str,
239 unsigned_char_dynarr *dst, unsigned int *flags);
241 static int postprocess_iso2022_mask (int mask);
242 static void reset_iso2022 (Lisp_Object coding_system,
243 struct iso2022_decoder *iso);
244 static int detect_coding_iso2022 (struct detection_state *st,
245 const Extbyte *src, Lstream_data_count n);
246 static void decode_coding_iso2022 (Lstream *decoding, const Extbyte *src,
247 unsigned_char_dynarr *dst, Lstream_data_count n);
248 void char_encode_iso2022 (struct encoding_stream *str, Emchar c,
249 unsigned_char_dynarr *dst, unsigned int *flags);
250 void char_finish_iso2022 (struct encoding_stream *str,
251 unsigned_char_dynarr *dst, unsigned int *flags);
253 static void decode_coding_no_conversion (Lstream *decoding, const Extbyte *src,
254 unsigned_char_dynarr *dst, Lstream_data_count n);
255 static void encode_coding_no_conversion (Lstream *encoding, const Bufbyte *src,
256 unsigned_char_dynarr *dst, Lstream_data_count n);
257 static void mule_decode (Lstream *decoding, const Extbyte *src,
258 unsigned_char_dynarr *dst, Lstream_data_count n);
259 static void mule_encode (Lstream *encoding, const Bufbyte *src,
260 unsigned_char_dynarr *dst, Lstream_data_count n);
262 typedef struct codesys_prop codesys_prop;
271 Dynarr_declare (codesys_prop);
272 } codesys_prop_dynarr;
274 static const struct lrecord_description codesys_prop_description_1[] = {
275 { XD_LISP_OBJECT, offsetof (codesys_prop, sym) },
279 static const struct struct_description codesys_prop_description = {
280 sizeof (codesys_prop),
281 codesys_prop_description_1
284 static const struct lrecord_description codesys_prop_dynarr_description_1[] = {
285 XD_DYNARR_DESC (codesys_prop_dynarr, &codesys_prop_description),
289 static const struct struct_description codesys_prop_dynarr_description = {
290 sizeof (codesys_prop_dynarr),
291 codesys_prop_dynarr_description_1
294 codesys_prop_dynarr *the_codesys_prop_dynarr;
296 enum codesys_prop_enum
299 CODESYS_PROP_ISO2022,
304 /************************************************************************/
305 /* Coding system functions */
306 /************************************************************************/
308 static Lisp_Object mark_coding_system (Lisp_Object);
309 static void print_coding_system (Lisp_Object, Lisp_Object, int);
310 static void finalize_coding_system (void *header, int for_disksave);
313 static const struct lrecord_description ccs_description_1[] = {
314 { XD_LISP_OBJECT, offsetof (charset_conversion_spec, from_charset) },
315 { XD_LISP_OBJECT, offsetof (charset_conversion_spec, to_charset) },
319 static const struct struct_description ccs_description = {
320 sizeof (charset_conversion_spec),
324 static const struct lrecord_description ccsd_description_1[] = {
325 XD_DYNARR_DESC (charset_conversion_spec_dynarr, &ccs_description),
329 static const struct struct_description ccsd_description = {
330 sizeof (charset_conversion_spec_dynarr),
335 static const struct lrecord_description coding_system_description[] = {
336 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, name) },
337 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, doc_string) },
338 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, mnemonic) },
339 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, post_read_conversion) },
340 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, pre_write_conversion) },
341 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, eol_lf) },
342 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, eol_crlf) },
343 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, eol_cr) },
345 { XD_LISP_OBJECT_ARRAY, offsetof (Lisp_Coding_System, iso2022.initial_charset), 4 },
346 { XD_STRUCT_PTR, offsetof (Lisp_Coding_System, iso2022.input_conv), 1, &ccsd_description },
347 { XD_STRUCT_PTR, offsetof (Lisp_Coding_System, iso2022.output_conv), 1, &ccsd_description },
348 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, ccl.decode) },
349 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, ccl.encode) },
351 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, ccs_priority_list) },
357 DEFINE_LRECORD_IMPLEMENTATION ("coding-system", coding_system,
358 mark_coding_system, print_coding_system,
359 finalize_coding_system,
360 0, 0, coding_system_description,
364 mark_coding_system (Lisp_Object obj)
366 Lisp_Coding_System *codesys = XCODING_SYSTEM (obj);
368 mark_object (CODING_SYSTEM_NAME (codesys));
369 mark_object (CODING_SYSTEM_DOC_STRING (codesys));
370 mark_object (CODING_SYSTEM_MNEMONIC (codesys));
371 mark_object (CODING_SYSTEM_EOL_LF (codesys));
372 mark_object (CODING_SYSTEM_EOL_CRLF (codesys));
373 mark_object (CODING_SYSTEM_EOL_CR (codesys));
375 switch (CODING_SYSTEM_TYPE (codesys))
379 case CODESYS_ISO2022:
380 for (i = 0; i < 4; i++)
381 mark_object (CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i));
382 if (codesys->iso2022.input_conv)
384 for (i = 0; i < Dynarr_length (codesys->iso2022.input_conv); i++)
386 struct charset_conversion_spec *ccs =
387 Dynarr_atp (codesys->iso2022.input_conv, i);
388 mark_object (ccs->from_charset);
389 mark_object (ccs->to_charset);
392 if (codesys->iso2022.output_conv)
394 for (i = 0; i < Dynarr_length (codesys->iso2022.output_conv); i++)
396 struct charset_conversion_spec *ccs =
397 Dynarr_atp (codesys->iso2022.output_conv, i);
398 mark_object (ccs->from_charset);
399 mark_object (ccs->to_charset);
406 mark_object (CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 0));
407 mark_object (CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 1));
412 mark_object (CODING_SYSTEM_CCL_DECODE (codesys));
413 mark_object (CODING_SYSTEM_CCL_ENCODE (codesys));
420 mark_object (CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys));
422 mark_object (CODING_SYSTEM_CCS_PRIORITY_LIST (codesys));
424 return CODING_SYSTEM_POST_READ_CONVERSION (codesys);
428 print_coding_system (Lisp_Object obj, Lisp_Object printcharfun,
431 Lisp_Coding_System *c = XCODING_SYSTEM (obj);
433 error ("printing unreadable object #<coding_system 0x%x>",
436 write_c_string ("#<coding_system ", printcharfun);
437 print_internal (c->name, printcharfun, 1);
438 write_c_string (">", printcharfun);
442 finalize_coding_system (void *header, int for_disksave)
444 Lisp_Coding_System *c = (Lisp_Coding_System *) header;
445 /* Since coding systems never go away, this function is not
446 necessary. But it would be necessary if we changed things
447 so that coding systems could go away. */
448 if (!for_disksave) /* see comment in lstream.c */
450 switch (CODING_SYSTEM_TYPE (c))
453 case CODESYS_ISO2022:
454 if (c->iso2022.input_conv)
456 Dynarr_free (c->iso2022.input_conv);
457 c->iso2022.input_conv = 0;
459 if (c->iso2022.output_conv)
461 Dynarr_free (c->iso2022.output_conv);
462 c->iso2022.output_conv = 0;
473 symbol_to_eol_type (Lisp_Object symbol)
475 CHECK_SYMBOL (symbol);
476 if (NILP (symbol)) return EOL_AUTODETECT;
477 if (EQ (symbol, Qlf)) return EOL_LF;
478 if (EQ (symbol, Qcrlf)) return EOL_CRLF;
479 if (EQ (symbol, Qcr)) return EOL_CR;
481 signal_simple_error ("Unrecognized eol type", symbol);
482 return EOL_AUTODETECT; /* not reached */
486 eol_type_to_symbol (eol_type_t type)
491 case EOL_LF: return Qlf;
492 case EOL_CRLF: return Qcrlf;
493 case EOL_CR: return Qcr;
494 case EOL_AUTODETECT: return Qnil;
499 setup_eol_coding_systems (Lisp_Coding_System *codesys)
501 Lisp_Object codesys_obj;
502 int len = string_length (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name);
503 char *codesys_name = (char *) alloca (len + 7);
505 char *codesys_mnemonic=0;
507 Lisp_Object codesys_name_sym, sub_codesys_obj;
511 XSETCODING_SYSTEM (codesys_obj, codesys);
513 memcpy (codesys_name,
514 string_data (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name), len);
516 if (STRINGP (CODING_SYSTEM_MNEMONIC (codesys)))
518 mlen = XSTRING_LENGTH (CODING_SYSTEM_MNEMONIC (codesys));
519 codesys_mnemonic = (char *) alloca (mlen + 7);
520 memcpy (codesys_mnemonic,
521 XSTRING_DATA (CODING_SYSTEM_MNEMONIC (codesys)), mlen);
524 #define DEFINE_SUB_CODESYS(op_sys, op_sys_abbr, Type) do { \
525 strcpy (codesys_name + len, "-" op_sys); \
527 strcpy (codesys_mnemonic + mlen, op_sys_abbr); \
528 codesys_name_sym = intern (codesys_name); \
529 sub_codesys_obj = Fcopy_coding_system (codesys_obj, codesys_name_sym); \
530 XCODING_SYSTEM_EOL_TYPE (sub_codesys_obj) = Type; \
532 XCODING_SYSTEM_MNEMONIC(sub_codesys_obj) = \
533 build_string (codesys_mnemonic); \
534 CODING_SYSTEM_##Type (codesys) = sub_codesys_obj; \
537 DEFINE_SUB_CODESYS("unix", "", EOL_LF);
538 DEFINE_SUB_CODESYS("dos", ":T", EOL_CRLF);
539 DEFINE_SUB_CODESYS("mac", ":t", EOL_CR);
542 DEFUN ("coding-system-p", Fcoding_system_p, 1, 1, 0, /*
543 Return t if OBJECT is a coding system.
544 A coding system is an object that defines how text containing multiple
545 character sets is encoded into a stream of (typically 8-bit) bytes.
546 The coding system is used to decode the stream into a series of
547 characters (which may be from multiple charsets) when the text is read
548 from a file or process, and is used to encode the text back into the
549 same format when it is written out to a file or process.
551 For example, many ISO2022-compliant coding systems (such as Compound
552 Text, which is used for inter-client data under the X Window System)
553 use escape sequences to switch between different charsets -- Japanese
554 Kanji, for example, is invoked with "ESC $ ( B"; ASCII is invoked
555 with "ESC ( B"; and Cyrillic is invoked with "ESC - L". See
556 `make-coding-system' for more information.
558 Coding systems are normally identified using a symbol, and the
559 symbol is accepted in place of the actual coding system object whenever
560 a coding system is called for. (This is similar to how faces work.)
564 return CODING_SYSTEMP (object) ? Qt : Qnil;
567 DEFUN ("find-coding-system", Ffind_coding_system, 1, 1, 0, /*
568 Retrieve the coding system of the given name.
570 If CODING-SYSTEM-OR-NAME is a coding-system object, it is simply
571 returned. Otherwise, CODING-SYSTEM-OR-NAME should be a symbol.
572 If there is no such coding system, nil is returned. Otherwise the
573 associated coding system object is returned.
575 (coding_system_or_name))
577 if (NILP (coding_system_or_name))
578 coding_system_or_name = Qbinary;
579 else if (CODING_SYSTEMP (coding_system_or_name))
580 return coding_system_or_name;
582 CHECK_SYMBOL (coding_system_or_name);
586 coding_system_or_name =
587 Fgethash (coding_system_or_name, Vcoding_system_hash_table, Qnil);
589 if (CODING_SYSTEMP (coding_system_or_name) || NILP (coding_system_or_name))
590 return coding_system_or_name;
594 DEFUN ("get-coding-system", Fget_coding_system, 1, 1, 0, /*
595 Retrieve the coding system of the given name.
596 Same as `find-coding-system' except that if there is no such
597 coding system, an error is signaled instead of returning nil.
601 Lisp_Object coding_system = Ffind_coding_system (name);
603 if (NILP (coding_system))
604 signal_simple_error ("No such coding system", name);
605 return coding_system;
608 /* We store the coding systems in hash tables with the names as the key and the
609 actual coding system object as the value. Occasionally we need to use them
610 in a list format. These routines provide us with that. */
611 struct coding_system_list_closure
613 Lisp_Object *coding_system_list;
617 add_coding_system_to_list_mapper (Lisp_Object key, Lisp_Object value,
618 void *coding_system_list_closure)
620 /* This function can GC */
621 struct coding_system_list_closure *cscl =
622 (struct coding_system_list_closure *) coding_system_list_closure;
623 Lisp_Object *coding_system_list = cscl->coding_system_list;
625 *coding_system_list = Fcons (key, *coding_system_list);
629 DEFUN ("coding-system-list", Fcoding_system_list, 0, 0, 0, /*
630 Return a list of the names of all defined coding systems.
634 Lisp_Object coding_system_list = Qnil;
636 struct coding_system_list_closure coding_system_list_closure;
638 GCPRO1 (coding_system_list);
639 coding_system_list_closure.coding_system_list = &coding_system_list;
640 elisp_maphash (add_coding_system_to_list_mapper, Vcoding_system_hash_table,
641 &coding_system_list_closure);
644 return coding_system_list;
647 DEFUN ("coding-system-name", Fcoding_system_name, 1, 1, 0, /*
648 Return the name of the given coding system.
652 coding_system = Fget_coding_system (coding_system);
653 return XCODING_SYSTEM_NAME (coding_system);
656 static Lisp_Coding_System *
657 allocate_coding_system (enum coding_system_type type, Lisp_Object name)
659 Lisp_Coding_System *codesys =
660 alloc_lcrecord_type (Lisp_Coding_System, &lrecord_coding_system);
662 zero_lcrecord (codesys);
663 CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = Qnil;
664 CODING_SYSTEM_POST_READ_CONVERSION (codesys) = Qnil;
665 CODING_SYSTEM_EOL_TYPE (codesys) = EOL_AUTODETECT;
666 CODING_SYSTEM_EOL_CRLF (codesys) = Qnil;
667 CODING_SYSTEM_EOL_CR (codesys) = Qnil;
668 CODING_SYSTEM_EOL_LF (codesys) = Qnil;
669 CODING_SYSTEM_TYPE (codesys) = type;
670 CODING_SYSTEM_MNEMONIC (codesys) = Qnil;
673 CODING_SYSTEM_CCS_PRIORITY_LIST (codesys) = Qnil;
675 if (type == CODESYS_ISO2022)
678 for (i = 0; i < 4; i++)
679 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i) = Qnil;
682 if (type == CODESYS_UTF8)
684 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 0)
686 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 1)
688 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 2)
690 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 3)
693 else if (type == CODESYS_BIG5)
695 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 0)
697 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 1)
698 = Vcharset_chinese_big5;
699 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 2)
701 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 3)
705 else if (type == CODESYS_CCL)
707 CODING_SYSTEM_CCL_DECODE (codesys) = Qnil;
708 CODING_SYSTEM_CCL_ENCODE (codesys) = Qnil;
711 CODING_SYSTEM_NAME (codesys) = name;
717 /* Given a list of charset conversion specs as specified in a Lisp
718 program, parse it into STORE_HERE. */
721 parse_charset_conversion_specs (charset_conversion_spec_dynarr *store_here,
722 Lisp_Object spec_list)
726 EXTERNAL_LIST_LOOP (rest, spec_list)
728 Lisp_Object car = XCAR (rest);
729 Lisp_Object from, to;
730 struct charset_conversion_spec spec;
732 if (!CONSP (car) || !CONSP (XCDR (car)) || !NILP (XCDR (XCDR (car))))
733 signal_simple_error ("Invalid charset conversion spec", car);
734 from = Fget_charset (XCAR (car));
735 to = Fget_charset (XCAR (XCDR (car)));
736 if ( (XCHARSET_CHARS (from) != XCHARSET_CHARS (to)) ||
737 (XCHARSET_DIMENSION (from) != XCHARSET_DIMENSION (to)) )
738 signal_simple_error_2
739 ("Attempted conversion between different charset types",
741 spec.from_charset = from;
742 spec.to_charset = to;
744 Dynarr_add (store_here, spec);
748 /* Given a dynarr LOAD_HERE of internally-stored charset conversion
749 specs, return the equivalent as the Lisp programmer would see it.
751 If LOAD_HERE is 0, return Qnil. */
754 unparse_charset_conversion_specs (charset_conversion_spec_dynarr *load_here)
761 for (i = 0, result = Qnil; i < Dynarr_length (load_here); i++)
763 struct charset_conversion_spec *ccs = Dynarr_atp (load_here, i);
764 result = Fcons (list2 (ccs->from_charset, ccs->to_charset), result);
767 return Fnreverse (result);
772 DEFUN ("make-coding-system", Fmake_coding_system, 2, 4, 0, /*
773 Register symbol NAME as a coding system.
775 TYPE describes the conversion method used and should be one of
778 Automatic conversion. XEmacs attempts to detect the coding system
781 No conversion. Use this for binary files and such. On output,
782 graphic characters that are not in ASCII or Latin-1 will be
783 replaced by a ?. (For a no-conversion-encoded buffer, these
784 characters will only be present if you explicitly insert them.)
786 Shift-JIS (a Japanese encoding commonly used in PC operating systems).
788 ISO 10646 UCS-4 encoding.
790 ISO 10646 UTF-8 encoding.
792 Any ISO2022-compliant encoding. Among other things, this includes
793 JIS (the Japanese encoding commonly used for e-mail), EUC (the
794 standard Unix encoding for Japanese and other languages), and
795 Compound Text (the encoding used in X11). You can specify more
796 specific information about the conversion with the PROPS argument.
798 Big5 (the encoding commonly used for Taiwanese).
800 The conversion is performed using a user-written pseudo-code
801 program. CCL (Code Conversion Language) is the name of this
804 Write out or read in the raw contents of the memory representing
805 the buffer's text. This is primarily useful for debugging
806 purposes, and is only enabled when XEmacs has been compiled with
807 DEBUG_XEMACS defined (via the --debug configure option).
808 WARNING: Reading in a file using 'internal conversion can result
809 in an internal inconsistency in the memory representing a
810 buffer's text, which will produce unpredictable results and may
811 cause XEmacs to crash. Under normal circumstances you should
812 never use 'internal conversion.
814 DOC-STRING is a string describing the coding system.
816 PROPS is a property list, describing the specific nature of the
817 character set. Recognized properties are:
820 String to be displayed in the modeline when this coding system is
824 End-of-line conversion to be used. It should be one of
827 Automatically detect the end-of-line type (LF, CRLF,
828 or CR). Also generate subsidiary coding systems named
829 `NAME-unix', `NAME-dos', and `NAME-mac', that are
830 identical to this coding system but have an EOL-TYPE
831 value of 'lf, 'crlf, and 'cr, respectively.
833 The end of a line is marked externally using ASCII LF.
834 Since this is also the way that XEmacs represents an
835 end-of-line internally, specifying this option results
836 in no end-of-line conversion. This is the standard
837 format for Unix text files.
839 The end of a line is marked externally using ASCII
840 CRLF. This is the standard format for MS-DOS text
843 The end of a line is marked externally using ASCII CR.
844 This is the standard format for Macintosh text files.
846 Automatically detect the end-of-line type but do not
847 generate subsidiary coding systems. (This value is
848 converted to nil when stored internally, and
849 `coding-system-property' will return nil.)
852 If non-nil, composition for combining characters is disabled.
854 'enable-decomposition
855 If non-nil, decomposition for combining characters is enabled.
857 'use-entity-reference
858 If non-nil, SGML style entity-reference is used for non-system-characters.
860 'post-read-conversion
861 Function called after a file has been read in, to perform the
862 decoding. Called with two arguments, START and END, denoting
863 a region of the current buffer to be decoded.
865 'pre-write-conversion
866 Function called before a file is written out, to perform the
867 encoding. Called with two arguments, START and END, denoting
868 a region of the current buffer to be encoded.
871 The following additional properties are recognized if TYPE is 'iso2022:
877 The character set initially designated to the G0 - G3 registers.
878 The value should be one of
880 -- A charset object (designate that character set)
881 -- nil (do not ever use this register)
882 -- t (no character set is initially designated to
883 the register, but may be later on; this automatically
884 sets the corresponding `force-g*-on-output' property)
890 If non-nil, send an explicit designation sequence on output before
891 using the specified register.
894 If non-nil, use the short forms "ESC $ @", "ESC $ A", and
895 "ESC $ B" on output in place of the full designation sequences
896 "ESC $ ( @", "ESC $ ( A", and "ESC $ ( B".
899 If non-nil, don't designate ASCII to G0 at each end of line on output.
900 Setting this to non-nil also suppresses other state-resetting that
901 normally happens at the end of a line.
904 If non-nil, don't designate ASCII to G0 before control chars on output.
907 If non-nil, use 7-bit environment on output. Otherwise, use 8-bit
911 If non-nil, use locking-shift (SO/SI) instead of single-shift
912 or designation by escape sequence.
915 If non-nil, don't use ISO6429's direction specification.
918 If non-nil, literal control characters that are the same as
919 the beginning of a recognized ISO2022 or ISO6429 escape sequence
920 (in particular, ESC (0x1B), SO (0x0E), SI (0x0F), SS2 (0x8E),
921 SS3 (0x8F), and CSI (0x9B)) are "quoted" with an escape character
922 so that they can be properly distinguished from an escape sequence.
923 (Note that doing this results in a non-portable encoding.) This
924 encoding flag is used for byte-compiled files. Note that ESC
925 is a good choice for a quoting character because there are no
926 escape sequences whose second byte is a character from the Control-0
927 or Control-1 character sets; this is explicitly disallowed by the
930 'input-charset-conversion
931 A list of conversion specifications, specifying conversion of
932 characters in one charset to another when decoding is performed.
933 Each specification is a list of two elements: the source charset,
934 and the destination charset.
936 'output-charset-conversion
937 A list of conversion specifications, specifying conversion of
938 characters in one charset to another when encoding is performed.
939 The form of each specification is the same as for
940 'input-charset-conversion.
943 The following additional properties are recognized (and required)
947 CCL program used for decoding (converting to internal format).
950 CCL program used for encoding (converting to external format).
952 (name, type, doc_string, props))
954 Lisp_Coding_System *codesys;
955 enum coding_system_type ty;
956 int need_to_setup_eol_systems = 1;
958 /* Convert type to constant */
959 if (NILP (type) || EQ (type, Qundecided))
960 { ty = CODESYS_AUTODETECT; }
962 else if (EQ (type, Qshift_jis)) { ty = CODESYS_SHIFT_JIS; }
963 else if (EQ (type, Qiso2022)) { ty = CODESYS_ISO2022; }
964 else if (EQ (type, Qbig5)) { ty = CODESYS_BIG5; }
965 else if (EQ (type, Qucs4)) { ty = CODESYS_UCS4; }
966 else if (EQ (type, Qutf16)) { ty = CODESYS_UTF16; }
967 else if (EQ (type, Qutf8)) { ty = CODESYS_UTF8; }
968 else if (EQ (type, Qccl)) { ty = CODESYS_CCL; }
970 else if (EQ (type, Qno_conversion)) { ty = CODESYS_NO_CONVERSION; }
972 else if (EQ (type, Qinternal)) { ty = CODESYS_INTERNAL; }
975 signal_simple_error ("Invalid coding system type", type);
979 codesys = allocate_coding_system (ty, name);
981 if (NILP (doc_string))
982 doc_string = build_string ("");
984 CHECK_STRING (doc_string);
985 CODING_SYSTEM_DOC_STRING (codesys) = doc_string;
988 EXTERNAL_PROPERTY_LIST_LOOP_3 (key, value, props)
990 if (EQ (key, Qmnemonic))
993 CHECK_STRING (value);
994 CODING_SYSTEM_MNEMONIC (codesys) = value;
997 else if (EQ (key, Qeol_type))
999 need_to_setup_eol_systems = NILP (value);
1002 CODING_SYSTEM_EOL_TYPE (codesys) = symbol_to_eol_type (value);
1005 else if (EQ (key, Qpost_read_conversion))
1006 CODING_SYSTEM_POST_READ_CONVERSION (codesys) = value;
1007 else if (EQ (key, Qpre_write_conversion))
1008 CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = value;
1010 else if (EQ (key, Qdisable_composition))
1011 CODING_SYSTEM_DISABLE_COMPOSITION (codesys) = !NILP (value);
1012 else if (EQ (key, Qenable_decomposition))
1013 CODING_SYSTEM_ENABLE_DECOMPOSITION (codesys) = !NILP (value);
1014 else if (EQ (key, Quse_entity_reference))
1015 CODING_SYSTEM_USE_ENTITY_REFERENCE (codesys) = !NILP (value);
1018 else if (ty == CODESYS_ISO2022)
1020 #define FROB_INITIAL_CHARSET(charset_num) \
1021 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, charset_num) = \
1022 ((EQ (value, Qt) || EQ (value, Qnil)) ? value : Fget_charset (value))
1024 if (EQ (key, Qcharset_g0)) FROB_INITIAL_CHARSET (0);
1025 else if (EQ (key, Qcharset_g1)) FROB_INITIAL_CHARSET (1);
1026 else if (EQ (key, Qcharset_g2)) FROB_INITIAL_CHARSET (2);
1027 else if (EQ (key, Qcharset_g3)) FROB_INITIAL_CHARSET (3);
1029 #define FROB_FORCE_CHARSET(charset_num) \
1030 CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (codesys, charset_num) = !NILP (value)
1032 else if (EQ (key, Qforce_g0_on_output)) FROB_FORCE_CHARSET (0);
1033 else if (EQ (key, Qforce_g1_on_output)) FROB_FORCE_CHARSET (1);
1034 else if (EQ (key, Qforce_g2_on_output)) FROB_FORCE_CHARSET (2);
1035 else if (EQ (key, Qforce_g3_on_output)) FROB_FORCE_CHARSET (3);
1037 #define FROB_BOOLEAN_PROPERTY(prop) \
1038 CODING_SYSTEM_ISO2022_##prop (codesys) = !NILP (value)
1040 else if (EQ (key, Qshort)) FROB_BOOLEAN_PROPERTY (SHORT);
1041 else if (EQ (key, Qno_ascii_eol)) FROB_BOOLEAN_PROPERTY (NO_ASCII_EOL);
1042 else if (EQ (key, Qno_ascii_cntl)) FROB_BOOLEAN_PROPERTY (NO_ASCII_CNTL);
1043 else if (EQ (key, Qseven)) FROB_BOOLEAN_PROPERTY (SEVEN);
1044 else if (EQ (key, Qlock_shift)) FROB_BOOLEAN_PROPERTY (LOCK_SHIFT);
1045 else if (EQ (key, Qno_iso6429)) FROB_BOOLEAN_PROPERTY (NO_ISO6429);
1046 else if (EQ (key, Qescape_quoted)) FROB_BOOLEAN_PROPERTY (ESCAPE_QUOTED);
1048 else if (EQ (key, Qinput_charset_conversion))
1050 codesys->iso2022.input_conv =
1051 Dynarr_new (charset_conversion_spec);
1052 parse_charset_conversion_specs (codesys->iso2022.input_conv,
1055 else if (EQ (key, Qoutput_charset_conversion))
1057 codesys->iso2022.output_conv =
1058 Dynarr_new (charset_conversion_spec);
1059 parse_charset_conversion_specs (codesys->iso2022.output_conv,
1063 else if (EQ (key, Qccs_priority_list))
1065 codesys->ccs_priority_list = value;
1069 signal_simple_error ("Unrecognized property", key);
1072 else if (ty == CODESYS_UTF8)
1074 if (EQ (key, Qcharset_g0)) FROB_INITIAL_CHARSET (0);
1075 else if (EQ (key, Qcharset_g1))
1076 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 1) = value;
1077 else if (EQ (key, Qcharset_g2))
1078 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 2) = value;
1080 signal_simple_error ("Unrecognized property", key);
1082 else if (ty == CODESYS_BIG5)
1084 if (EQ (key, Qcharset_g0)) FROB_INITIAL_CHARSET (0);
1085 else if (EQ (key, Qcharset_g1)) FROB_INITIAL_CHARSET (1);
1087 signal_simple_error ("Unrecognized property", key);
1090 else if (EQ (type, Qccl))
1093 struct ccl_program test_ccl;
1096 /* Check key first. */
1097 if (EQ (key, Qdecode))
1098 suffix = "-ccl-decode";
1099 else if (EQ (key, Qencode))
1100 suffix = "-ccl-encode";
1102 signal_simple_error ("Unrecognized property", key);
1104 /* If value is vector, register it as a ccl program
1105 associated with an newly created symbol for
1106 backward compatibility. */
1107 if (VECTORP (value))
1109 sym = Fintern (concat2 (Fsymbol_name (name),
1110 build_string (suffix)),
1112 Fregister_ccl_program (sym, value);
1116 CHECK_SYMBOL (value);
1119 /* check if the given ccl programs are valid. */
1120 if (setup_ccl_program (&test_ccl, sym) < 0)
1121 signal_simple_error ("Invalid CCL program", value);
1123 if (EQ (key, Qdecode))
1124 CODING_SYSTEM_CCL_DECODE (codesys) = sym;
1125 else if (EQ (key, Qencode))
1126 CODING_SYSTEM_CCL_ENCODE (codesys) = sym;
1131 signal_simple_error ("Unrecognized property", key);
1135 if (need_to_setup_eol_systems)
1136 setup_eol_coding_systems (codesys);
1139 Lisp_Object codesys_obj;
1140 XSETCODING_SYSTEM (codesys_obj, codesys);
1141 Fputhash (name, codesys_obj, Vcoding_system_hash_table);
1146 DEFUN ("copy-coding-system", Fcopy_coding_system, 2, 2, 0, /*
1147 Copy OLD-CODING-SYSTEM to NEW-NAME.
1148 If NEW-NAME does not name an existing coding system, a new one will
1151 (old_coding_system, new_name))
1153 Lisp_Object new_coding_system;
1154 old_coding_system = Fget_coding_system (old_coding_system);
1155 new_coding_system = Ffind_coding_system (new_name);
1156 if (NILP (new_coding_system))
1158 XSETCODING_SYSTEM (new_coding_system,
1159 allocate_coding_system
1160 (XCODING_SYSTEM_TYPE (old_coding_system),
1162 Fputhash (new_name, new_coding_system, Vcoding_system_hash_table);
1166 Lisp_Coding_System *to = XCODING_SYSTEM (new_coding_system);
1167 Lisp_Coding_System *from = XCODING_SYSTEM (old_coding_system);
1168 memcpy (((char *) to ) + sizeof (to->header),
1169 ((char *) from) + sizeof (from->header),
1170 sizeof (*from) - sizeof (from->header));
1171 to->name = new_name;
1173 return new_coding_system;
1176 DEFUN ("coding-system-canonical-name-p", Fcoding_system_canonical_name_p, 1, 1, 0, /*
1177 Return t if OBJECT names a coding system, and is not a coding system alias.
1181 return CODING_SYSTEMP (Fgethash (object, Vcoding_system_hash_table, Qnil))
1185 DEFUN ("coding-system-alias-p", Fcoding_system_alias_p, 1, 1, 0, /*
1186 Return t if OBJECT is a coding system alias.
1187 All coding system aliases are created by `define-coding-system-alias'.
1191 return SYMBOLP (Fgethash (object, Vcoding_system_hash_table, Qzero))
1195 DEFUN ("coding-system-aliasee", Fcoding_system_aliasee, 1, 1, 0, /*
1196 Return the coding-system symbol for which symbol ALIAS is an alias.
1200 Lisp_Object aliasee = Fgethash (alias, Vcoding_system_hash_table, Qnil);
1201 if (SYMBOLP (aliasee))
1204 signal_simple_error ("Symbol is not a coding system alias", alias);
1205 return Qnil; /* To keep the compiler happy */
1209 append_suffix_to_symbol (Lisp_Object symbol, const char *ascii_string)
1211 return Fintern (concat2 (Fsymbol_name (symbol), build_string (ascii_string)),
1215 /* A maphash function, for removing dangling coding system aliases. */
1217 dangling_coding_system_alias_p (Lisp_Object alias,
1218 Lisp_Object aliasee,
1219 void *dangling_aliases)
1221 if (SYMBOLP (aliasee)
1222 && NILP (Fgethash (aliasee, Vcoding_system_hash_table, Qnil)))
1224 (*(int *) dangling_aliases)++;
1231 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias, 2, 2, 0, /*
1232 Define symbol ALIAS as an alias for coding system ALIASEE.
1234 You can use this function to redefine an alias that has already been defined,
1235 but you cannot redefine a name which is the canonical name for a coding system.
1236 \(a canonical name of a coding system is what is returned when you call
1237 `coding-system-name' on a coding system).
1239 ALIASEE itself can be an alias, which allows you to define nested aliases.
1241 You are forbidden, however, from creating alias loops or `dangling' aliases.
1242 These will be detected, and an error will be signaled if you attempt to do so.
1244 If ALIASEE is nil, then ALIAS will simply be undefined.
1246 See also `coding-system-alias-p', `coding-system-aliasee',
1247 and `coding-system-canonical-name-p'.
1251 Lisp_Object real_coding_system, probe;
1253 CHECK_SYMBOL (alias);
1255 if (!NILP (Fcoding_system_canonical_name_p (alias)))
1257 ("Symbol is the canonical name of a coding system and cannot be redefined",
1262 Lisp_Object subsidiary_unix = append_suffix_to_symbol (alias, "-unix");
1263 Lisp_Object subsidiary_dos = append_suffix_to_symbol (alias, "-dos");
1264 Lisp_Object subsidiary_mac = append_suffix_to_symbol (alias, "-mac");
1266 Fremhash (alias, Vcoding_system_hash_table);
1268 /* Undefine subsidiary aliases,
1269 presumably created by a previous call to this function */
1270 if (! NILP (Fcoding_system_alias_p (subsidiary_unix)) &&
1271 ! NILP (Fcoding_system_alias_p (subsidiary_dos)) &&
1272 ! NILP (Fcoding_system_alias_p (subsidiary_mac)))
1274 Fdefine_coding_system_alias (subsidiary_unix, Qnil);
1275 Fdefine_coding_system_alias (subsidiary_dos, Qnil);
1276 Fdefine_coding_system_alias (subsidiary_mac, Qnil);
1279 /* Undefine dangling coding system aliases. */
1281 int dangling_aliases;
1284 dangling_aliases = 0;
1285 elisp_map_remhash (dangling_coding_system_alias_p,
1286 Vcoding_system_hash_table,
1288 } while (dangling_aliases > 0);
1294 if (CODING_SYSTEMP (aliasee))
1295 aliasee = XCODING_SYSTEM_NAME (aliasee);
1297 /* Checks that aliasee names a coding-system */
1298 real_coding_system = Fget_coding_system (aliasee);
1300 /* Check for coding system alias loops */
1301 if (EQ (alias, aliasee))
1302 alias_loop: signal_simple_error_2
1303 ("Attempt to create a coding system alias loop", alias, aliasee);
1305 for (probe = aliasee;
1307 probe = Fgethash (probe, Vcoding_system_hash_table, Qzero))
1309 if (EQ (probe, alias))
1313 Fputhash (alias, aliasee, Vcoding_system_hash_table);
1315 /* Set up aliases for subsidiaries.
1316 #### There must be a better way to handle subsidiary coding systems. */
1318 static const char *suffixes[] = { "-unix", "-dos", "-mac" };
1320 for (i = 0; i < countof (suffixes); i++)
1322 Lisp_Object alias_subsidiary =
1323 append_suffix_to_symbol (alias, suffixes[i]);
1324 Lisp_Object aliasee_subsidiary =
1325 append_suffix_to_symbol (aliasee, suffixes[i]);
1327 if (! NILP (Ffind_coding_system (aliasee_subsidiary)))
1328 Fdefine_coding_system_alias (alias_subsidiary, aliasee_subsidiary);
1331 /* FSF return value is a vector of [ALIAS-unix ALIAS-dos ALIAS-mac],
1332 but it doesn't look intentional, so I'd rather return something
1333 meaningful or nothing at all. */
1338 subsidiary_coding_system (Lisp_Object coding_system, eol_type_t type)
1340 Lisp_Coding_System *cs = XCODING_SYSTEM (coding_system);
1341 Lisp_Object new_coding_system;
1343 if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
1344 return coding_system;
1348 case EOL_AUTODETECT: return coding_system;
1349 case EOL_LF: new_coding_system = CODING_SYSTEM_EOL_LF (cs); break;
1350 case EOL_CR: new_coding_system = CODING_SYSTEM_EOL_CR (cs); break;
1351 case EOL_CRLF: new_coding_system = CODING_SYSTEM_EOL_CRLF (cs); break;
1352 default: abort (); return Qnil;
1355 return NILP (new_coding_system) ? coding_system : new_coding_system;
1358 DEFUN ("subsidiary-coding-system", Fsubsidiary_coding_system, 2, 2, 0, /*
1359 Return the subsidiary coding system of CODING-SYSTEM with eol type EOL-TYPE.
1361 (coding_system, eol_type))
1363 coding_system = Fget_coding_system (coding_system);
1365 return subsidiary_coding_system (coding_system,
1366 symbol_to_eol_type (eol_type));
1370 /************************************************************************/
1371 /* Coding system accessors */
1372 /************************************************************************/
1374 DEFUN ("coding-system-doc-string", Fcoding_system_doc_string, 1, 1, 0, /*
1375 Return the doc string for CODING-SYSTEM.
1379 coding_system = Fget_coding_system (coding_system);
1380 return XCODING_SYSTEM_DOC_STRING (coding_system);
1383 DEFUN ("coding-system-type", Fcoding_system_type, 1, 1, 0, /*
1384 Return the type of CODING-SYSTEM.
1388 switch (XCODING_SYSTEM_TYPE (Fget_coding_system (coding_system)))
1391 case CODESYS_AUTODETECT: return Qundecided;
1393 case CODESYS_SHIFT_JIS: return Qshift_jis;
1394 case CODESYS_ISO2022: return Qiso2022;
1395 case CODESYS_BIG5: return Qbig5;
1396 case CODESYS_UCS4: return Qucs4;
1397 case CODESYS_UTF16: return Qutf16;
1398 case CODESYS_UTF8: return Qutf8;
1399 case CODESYS_CCL: return Qccl;
1401 case CODESYS_NO_CONVERSION: return Qno_conversion;
1403 case CODESYS_INTERNAL: return Qinternal;
1410 Lisp_Object coding_system_charset (Lisp_Object coding_system, int gnum)
1413 = XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, gnum);
1415 return CHARSETP (cs) ? XCHARSET_NAME (cs) : Qnil;
1418 DEFUN ("coding-system-charset", Fcoding_system_charset, 2, 2, 0, /*
1419 Return initial charset of CODING-SYSTEM designated to GNUM.
1422 (coding_system, gnum))
1424 coding_system = Fget_coding_system (coding_system);
1427 return coding_system_charset (coding_system, XINT (gnum));
1431 DEFUN ("coding-system-property", Fcoding_system_property, 2, 2, 0, /*
1432 Return the PROP property of CODING-SYSTEM.
1434 (coding_system, prop))
1437 enum coding_system_type type;
1439 coding_system = Fget_coding_system (coding_system);
1440 CHECK_SYMBOL (prop);
1441 type = XCODING_SYSTEM_TYPE (coding_system);
1443 for (i = 0; !ok && i < Dynarr_length (the_codesys_prop_dynarr); i++)
1444 if (EQ (Dynarr_at (the_codesys_prop_dynarr, i).sym, prop))
1447 switch (Dynarr_at (the_codesys_prop_dynarr, i).prop_type)
1449 case CODESYS_PROP_ALL_OK:
1452 case CODESYS_PROP_ISO2022:
1453 if (type != CODESYS_ISO2022)
1455 ("Property only valid in ISO2022 coding systems",
1459 case CODESYS_PROP_CCL:
1460 if (type != CODESYS_CCL)
1462 ("Property only valid in CCL coding systems",
1472 signal_simple_error ("Unrecognized property", prop);
1474 if (EQ (prop, Qname))
1475 return XCODING_SYSTEM_NAME (coding_system);
1476 else if (EQ (prop, Qtype))
1477 return Fcoding_system_type (coding_system);
1478 else if (EQ (prop, Qdoc_string))
1479 return XCODING_SYSTEM_DOC_STRING (coding_system);
1480 else if (EQ (prop, Qmnemonic))
1481 return XCODING_SYSTEM_MNEMONIC (coding_system);
1482 else if (EQ (prop, Qeol_type))
1483 return eol_type_to_symbol (XCODING_SYSTEM_EOL_TYPE (coding_system));
1484 else if (EQ (prop, Qeol_lf))
1485 return XCODING_SYSTEM_EOL_LF (coding_system);
1486 else if (EQ (prop, Qeol_crlf))
1487 return XCODING_SYSTEM_EOL_CRLF (coding_system);
1488 else if (EQ (prop, Qeol_cr))
1489 return XCODING_SYSTEM_EOL_CR (coding_system);
1490 else if (EQ (prop, Qpost_read_conversion))
1491 return XCODING_SYSTEM_POST_READ_CONVERSION (coding_system);
1492 else if (EQ (prop, Qpre_write_conversion))
1493 return XCODING_SYSTEM_PRE_WRITE_CONVERSION (coding_system);
1496 else if (EQ (prop, Qdisable_composition))
1497 return XCODING_SYSTEM_DISABLE_COMPOSITION (coding_system) ? Qt : Qnil;
1498 else if (EQ (prop, Qenable_decomposition))
1499 return XCODING_SYSTEM_ENABLE_DECOMPOSITION (coding_system) ? Qt : Qnil;
1500 else if (EQ (prop, Quse_entity_reference))
1501 return XCODING_SYSTEM_USE_ENTITY_REFERENCE (coding_system) ? Qt : Qnil;
1502 else if (EQ (prop, Qccs_priority_list))
1503 return XCODING_SYSTEM_CCS_PRIORITY_LIST (coding_system);
1505 else if (type == CODESYS_ISO2022)
1507 if (EQ (prop, Qcharset_g0))
1508 return coding_system_charset (coding_system, 0);
1509 else if (EQ (prop, Qcharset_g1))
1510 return coding_system_charset (coding_system, 1);
1511 else if (EQ (prop, Qcharset_g2))
1512 return coding_system_charset (coding_system, 2);
1513 else if (EQ (prop, Qcharset_g3))
1514 return coding_system_charset (coding_system, 3);
1516 #define FORCE_CHARSET(charset_num) \
1517 (XCODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT \
1518 (coding_system, charset_num) ? Qt : Qnil)
1520 else if (EQ (prop, Qforce_g0_on_output)) return FORCE_CHARSET (0);
1521 else if (EQ (prop, Qforce_g1_on_output)) return FORCE_CHARSET (1);
1522 else if (EQ (prop, Qforce_g2_on_output)) return FORCE_CHARSET (2);
1523 else if (EQ (prop, Qforce_g3_on_output)) return FORCE_CHARSET (3);
1525 #define LISP_BOOLEAN(prop) \
1526 (XCODING_SYSTEM_ISO2022_##prop (coding_system) ? Qt : Qnil)
1528 else if (EQ (prop, Qshort)) return LISP_BOOLEAN (SHORT);
1529 else if (EQ (prop, Qno_ascii_eol)) return LISP_BOOLEAN (NO_ASCII_EOL);
1530 else if (EQ (prop, Qno_ascii_cntl)) return LISP_BOOLEAN (NO_ASCII_CNTL);
1531 else if (EQ (prop, Qseven)) return LISP_BOOLEAN (SEVEN);
1532 else if (EQ (prop, Qlock_shift)) return LISP_BOOLEAN (LOCK_SHIFT);
1533 else if (EQ (prop, Qno_iso6429)) return LISP_BOOLEAN (NO_ISO6429);
1534 else if (EQ (prop, Qescape_quoted)) return LISP_BOOLEAN (ESCAPE_QUOTED);
1536 else if (EQ (prop, Qinput_charset_conversion))
1538 unparse_charset_conversion_specs
1539 (XCODING_SYSTEM (coding_system)->iso2022.input_conv);
1540 else if (EQ (prop, Qoutput_charset_conversion))
1542 unparse_charset_conversion_specs
1543 (XCODING_SYSTEM (coding_system)->iso2022.output_conv);
1547 else if (type == CODESYS_CCL)
1549 if (EQ (prop, Qdecode))
1550 return XCODING_SYSTEM_CCL_DECODE (coding_system);
1551 else if (EQ (prop, Qencode))
1552 return XCODING_SYSTEM_CCL_ENCODE (coding_system);
1560 return Qnil; /* not reached */
1564 /************************************************************************/
1565 /* Coding category functions */
1566 /************************************************************************/
1569 decode_coding_category (Lisp_Object symbol)
1573 CHECK_SYMBOL (symbol);
1574 for (i = 0; i < CODING_CATEGORY_LAST; i++)
1575 if (EQ (coding_category_symbol[i], symbol))
1578 signal_simple_error ("Unrecognized coding category", symbol);
1579 return 0; /* not reached */
1582 DEFUN ("coding-category-list", Fcoding_category_list, 0, 0, 0, /*
1583 Return a list of all recognized coding categories.
1588 Lisp_Object list = Qnil;
1590 for (i = CODING_CATEGORY_LAST - 1; i >= 0; i--)
1591 list = Fcons (coding_category_symbol[i], list);
1595 DEFUN ("set-coding-priority-list", Fset_coding_priority_list, 1, 1, 0, /*
1596 Change the priority order of the coding categories.
1597 LIST should be list of coding categories, in descending order of
1598 priority. Unspecified coding categories will be lower in priority
1599 than all specified ones, in the same relative order they were in
1604 int category_to_priority[CODING_CATEGORY_LAST];
1608 /* First generate a list that maps coding categories to priorities. */
1610 for (i = 0; i < CODING_CATEGORY_LAST; i++)
1611 category_to_priority[i] = -1;
1613 /* Highest priority comes from the specified list. */
1615 EXTERNAL_LIST_LOOP (rest, list)
1617 int cat = decode_coding_category (XCAR (rest));
1619 if (category_to_priority[cat] >= 0)
1620 signal_simple_error ("Duplicate coding category in list", XCAR (rest));
1621 category_to_priority[cat] = i++;
1624 /* Now go through the existing categories by priority to retrieve
1625 the categories not yet specified and preserve their priority
1627 for (j = 0; j < CODING_CATEGORY_LAST; j++)
1629 int cat = fcd->coding_category_by_priority[j];
1630 if (category_to_priority[cat] < 0)
1631 category_to_priority[cat] = i++;
1634 /* Now we need to construct the inverse of the mapping we just
1637 for (i = 0; i < CODING_CATEGORY_LAST; i++)
1638 fcd->coding_category_by_priority[category_to_priority[i]] = i;
1640 /* Phew! That was confusing. */
1644 DEFUN ("coding-priority-list", Fcoding_priority_list, 0, 0, 0, /*
1645 Return a list of coding categories in descending order of priority.
1650 Lisp_Object list = Qnil;
1652 for (i = CODING_CATEGORY_LAST - 1; i >= 0; i--)
1653 list = Fcons (coding_category_symbol[fcd->coding_category_by_priority[i]],
1658 DEFUN ("set-coding-category-system", Fset_coding_category_system, 2, 2, 0, /*
1659 Change the coding system associated with a coding category.
1661 (coding_category, coding_system))
1663 int cat = decode_coding_category (coding_category);
1665 coding_system = Fget_coding_system (coding_system);
1666 fcd->coding_category_system[cat] = coding_system;
1670 DEFUN ("coding-category-system", Fcoding_category_system, 1, 1, 0, /*
1671 Return the coding system associated with a coding category.
1675 int cat = decode_coding_category (coding_category);
1676 Lisp_Object sys = fcd->coding_category_system[cat];
1679 return XCODING_SYSTEM_NAME (sys);
1684 /************************************************************************/
1685 /* Detecting the encoding of data */
1686 /************************************************************************/
1688 struct detection_state
1690 eol_type_t eol_type;
1733 struct iso2022_decoder iso;
1735 int high_byte_count;
1736 unsigned int saw_single_shift:1;
1749 acceptable_control_char_p (int c)
1753 /* Allow and ignore control characters that you might
1754 reasonably see in a text file */
1759 case 8: /* backspace */
1760 case 11: /* vertical tab */
1761 case 12: /* form feed */
1762 case 26: /* MS-DOS C-z junk */
1763 case 31: /* '^_' -- for info */
1771 mask_has_at_most_one_bit_p (int mask)
1773 /* Perhaps the only thing useful you learn from intensive Microsoft
1774 technical interviews */
1775 return (mask & (mask - 1)) == 0;
1779 detect_eol_type (struct detection_state *st, const Extbyte *src,
1780 Lstream_data_count n)
1784 unsigned char c = *(unsigned char *)src++;
1787 if (st->eol.just_saw_cr)
1789 else if (st->eol.seen_anything)
1792 else if (st->eol.just_saw_cr)
1795 st->eol.just_saw_cr = 1;
1797 st->eol.just_saw_cr = 0;
1798 st->eol.seen_anything = 1;
1801 return EOL_AUTODETECT;
1804 /* Attempt to determine the encoding and EOL type of the given text.
1805 Before calling this function for the first type, you must initialize
1806 st->eol_type as appropriate and initialize st->mask to ~0.
1808 st->eol_type holds the determined EOL type, or EOL_AUTODETECT if
1811 st->mask holds the determined coding category mask, or ~0 if only
1812 ASCII has been seen so far.
1816 0 == st->eol_type is EOL_AUTODETECT and/or more than coding category
1817 is present in st->mask
1818 1 == definitive answers are here for both st->eol_type and st->mask
1822 detect_coding_type (struct detection_state *st, const Extbyte *src,
1823 Lstream_data_count n, int just_do_eol)
1825 if (st->eol_type == EOL_AUTODETECT)
1826 st->eol_type = detect_eol_type (st, src, n);
1829 return st->eol_type != EOL_AUTODETECT;
1831 if (!st->seen_non_ascii)
1833 for (; n; n--, src++)
1835 unsigned char c = *(unsigned char *) src;
1836 if ((c < 0x20 && !acceptable_control_char_p (c)) || c >= 0x80)
1838 st->seen_non_ascii = 1;
1840 st->shift_jis.mask = ~0;
1843 st->utf16.mask = ~0;
1845 st->iso2022.mask = ~0;
1855 if (!mask_has_at_most_one_bit_p (st->iso2022.mask))
1856 st->iso2022.mask = detect_coding_iso2022 (st, src, n);
1857 if (!mask_has_at_most_one_bit_p (st->shift_jis.mask))
1858 st->shift_jis.mask = detect_coding_sjis (st, src, n);
1859 if (!mask_has_at_most_one_bit_p (st->big5.mask))
1860 st->big5.mask = detect_coding_big5 (st, src, n);
1861 if (!mask_has_at_most_one_bit_p (st->utf8.mask))
1862 st->utf8.mask = detect_coding_utf8 (st, src, n);
1863 if (!mask_has_at_most_one_bit_p (st->utf16.mask))
1864 st->utf16.mask = detect_coding_utf16 (st, src, n);
1865 if (!mask_has_at_most_one_bit_p (st->ucs4.mask))
1866 st->ucs4.mask = detect_coding_ucs4 (st, src, n);
1869 = st->iso2022.mask | st->shift_jis.mask | st->big5.mask
1870 | st->utf8.mask | st->ucs4.mask;
1873 int retval = mask_has_at_most_one_bit_p (st->mask);
1874 st->mask |= CODING_CATEGORY_NO_CONVERSION_MASK;
1875 return retval && st->eol_type != EOL_AUTODETECT;
1880 coding_system_from_mask (int mask)
1884 /* If the file was entirely or basically ASCII, use the
1885 default value of `buffer-file-coding-system'. */
1886 Lisp_Object retval =
1887 XBUFFER (Vbuffer_defaults)->buffer_file_coding_system;
1890 retval = Ffind_coding_system (retval);
1894 (Qbad_variable, Qwarning,
1895 "Invalid `default-buffer-file-coding-system', set to nil");
1896 XBUFFER (Vbuffer_defaults)->buffer_file_coding_system = Qnil;
1900 retval = Fget_coding_system (Qraw_text);
1908 mask = postprocess_iso2022_mask (mask);
1910 /* Look through the coding categories by priority and find
1911 the first one that is allowed. */
1912 for (i = 0; i < CODING_CATEGORY_LAST; i++)
1914 cat = fcd->coding_category_by_priority[i];
1915 if ((mask & (1 << cat)) &&
1916 !NILP (fcd->coding_category_system[cat]))
1920 return fcd->coding_category_system[cat];
1922 return Fget_coding_system (Qraw_text);
1926 /* Given a seekable read stream and potential coding system and EOL type
1927 as specified, do any autodetection that is called for. If the
1928 coding system and/or EOL type are not `autodetect', they will be left
1929 alone; but this function will never return an autodetect coding system
1932 This function does not automatically fetch subsidiary coding systems;
1933 that should be unnecessary with the explicit eol-type argument. */
1935 #define LENGTH(string_constant) (sizeof (string_constant) - 1)
1936 /* number of leading lines to check for a coding cookie */
1937 #define LINES_TO_CHECK 2
1940 determine_real_coding_system (Lstream *stream, Lisp_Object *codesys_in_out,
1941 eol_type_t *eol_type_in_out)
1943 struct detection_state decst;
1945 if (*eol_type_in_out == EOL_AUTODETECT)
1946 *eol_type_in_out = XCODING_SYSTEM_EOL_TYPE (*codesys_in_out);
1949 decst.eol_type = *eol_type_in_out;
1952 /* If autodetection is called for, do it now. */
1953 if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT
1954 || *eol_type_in_out == EOL_AUTODETECT)
1957 Lisp_Object coding_system = Qnil;
1959 Lstream_data_count nread = Lstream_read (stream, buf, sizeof (buf));
1961 int lines_checked = 0;
1963 /* Look for initial "-*-"; mode line prefix */
1965 scan_end = buf + nread - LENGTH ("-*-coding:?-*-");
1967 && lines_checked < LINES_TO_CHECK;
1969 if (*p == '-' && *(p+1) == '*' && *(p+2) == '-')
1971 Extbyte *local_vars_beg = p + 3;
1972 /* Look for final "-*-"; mode line suffix */
1973 for (p = local_vars_beg,
1974 scan_end = buf + nread - LENGTH ("-*-");
1976 && lines_checked < LINES_TO_CHECK;
1978 if (*p == '-' && *(p+1) == '*' && *(p+2) == '-')
1980 Extbyte *suffix = p;
1981 /* Look for "coding:" */
1982 for (p = local_vars_beg,
1983 scan_end = suffix - LENGTH ("coding:?");
1986 if (memcmp ("coding:", p, LENGTH ("coding:")) == 0
1987 && (p == local_vars_beg
1988 || (*(p-1) == ' ' ||
1994 p += LENGTH ("coding:");
1995 while (*p == ' ' || *p == '\t') p++;
1997 /* Get coding system name */
1998 save = *suffix; *suffix = '\0';
1999 /* Characters valid in a MIME charset name (rfc 1521),
2000 and in a Lisp symbol name. */
2001 n = strspn ( (char *) p,
2002 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
2003 "abcdefghijklmnopqrstuvwxyz"
2009 save = p[n]; p[n] = '\0';
2011 Ffind_coding_system (intern ((char *) p));
2018 /* #### file must use standard EOLs or we miss 2d line */
2019 /* #### not to mention this is broken for UTF-16 DOS files */
2020 else if (*p == '\n' || *p == '\r')
2023 /* skip past multibyte (DOS) newline */
2024 if (*p == '\r' && *(p+1) == '\n') p++;
2028 /* #### file must use standard EOLs or we miss 2d line */
2029 /* #### not to mention this is broken for UTF-16 DOS files */
2030 else if (*p == '\n' || *p == '\r')
2033 /* skip past multibyte (DOS) newline */
2034 if (*p == '\r' && *(p+1) == '\n') p++;
2037 if (NILP (coding_system))
2040 if (detect_coding_type (&decst, buf, nread,
2041 XCODING_SYSTEM_TYPE (*codesys_in_out)
2042 != CODESYS_AUTODETECT))
2044 nread = Lstream_read (stream, buf, sizeof (buf));
2050 else if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT
2051 && XCODING_SYSTEM_EOL_TYPE (coding_system) == EOL_AUTODETECT)
2054 if (detect_coding_type (&decst, buf, nread, 1))
2056 nread = Lstream_read (stream, buf, sizeof (buf));
2062 *eol_type_in_out = decst.eol_type;
2063 if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT)
2065 if (NILP (coding_system))
2066 *codesys_in_out = coding_system_from_mask (decst.mask);
2068 *codesys_in_out = coding_system;
2072 /* If we absolutely can't determine the EOL type, just assume LF. */
2073 if (*eol_type_in_out == EOL_AUTODETECT)
2074 *eol_type_in_out = EOL_LF;
2076 Lstream_rewind (stream);
2079 DEFUN ("detect-coding-region", Fdetect_coding_region, 2, 3, 0, /*
2080 Detect coding system of the text in the region between START and END.
2081 Return a list of possible coding systems ordered by priority.
2082 If only ASCII characters are found, return 'undecided or one of
2083 its subsidiary coding systems according to a detected end-of-line
2084 type. Optional arg BUFFER defaults to the current buffer.
2086 (start, end, buffer))
2088 Lisp_Object val = Qnil;
2089 struct buffer *buf = decode_buffer (buffer, 0);
2091 Lisp_Object instream, lb_instream;
2092 Lstream *istr, *lb_istr;
2093 struct detection_state decst;
2094 struct gcpro gcpro1, gcpro2;
2096 get_buffer_range_char (buf, start, end, &b, &e, 0);
2097 lb_instream = make_lisp_buffer_input_stream (buf, b, e, 0);
2098 lb_istr = XLSTREAM (lb_instream);
2099 instream = make_encoding_input_stream (lb_istr, Fget_coding_system (Qbinary));
2100 istr = XLSTREAM (instream);
2101 GCPRO2 (instream, lb_instream);
2103 decst.eol_type = EOL_AUTODETECT;
2107 Extbyte random_buffer[4096];
2108 Lstream_data_count nread = Lstream_read (istr, random_buffer, sizeof (random_buffer));
2112 if (detect_coding_type (&decst, random_buffer, nread, 0))
2116 if (decst.mask == ~0)
2117 val = subsidiary_coding_system (Fget_coding_system (Qundecided),
2125 decst.mask = postprocess_iso2022_mask (decst.mask);
2127 for (i = CODING_CATEGORY_LAST - 1; i >= 0; i--)
2129 int sys = fcd->coding_category_by_priority[i];
2130 if (decst.mask & (1 << sys))
2132 Lisp_Object codesys = fcd->coding_category_system[sys];
2133 if (!NILP (codesys))
2134 codesys = subsidiary_coding_system (codesys, decst.eol_type);
2135 val = Fcons (codesys, val);
2139 Lstream_close (istr);
2141 Lstream_delete (istr);
2142 Lstream_delete (lb_istr);
2147 /************************************************************************/
2148 /* Converting to internal Mule format ("decoding") */
2149 /************************************************************************/
2151 /* A decoding stream is a stream used for decoding text (i.e.
2152 converting from some external format to internal format).
2153 The decoding-stream object keeps track of the actual coding
2154 stream, the stream that is at the other end, and data that
2155 needs to be persistent across the lifetime of the stream. */
2157 /* Handle the EOL stuff related to just-read-in character C.
2158 EOL_TYPE is the EOL type of the coding stream.
2159 FLAGS is the current value of FLAGS in the coding stream, and may
2160 be modified by this macro. (The macro only looks at the
2161 CODING_STATE_CR flag.) DST is the Dynarr to which the decoded
2162 bytes are to be written. You need to also define a local goto
2163 label "label_continue_loop" that is at the end of the main
2164 character-reading loop.
2166 If C is a CR character, then this macro handles it entirely and
2167 jumps to label_continue_loop. Otherwise, this macro does not add
2168 anything to DST, and continues normally. You should continue
2169 processing C normally after this macro. */
2171 #define DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst) \
2175 if (eol_type == EOL_CR) \
2176 Dynarr_add (dst, '\n'); \
2177 else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR) \
2178 Dynarr_add (dst, c); \
2180 flags |= CODING_STATE_CR; \
2181 goto label_continue_loop; \
2183 else if (flags & CODING_STATE_CR) \
2184 { /* eol_type == CODING_SYSTEM_EOL_CRLF */ \
2186 Dynarr_add (dst, '\r'); \
2187 flags &= ~CODING_STATE_CR; \
2191 /* C should be a binary character in the range 0 - 255; convert
2192 to internal format and add to Dynarr DST. */
2195 #define DECODE_ADD_BINARY_CHAR(c, dst) \
2197 if (BYTE_ASCII_P (c)) \
2198 Dynarr_add (dst, c); \
2201 Dynarr_add (dst, (c >> 6) | 0xc0); \
2202 Dynarr_add (dst, (c & 0x3f) | 0x80); \
2206 static void DECODE_ADD_UCS_CHAR(Emchar c, unsigned_char_dynarr* dst);
2208 DECODE_ADD_UCS_CHAR(Emchar c, unsigned_char_dynarr* dst)
2212 Dynarr_add (dst, c);
2214 else if ( c <= 0x7ff )
2216 Dynarr_add (dst, (c >> 6) | 0xc0);
2217 Dynarr_add (dst, (c & 0x3f) | 0x80);
2219 else if ( c <= 0xffff )
2221 Dynarr_add (dst, (c >> 12) | 0xe0);
2222 Dynarr_add (dst, ((c >> 6) & 0x3f) | 0x80);
2223 Dynarr_add (dst, (c & 0x3f) | 0x80);
2225 else if ( c <= 0x1fffff )
2227 Dynarr_add (dst, (c >> 18) | 0xf0);
2228 Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
2229 Dynarr_add (dst, ((c >> 6) & 0x3f) | 0x80);
2230 Dynarr_add (dst, (c & 0x3f) | 0x80);
2232 else if ( c <= 0x3ffffff )
2234 Dynarr_add (dst, (c >> 24) | 0xf8);
2235 Dynarr_add (dst, ((c >> 18) & 0x3f) | 0x80);
2236 Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
2237 Dynarr_add (dst, ((c >> 6) & 0x3f) | 0x80);
2238 Dynarr_add (dst, (c & 0x3f) | 0x80);
2242 Dynarr_add (dst, (c >> 30) | 0xfc);
2243 Dynarr_add (dst, ((c >> 24) & 0x3f) | 0x80);
2244 Dynarr_add (dst, ((c >> 18) & 0x3f) | 0x80);
2245 Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
2246 Dynarr_add (dst, ((c >> 6) & 0x3f) | 0x80);
2247 Dynarr_add (dst, (c & 0x3f) | 0x80);
2251 #define DECODE_ADD_BINARY_CHAR(c, dst) \
2253 if (BYTE_ASCII_P (c)) \
2254 Dynarr_add (dst, c); \
2255 else if (BYTE_C1_P (c)) \
2257 Dynarr_add (dst, LEADING_BYTE_CONTROL_1); \
2258 Dynarr_add (dst, c + 0x20); \
2262 Dynarr_add (dst, LEADING_BYTE_LATIN_ISO8859_1); \
2263 Dynarr_add (dst, c); \
2268 #define DECODE_OUTPUT_PARTIAL_CHAR(ch) \
2272 DECODE_ADD_BINARY_CHAR (ch, dst); \
2277 #define DECODE_HANDLE_END_OF_CONVERSION(flags, ch, dst) \
2279 if (flags & CODING_STATE_END) \
2281 DECODE_OUTPUT_PARTIAL_CHAR (ch); \
2282 if (flags & CODING_STATE_CR) \
2283 Dynarr_add (dst, '\r'); \
2287 #define DECODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, decoding)
2289 #define ER_BUF_SIZE 24
2291 struct decoding_stream
2293 /* Coding system that governs the conversion. */
2294 Lisp_Coding_System *codesys;
2296 /* Stream that we read the encoded data from or
2297 write the decoded data to. */
2300 /* If we are reading, then we can return only a fixed amount of
2301 data, so if the conversion resulted in too much data, we store it
2302 here for retrieval the next time around. */
2303 unsigned_char_dynarr *runoff;
2305 /* FLAGS holds flags indicating the current state of the decoding.
2306 Some of these flags are dependent on the coding system. */
2309 /* CPOS holds a partially built-up code-point of character. */
2312 /* EOL_TYPE specifies the type of end-of-line conversion that
2313 currently applies. We need to keep this separate from the
2314 EOL type stored in CODESYS because the latter might indicate
2315 automatic EOL-type detection while the former will always
2316 indicate a particular EOL type. */
2317 eol_type_t eol_type;
2319 /* Additional ISO2022 information. We define the structure above
2320 because it's also needed by the detection routines. */
2321 struct iso2022_decoder iso2022;
2323 /* Additional information (the state of the running CCL program)
2324 used by the CCL decoder. */
2325 struct ccl_program ccl;
2327 /* counter for UTF-8 or UCS-4 */
2328 unsigned char counter;
2332 unsigned char er_counter;
2333 unsigned char er_buf[ER_BUF_SIZE];
2335 unsigned combined_char_count;
2336 Emchar combined_chars[16];
2337 Lisp_Object combining_table;
2339 struct detection_state decst;
2342 static Lstream_data_count decoding_reader (Lstream *stream,
2343 unsigned char *data, Lstream_data_count size);
2344 static Lstream_data_count decoding_writer (Lstream *stream,
2345 const unsigned char *data, Lstream_data_count size);
2346 static int decoding_rewinder (Lstream *stream);
2347 static int decoding_seekable_p (Lstream *stream);
2348 static int decoding_flusher (Lstream *stream);
2349 static int decoding_closer (Lstream *stream);
2351 static Lisp_Object decoding_marker (Lisp_Object stream);
2353 DEFINE_LSTREAM_IMPLEMENTATION ("decoding", lstream_decoding,
2354 sizeof (struct decoding_stream));
2357 decoding_marker (Lisp_Object stream)
2359 Lstream *str = DECODING_STREAM_DATA (XLSTREAM (stream))->other_end;
2360 Lisp_Object str_obj;
2362 /* We do not need to mark the coding systems or charsets stored
2363 within the stream because they are stored in a global list
2364 and automatically marked. */
2366 XSETLSTREAM (str_obj, str);
2367 mark_object (str_obj);
2368 if (str->imp->marker)
2369 return (str->imp->marker) (str_obj);
2374 /* Read SIZE bytes of data and store it into DATA. We are a decoding stream
2375 so we read data from the other end, decode it, and store it into DATA. */
2377 static Lstream_data_count
2378 decoding_reader (Lstream *stream, unsigned char *data, Lstream_data_count size)
2380 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2381 unsigned char *orig_data = data;
2382 Lstream_data_count read_size;
2383 int error_occurred = 0;
2385 /* We need to interface to mule_decode(), which expects to take some
2386 amount of data and store the result into a Dynarr. We have
2387 mule_decode() store into str->runoff, and take data from there
2390 /* We loop until we have enough data, reading chunks from the other
2391 end and decoding it. */
2394 /* Take data from the runoff if we can. Make sure to take at
2395 most SIZE bytes, and delete the data from the runoff. */
2396 if (Dynarr_length (str->runoff) > 0)
2398 Lstream_data_count chunk = min (size, (Lstream_data_count) Dynarr_length (str->runoff));
2399 memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
2400 Dynarr_delete_many (str->runoff, 0, chunk);
2406 break; /* No more room for data */
2408 if (str->flags & CODING_STATE_END)
2409 /* This means that on the previous iteration, we hit the EOF on
2410 the other end. We loop once more so that mule_decode() can
2411 output any final stuff it may be holding, or any "go back
2412 to a sane state" escape sequences. (This latter makes sense
2413 during encoding.) */
2416 /* Exhausted the runoff, so get some more. DATA has at least
2417 SIZE bytes left of storage in it, so it's OK to read directly
2418 into it. (We'll be overwriting above, after we've decoded it
2419 into the runoff.) */
2420 read_size = Lstream_read (str->other_end, data, size);
2427 /* There might be some more end data produced in the translation.
2428 See the comment above. */
2429 str->flags |= CODING_STATE_END;
2430 mule_decode (stream, (Extbyte *) data, str->runoff, read_size);
2433 if (data - orig_data == 0)
2434 return error_occurred ? -1 : 0;
2436 return data - orig_data;
2439 static Lstream_data_count
2440 decoding_writer (Lstream *stream, const unsigned char *data, Lstream_data_count size)
2442 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2443 Lstream_data_count retval;
2445 /* Decode all our data into the runoff, and then attempt to write
2446 it all out to the other end. Remove whatever chunk we succeeded
2448 mule_decode (stream, (Extbyte *) data, str->runoff, size);
2449 retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
2450 Dynarr_length (str->runoff));
2452 Dynarr_delete_many (str->runoff, 0, retval);
2453 /* Do NOT return retval. The return value indicates how much
2454 of the incoming data was written, not how many bytes were
2460 reset_decoding_stream (struct decoding_stream *str)
2463 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_ISO2022)
2465 Lisp_Object coding_system;
2466 XSETCODING_SYSTEM (coding_system, str->codesys);
2467 reset_iso2022 (coding_system, &str->iso2022);
2469 else if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_CCL)
2471 setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_DECODE (str->codesys));
2477 str->er_counter = 0;
2478 str->combined_char_count = 0;
2479 str->combining_table = Qnil;
2481 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT
2482 || CODING_SYSTEM_EOL_TYPE (str->codesys) == EOL_AUTODETECT)
2485 str->decst.eol_type = EOL_AUTODETECT;
2486 str->decst.mask = ~0;
2488 str->flags = str->cpos = 0;
2492 decoding_rewinder (Lstream *stream)
2494 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2495 reset_decoding_stream (str);
2496 Dynarr_reset (str->runoff);
2497 return Lstream_rewind (str->other_end);
2501 decoding_seekable_p (Lstream *stream)
2503 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2504 return Lstream_seekable_p (str->other_end);
2508 decoding_flusher (Lstream *stream)
2510 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2511 return Lstream_flush (str->other_end);
2515 decoding_closer (Lstream *stream)
2517 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2518 if (stream->flags & LSTREAM_FL_WRITE)
2520 str->flags |= CODING_STATE_END;
2521 decoding_writer (stream, 0, 0);
2523 Dynarr_free (str->runoff);
2525 #ifdef ENABLE_COMPOSITE_CHARS
2526 if (str->iso2022.composite_chars)
2527 Dynarr_free (str->iso2022.composite_chars);
2530 return Lstream_close (str->other_end);
2534 decoding_stream_coding_system (Lstream *stream)
2536 Lisp_Object coding_system;
2537 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2539 XSETCODING_SYSTEM (coding_system, str->codesys);
2540 return subsidiary_coding_system (coding_system, str->eol_type);
2544 set_decoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
2546 Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
2547 struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
2549 if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
2550 str->eol_type = CODING_SYSTEM_EOL_TYPE (cs);
2551 reset_decoding_stream (str);
2554 /* WARNING WARNING WARNING WARNING!!!!! If you open up a decoding
2555 stream for writing, no automatic code detection will be performed.
2556 The reason for this is that automatic code detection requires a
2557 seekable input. Things will also fail if you open a decoding
2558 stream for reading using a non-fully-specified coding system and
2559 a non-seekable input stream. */
2562 make_decoding_stream_1 (Lstream *stream, Lisp_Object codesys,
2565 Lstream *lstr = Lstream_new (lstream_decoding, mode);
2566 struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
2570 str->other_end = stream;
2571 str->runoff = (unsigned_char_dynarr *) Dynarr_new (unsigned_char);
2572 str->eol_type = EOL_AUTODETECT;
2573 if (!strcmp (mode, "r")
2574 && Lstream_seekable_p (stream))
2575 /* We can determine the coding system now. */
2576 determine_real_coding_system (stream, &codesys, &str->eol_type);
2577 set_decoding_stream_coding_system (lstr, codesys);
2578 str->decst.eol_type = str->eol_type;
2579 str->decst.mask = ~0;
2580 XSETLSTREAM (obj, lstr);
2585 make_decoding_input_stream (Lstream *stream, Lisp_Object codesys)
2587 return make_decoding_stream_1 (stream, codesys, "r");
2591 make_decoding_output_stream (Lstream *stream, Lisp_Object codesys)
2593 return make_decoding_stream_1 (stream, codesys, "w");
2596 /* Note: the decode_coding_* functions all take the same
2597 arguments as mule_decode(), which is to say some SRC data of
2598 size N, which is to be stored into dynamic array DST.
2599 DECODING is the stream within which the decoding is
2600 taking place, but no data is actually read from or
2601 written to that stream; that is handled in decoding_reader()
2602 or decoding_writer(). This allows the same functions to
2603 be used for both reading and writing. */
2606 mule_decode (Lstream *decoding, const Extbyte *src,
2607 unsigned_char_dynarr *dst, Lstream_data_count n)
2609 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
2611 /* If necessary, do encoding-detection now. We do this when
2612 we're a writing stream or a non-seekable reading stream,
2613 meaning that we can't just process the whole input,
2614 rewind, and start over. */
2616 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT ||
2617 str->eol_type == EOL_AUTODETECT)
2619 Lisp_Object codesys;
2621 XSETCODING_SYSTEM (codesys, str->codesys);
2622 detect_coding_type (&str->decst, src, n,
2623 CODING_SYSTEM_TYPE (str->codesys) !=
2624 CODESYS_AUTODETECT);
2625 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT &&
2626 str->decst.mask != ~0)
2627 /* #### This is cheesy. What we really ought to do is
2628 buffer up a certain amount of data so as to get a
2629 less random result. */
2630 codesys = coding_system_from_mask (str->decst.mask);
2631 str->eol_type = str->decst.eol_type;
2632 if (XCODING_SYSTEM (codesys) != str->codesys)
2634 /* Preserve the CODING_STATE_END flag in case it was set.
2635 If we erase it, bad things might happen. */
2636 int was_end = str->flags & CODING_STATE_END;
2637 set_decoding_stream_coding_system (decoding, codesys);
2639 str->flags |= CODING_STATE_END;
2643 switch (CODING_SYSTEM_TYPE (str->codesys))
2646 case CODESYS_INTERNAL:
2647 Dynarr_add_many (dst, src, n);
2650 case CODESYS_AUTODETECT:
2651 /* If we got this far and still haven't decided on the coding
2652 system, then do no conversion. */
2653 case CODESYS_NO_CONVERSION:
2654 decode_coding_no_conversion (decoding, src, dst, n);
2657 case CODESYS_SHIFT_JIS:
2658 decode_coding_sjis (decoding, src, dst, n);
2661 decode_coding_big5 (decoding, src, dst, n);
2664 decode_coding_ucs4 (decoding, src, dst, n);
2667 decode_coding_utf16 (decoding, src, dst, n);
2670 decode_coding_utf8 (decoding, src, dst, n);
2673 str->ccl.last_block = str->flags & CODING_STATE_END;
2674 /* When applying ccl program to stream, MUST NOT set NULL
2676 ccl_driver (&str->ccl, (src ? (unsigned char *)src : (unsigned char*)""),
2677 dst, n, 0, CCL_MODE_DECODING);
2679 case CODESYS_ISO2022:
2680 decode_coding_iso2022 (decoding, src, dst, n);
2688 DEFUN ("decode-coding-region", Fdecode_coding_region, 3, 4, 0, /*
2689 Decode the text between START and END which is encoded in CODING-SYSTEM.
2690 This is useful if you've read in encoded text from a file without decoding
2691 it (e.g. you read in a JIS-formatted file but used the `binary' or
2692 `no-conversion' coding system, so that it shows up as "^[$B!<!+^[(B").
2693 Return length of decoded text.
2694 BUFFER defaults to the current buffer if unspecified.
2696 (start, end, coding_system, buffer))
2699 struct buffer *buf = decode_buffer (buffer, 0);
2700 Lisp_Object instream, lb_outstream, de_outstream, outstream;
2701 Lstream *istr, *ostr;
2702 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
2704 get_buffer_range_char (buf, start, end, &b, &e, 0);
2706 barf_if_buffer_read_only (buf, b, e);
2708 coding_system = Fget_coding_system (coding_system);
2709 instream = make_lisp_buffer_input_stream (buf, b, e, 0);
2710 lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
2711 de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
2713 outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
2714 Fget_coding_system (Qbinary));
2715 istr = XLSTREAM (instream);
2716 ostr = XLSTREAM (outstream);
2717 GCPRO4 (instream, lb_outstream, de_outstream, outstream);
2719 /* The chain of streams looks like this:
2721 [BUFFER] <----- send through
2722 ------> [ENCODE AS BINARY]
2723 ------> [DECODE AS SPECIFIED]
2729 char tempbuf[1024]; /* some random amount */
2730 Bufpos newpos, even_newer_pos;
2731 Bufpos oldpos = lisp_buffer_stream_startpos (istr);
2732 Lstream_data_count size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
2736 newpos = lisp_buffer_stream_startpos (istr);
2737 Lstream_write (ostr, tempbuf, size_in_bytes);
2738 even_newer_pos = lisp_buffer_stream_startpos (istr);
2739 buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
2742 Lstream_close (istr);
2743 Lstream_close (ostr);
2745 Lstream_delete (istr);
2746 Lstream_delete (ostr);
2747 Lstream_delete (XLSTREAM (de_outstream));
2748 Lstream_delete (XLSTREAM (lb_outstream));
2753 /************************************************************************/
2754 /* Converting to an external encoding ("encoding") */
2755 /************************************************************************/
2757 /* An encoding stream is an output stream. When you create the
2758 stream, you specify the coding system that governs the encoding
2759 and another stream that the resulting encoded data is to be
2760 sent to, and then start sending data to it. */
2762 #define ENCODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, encoding)
2764 struct encoding_stream
2766 /* Coding system that governs the conversion. */
2767 Lisp_Coding_System *codesys;
2769 /* Stream that we read the encoded data from or
2770 write the decoded data to. */
2773 /* If we are reading, then we can return only a fixed amount of
2774 data, so if the conversion resulted in too much data, we store it
2775 here for retrieval the next time around. */
2776 unsigned_char_dynarr *runoff;
2778 /* FLAGS holds flags indicating the current state of the encoding.
2779 Some of these flags are dependent on the coding system. */
2782 /* CH holds a partially built-up character. Since we only deal
2783 with one- and two-byte characters at the moment, we only use
2784 this to store the first byte of a two-byte character. */
2787 /* Additional information used by the ISO2022 encoder. */
2790 /* CHARSET holds the character sets currently assigned to the G0
2791 through G3 registers. It is initialized from the array
2792 INITIAL_CHARSET in CODESYS. */
2793 Lisp_Object charset[4];
2795 /* Which registers are currently invoked into the left (GL) and
2796 right (GR) halves of the 8-bit encoding space? */
2797 int register_left, register_right;
2799 /* Whether we need to explicitly designate the charset in the
2800 G? register before using it. It is initialized from the
2801 array FORCE_CHARSET_ON_OUTPUT in CODESYS. */
2802 unsigned char force_charset_on_output[4];
2804 /* Other state variables that need to be preserved across
2806 Lisp_Object current_charset;
2808 int current_char_boundary;
2811 void (*encode_char) (struct encoding_stream *str, Emchar c,
2812 unsigned_char_dynarr *dst, unsigned int *flags);
2813 void (*finish) (struct encoding_stream *str,
2814 unsigned_char_dynarr *dst, unsigned int *flags);
2816 /* Additional information (the state of the running CCL program)
2817 used by the CCL encoder. */
2818 struct ccl_program ccl;
2822 static Lstream_data_count encoding_reader (Lstream *stream, unsigned char *data, Lstream_data_count size);
2823 static Lstream_data_count encoding_writer (Lstream *stream, const unsigned char *data,
2824 Lstream_data_count size);
2825 static int encoding_rewinder (Lstream *stream);
2826 static int encoding_seekable_p (Lstream *stream);
2827 static int encoding_flusher (Lstream *stream);
2828 static int encoding_closer (Lstream *stream);
2830 static Lisp_Object encoding_marker (Lisp_Object stream);
2832 DEFINE_LSTREAM_IMPLEMENTATION ("encoding", lstream_encoding,
2833 sizeof (struct encoding_stream));
2836 encoding_marker (Lisp_Object stream)
2838 Lstream *str = ENCODING_STREAM_DATA (XLSTREAM (stream))->other_end;
2839 Lisp_Object str_obj;
2841 /* We do not need to mark the coding systems or charsets stored
2842 within the stream because they are stored in a global list
2843 and automatically marked. */
2845 XSETLSTREAM (str_obj, str);
2846 mark_object (str_obj);
2847 if (str->imp->marker)
2848 return (str->imp->marker) (str_obj);
2853 /* Read SIZE bytes of data and store it into DATA. We are a encoding stream
2854 so we read data from the other end, encode it, and store it into DATA. */
2856 static Lstream_data_count
2857 encoding_reader (Lstream *stream, unsigned char *data, Lstream_data_count size)
2859 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2860 unsigned char *orig_data = data;
2861 Lstream_data_count read_size;
2862 int error_occurred = 0;
2864 /* We need to interface to mule_encode(), which expects to take some
2865 amount of data and store the result into a Dynarr. We have
2866 mule_encode() store into str->runoff, and take data from there
2869 /* We loop until we have enough data, reading chunks from the other
2870 end and encoding it. */
2873 /* Take data from the runoff if we can. Make sure to take at
2874 most SIZE bytes, and delete the data from the runoff. */
2875 if (Dynarr_length (str->runoff) > 0)
2877 int chunk = min ((int) size, Dynarr_length (str->runoff));
2878 memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
2879 Dynarr_delete_many (str->runoff, 0, chunk);
2885 break; /* No more room for data */
2887 if (str->flags & CODING_STATE_END)
2888 /* This means that on the previous iteration, we hit the EOF on
2889 the other end. We loop once more so that mule_encode() can
2890 output any final stuff it may be holding, or any "go back
2891 to a sane state" escape sequences. (This latter makes sense
2892 during encoding.) */
2895 /* Exhausted the runoff, so get some more. DATA at least SIZE bytes
2896 left of storage in it, so it's OK to read directly into it.
2897 (We'll be overwriting above, after we've encoded it into the
2899 read_size = Lstream_read (str->other_end, data, size);
2906 /* There might be some more end data produced in the translation.
2907 See the comment above. */
2908 str->flags |= CODING_STATE_END;
2909 mule_encode (stream, data, str->runoff, read_size);
2912 if (data == orig_data)
2913 return error_occurred ? -1 : 0;
2915 return data - orig_data;
2918 static Lstream_data_count
2919 encoding_writer (Lstream *stream, const unsigned char *data, Lstream_data_count size)
2921 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2922 Lstream_data_count retval;
2924 /* Encode all our data into the runoff, and then attempt to write
2925 it all out to the other end. Remove whatever chunk we succeeded
2927 mule_encode (stream, data, str->runoff, size);
2928 retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
2929 Dynarr_length (str->runoff));
2931 Dynarr_delete_many (str->runoff, 0, retval);
2932 /* Do NOT return retval. The return value indicates how much
2933 of the incoming data was written, not how many bytes were
2939 reset_encoding_stream (struct encoding_stream *str)
2942 switch (CODING_SYSTEM_TYPE (str->codesys))
2944 case CODESYS_ISO2022:
2948 str->encode_char = &char_encode_iso2022;
2949 str->finish = &char_finish_iso2022;
2950 for (i = 0; i < 4; i++)
2952 str->iso2022.charset[i] =
2953 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (str->codesys, i);
2954 str->iso2022.force_charset_on_output[i] =
2955 CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (str->codesys, i);
2957 str->iso2022.register_left = 0;
2958 str->iso2022.register_right = 1;
2959 str->iso2022.current_charset = Qnil;
2960 str->iso2022.current_half = 0;
2964 setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_ENCODE (str->codesys));
2967 str->encode_char = &char_encode_utf8;
2968 str->finish = &char_finish_utf8;
2971 str->encode_char = &char_encode_utf16;
2972 str->finish = &char_finish_utf16;
2975 str->encode_char = &char_encode_ucs4;
2976 str->finish = &char_finish_ucs4;
2978 case CODESYS_SHIFT_JIS:
2979 str->encode_char = &char_encode_shift_jis;
2980 str->finish = &char_finish_shift_jis;
2983 str->encode_char = &char_encode_big5;
2984 str->finish = &char_finish_big5;
2990 str->iso2022.current_char_boundary = 0;
2991 str->flags = str->ch = 0;
2995 encoding_rewinder (Lstream *stream)
2997 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2998 reset_encoding_stream (str);
2999 Dynarr_reset (str->runoff);
3000 return Lstream_rewind (str->other_end);
3004 encoding_seekable_p (Lstream *stream)
3006 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
3007 return Lstream_seekable_p (str->other_end);
3011 encoding_flusher (Lstream *stream)
3013 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
3014 return Lstream_flush (str->other_end);
3018 encoding_closer (Lstream *stream)
3020 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
3021 if (stream->flags & LSTREAM_FL_WRITE)
3023 str->flags |= CODING_STATE_END;
3024 encoding_writer (stream, 0, 0);
3026 Dynarr_free (str->runoff);
3027 return Lstream_close (str->other_end);
3031 encoding_stream_coding_system (Lstream *stream)
3033 Lisp_Object coding_system;
3034 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
3036 XSETCODING_SYSTEM (coding_system, str->codesys);
3037 return coding_system;
3041 set_encoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
3043 Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
3044 struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
3046 reset_encoding_stream (str);
3050 make_encoding_stream_1 (Lstream *stream, Lisp_Object codesys,
3053 Lstream *lstr = Lstream_new (lstream_encoding, mode);
3054 struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
3058 str->runoff = Dynarr_new (unsigned_char);
3059 str->other_end = stream;
3060 set_encoding_stream_coding_system (lstr, codesys);
3061 XSETLSTREAM (obj, lstr);
3066 make_encoding_input_stream (Lstream *stream, Lisp_Object codesys)
3068 return make_encoding_stream_1 (stream, codesys, "r");
3072 make_encoding_output_stream (Lstream *stream, Lisp_Object codesys)
3074 return make_encoding_stream_1 (stream, codesys, "w");
3077 /* Convert N bytes of internally-formatted data stored in SRC to an
3078 external format, according to the encoding stream ENCODING.
3079 Store the encoded data into DST. */
3082 mule_encode (Lstream *encoding, const Bufbyte *src,
3083 unsigned_char_dynarr *dst, Lstream_data_count n)
3085 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3087 switch (CODING_SYSTEM_TYPE (str->codesys))
3090 case CODESYS_INTERNAL:
3091 Dynarr_add_many (dst, src, n);
3094 case CODESYS_AUTODETECT:
3095 /* If we got this far and still haven't decided on the coding
3096 system, then do no conversion. */
3097 case CODESYS_NO_CONVERSION:
3098 encode_coding_no_conversion (encoding, src, dst, n);
3102 str->ccl.last_block = str->flags & CODING_STATE_END;
3103 /* When applying ccl program to stream, MUST NOT set NULL
3105 ccl_driver (&str->ccl, ((src) ? src : (unsigned char*)""),
3106 dst, n, 0, CCL_MODE_ENCODING);
3110 text_encode_generic (encoding, src, dst, n);
3114 DEFUN ("encode-coding-region", Fencode_coding_region, 3, 4, 0, /*
3115 Encode the text between START and END using CODING-SYSTEM.
3116 This will, for example, convert Japanese characters into stuff such as
3117 "^[$B!<!+^[(B" if you use the JIS encoding. Return length of encoded
3118 text. BUFFER defaults to the current buffer if unspecified.
3120 (start, end, coding_system, buffer))
3123 struct buffer *buf = decode_buffer (buffer, 0);
3124 Lisp_Object instream, lb_outstream, de_outstream, outstream;
3125 Lstream *istr, *ostr;
3126 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
3128 get_buffer_range_char (buf, start, end, &b, &e, 0);
3130 barf_if_buffer_read_only (buf, b, e);
3132 coding_system = Fget_coding_system (coding_system);
3133 instream = make_lisp_buffer_input_stream (buf, b, e, 0);
3134 lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
3135 de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
3136 Fget_coding_system (Qbinary));
3137 outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
3139 istr = XLSTREAM (instream);
3140 ostr = XLSTREAM (outstream);
3141 GCPRO4 (instream, outstream, de_outstream, lb_outstream);
3142 /* The chain of streams looks like this:
3144 [BUFFER] <----- send through
3145 ------> [ENCODE AS SPECIFIED]
3146 ------> [DECODE AS BINARY]
3151 char tempbuf[1024]; /* some random amount */
3152 Bufpos newpos, even_newer_pos;
3153 Bufpos oldpos = lisp_buffer_stream_startpos (istr);
3154 Lstream_data_count size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
3158 newpos = lisp_buffer_stream_startpos (istr);
3159 Lstream_write (ostr, tempbuf, size_in_bytes);
3160 even_newer_pos = lisp_buffer_stream_startpos (istr);
3161 buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
3167 lisp_buffer_stream_startpos (XLSTREAM (instream)) - b;
3168 Lstream_close (istr);
3169 Lstream_close (ostr);
3171 Lstream_delete (istr);
3172 Lstream_delete (ostr);
3173 Lstream_delete (XLSTREAM (de_outstream));
3174 Lstream_delete (XLSTREAM (lb_outstream));
3175 return make_int (retlen);
3182 text_encode_generic (Lstream *encoding, const Bufbyte *src,
3183 unsigned_char_dynarr *dst, Lstream_data_count n)
3186 unsigned char char_boundary;
3187 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3188 unsigned int flags = str->flags;
3189 Emchar ch = str->ch;
3191 char_boundary = str->iso2022.current_char_boundary;
3197 if (char_boundary == 0)
3225 (*str->encode_char) (str, c, dst, &flags);
3227 else if (char_boundary == 1)
3229 (*str->encode_char) (str, (ch << 6) | (c & 0x3f), dst, &flags);
3235 ch = (ch << 6) | (c & 0x3f);
3240 if ((char_boundary == 0) && (flags & CODING_STATE_END))
3242 (*str->finish) (str, dst, &flags);
3247 str->iso2022.current_char_boundary = char_boundary;
3252 /************************************************************************/
3253 /* entity reference */
3254 /************************************************************************/
3257 decode_flush_er_chars (struct decoding_stream *str, unsigned_char_dynarr* dst);
3259 decode_flush_er_chars (struct decoding_stream *str, unsigned_char_dynarr* dst)
3261 if ( str->er_counter > 0)
3263 Dynarr_add_many (dst, str->er_buf, str->er_counter);
3264 str->er_counter = 0;
3268 EXFUN (Fregexp_quote, 1);
3270 void decode_add_er_char (struct decoding_stream *str, Emchar character,
3271 unsigned_char_dynarr* dst);
3273 decode_add_er_char (struct decoding_stream *str, Emchar c,
3274 unsigned_char_dynarr* dst)
3276 if (str->er_counter == 0)
3278 if (CODING_SYSTEM_USE_ENTITY_REFERENCE (str->codesys)
3281 str->er_buf[0] = '&';
3285 DECODE_ADD_UCS_CHAR (c, dst);
3289 Lisp_Object string = make_string (str->er_buf,
3296 Lisp_Object char_type;
3299 for ( rest = Vcoded_charset_entity_reference_alist;
3300 !NILP (rest); rest = Fcdr (rest) )
3306 char_type = XCDR (ccs);
3311 if (NILP (ccs = Ffind_charset (ccs)))
3320 pat = Fregexp_quote (pat);
3327 pat = concat3 (build_string ("^&"),
3328 pat, build_string ("\\([0-9]+\\)$"));
3331 else if (EQ (ret, Qx))
3333 pat = concat3 (build_string ("^&"),
3334 pat, build_string ("\\([0-9a-f]+\\)$"));
3337 else if (EQ (ret, QX))
3339 pat = concat3 (build_string ("^&"),
3340 pat, build_string ("\\([0-9A-F]+\\)$"));
3346 if (!NILP (Fstring_match (pat, string, Qnil, Qnil)))
3349 = XINT (Fstring_to_number
3350 (Fsubstring (string,
3351 Fmatch_beginning (make_int (1)),
3352 Fmatch_end (make_int (1))),
3356 ? DECODE_CHAR (ccs, code, 0)
3357 : decode_builtin_char (ccs, code);
3360 DECODE_ADD_UCS_CHAR (chr, dst);
3363 Dynarr_add_many (dst, str->er_buf, str->er_counter);
3364 Dynarr_add (dst, ';');
3370 if (!NILP (Fstring_match (build_string ("^&MCS-\\([0-9A-F]+\\)$"),
3371 string, Qnil, Qnil)))
3374 = XUINT (Fstring_to_number
3375 (Fsubstring (string,
3376 Fmatch_beginning (make_int (1)),
3377 Fmatch_end (make_int (1))),
3380 DECODE_ADD_UCS_CHAR (code, dst);
3384 Dynarr_add_many (dst, str->er_buf, str->er_counter);
3385 Dynarr_add (dst, ';');
3388 str->er_counter = 0;
3390 else if ( (str->er_counter >= ER_BUF_SIZE) || (c >= 0x7F) )
3392 Dynarr_add_many (dst, str->er_buf, str->er_counter);
3393 str->er_counter = 0;
3394 DECODE_ADD_UCS_CHAR (c, dst);
3397 str->er_buf[str->er_counter++] = c;
3400 void char_encode_as_entity_reference (Emchar ch, char* buf);
3402 char_encode_as_entity_reference (Emchar ch, char* buf)
3404 Lisp_Object rest = Vcoded_charset_entity_reference_alist;
3407 Lisp_Object char_type;
3408 int format_columns, idx;
3409 char format[ER_BUF_SIZE];
3411 while (!NILP (rest))
3417 char_type = XCDR (ccs);
3422 if (!NILP (ccs = Ffind_charset (ccs)))
3425 = charset_code_point (ccs, ch,
3427 CHAR_ALL : CHAR_ISOLATED_ONLY );
3429 if ( (code_point >= 0)
3430 && ( NILP (char_type)
3432 || ( charset_code_point (ccs, ch, CHAR_DEFINED_ONLY)
3436 || ( DECODE_CHAR (ccs, code_point, 0) != ch )
3445 if ( STRINGP (ret) &&
3446 ( (idx = XSTRING_LENGTH (ret)) <= (ER_BUF_SIZE - 4) ) )
3449 strncpy (&format[1], XSTRING_DATA (ret), idx);
3459 format[idx++] = '%';
3460 format_columns = XINT (ret);
3461 if ( (2 <= format_columns) && (format_columns <= 8)
3462 && (idx + format_columns <= ER_BUF_SIZE - 1) )
3464 format [idx++] = '0';
3465 format [idx++] = '0' + format_columns;
3474 format [idx++] = 'd';
3475 else if (EQ (ret, Qx))
3476 format [idx++] = 'x';
3477 else if (EQ (ret, QX))
3478 format [idx++] = 'X';
3481 format [idx++] = ';';
3484 sprintf (buf, format, code_point);
3491 sprintf (buf, "&MCS-%08X;", ch);
3495 /************************************************************************/
3496 /* character composition */
3497 /************************************************************************/
3498 extern Lisp_Object Qcomposition, Qrep_decomposition;
3501 COMPOSE_FLUSH_CHARS (struct decoding_stream *str, unsigned_char_dynarr* dst);
3503 COMPOSE_FLUSH_CHARS (struct decoding_stream *str, unsigned_char_dynarr* dst)
3507 for (i = 0; i < str->combined_char_count; i++)
3508 decode_add_er_char (str, str->combined_chars[i], dst);
3509 str->combined_char_count = 0;
3510 str->combining_table = Qnil;
3513 void COMPOSE_ADD_CHAR (struct decoding_stream *str, Emchar character,
3514 unsigned_char_dynarr* dst);
3516 COMPOSE_ADD_CHAR (struct decoding_stream *str,
3517 Emchar character, unsigned_char_dynarr* dst)
3519 if (CODING_SYSTEM_DISABLE_COMPOSITION (str->codesys))
3520 decode_add_er_char (str, character, dst);
3521 else if (!CONSP (str->combining_table))
3524 = Fchar_feature (make_char (character), Qcomposition, Qnil,
3528 decode_add_er_char (str, character, dst);
3531 str->combined_chars[0] = character;
3532 str->combined_char_count = 1;
3533 str->combining_table = ret;
3539 = Fcdr (Fassq (make_char (character), str->combining_table));
3543 Emchar char2 = XCHARVAL (ret);
3544 Lisp_Object ret2 = Fchar_feature (ret, Qcomposition, Qnil,
3549 decode_add_er_char (str, char2, dst);
3550 str->combined_char_count = 0;
3551 str->combining_table = Qnil;
3555 str->combined_chars[0] = char2;
3556 str->combined_char_count = 1;
3557 str->combining_table = ret2;
3562 ret = Fchar_feature (make_char (character), Qcomposition, Qnil,
3565 COMPOSE_FLUSH_CHARS (str, dst);
3567 decode_add_er_char (str, character, dst);
3570 str->combined_chars[0] = character;
3571 str->combined_char_count = 1;
3572 str->combining_table = ret;
3577 #else /* not UTF2000 */
3578 #define COMPOSE_FLUSH_CHARS(str, dst)
3579 #define COMPOSE_ADD_CHAR(str, ch, dst) DECODE_ADD_UCS_CHAR (ch, dst)
3580 #endif /* UTF2000 */
3583 /************************************************************************/
3584 /* Shift-JIS methods */
3585 /************************************************************************/
3587 /* Shift-JIS is a coding system encoding three character sets: ASCII, right
3588 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
3589 as is. A character of JISX0201-Kana (DIMENSION1_CHARS94 character set) is
3590 encoded by "position-code + 0x80". A character of JISX0208
3591 (DIMENSION2_CHARS94 character set) is encoded in 2-byte but two
3592 position-codes are divided and shifted so that it fit in the range
3595 --- CODE RANGE of Shift-JIS ---
3596 (character set) (range)
3598 JISX0201-Kana 0xA0 .. 0xDF
3599 JISX0208 (1st byte) 0x80 .. 0x9F and 0xE0 .. 0xEF
3600 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
3601 -------------------------------
3605 /* Is this the first byte of a Shift-JIS two-byte char? */
3607 #define BYTE_SJIS_TWO_BYTE_1_P(c) \
3608 (((c) >= 0x81 && (c) <= 0x9F) || ((c) >= 0xE0 && (c) <= 0xEF))
3610 /* Is this the second byte of a Shift-JIS two-byte char? */
3612 #define BYTE_SJIS_TWO_BYTE_2_P(c) \
3613 (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0x80 && (c) <= 0xFC))
3615 #define BYTE_SJIS_KATAKANA_P(c) \
3616 ((c) >= 0xA1 && (c) <= 0xDF)
3619 detect_coding_sjis (struct detection_state *st, const Extbyte *src, Lstream_data_count n)
3623 unsigned char c = *(unsigned char *)src++;
3624 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
3626 if (st->shift_jis.in_second_byte)
3628 st->shift_jis.in_second_byte = 0;
3632 else if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
3633 st->shift_jis.in_second_byte = 1;
3635 return CODING_CATEGORY_SHIFT_JIS_MASK;
3638 /* Convert Shift-JIS data to internal format. */
3641 decode_coding_sjis (Lstream *decoding, const Extbyte *src,
3642 unsigned_char_dynarr *dst, Lstream_data_count n)
3644 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3645 unsigned int flags = str->flags;
3646 unsigned int cpos = str->cpos;
3647 eol_type_t eol_type = str->eol_type;
3651 unsigned char c = *(unsigned char *)src++;
3655 /* Previous character was first byte of Shift-JIS Kanji char. */
3656 if (BYTE_SJIS_TWO_BYTE_2_P (c))
3658 unsigned char e1, e2;
3660 DECODE_SJIS (cpos, c, e1, e2);
3662 DECODE_ADD_UCS_CHAR(MAKE_CHAR(Vcharset_japanese_jisx0208,
3666 Dynarr_add (dst, LEADING_BYTE_JAPANESE_JISX0208);
3667 Dynarr_add (dst, e1);
3668 Dynarr_add (dst, e2);
3673 DECODE_ADD_BINARY_CHAR (cpos, dst);
3674 DECODE_ADD_BINARY_CHAR (c, dst);
3680 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3681 if (BYTE_SJIS_TWO_BYTE_1_P (c))
3683 else if (BYTE_SJIS_KATAKANA_P (c))
3686 DECODE_ADD_UCS_CHAR(MAKE_CHAR(Vcharset_katakana_jisx0201,
3689 Dynarr_add (dst, LEADING_BYTE_KATAKANA_JISX0201);
3690 Dynarr_add (dst, c);
3695 DECODE_ADD_UCS_CHAR(MAKE_CHAR(Vcharset_latin_jisx0201,
3699 DECODE_ADD_BINARY_CHAR (c, dst);
3701 label_continue_loop:;
3704 DECODE_HANDLE_END_OF_CONVERSION (flags, cpos, dst);
3710 /* Convert internal character representation to Shift_JIS. */
3713 char_encode_shift_jis (struct encoding_stream *str, Emchar ch,
3714 unsigned_char_dynarr *dst, unsigned int *flags)
3716 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
3720 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3721 Dynarr_add (dst, '\r');
3722 if (eol_type != EOL_CR)
3723 Dynarr_add (dst, ch);
3727 unsigned int s1, s2;
3729 int code_point = charset_code_point (Vcharset_latin_jisx0201, ch, 0);
3731 if (code_point >= 0)
3732 Dynarr_add (dst, code_point);
3733 else if ((code_point
3734 = charset_code_point (Vcharset_japanese_jisx0208_1990, ch, 0))
3737 ENCODE_SJIS ((code_point >> 8) | 0x80,
3738 (code_point & 0xFF) | 0x80, s1, s2);
3739 Dynarr_add (dst, s1);
3740 Dynarr_add (dst, s2);
3742 else if ((code_point
3743 = charset_code_point (Vcharset_katakana_jisx0201, ch, 0))
3745 Dynarr_add (dst, code_point | 0x80);
3746 else if ((code_point
3747 = charset_code_point (Vcharset_japanese_jisx0208, ch, 0))
3750 ENCODE_SJIS ((code_point >> 8) | 0x80,
3751 (code_point & 0xFF) | 0x80, s1, s2);
3752 Dynarr_add (dst, s1);
3753 Dynarr_add (dst, s2);
3755 else if ((code_point = charset_code_point (Vcharset_ascii, ch, 0))
3757 Dynarr_add (dst, code_point);
3759 Dynarr_add (dst, '?');
3761 Lisp_Object charset;
3762 unsigned int c1, c2;
3764 BREAKUP_CHAR (ch, charset, c1, c2);
3766 if (EQ(charset, Vcharset_katakana_jisx0201))
3768 Dynarr_add (dst, c1 | 0x80);
3772 Dynarr_add (dst, c1);
3774 else if (EQ(charset, Vcharset_japanese_jisx0208))
3776 ENCODE_SJIS (c1 | 0x80, c2 | 0x80, s1, s2);
3777 Dynarr_add (dst, s1);
3778 Dynarr_add (dst, s2);
3781 Dynarr_add (dst, '?');
3787 char_finish_shift_jis (struct encoding_stream *str, unsigned_char_dynarr *dst,
3788 unsigned int *flags)
3792 DEFUN ("decode-shift-jis-char", Fdecode_shift_jis_char, 1, 1, 0, /*
3793 Decode a JISX0208 character of Shift-JIS coding-system.
3794 CODE is the character code in Shift-JIS as a cons of type bytes.
3795 Return the corresponding character.
3799 unsigned char c1, c2, s1, s2;
3802 CHECK_INT (XCAR (code));
3803 CHECK_INT (XCDR (code));
3804 s1 = XINT (XCAR (code));
3805 s2 = XINT (XCDR (code));
3806 if (BYTE_SJIS_TWO_BYTE_1_P (s1) &&
3807 BYTE_SJIS_TWO_BYTE_2_P (s2))
3809 DECODE_SJIS (s1, s2, c1, c2);
3810 return make_char (MAKE_CHAR (Vcharset_japanese_jisx0208,
3811 c1 & 0x7F, c2 & 0x7F));
3817 DEFUN ("encode-shift-jis-char", Fencode_shift_jis_char, 1, 1, 0, /*
3818 Encode a JISX0208 character CHARACTER to SHIFT-JIS coding-system.
3819 Return the corresponding character code in SHIFT-JIS as a cons of two bytes.
3823 Lisp_Object charset;
3826 CHECK_CHAR_COERCE_INT (character);
3827 BREAKUP_CHAR (XCHAR (character), charset, c1, c2);
3828 if (EQ (charset, Vcharset_japanese_jisx0208))
3830 ENCODE_SJIS (c1 | 0x80, c2 | 0x80, s1, s2);
3831 return Fcons (make_int (s1), make_int (s2));
3838 /************************************************************************/
3840 /************************************************************************/
3842 /* BIG5 is a coding system encoding two character sets: ASCII and
3843 Big5. An ASCII character is encoded as is. Big5 is a two-byte
3844 character set and is encoded in two-byte.
3846 --- CODE RANGE of BIG5 ---
3847 (character set) (range)
3849 Big5 (1st byte) 0xA1 .. 0xFE
3850 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
3851 --------------------------
3853 Since the number of characters in Big5 is larger than maximum
3854 characters in Emacs' charset (96x96), it can't be handled as one
3855 charset. So, in Emacs, Big5 is divided into two: `charset-big5-1'
3856 and `charset-big5-2'. Both <type>s are DIMENSION2_CHARS94. The former
3857 contains frequently used characters and the latter contains less
3858 frequently used characters. */
3861 #define BYTE_BIG5_TWO_BYTE_1_P(c) \
3862 ((c) >= 0x81 && (c) <= 0xFE)
3864 #define BYTE_BIG5_TWO_BYTE_1_P(c) \
3865 ((c) >= 0xA1 && (c) <= 0xFE)
3868 /* Is this the second byte of a Shift-JIS two-byte char? */
3870 #define BYTE_BIG5_TWO_BYTE_2_P(c) \
3871 (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0xA1 && (c) <= 0xFE))
3873 /* Number of Big5 characters which have the same code in 1st byte. */
3875 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
3877 /* Code conversion macros. These are macros because they are used in
3878 inner loops during code conversion.
3880 Note that temporary variables in macros introduce the classic
3881 dynamic-scoping problems with variable names. We use capital-
3882 lettered variables in the assumption that XEmacs does not use
3883 capital letters in variables except in a very formalized way
3886 /* Convert Big5 code (b1, b2) into its internal string representation
3889 /* There is a much simpler way to split the Big5 charset into two.
3890 For the moment I'm going to leave the algorithm as-is because it
3891 claims to separate out the most-used characters into a single
3892 charset, which perhaps will lead to optimizations in various
3895 The way the algorithm works is something like this:
3897 Big5 can be viewed as a 94x157 charset, where the row is
3898 encoded into the bytes 0xA1 .. 0xFE and the column is encoded
3899 into the bytes 0x40 .. 0x7E and 0xA1 .. 0xFE. As for frequency,
3900 the split between low and high column numbers is apparently
3901 meaningless; ascending rows produce less and less frequent chars.
3902 Therefore, we assign the lower half of rows (0xA1 .. 0xC8) to
3903 the first charset, and the upper half (0xC9 .. 0xFE) to the
3904 second. To do the conversion, we convert the character into
3905 a single number where 0 .. 156 is the first row, 157 .. 313
3906 is the second, etc. That way, the characters are ordered by
3907 decreasing frequency. Then we just chop the space in two
3908 and coerce the result into a 94x94 space.
3911 #define DECODE_BIG5(b1, b2, lb, c1, c2) do \
3913 int B1 = b1, B2 = b2; \
3915 = (B1 - 0xA1) * BIG5_SAME_ROW + B2 - (B2 < 0x7F ? 0x40 : 0x62); \
3919 lb = LEADING_BYTE_CHINESE_BIG5_1; \
3923 lb = LEADING_BYTE_CHINESE_BIG5_2; \
3924 I -= (BIG5_SAME_ROW) * (0xC9 - 0xA1); \
3926 c1 = I / (0xFF - 0xA1) + 0xA1; \
3927 c2 = I % (0xFF - 0xA1) + 0xA1; \
3930 /* Convert the internal string representation of a Big5 character
3931 (lb, c1, c2) into Big5 code (b1, b2). */
3933 #define ENCODE_BIG5(lb, c1, c2, b1, b2) do \
3935 unsigned int I = ((c1) - 0xA1) * (0xFF - 0xA1) + ((c2) - 0xA1); \
3937 if (lb == LEADING_BYTE_CHINESE_BIG5_2) \
3939 I += BIG5_SAME_ROW * (0xC9 - 0xA1); \
3941 b1 = I / BIG5_SAME_ROW + 0xA1; \
3942 b2 = I % BIG5_SAME_ROW; \
3943 b2 += b2 < 0x3F ? 0x40 : 0x62; \
3947 detect_coding_big5 (struct detection_state *st, const Extbyte *src, Lstream_data_count n)
3951 unsigned char c = *(unsigned char *)src++;
3952 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO
3954 || (c >= 0x80 && c <= 0xA0)
3958 if (st->big5.in_second_byte)
3960 st->big5.in_second_byte = 0;
3961 if (c < 0x40 || (c >= 0x80 && c <= 0xA0))
3971 st->big5.in_second_byte = 1;
3973 return CODING_CATEGORY_BIG5_MASK;
3976 /* Convert Big5 data to internal format. */
3979 decode_coding_big5 (Lstream *decoding, const Extbyte *src,
3980 unsigned_char_dynarr *dst, Lstream_data_count n)
3982 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3983 unsigned int flags = str->flags;
3984 unsigned int cpos = str->cpos;
3985 eol_type_t eol_type = str->eol_type;
3988 = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (DECODING_STREAM_DATA
3989 (decoding)->codesys, 1);
3994 unsigned char c = *(unsigned char *)src++;
3997 /* Previous character was first byte of Big5 char. */
3998 if (BYTE_BIG5_TWO_BYTE_2_P (c))
4001 int code_point = (cpos << 8) | c;
4002 Emchar char_id = decode_defined_char (ccs, code_point, 0);
4006 = DECODE_CHAR (Vcharset_chinese_big5, code_point, 0);
4007 DECODE_ADD_UCS_CHAR (char_id, dst);
4009 unsigned char b1, b2, b3;
4010 DECODE_BIG5 (cpos, c, b1, b2, b3);
4011 Dynarr_add (dst, b1);
4012 Dynarr_add (dst, b2);
4013 Dynarr_add (dst, b3);
4018 DECODE_ADD_BINARY_CHAR (cpos, dst);
4019 DECODE_ADD_BINARY_CHAR (c, dst);
4025 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
4026 if (BYTE_BIG5_TWO_BYTE_1_P (c))
4028 decode_flush_er_chars (str, dst);
4033 decode_flush_er_chars (str, dst);
4034 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
4035 DECODE_ADD_BINARY_CHAR (c, dst);
4039 /* DECODE_ADD_BINARY_CHAR (c, dst); */
4040 decode_add_er_char (str, c, dst);
4043 label_continue_loop:;
4046 /* DECODE_HANDLE_END_OF_CONVERSION (flags, cpos, dst); */
4047 if (flags & CODING_STATE_END)
4049 decode_flush_er_chars (str, dst);
4050 DECODE_OUTPUT_PARTIAL_CHAR (cpos);
4051 if (flags & CODING_STATE_CR)
4052 Dynarr_add (dst, '\r');
4059 /* Convert internally-formatted data to Big5. */
4062 char_encode_big5 (struct encoding_stream *str, Emchar ch,
4063 unsigned_char_dynarr *dst, unsigned int *flags)
4065 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
4069 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
4070 Dynarr_add (dst, '\r');
4071 if (eol_type != EOL_CR)
4072 Dynarr_add (dst, ch);
4079 = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (str->codesys, 1);
4081 if ((code_point = charset_code_point (Vcharset_ascii, ch, 0)) >= 0)
4082 Dynarr_add (dst, code_point);
4083 else if ((code_point = charset_code_point (ccs, ch, 0)) >= 0)
4085 Dynarr_add (dst, code_point >> 8);
4086 Dynarr_add (dst, code_point & 0xFF);
4088 else if ((code_point
4089 = charset_code_point (Vcharset_chinese_big5, ch, 0)) >= 0)
4091 Dynarr_add (dst, code_point >> 8);
4092 Dynarr_add (dst, code_point & 0xFF);
4094 else if ((code_point
4095 = charset_code_point (Vcharset_chinese_big5_1, ch, 0)) >= 0)
4098 = ((code_point >> 8) - 33) * (0xFF - 0xA1)
4099 + ((code_point & 0xFF) - 33);
4100 unsigned char b1 = I / BIG5_SAME_ROW + 0xA1;
4101 unsigned char b2 = I % BIG5_SAME_ROW;
4103 b2 += b2 < 0x3F ? 0x40 : 0x62;
4104 Dynarr_add (dst, b1);
4105 Dynarr_add (dst, b2);
4107 else if ((code_point
4108 = charset_code_point (Vcharset_chinese_big5_2, ch, 0)) >= 0)
4111 = ((code_point >> 8) - 33) * (0xFF - 0xA1)
4112 + ((code_point & 0xFF) - 33);
4113 unsigned char b1, b2;
4115 I += BIG5_SAME_ROW * (0xC9 - 0xA1);
4116 b1 = I / BIG5_SAME_ROW + 0xA1;
4117 b2 = I % BIG5_SAME_ROW;
4118 b2 += b2 < 0x3F ? 0x40 : 0x62;
4119 Dynarr_add (dst, b1);
4120 Dynarr_add (dst, b2);
4122 else if (CODING_SYSTEM_USE_ENTITY_REFERENCE (str->codesys))
4126 char_encode_as_entity_reference (ch, buf);
4127 Dynarr_add_many (dst, buf, strlen (buf));
4130 Dynarr_add (dst, '?');
4137 char_finish_big5 (struct encoding_stream *str, unsigned_char_dynarr *dst,
4138 unsigned int *flags)
4143 DEFUN ("decode-big5-char", Fdecode_big5_char, 1, 1, 0, /*
4144 Decode a Big5 character CODE of BIG5 coding-system.
4145 CODE is the character code in BIG5, a cons of two integers.
4146 Return the corresponding character.
4150 unsigned char c1, c2, b1, b2;
4153 CHECK_INT (XCAR (code));
4154 CHECK_INT (XCDR (code));
4155 b1 = XINT (XCAR (code));
4156 b2 = XINT (XCDR (code));
4157 if (BYTE_BIG5_TWO_BYTE_1_P (b1) &&
4158 BYTE_BIG5_TWO_BYTE_2_P (b2))
4160 Charset_ID leading_byte;
4161 Lisp_Object charset;
4162 DECODE_BIG5 (b1, b2, leading_byte, c1, c2);
4163 charset = CHARSET_BY_LEADING_BYTE (leading_byte);
4164 return make_char (MAKE_CHAR (charset, c1 & 0x7F, c2 & 0x7F));
4170 DEFUN ("encode-big5-char", Fencode_big5_char, 1, 1, 0, /*
4171 Encode the Big5 character CHARACTER in the BIG5 coding-system.
4172 Return the corresponding character code in Big5.
4176 Lisp_Object charset;
4179 CHECK_CHAR_COERCE_INT (character);
4180 BREAKUP_CHAR (XCHAR (character), charset, c1, c2);
4181 if (EQ (charset, Vcharset_chinese_big5_1) ||
4182 EQ (charset, Vcharset_chinese_big5_2))
4184 ENCODE_BIG5 (XCHARSET_LEADING_BYTE (charset), c1 | 0x80, c2 | 0x80,
4186 return Fcons (make_int (b1), make_int (b2));
4193 /************************************************************************/
4195 /************************************************************************/
4198 detect_coding_ucs4 (struct detection_state *st, const Extbyte *src, Lstream_data_count n)
4202 unsigned char c = *(unsigned char *)src++;
4203 switch (st->ucs4.in_byte)
4212 st->ucs4.in_byte = 0;
4218 return CODING_CATEGORY_UCS4_MASK;
4222 decode_coding_ucs4 (Lstream *decoding, const Extbyte *src,
4223 unsigned_char_dynarr *dst, Lstream_data_count n)
4225 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
4226 unsigned int flags = str->flags;
4227 unsigned int cpos = str->cpos;
4228 unsigned char counter = str->counter;
4232 unsigned char c = *(unsigned char *)src++;
4240 DECODE_ADD_UCS_CHAR ((cpos << 8) | c, dst);
4245 cpos = ( cpos << 8 ) | c;
4249 if (counter & CODING_STATE_END)
4250 DECODE_OUTPUT_PARTIAL_CHAR (cpos);
4254 str->counter = counter;
4258 char_encode_ucs4 (struct encoding_stream *str, Emchar ch,
4259 unsigned_char_dynarr *dst, unsigned int *flags)
4261 Dynarr_add (dst, ch >> 24);
4262 Dynarr_add (dst, ch >> 16);
4263 Dynarr_add (dst, ch >> 8);
4264 Dynarr_add (dst, ch );
4268 char_finish_ucs4 (struct encoding_stream *str, unsigned_char_dynarr *dst,
4269 unsigned int *flags)
4274 /************************************************************************/
4275 /* UTF-16 methods */
4276 /************************************************************************/
4279 detect_coding_utf16 (struct detection_state *st, const Extbyte *src, Lstream_data_count n)
4281 return CODING_CATEGORY_UTF16_MASK;
4285 decode_coding_utf16 (Lstream *decoding, const Extbyte *src,
4286 unsigned_char_dynarr *dst, Lstream_data_count n)
4288 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
4289 unsigned int flags = str->flags;
4290 unsigned int cpos = str->cpos;
4291 unsigned char counter = str->counter & 3;
4292 unsigned char byte_order = str->counter >> 2;
4293 eol_type_t eol_type = str->eol_type;
4297 unsigned char c = *(unsigned char *)src++;
4303 else if (counter == 1)
4307 if (byte_order == 0)
4308 code = (c << 8) | cpos;
4310 code = (cpos << 8) | c;
4313 code = ((code & 0xFF) << 8) | (code >> 8);
4314 if ( byte_order == 0 )
4319 if ( (0xD800 <= code) && (code <= 0xDBFF) )
4330 DECODE_HANDLE_EOL_TYPE (eol_type, code, flags, dst);
4331 DECODE_ADD_UCS_CHAR (code, dst);
4335 else if (counter == 2)
4337 cpos = (cpos << 8) | c;
4345 ? (c << 8) | (cpos & 0xFF)
4346 : ((cpos & 0xFF) << 8) | c;
4348 DECODE_ADD_UCS_CHAR ((x - 0xD800) * 0x400 + (y - 0xDC00)
4353 label_continue_loop:;
4355 if (counter & CODING_STATE_END)
4356 DECODE_OUTPUT_PARTIAL_CHAR (cpos);
4360 str->counter = (byte_order << 2) | counter;
4364 char_encode_utf16 (struct encoding_stream *str, Emchar ch,
4365 unsigned_char_dynarr *dst, unsigned int *flags)
4369 Dynarr_add (dst, ch);
4370 Dynarr_add (dst, ch >> 8);
4374 int y = ((ch - 0x10000) / 0x400) + 0xD800;
4375 int z = ((ch - 0x10000) % 0x400) + 0xDC00;
4377 Dynarr_add (dst, y);
4378 Dynarr_add (dst, y >> 8);
4379 Dynarr_add (dst, z);
4380 Dynarr_add (dst, z >> 8);
4385 char_finish_utf16 (struct encoding_stream *str, unsigned_char_dynarr *dst,
4386 unsigned int *flags)
4391 /************************************************************************/
4393 /************************************************************************/
4396 detect_coding_utf8 (struct detection_state *st, const Extbyte *src, Lstream_data_count n)
4400 unsigned char c = *(unsigned char *)src++;
4401 switch (st->utf8.in_byte)
4404 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
4407 st->utf8.in_byte = 5;
4409 st->utf8.in_byte = 4;
4411 st->utf8.in_byte = 3;
4413 st->utf8.in_byte = 2;
4415 st->utf8.in_byte = 1;
4420 if ((c & 0xc0) != 0x80)
4426 return CODING_CATEGORY_UTF8_MASK;
4430 decode_output_utf8_partial_char (unsigned char counter,
4432 unsigned_char_dynarr *dst)
4435 DECODE_ADD_BINARY_CHAR ( (cpos|0xFC), dst);
4436 else if (counter == 4)
4438 if (cpos < (1 << 6))
4439 DECODE_ADD_BINARY_CHAR ( (cpos|0xF8), dst);
4442 DECODE_ADD_BINARY_CHAR ( ((cpos >> 6)|0xFC), dst);
4443 DECODE_ADD_BINARY_CHAR ( ((cpos&0x3F)|0x80), dst);
4446 else if (counter == 3)
4448 if (cpos < (1 << 6))
4449 DECODE_ADD_BINARY_CHAR ( (cpos|0xF0), dst);
4450 else if (cpos < (1 << 12))
4452 DECODE_ADD_BINARY_CHAR ( ((cpos >> 6)|0xF8), dst);
4453 DECODE_ADD_BINARY_CHAR ( ((cpos&0x3F)|0x80), dst);
4457 DECODE_ADD_BINARY_CHAR ( ( (cpos >> 12)|0xFC), dst);
4458 DECODE_ADD_BINARY_CHAR ( (((cpos >> 6)&0x3F)|0x80), dst);
4459 DECODE_ADD_BINARY_CHAR ( ( (cpos &0x3F)|0x80), dst);
4462 else if (counter == 2)
4464 if (cpos < (1 << 6))
4465 DECODE_ADD_BINARY_CHAR ( (cpos|0xE0), dst);
4466 else if (cpos < (1 << 12))
4468 DECODE_ADD_BINARY_CHAR ( ((cpos >> 6)|0xF0), dst);
4469 DECODE_ADD_BINARY_CHAR ( ((cpos&0x3F)|0x80), dst);
4471 else if (cpos < (1 << 18))
4473 DECODE_ADD_BINARY_CHAR ( ( (cpos >> 12)|0xF8), dst);
4474 DECODE_ADD_BINARY_CHAR ( (((cpos >> 6)&0x3F)|0x80), dst);
4475 DECODE_ADD_BINARY_CHAR ( ( (cpos &0x3F)|0x80), dst);
4479 DECODE_ADD_BINARY_CHAR ( ( (cpos >> 18)|0xFC), dst);
4480 DECODE_ADD_BINARY_CHAR ( (((cpos >> 12)&0x3F)|0x80), dst);
4481 DECODE_ADD_BINARY_CHAR ( (((cpos >> 6)&0x3F)|0x80), dst);
4482 DECODE_ADD_BINARY_CHAR ( ( (cpos &0x3F)|0x80), dst);
4487 if (cpos < (1 << 6))
4488 DECODE_ADD_BINARY_CHAR ( (cpos|0xC0), dst);
4489 else if (cpos < (1 << 12))
4491 DECODE_ADD_BINARY_CHAR ( ((cpos >> 6)|0xE0), dst);
4492 DECODE_ADD_BINARY_CHAR ( ((cpos&0x3F)|0x80), dst);
4494 else if (cpos < (1 << 18))
4496 DECODE_ADD_BINARY_CHAR ( ( (cpos >> 12)|0xF0), dst);
4497 DECODE_ADD_BINARY_CHAR ( (((cpos >> 6)&0x3F)|0x80), dst);
4498 DECODE_ADD_BINARY_CHAR ( ( (cpos &0x3F)|0x80), dst);
4500 else if (cpos < (1 << 24))
4502 DECODE_ADD_BINARY_CHAR ( ( (cpos >> 18)|0xF8), dst);
4503 DECODE_ADD_BINARY_CHAR ( (((cpos >> 12)&0x3F)|0x80), dst);
4504 DECODE_ADD_BINARY_CHAR ( (((cpos >> 6)&0x3F)|0x80), dst);
4505 DECODE_ADD_BINARY_CHAR ( ( (cpos &0x3F)|0x80), dst);
4509 DECODE_ADD_BINARY_CHAR ( ( (cpos >> 24)|0xFC), dst);
4510 DECODE_ADD_BINARY_CHAR ( (((cpos >> 18)&0x3F)|0x80), dst);
4511 DECODE_ADD_BINARY_CHAR ( (((cpos >> 12)&0x3F)|0x80), dst);
4512 DECODE_ADD_BINARY_CHAR ( (((cpos >> 6)&0x3F)|0x80), dst);
4513 DECODE_ADD_BINARY_CHAR ( ( (cpos &0x3F)|0x80), dst);
4519 decode_coding_utf8 (Lstream *decoding, const Extbyte *src,
4520 unsigned_char_dynarr *dst, Lstream_data_count n)
4522 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
4523 unsigned int flags = str->flags;
4524 unsigned int cpos = str->cpos;
4525 eol_type_t eol_type = str->eol_type;
4526 unsigned char counter = str->counter;
4528 int bom_flag = str->bom_flag;
4530 = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (DECODING_STREAM_DATA
4531 (decoding)->codesys, 0);
4536 unsigned char c = *(unsigned char *)src++;
4541 COMPOSE_FLUSH_CHARS (str, dst);
4542 decode_flush_er_chars (str, dst);
4543 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
4545 if ( bom_flag == 0 )
4548 DECODE_ADD_UCS_CHAR (c, dst);
4550 else if ( c < 0xC0 )
4552 if ( bom_flag == 0 )
4555 /* decode_add_er_char (str, c, dst); */
4556 COMPOSE_ADD_CHAR (str, c, dst);
4560 /* decode_flush_er_chars (str, dst); */
4566 else if ( c < 0xF0 )
4571 else if ( c < 0xF8 )
4576 else if ( c < 0xFC )
4588 else if ( (c & 0xC0) == 0x80 )
4590 cpos = ( cpos << 6 ) | ( c & 0x3f );
4595 if ( bom_flag == 0 )
4597 if ( cpos == 0xFEFF )
4608 char_id = decode_defined_char (ccs, cpos, 0);
4615 COMPOSE_ADD_CHAR (str, char_id, dst);
4625 COMPOSE_FLUSH_CHARS (str, dst);
4626 decode_flush_er_chars (str, dst);
4627 decode_output_utf8_partial_char (counter, cpos, dst);
4628 DECODE_ADD_BINARY_CHAR (c, dst);
4632 label_continue_loop:;
4635 if (flags & CODING_STATE_END)
4637 COMPOSE_FLUSH_CHARS (str, dst);
4638 decode_flush_er_chars (str, dst);
4641 decode_output_utf8_partial_char (counter, cpos, dst);
4648 str->counter = counter;
4650 str->bom_flag = bom_flag;
4655 char_encode_utf8 (struct encoding_stream *str, Emchar ch,
4656 unsigned_char_dynarr *dst, unsigned int *flags)
4658 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
4662 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
4663 Dynarr_add (dst, '\r');
4664 if (eol_type != EOL_CR)
4665 Dynarr_add (dst, ch);
4667 else if (ch <= 0x7f)
4669 Dynarr_add (dst, ch);
4674 = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (str->codesys, 0);
4675 int code_point = charset_code_point (ucs_ccs, ch, 0);
4677 if ( (code_point < 0) || (code_point > 0xEFFFF) )
4679 Lisp_Object map, ret;
4681 if ( CODING_SYSTEM_ENABLE_DECOMPOSITION (str->codesys) )
4683 Lisp_Object rest = Vdecomposition_feature_list;
4684 Lisp_Object decomp_f;
4685 Lisp_Object seq = Qnil;
4686 struct gcpro gcpro1;
4688 while ( CONSP (rest) )
4690 decomp_f = XCAR (rest);
4692 seq = Fchar_feature (make_char (ch), decomp_f, Qnil,
4702 Lisp_Object base = Fcar (seq);
4705 if ( CHARP (base) && CONSP (seq) )
4707 Lisp_Object comb = Fcar (seq);
4711 char_encode_utf8 (str, XCHAR (base), dst, flags);
4712 char_encode_utf8 (str, XCHAR (comb), dst, flags);
4719 map = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (str->codesys, 1);
4721 && INTP (ret = Fchar_feature (make_char (ch),
4724 code_point = XINT (ret);
4725 else if ( !NILP (map =
4726 CODING_SYSTEM_ISO2022_INITIAL_CHARSET
4728 && INTP (ret = Fchar_feature (make_char (ch),
4731 code_point = XINT (ret);
4732 else if (CODING_SYSTEM_USE_ENTITY_REFERENCE (str->codesys))
4736 char_encode_as_entity_reference (ch, buf);
4737 Dynarr_add_many (dst, buf, strlen (buf));
4743 if (code_point <= 0x7ff)
4745 Dynarr_add (dst, (code_point >> 6) | 0xc0);
4746 Dynarr_add (dst, (code_point & 0x3f) | 0x80);
4748 else if (code_point <= 0xffff)
4750 Dynarr_add (dst, (code_point >> 12) | 0xe0);
4751 Dynarr_add (dst, ((code_point >> 6) & 0x3f) | 0x80);
4752 Dynarr_add (dst, (code_point & 0x3f) | 0x80);
4754 else if (code_point <= 0x1fffff)
4756 Dynarr_add (dst, (code_point >> 18) | 0xf0);
4757 Dynarr_add (dst, ((code_point >> 12) & 0x3f) | 0x80);
4758 Dynarr_add (dst, ((code_point >> 6) & 0x3f) | 0x80);
4759 Dynarr_add (dst, (code_point & 0x3f) | 0x80);
4761 else if (code_point <= 0x3ffffff)
4763 Dynarr_add (dst, (code_point >> 24) | 0xf8);
4764 Dynarr_add (dst, ((code_point >> 18) & 0x3f) | 0x80);
4765 Dynarr_add (dst, ((code_point >> 12) & 0x3f) | 0x80);
4766 Dynarr_add (dst, ((code_point >> 6) & 0x3f) | 0x80);
4767 Dynarr_add (dst, (code_point & 0x3f) | 0x80);
4771 Dynarr_add (dst, (code_point >> 30) | 0xfc);
4772 Dynarr_add (dst, ((code_point >> 24) & 0x3f) | 0x80);
4773 Dynarr_add (dst, ((code_point >> 18) & 0x3f) | 0x80);
4774 Dynarr_add (dst, ((code_point >> 12) & 0x3f) | 0x80);
4775 Dynarr_add (dst, ((code_point >> 6) & 0x3f) | 0x80);
4776 Dynarr_add (dst, (code_point & 0x3f) | 0x80);
4782 char_finish_utf8 (struct encoding_stream *str, unsigned_char_dynarr *dst,
4783 unsigned int *flags)
4788 /************************************************************************/
4789 /* ISO2022 methods */
4790 /************************************************************************/
4792 /* The following note describes the coding system ISO2022 briefly.
4793 Since the intention of this note is to help understand the
4794 functions in this file, some parts are NOT ACCURATE or OVERLY
4795 SIMPLIFIED. For thorough understanding, please refer to the
4796 original document of ISO2022.
4798 ISO2022 provides many mechanisms to encode several character sets
4799 in 7-bit and 8-bit environments. For 7-bit environments, all text
4800 is encoded using bytes less than 128. This may make the encoded
4801 text a little bit longer, but the text passes more easily through
4802 several gateways, some of which strip off MSB (Most Signigant Bit).
4804 There are two kinds of character sets: control character set and
4805 graphic character set. The former contains control characters such
4806 as `newline' and `escape' to provide control functions (control
4807 functions are also provided by escape sequences). The latter
4808 contains graphic characters such as 'A' and '-'. Emacs recognizes
4809 two control character sets and many graphic character sets.
4811 Graphic character sets are classified into one of the following
4812 four classes, according to the number of bytes (DIMENSION) and
4813 number of characters in one dimension (CHARS) of the set:
4814 - DIMENSION1_CHARS94
4815 - DIMENSION1_CHARS96
4816 - DIMENSION2_CHARS94
4817 - DIMENSION2_CHARS96
4819 In addition, each character set is assigned an identification tag,
4820 unique for each set, called "final character" (denoted as <F>
4821 hereafter). The <F> of each character set is decided by ECMA(*)
4822 when it is registered in ISO. The code range of <F> is 0x30..0x7F
4823 (0x30..0x3F are for private use only).
4825 Note (*): ECMA = European Computer Manufacturers Association
4827 Here are examples of graphic character set [NAME(<F>)]:
4828 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
4829 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
4830 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
4831 o DIMENSION2_CHARS96 -- none for the moment
4833 A code area (1 byte = 8 bits) is divided into 4 areas, C0, GL, C1, and GR.
4834 C0 [0x00..0x1F] -- control character plane 0
4835 GL [0x20..0x7F] -- graphic character plane 0
4836 C1 [0x80..0x9F] -- control character plane 1
4837 GR [0xA0..0xFF] -- graphic character plane 1
4839 A control character set is directly designated and invoked to C0 or
4840 C1 by an escape sequence. The most common case is that:
4841 - ISO646's control character set is designated/invoked to C0, and
4842 - ISO6429's control character set is designated/invoked to C1,
4843 and usually these designations/invocations are omitted in encoded
4844 text. In a 7-bit environment, only C0 can be used, and a control
4845 character for C1 is encoded by an appropriate escape sequence to
4846 fit into the environment. All control characters for C1 are
4847 defined to have corresponding escape sequences.
4849 A graphic character set is at first designated to one of four
4850 graphic registers (G0 through G3), then these graphic registers are
4851 invoked to GL or GR. These designations and invocations can be
4852 done independently. The most common case is that G0 is invoked to
4853 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
4854 these invocations and designations are omitted in encoded text.
4855 In a 7-bit environment, only GL can be used.
4857 When a graphic character set of CHARS94 is invoked to GL, codes
4858 0x20 and 0x7F of the GL area work as control characters SPACE and
4859 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
4862 There are two ways of invocation: locking-shift and single-shift.
4863 With locking-shift, the invocation lasts until the next different
4864 invocation, whereas with single-shift, the invocation affects the
4865 following character only and doesn't affect the locking-shift
4866 state. Invocations are done by the following control characters or
4869 ----------------------------------------------------------------------
4870 abbrev function cntrl escape seq description
4871 ----------------------------------------------------------------------
4872 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
4873 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
4874 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
4875 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
4876 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
4877 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
4878 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
4879 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
4880 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
4881 ----------------------------------------------------------------------
4882 (*) These are not used by any known coding system.
4884 Control characters for these functions are defined by macros
4885 ISO_CODE_XXX in `coding.h'.
4887 Designations are done by the following escape sequences:
4888 ----------------------------------------------------------------------
4889 escape sequence description
4890 ----------------------------------------------------------------------
4891 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
4892 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
4893 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
4894 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
4895 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
4896 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
4897 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
4898 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
4899 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
4900 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
4901 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
4902 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
4903 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
4904 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
4905 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
4906 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
4907 ----------------------------------------------------------------------
4909 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
4910 of dimension 1, chars 94, and final character <F>, etc...
4912 Note (*): Although these designations are not allowed in ISO2022,
4913 Emacs accepts them on decoding, and produces them on encoding
4914 CHARS96 character sets in a coding system which is characterized as
4915 7-bit environment, non-locking-shift, and non-single-shift.
4917 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
4918 '(' can be omitted. We refer to this as "short-form" hereafter.
4920 Now you may notice that there are a lot of ways for encoding the
4921 same multilingual text in ISO2022. Actually, there exist many
4922 coding systems such as Compound Text (used in X11's inter client
4923 communication, ISO-2022-JP (used in Japanese internet), ISO-2022-KR
4924 (used in Korean internet), EUC (Extended UNIX Code, used in Asian
4925 localized platforms), and all of these are variants of ISO2022.
4927 In addition to the above, Emacs handles two more kinds of escape
4928 sequences: ISO6429's direction specification and Emacs' private
4929 sequence for specifying character composition.
4931 ISO6429's direction specification takes the following form:
4932 o CSI ']' -- end of the current direction
4933 o CSI '0' ']' -- end of the current direction
4934 o CSI '1' ']' -- start of left-to-right text
4935 o CSI '2' ']' -- start of right-to-left text
4936 The control character CSI (0x9B: control sequence introducer) is
4937 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
4939 Character composition specification takes the following form:
4940 o ESC '0' -- start character composition
4941 o ESC '1' -- end character composition
4942 Since these are not standard escape sequences of any ISO standard,
4943 their use with these meanings is restricted to Emacs only. */
4946 reset_iso2022 (Lisp_Object coding_system, struct iso2022_decoder *iso)
4950 for (i = 0; i < 4; i++)
4952 if (!NILP (coding_system))
4954 XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, i);
4956 iso->charset[i] = Qt;
4957 iso->invalid_designated[i] = 0;
4959 iso->esc = ISO_ESC_NOTHING;
4960 iso->esc_bytes_index = 0;
4961 iso->register_left = 0;
4962 iso->register_right = 1;
4963 iso->switched_dir_and_no_valid_charset_yet = 0;
4964 iso->invalid_switch_dir = 0;
4965 iso->output_direction_sequence = 0;
4966 iso->output_literally = 0;
4967 #ifdef ENABLE_COMPOSITE_CHARS
4968 if (iso->composite_chars)
4969 Dynarr_reset (iso->composite_chars);
4974 fit_to_be_escape_quoted (unsigned char c)
4991 /* Parse one byte of an ISO2022 escape sequence.
4992 If the result is an invalid escape sequence, return 0 and
4993 do not change anything in STR. Otherwise, if the result is
4994 an incomplete escape sequence, update ISO2022.ESC and
4995 ISO2022.ESC_BYTES and return -1. Otherwise, update
4996 all the state variables (but not ISO2022.ESC_BYTES) and
4999 If CHECK_INVALID_CHARSETS is non-zero, check for designation
5000 or invocation of an invalid character set and treat that as
5001 an unrecognized escape sequence. */
5004 parse_iso2022_esc (Lisp_Object codesys, struct iso2022_decoder *iso,
5005 unsigned char c, unsigned int *flags,
5006 int check_invalid_charsets)
5008 /* (1) If we're at the end of a designation sequence, CS is the
5009 charset being designated and REG is the register to designate
5012 (2) If we're at the end of a locking-shift sequence, REG is
5013 the register to invoke and HALF (0 == left, 1 == right) is
5014 the half to invoke it into.
5016 (3) If we're at the end of a single-shift sequence, REG is
5017 the register to invoke. */
5018 Lisp_Object cs = Qnil;
5021 /* NOTE: This code does goto's all over the fucking place.
5022 The reason for this is that we're basically implementing
5023 a state machine here, and hierarchical languages like C
5024 don't really provide a clean way of doing this. */
5026 if (! (*flags & CODING_STATE_ESCAPE))
5027 /* At beginning of escape sequence; we need to reset our
5028 escape-state variables. */
5029 iso->esc = ISO_ESC_NOTHING;
5031 iso->output_literally = 0;
5032 iso->output_direction_sequence = 0;
5036 case ISO_ESC_NOTHING:
5037 iso->esc_bytes_index = 0;
5040 case ISO_CODE_ESC: /* Start escape sequence */
5041 *flags |= CODING_STATE_ESCAPE;
5045 case ISO_CODE_CSI: /* ISO6429 (specifying directionality) */
5046 *flags |= CODING_STATE_ESCAPE;
5047 iso->esc = ISO_ESC_5_11;
5050 case ISO_CODE_SO: /* locking shift 1 */
5053 case ISO_CODE_SI: /* locking shift 0 */
5057 case ISO_CODE_SS2: /* single shift */
5060 case ISO_CODE_SS3: /* single shift */
5064 default: /* Other control characters */
5071 /**** single shift ****/
5073 case 'N': /* single shift 2 */
5076 case 'O': /* single shift 3 */
5080 /**** locking shift ****/
5082 case '~': /* locking shift 1 right */
5085 case 'n': /* locking shift 2 */
5088 case '}': /* locking shift 2 right */
5091 case 'o': /* locking shift 3 */
5094 case '|': /* locking shift 3 right */
5098 #ifdef ENABLE_COMPOSITE_CHARS
5099 /**** composite ****/
5102 iso->esc = ISO_ESC_START_COMPOSITE;
5103 *flags = (*flags & CODING_STATE_ISO2022_LOCK) |
5104 CODING_STATE_COMPOSITE;
5108 iso->esc = ISO_ESC_END_COMPOSITE;
5109 *flags = (*flags & CODING_STATE_ISO2022_LOCK) &
5110 ~CODING_STATE_COMPOSITE;
5112 #endif /* ENABLE_COMPOSITE_CHARS */
5114 /**** directionality ****/
5117 iso->esc = ISO_ESC_5_11;
5120 /**** designation ****/
5122 case '$': /* multibyte charset prefix */
5123 iso->esc = ISO_ESC_2_4;
5127 if (0x28 <= c && c <= 0x2F)
5129 iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_8);
5133 /* This function is called with CODESYS equal to nil when
5134 doing coding-system detection. */
5136 && XCODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
5137 && fit_to_be_escape_quoted (c))
5139 iso->esc = ISO_ESC_LITERAL;
5140 *flags &= CODING_STATE_ISO2022_LOCK;
5150 /**** directionality ****/
5152 case ISO_ESC_5_11: /* ISO6429 direction control */
5155 *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
5156 goto directionality;
5158 if (c == '0') iso->esc = ISO_ESC_5_11_0;
5159 else if (c == '1') iso->esc = ISO_ESC_5_11_1;
5160 else if (c == '2') iso->esc = ISO_ESC_5_11_2;
5164 case ISO_ESC_5_11_0:
5167 *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
5168 goto directionality;
5172 case ISO_ESC_5_11_1:
5175 *flags = (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
5176 goto directionality;
5180 case ISO_ESC_5_11_2:
5183 *flags = (*flags & CODING_STATE_ISO2022_LOCK) | CODING_STATE_R2L;
5184 goto directionality;
5189 iso->esc = ISO_ESC_DIRECTIONALITY;
5190 /* Various junk here to attempt to preserve the direction sequences
5191 literally in the text if they would otherwise be swallowed due
5192 to invalid designations that don't show up as actual charset
5193 changes in the text. */
5194 if (iso->invalid_switch_dir)
5196 /* We already inserted a direction switch literally into the
5197 text. We assume (#### this may not be right) that the
5198 next direction switch is the one going the other way,
5199 and we need to output that literally as well. */
5200 iso->output_literally = 1;
5201 iso->invalid_switch_dir = 0;
5207 /* If we are in the thrall of an invalid designation,
5208 then stick the directionality sequence literally into the
5209 output stream so it ends up in the original text again. */
5210 for (jj = 0; jj < 4; jj++)
5211 if (iso->invalid_designated[jj])
5215 iso->output_literally = 1;
5216 iso->invalid_switch_dir = 1;
5219 /* Indicate that we haven't yet seen a valid designation,
5220 so that if a switch-dir is directly followed by an
5221 invalid designation, both get inserted literally. */
5222 iso->switched_dir_and_no_valid_charset_yet = 1;
5227 /**** designation ****/
5230 if (0x28 <= c && c <= 0x2F)
5232 iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_4_8);
5235 if (0x40 <= c && c <= 0x42)
5238 cs = CHARSET_BY_ATTRIBUTES (94, -1, c,
5239 *flags & CODING_STATE_R2L ?
5240 CHARSET_RIGHT_TO_LEFT :
5241 CHARSET_LEFT_TO_RIGHT);
5252 if (c < '0' || c > '~')
5253 return 0; /* bad final byte */
5255 if (iso->esc >= ISO_ESC_2_8 &&
5256 iso->esc <= ISO_ESC_2_15)
5258 chars = (iso->esc >= ISO_ESC_2_12) ? 96 : 94;
5259 single = 1; /* single-byte */
5260 reg = (iso->esc - ISO_ESC_2_8) & 3;
5262 else if (iso->esc >= ISO_ESC_2_4_8 &&
5263 iso->esc <= ISO_ESC_2_4_15)
5265 chars = (iso->esc >= ISO_ESC_2_4_12) ? 96 : 94;
5266 single = -1; /* multi-byte */
5267 reg = (iso->esc - ISO_ESC_2_4_8) & 3;
5271 /* Can this ever be reached? -slb */
5275 cs = CHARSET_BY_ATTRIBUTES (chars, single, c,
5276 *flags & CODING_STATE_R2L ?
5277 CHARSET_RIGHT_TO_LEFT :
5278 CHARSET_LEFT_TO_RIGHT);
5284 iso->esc_bytes[iso->esc_bytes_index++] = (unsigned char) c;
5288 if (check_invalid_charsets && !CHARSETP (iso->charset[reg]))
5289 /* can't invoke something that ain't there. */
5291 iso->esc = ISO_ESC_SINGLE_SHIFT;
5292 *flags &= CODING_STATE_ISO2022_LOCK;
5294 *flags |= CODING_STATE_SS2;
5296 *flags |= CODING_STATE_SS3;
5300 if (check_invalid_charsets &&
5301 !CHARSETP (iso->charset[reg]))
5302 /* can't invoke something that ain't there. */
5305 iso->register_right = reg;
5307 iso->register_left = reg;
5308 *flags &= CODING_STATE_ISO2022_LOCK;
5309 iso->esc = ISO_ESC_LOCKING_SHIFT;
5313 if (NILP (cs) && check_invalid_charsets)
5315 iso->invalid_designated[reg] = 1;
5316 iso->charset[reg] = Vcharset_ascii;
5317 iso->esc = ISO_ESC_DESIGNATE;
5318 *flags &= CODING_STATE_ISO2022_LOCK;
5319 iso->output_literally = 1;
5320 if (iso->switched_dir_and_no_valid_charset_yet)
5322 /* We encountered a switch-direction followed by an
5323 invalid designation. Ensure that the switch-direction
5324 gets outputted; otherwise it will probably get eaten
5325 when the text is written out again. */
5326 iso->switched_dir_and_no_valid_charset_yet = 0;
5327 iso->output_direction_sequence = 1;
5328 /* And make sure that the switch-dir going the other
5329 way gets outputted, as well. */
5330 iso->invalid_switch_dir = 1;
5334 /* This function is called with CODESYS equal to nil when
5335 doing coding-system detection. */
5336 if (!NILP (codesys))
5338 charset_conversion_spec_dynarr *dyn =
5339 XCODING_SYSTEM (codesys)->iso2022.input_conv;
5345 for (i = 0; i < Dynarr_length (dyn); i++)
5347 struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
5348 if (EQ (cs, spec->from_charset))
5349 cs = spec->to_charset;
5354 iso->charset[reg] = cs;
5355 iso->esc = ISO_ESC_DESIGNATE;
5356 *flags &= CODING_STATE_ISO2022_LOCK;
5357 if (iso->invalid_designated[reg])
5359 iso->invalid_designated[reg] = 0;
5360 iso->output_literally = 1;
5362 if (iso->switched_dir_and_no_valid_charset_yet)
5363 iso->switched_dir_and_no_valid_charset_yet = 0;
5368 detect_coding_iso2022 (struct detection_state *st, const Extbyte *src, Lstream_data_count n)
5372 /* #### There are serious deficiencies in the recognition mechanism
5373 here. This needs to be much smarter if it's going to cut it.
5374 The sequence "\xff\x0f" is currently detected as LOCK_SHIFT while
5375 it should be detected as Latin-1.
5376 All the ISO2022 stuff in this file should be synced up with the
5377 code from FSF Emacs-20.4, in which Mule should be more or less stable.
5378 Perhaps we should wait till R2L works in FSF Emacs? */
5380 if (!st->iso2022.initted)
5382 reset_iso2022 (Qnil, &st->iso2022.iso);
5383 st->iso2022.mask = (CODING_CATEGORY_ISO_7_MASK |
5384 CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
5385 CODING_CATEGORY_ISO_8_1_MASK |
5386 CODING_CATEGORY_ISO_8_2_MASK |
5387 CODING_CATEGORY_ISO_LOCK_SHIFT_MASK);
5388 st->iso2022.flags = 0;
5389 st->iso2022.high_byte_count = 0;
5390 st->iso2022.saw_single_shift = 0;
5391 st->iso2022.initted = 1;
5394 mask = st->iso2022.mask;
5398 unsigned char c = *(unsigned char *)src++;
5401 mask &= ~CODING_CATEGORY_ISO_7_MASK;
5402 st->iso2022.high_byte_count++;
5406 if (st->iso2022.high_byte_count && !st->iso2022.saw_single_shift)
5408 if (st->iso2022.high_byte_count & 1)
5409 /* odd number of high bytes; assume not iso-8-2 */
5410 mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
5412 st->iso2022.high_byte_count = 0;
5413 st->iso2022.saw_single_shift = 0;
5415 mask &= ~CODING_CATEGORY_ISO_7_MASK;
5417 if (!(st->iso2022.flags & CODING_STATE_ESCAPE)
5418 && (BYTE_C0_P (c) || BYTE_C1_P (c)))
5419 { /* control chars */
5422 /* Allow and ignore control characters that you might
5423 reasonably see in a text file */
5428 case 8: /* backspace */
5429 case 11: /* vertical tab */
5430 case 12: /* form feed */
5431 case 26: /* MS-DOS C-z junk */
5432 case 31: /* '^_' -- for info */
5433 goto label_continue_loop;
5440 if ((st->iso2022.flags & CODING_STATE_ESCAPE) || BYTE_C0_P (c)
5443 if (parse_iso2022_esc (Qnil, &st->iso2022.iso, c,
5444 &st->iso2022.flags, 0))
5446 switch (st->iso2022.iso.esc)
5448 case ISO_ESC_DESIGNATE:
5449 mask &= ~CODING_CATEGORY_ISO_8_1_MASK;
5450 mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
5452 case ISO_ESC_LOCKING_SHIFT:
5453 mask = CODING_CATEGORY_ISO_LOCK_SHIFT_MASK;
5454 goto ran_out_of_chars;
5455 case ISO_ESC_SINGLE_SHIFT:
5456 mask &= ~CODING_CATEGORY_ISO_8_DESIGNATE_MASK;
5457 st->iso2022.saw_single_shift = 1;
5466 goto ran_out_of_chars;
5469 label_continue_loop:;
5478 postprocess_iso2022_mask (int mask)
5480 /* #### kind of cheesy */
5481 /* If seven-bit ISO is allowed, then assume that the encoding is
5482 entirely seven-bit and turn off the eight-bit ones. */
5483 if (mask & CODING_CATEGORY_ISO_7_MASK)
5484 mask &= ~ (CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
5485 CODING_CATEGORY_ISO_8_1_MASK |
5486 CODING_CATEGORY_ISO_8_2_MASK);
5490 /* If FLAGS is a null pointer or specifies right-to-left motion,
5491 output a switch-dir-to-left-to-right sequence to DST.
5492 Also update FLAGS if it is not a null pointer.
5493 If INTERNAL_P is set, we are outputting in internal format and
5494 need to handle the CSI differently. */
5497 restore_left_to_right_direction (Lisp_Coding_System *codesys,
5498 unsigned_char_dynarr *dst,
5499 unsigned int *flags,
5502 if (!flags || (*flags & CODING_STATE_R2L))
5504 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
5506 Dynarr_add (dst, ISO_CODE_ESC);
5507 Dynarr_add (dst, '[');
5509 else if (internal_p)
5510 DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
5512 Dynarr_add (dst, ISO_CODE_CSI);
5513 Dynarr_add (dst, '0');
5514 Dynarr_add (dst, ']');
5516 *flags &= ~CODING_STATE_R2L;
5520 /* If FLAGS is a null pointer or specifies a direction different from
5521 DIRECTION (which should be either CHARSET_RIGHT_TO_LEFT or
5522 CHARSET_LEFT_TO_RIGHT), output the appropriate switch-dir escape
5523 sequence to DST. Also update FLAGS if it is not a null pointer.
5524 If INTERNAL_P is set, we are outputting in internal format and
5525 need to handle the CSI differently. */
5528 ensure_correct_direction (int direction, Lisp_Coding_System *codesys,
5529 unsigned_char_dynarr *dst, unsigned int *flags,
5532 if ((!flags || (*flags & CODING_STATE_R2L)) &&
5533 direction == CHARSET_LEFT_TO_RIGHT)
5534 restore_left_to_right_direction (codesys, dst, flags, internal_p);
5535 else if (!CODING_SYSTEM_ISO2022_NO_ISO6429 (codesys)
5536 && (!flags || !(*flags & CODING_STATE_R2L)) &&
5537 direction == CHARSET_RIGHT_TO_LEFT)
5539 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
5541 Dynarr_add (dst, ISO_CODE_ESC);
5542 Dynarr_add (dst, '[');
5544 else if (internal_p)
5545 DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
5547 Dynarr_add (dst, ISO_CODE_CSI);
5548 Dynarr_add (dst, '2');
5549 Dynarr_add (dst, ']');
5551 *flags |= CODING_STATE_R2L;
5555 /* Convert ISO2022-format data to internal format. */
5558 decode_coding_iso2022 (Lstream *decoding, const Extbyte *src,
5559 unsigned_char_dynarr *dst, Lstream_data_count n)
5561 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
5562 unsigned int flags = str->flags;
5563 unsigned int cpos = str->cpos;
5564 unsigned char counter = str->counter;
5565 eol_type_t eol_type = str->eol_type;
5566 #ifdef ENABLE_COMPOSITE_CHARS
5567 unsigned_char_dynarr *real_dst = dst;
5569 Lisp_Object coding_system;
5571 XSETCODING_SYSTEM (coding_system, str->codesys);
5573 #ifdef ENABLE_COMPOSITE_CHARS
5574 if (flags & CODING_STATE_COMPOSITE)
5575 dst = str->iso2022.composite_chars;
5576 #endif /* ENABLE_COMPOSITE_CHARS */
5580 unsigned char c = *(unsigned char *)src++;
5581 if (flags & CODING_STATE_ESCAPE)
5582 { /* Within ESC sequence */
5583 int retval = parse_iso2022_esc (coding_system, &str->iso2022,
5588 switch (str->iso2022.esc)
5590 #ifdef ENABLE_COMPOSITE_CHARS
5591 case ISO_ESC_START_COMPOSITE:
5592 if (str->iso2022.composite_chars)
5593 Dynarr_reset (str->iso2022.composite_chars);
5595 str->iso2022.composite_chars = Dynarr_new (unsigned_char);
5596 dst = str->iso2022.composite_chars;
5598 case ISO_ESC_END_COMPOSITE:
5600 Bufbyte comstr[MAX_EMCHAR_LEN];
5602 Emchar emch = lookup_composite_char (Dynarr_atp (dst, 0),
5603 Dynarr_length (dst));
5605 len = set_charptr_emchar (comstr, emch);
5606 Dynarr_add_many (dst, comstr, len);
5609 #endif /* ENABLE_COMPOSITE_CHARS */
5611 case ISO_ESC_LITERAL:
5612 COMPOSE_FLUSH_CHARS (str, dst);
5613 decode_flush_er_chars (str, dst);
5614 DECODE_ADD_BINARY_CHAR (c, dst);
5618 /* Everything else handled already */
5623 /* Attempted error recovery. */
5624 if (str->iso2022.output_direction_sequence)
5625 ensure_correct_direction (flags & CODING_STATE_R2L ?
5626 CHARSET_RIGHT_TO_LEFT :
5627 CHARSET_LEFT_TO_RIGHT,
5628 str->codesys, dst, 0, 1);
5629 /* More error recovery. */
5630 if (!retval || str->iso2022.output_literally)
5632 /* Output the (possibly invalid) sequence */
5634 COMPOSE_FLUSH_CHARS (str, dst);
5635 decode_flush_er_chars (str, dst);
5636 for (i = 0; i < str->iso2022.esc_bytes_index; i++)
5637 DECODE_ADD_BINARY_CHAR (str->iso2022.esc_bytes[i], dst);
5638 flags &= CODING_STATE_ISO2022_LOCK;
5640 n++, src--;/* Repeat the loop with the same character. */
5643 /* No sense in reprocessing the final byte of the
5644 escape sequence; it could mess things up anyway.
5646 COMPOSE_FLUSH_CHARS (str, dst);
5647 decode_flush_er_chars (str, dst);
5648 DECODE_ADD_BINARY_CHAR (c, dst);
5654 else if (BYTE_C0_P (c) || BYTE_C1_P (c))
5655 { /* Control characters */
5657 /***** Error-handling *****/
5659 /* If we were in the middle of a character, dump out the
5660 partial character. */
5663 COMPOSE_FLUSH_CHARS (str, dst);
5664 decode_flush_er_chars (str, dst);
5668 DECODE_ADD_BINARY_CHAR
5669 ((unsigned char)(cpos >> (counter * 8)), dst);
5674 /* If we just saw a single-shift character, dump it out.
5675 This may dump out the wrong sort of single-shift character,
5676 but least it will give an indication that something went
5678 if (flags & CODING_STATE_SS2)
5680 COMPOSE_FLUSH_CHARS (str, dst);
5681 decode_flush_er_chars (str, dst);
5682 DECODE_ADD_BINARY_CHAR (ISO_CODE_SS2, dst);
5683 flags &= ~CODING_STATE_SS2;
5685 if (flags & CODING_STATE_SS3)
5687 COMPOSE_FLUSH_CHARS (str, dst);
5688 decode_flush_er_chars (str, dst);
5689 DECODE_ADD_BINARY_CHAR (ISO_CODE_SS3, dst);
5690 flags &= ~CODING_STATE_SS3;
5693 /***** Now handle the control characters. *****/
5699 COMPOSE_FLUSH_CHARS (str, dst);
5700 decode_flush_er_chars (str, dst);
5701 if (eol_type == EOL_CR)
5702 Dynarr_add (dst, '\n');
5703 else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR)
5704 Dynarr_add (dst, c);
5706 flags |= CODING_STATE_CR;
5707 goto label_continue_loop;
5709 else if (flags & CODING_STATE_CR)
5710 { /* eol_type == CODING_SYSTEM_EOL_CRLF */
5712 Dynarr_add (dst, '\r');
5713 flags &= ~CODING_STATE_CR;
5716 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
5719 flags &= CODING_STATE_ISO2022_LOCK;
5721 if (!parse_iso2022_esc (coding_system, &str->iso2022, c, &flags, 1))
5723 COMPOSE_FLUSH_CHARS (str, dst);
5724 decode_flush_er_chars (str, dst);
5725 DECODE_ADD_BINARY_CHAR (c, dst);
5729 { /* Graphic characters */
5730 Lisp_Object charset;
5739 COMPOSE_FLUSH_CHARS (str, dst);
5740 decode_flush_er_chars (str, dst);
5741 if (eol_type == EOL_CR)
5742 Dynarr_add (dst, '\n');
5743 else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR)
5744 Dynarr_add (dst, c);
5746 flags |= CODING_STATE_CR;
5747 goto label_continue_loop;
5749 else if (flags & CODING_STATE_CR)
5750 { /* eol_type == CODING_SYSTEM_EOL_CRLF */
5752 Dynarr_add (dst, '\r');
5753 flags &= ~CODING_STATE_CR;
5756 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
5759 /* Now determine the charset. */
5760 reg = ((flags & CODING_STATE_SS2) ? 2
5761 : (flags & CODING_STATE_SS3) ? 3
5762 : !BYTE_ASCII_P (c) ? str->iso2022.register_right
5763 : str->iso2022.register_left);
5764 charset = str->iso2022.charset[reg];
5766 /* Error checking: */
5767 if (! CHARSETP (charset)
5768 || str->iso2022.invalid_designated[reg]
5769 || (((c & 0x7F) == ' ' || (c & 0x7F) == ISO_CODE_DEL)
5770 && XCHARSET_CHARS (charset) == 94))
5771 /* Mrmph. We are trying to invoke a register that has no
5772 or an invalid charset in it, or trying to add a character
5773 outside the range of the charset. Insert that char literally
5774 to preserve it for the output. */
5776 COMPOSE_FLUSH_CHARS (str, dst);
5777 decode_flush_er_chars (str, dst);
5781 DECODE_ADD_BINARY_CHAR
5782 ((unsigned char)(cpos >> (counter * 8)), dst);
5785 DECODE_ADD_BINARY_CHAR (c, dst);
5790 /* Things are probably hunky-dorey. */
5792 /* Fetch reverse charset, maybe. */
5793 if (((flags & CODING_STATE_R2L) &&
5794 XCHARSET_DIRECTION (charset) == CHARSET_LEFT_TO_RIGHT)
5796 (!(flags & CODING_STATE_R2L) &&
5797 XCHARSET_DIRECTION (charset) == CHARSET_RIGHT_TO_LEFT))
5799 Lisp_Object new_charset =
5800 XCHARSET_REVERSE_DIRECTION_CHARSET (charset);
5801 if (!NILP (new_charset))
5802 charset = new_charset;
5807 if (XCHARSET_DIMENSION (charset) == counter)
5809 COMPOSE_ADD_CHAR (str,
5810 DECODE_CHAR (charset,
5811 ((cpos & 0x7F7F7F) << 8)
5818 cpos = (cpos << 8) | c;
5820 lb = XCHARSET_LEADING_BYTE (charset);
5821 switch (XCHARSET_REP_BYTES (charset))
5824 DECODE_OUTPUT_PARTIAL_CHAR (ch);
5825 Dynarr_add (dst, c & 0x7F);
5828 case 2: /* one-byte official */
5829 DECODE_OUTPUT_PARTIAL_CHAR (ch);
5830 Dynarr_add (dst, lb);
5831 Dynarr_add (dst, c | 0x80);
5834 case 3: /* one-byte private or two-byte official */
5835 if (XCHARSET_PRIVATE_P (charset))
5837 DECODE_OUTPUT_PARTIAL_CHAR (ch);
5838 Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_1);
5839 Dynarr_add (dst, lb);
5840 Dynarr_add (dst, c | 0x80);
5846 Dynarr_add (dst, lb);
5847 Dynarr_add (dst, ch | 0x80);
5848 Dynarr_add (dst, c | 0x80);
5856 default: /* two-byte private */
5859 Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_2);
5860 Dynarr_add (dst, lb);
5861 Dynarr_add (dst, ch | 0x80);
5862 Dynarr_add (dst, c | 0x80);
5872 flags &= CODING_STATE_ISO2022_LOCK;
5875 label_continue_loop:;
5878 if (flags & CODING_STATE_END)
5880 COMPOSE_FLUSH_CHARS (str, dst);
5881 decode_flush_er_chars (str, dst);
5882 DECODE_OUTPUT_PARTIAL_CHAR (cpos);
5886 str->counter = counter;
5890 /***** ISO2022 encoder *****/
5892 /* Designate CHARSET into register REG. */
5895 iso2022_designate (Lisp_Object charset, unsigned char reg,
5896 struct encoding_stream *str, unsigned_char_dynarr *dst)
5898 static const char inter94[] = "()*+";
5899 static const char inter96[] = ",-./";
5900 unsigned short chars;
5901 unsigned char dimension;
5902 unsigned char final;
5903 Lisp_Object old_charset = str->iso2022.charset[reg];
5905 str->iso2022.charset[reg] = charset;
5906 if (!CHARSETP (charset))
5907 /* charset might be an initial nil or t. */
5909 chars = XCHARSET_CHARS (charset);
5910 dimension = XCHARSET_DIMENSION (charset);
5911 final = XCHARSET_FINAL (charset);
5912 if (!str->iso2022.force_charset_on_output[reg] &&
5913 CHARSETP (old_charset) &&
5914 XCHARSET_CHARS (old_charset) == chars &&
5915 XCHARSET_DIMENSION (old_charset) == dimension &&
5916 XCHARSET_FINAL (old_charset) == final)
5919 str->iso2022.force_charset_on_output[reg] = 0;
5922 charset_conversion_spec_dynarr *dyn =
5923 str->codesys->iso2022.output_conv;
5929 for (i = 0; i < Dynarr_length (dyn); i++)
5931 struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
5932 if (EQ (charset, spec->from_charset))
5933 charset = spec->to_charset;
5938 Dynarr_add (dst, ISO_CODE_ESC);
5943 Dynarr_add (dst, inter94[reg]);
5946 Dynarr_add (dst, '$');
5948 || !(CODING_SYSTEM_ISO2022_SHORT (str->codesys))
5951 Dynarr_add (dst, inter94[reg]);
5956 Dynarr_add (dst, inter96[reg]);
5959 Dynarr_add (dst, '$');
5960 Dynarr_add (dst, inter96[reg]);
5964 Dynarr_add (dst, final);
5968 ensure_normal_shift (struct encoding_stream *str, unsigned_char_dynarr *dst)
5970 if (str->iso2022.register_left != 0)
5972 Dynarr_add (dst, ISO_CODE_SI);
5973 str->iso2022.register_left = 0;
5978 ensure_shift_out (struct encoding_stream *str, unsigned_char_dynarr *dst)
5980 if (str->iso2022.register_left != 1)
5982 Dynarr_add (dst, ISO_CODE_SO);
5983 str->iso2022.register_left = 1;
5988 char_encode_iso2022 (struct encoding_stream *str, Emchar ch,
5989 unsigned_char_dynarr *dst, unsigned int *flags)
5991 unsigned char charmask;
5992 Lisp_Coding_System* codesys = str->codesys;
5993 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
5995 Lisp_Object charset = str->iso2022.current_charset;
5996 int half = str->iso2022.current_half;
5997 int code_point = -1;
6001 restore_left_to_right_direction (codesys, dst, flags, 0);
6003 /* Make sure G0 contains ASCII */
6004 if ((ch > ' ' && ch < ISO_CODE_DEL)
6005 || !CODING_SYSTEM_ISO2022_NO_ASCII_CNTL (codesys))
6007 ensure_normal_shift (str, dst);
6008 iso2022_designate (Vcharset_ascii, 0, str, dst);
6011 /* If necessary, restore everything to the default state
6013 if (ch == '\n' && !(CODING_SYSTEM_ISO2022_NO_ASCII_EOL (codesys)))
6015 restore_left_to_right_direction (codesys, dst, flags, 0);
6017 ensure_normal_shift (str, dst);
6019 for (i = 0; i < 4; i++)
6021 Lisp_Object initial_charset =
6022 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
6023 iso2022_designate (initial_charset, i, str, dst);
6028 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
6029 Dynarr_add (dst, '\r');
6030 if (eol_type != EOL_CR)
6031 Dynarr_add (dst, ch);
6035 if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
6036 && fit_to_be_escape_quoted (ch))
6037 Dynarr_add (dst, ISO_CODE_ESC);
6038 Dynarr_add (dst, ch);
6041 else if ( (0x80 <= ch) && (ch <= 0x9f) )
6043 charmask = (half == 0 ? 0x00 : 0x80);
6045 if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
6046 && fit_to_be_escape_quoted (ch))
6047 Dynarr_add (dst, ISO_CODE_ESC);
6048 /* you asked for it ... */
6049 Dynarr_add (dst, ch);
6055 /* Now determine which register to use. */
6057 for (i = 0; i < 4; i++)
6059 if ((CHARSETP (charset = str->iso2022.charset[i])
6060 && ((code_point = charset_code_point (charset, ch, 0)) >= 0))
6064 = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i))
6065 && ((code_point = charset_code_point (charset, ch, 0)) >= 0)))
6073 Lisp_Object original_default_coded_charset_priority_list
6074 = Vdefault_coded_charset_priority_list;
6075 Vdefault_coded_charset_priority_list
6076 = CODING_SYSTEM_CCS_PRIORITY_LIST (codesys);
6077 while (!EQ (Vdefault_coded_charset_priority_list, Qnil))
6079 code_point = ENCODE_CHAR (ch, charset);
6080 if (XCHARSET_FINAL (charset))
6082 Vdefault_coded_charset_priority_list
6083 = Fcdr (Fmemq (XCHARSET_NAME (charset),
6084 Vdefault_coded_charset_priority_list));
6086 Vdefault_coded_charset_priority_list
6087 = original_default_coded_charset_priority_list;
6088 while (!EQ (Vdefault_coded_charset_priority_list, Qnil))
6090 code_point = ENCODE_CHAR (ch, charset);
6091 if (XCHARSET_FINAL (charset))
6093 Vdefault_coded_charset_priority_list
6094 = Fcdr (Fmemq (XCHARSET_NAME (charset),
6095 Vdefault_coded_charset_priority_list));
6097 code_point = ENCODE_CHAR (ch, charset);
6098 if (!XCHARSET_FINAL (charset))
6100 charset = Vcharset_ascii;
6104 Vdefault_coded_charset_priority_list
6105 = original_default_coded_charset_priority_list;
6107 ensure_correct_direction (XCHARSET_DIRECTION (charset),
6108 codesys, dst, flags, 0);
6112 if (XCHARSET_GRAPHIC (charset) != 0)
6114 if (!NILP (str->iso2022.charset[1]) &&
6115 (!CODING_SYSTEM_ISO2022_SEVEN (codesys)
6116 || CODING_SYSTEM_ISO2022_LOCK_SHIFT (codesys)))
6118 else if (!NILP (str->iso2022.charset[2]))
6120 else if (!NILP (str->iso2022.charset[3]))
6129 iso2022_designate (charset, reg, str, dst);
6131 /* Now invoke that register. */
6135 ensure_normal_shift (str, dst);
6139 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
6141 ensure_shift_out (str, dst);
6148 if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
6150 Dynarr_add (dst, ISO_CODE_ESC);
6151 Dynarr_add (dst, 'N');
6156 Dynarr_add (dst, ISO_CODE_SS2);
6161 if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
6163 Dynarr_add (dst, ISO_CODE_ESC);
6164 Dynarr_add (dst, 'O');
6169 Dynarr_add (dst, ISO_CODE_SS3);
6177 charmask = (half == 0 ? 0x00 : 0x80);
6179 switch (XCHARSET_DIMENSION (charset))
6182 Dynarr_add (dst, (code_point & 0xFF) | charmask);
6185 Dynarr_add (dst, ((code_point >> 8) & 0xFF) | charmask);
6186 Dynarr_add (dst, ( code_point & 0xFF) | charmask);
6189 Dynarr_add (dst, ((code_point >> 16) & 0xFF) | charmask);
6190 Dynarr_add (dst, ((code_point >> 8) & 0xFF) | charmask);
6191 Dynarr_add (dst, ( code_point & 0xFF) | charmask);
6194 Dynarr_add (dst, ((code_point >> 24) & 0xFF) | charmask);
6195 Dynarr_add (dst, ((code_point >> 16) & 0xFF) | charmask);
6196 Dynarr_add (dst, ((code_point >> 8) & 0xFF) | charmask);
6197 Dynarr_add (dst, ( code_point & 0xFF) | charmask);
6203 str->iso2022.current_charset = charset;
6204 str->iso2022.current_half = half;
6208 char_finish_iso2022 (struct encoding_stream *str, unsigned_char_dynarr *dst,
6209 unsigned int *flags)
6211 Lisp_Coding_System* codesys = str->codesys;
6214 restore_left_to_right_direction (codesys, dst, flags, 0);
6215 ensure_normal_shift (str, dst);
6216 for (i = 0; i < 4; i++)
6218 Lisp_Object initial_charset
6219 = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
6220 iso2022_designate (initial_charset, i, str, dst);
6225 /************************************************************************/
6226 /* No-conversion methods */
6227 /************************************************************************/
6229 /* This is used when reading in "binary" files -- i.e. files that may
6230 contain all 256 possible byte values and that are not to be
6231 interpreted as being in any particular decoding. */
6233 decode_coding_no_conversion (Lstream *decoding, const Extbyte *src,
6234 unsigned_char_dynarr *dst, Lstream_data_count n)
6236 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
6237 unsigned int flags = str->flags;
6238 unsigned int cpos = str->cpos;
6239 eol_type_t eol_type = str->eol_type;
6243 unsigned char c = *(unsigned char *)src++;
6245 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
6246 DECODE_ADD_BINARY_CHAR (c, dst);
6247 label_continue_loop:;
6250 DECODE_HANDLE_END_OF_CONVERSION (flags, cpos, dst);
6257 encode_coding_no_conversion (Lstream *encoding, const Bufbyte *src,
6258 unsigned_char_dynarr *dst, Lstream_data_count n)
6261 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
6262 unsigned int flags = str->flags;
6263 unsigned int ch = str->ch;
6264 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
6266 unsigned char char_boundary = str->iso2022.current_char_boundary;
6273 if (char_boundary == 0)
6279 else if ( c >= 0xf8 )
6284 else if ( c >= 0xf0 )
6289 else if ( c >= 0xe0 )
6294 else if ( c >= 0xc0 )
6304 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
6305 Dynarr_add (dst, '\r');
6306 if (eol_type != EOL_CR)
6307 Dynarr_add (dst, c);
6310 Dynarr_add (dst, c);
6313 else if (char_boundary == 1)
6315 ch = ( ch << 6 ) | ( c & 0x3f );
6316 Dynarr_add (dst, ch & 0xff);
6321 ch = ( ch << 6 ) | ( c & 0x3f );
6324 #else /* not UTF2000 */
6327 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
6328 Dynarr_add (dst, '\r');
6329 if (eol_type != EOL_CR)
6330 Dynarr_add (dst, '\n');
6333 else if (BYTE_ASCII_P (c))
6336 Dynarr_add (dst, c);
6338 else if (BUFBYTE_LEADING_BYTE_P (c))
6341 if (c == LEADING_BYTE_LATIN_ISO8859_1 ||
6342 c == LEADING_BYTE_CONTROL_1)
6345 Dynarr_add (dst, '~'); /* untranslatable character */
6349 if (ch == LEADING_BYTE_LATIN_ISO8859_1)
6350 Dynarr_add (dst, c);
6351 else if (ch == LEADING_BYTE_CONTROL_1)
6354 Dynarr_add (dst, c - 0x20);
6356 /* else it should be the second or third byte of an
6357 untranslatable character, so ignore it */
6360 #endif /* not UTF2000 */
6366 str->iso2022.current_char_boundary = char_boundary;
6372 /************************************************************************/
6373 /* Initialization */
6374 /************************************************************************/
6377 syms_of_file_coding (void)
6379 INIT_LRECORD_IMPLEMENTATION (coding_system);
6381 DEFERROR_STANDARD (Qcoding_system_error, Qio_error);
6383 DEFSUBR (Fcoding_system_p);
6384 DEFSUBR (Ffind_coding_system);
6385 DEFSUBR (Fget_coding_system);
6386 DEFSUBR (Fcoding_system_list);
6387 DEFSUBR (Fcoding_system_name);
6388 DEFSUBR (Fmake_coding_system);
6389 DEFSUBR (Fcopy_coding_system);
6390 DEFSUBR (Fcoding_system_canonical_name_p);
6391 DEFSUBR (Fcoding_system_alias_p);
6392 DEFSUBR (Fcoding_system_aliasee);
6393 DEFSUBR (Fdefine_coding_system_alias);
6394 DEFSUBR (Fsubsidiary_coding_system);
6396 DEFSUBR (Fcoding_system_type);
6397 DEFSUBR (Fcoding_system_doc_string);
6399 DEFSUBR (Fcoding_system_charset);
6401 DEFSUBR (Fcoding_system_property);
6403 DEFSUBR (Fcoding_category_list);
6404 DEFSUBR (Fset_coding_priority_list);
6405 DEFSUBR (Fcoding_priority_list);
6406 DEFSUBR (Fset_coding_category_system);
6407 DEFSUBR (Fcoding_category_system);
6409 DEFSUBR (Fdetect_coding_region);
6410 DEFSUBR (Fdecode_coding_region);
6411 DEFSUBR (Fencode_coding_region);
6413 DEFSUBR (Fdecode_shift_jis_char);
6414 DEFSUBR (Fencode_shift_jis_char);
6415 DEFSUBR (Fdecode_big5_char);
6416 DEFSUBR (Fencode_big5_char);
6418 defsymbol (&Qcoding_systemp, "coding-system-p");
6419 defsymbol (&Qno_conversion, "no-conversion");
6420 defsymbol (&Qraw_text, "raw-text");
6422 defsymbol (&Qbig5, "big5");
6423 defsymbol (&Qshift_jis, "shift-jis");
6424 defsymbol (&Qucs4, "ucs-4");
6425 defsymbol (&Qutf8, "utf-8");
6426 defsymbol (&Qutf16, "utf-16");
6427 defsymbol (&Qccl, "ccl");
6428 defsymbol (&Qiso2022, "iso2022");
6430 defsymbol (&Qmnemonic, "mnemonic");
6431 defsymbol (&Qeol_type, "eol-type");
6432 defsymbol (&Qpost_read_conversion, "post-read-conversion");
6433 defsymbol (&Qpre_write_conversion, "pre-write-conversion");
6435 defsymbol (&Qcr, "cr");
6436 defsymbol (&Qlf, "lf");
6437 defsymbol (&Qcrlf, "crlf");
6438 defsymbol (&Qeol_cr, "eol-cr");
6439 defsymbol (&Qeol_lf, "eol-lf");
6440 defsymbol (&Qeol_crlf, "eol-crlf");
6442 defsymbol (&Qcharset_g0, "charset-g0");
6443 defsymbol (&Qcharset_g1, "charset-g1");
6444 defsymbol (&Qcharset_g2, "charset-g2");
6445 defsymbol (&Qcharset_g3, "charset-g3");
6446 defsymbol (&Qforce_g0_on_output, "force-g0-on-output");
6447 defsymbol (&Qforce_g1_on_output, "force-g1-on-output");
6448 defsymbol (&Qforce_g2_on_output, "force-g2-on-output");
6449 defsymbol (&Qforce_g3_on_output, "force-g3-on-output");
6450 defsymbol (&Qno_iso6429, "no-iso6429");
6451 defsymbol (&Qinput_charset_conversion, "input-charset-conversion");
6452 defsymbol (&Qoutput_charset_conversion, "output-charset-conversion");
6454 defsymbol (&Qshort, "short");
6455 defsymbol (&Qno_ascii_eol, "no-ascii-eol");
6456 defsymbol (&Qno_ascii_cntl, "no-ascii-cntl");
6457 defsymbol (&Qseven, "seven");
6458 defsymbol (&Qlock_shift, "lock-shift");
6459 defsymbol (&Qescape_quoted, "escape-quoted");
6462 defsymbol (&Qutf_8_mcs, "utf-8-mcs");
6463 defsymbol (&Qdisable_composition, "disable-composition");
6464 defsymbol (&Qenable_decomposition, "enable-decomposition");
6465 defsymbol (&Qccs_priority_list, "ccs-priority-list");
6466 defsymbol (&Quse_entity_reference, "use-entity-reference");
6467 defsymbol (&Qd, "d");
6468 defsymbol (&Qx, "x");
6469 defsymbol (&QX, "X");
6471 defsymbol (&Qencode, "encode");
6472 defsymbol (&Qdecode, "decode");
6475 defsymbol (&coding_category_symbol[CODING_CATEGORY_SHIFT_JIS],
6477 defsymbol (&coding_category_symbol[CODING_CATEGORY_BIG5],
6479 defsymbol (&coding_category_symbol[CODING_CATEGORY_UCS4],
6481 defsymbol (&coding_category_symbol[CODING_CATEGORY_UTF16],
6483 defsymbol (&coding_category_symbol[CODING_CATEGORY_UTF8],
6485 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_7],
6487 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_DESIGNATE],
6489 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_1],
6491 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_2],
6493 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_LOCK_SHIFT],
6496 defsymbol (&coding_category_symbol[CODING_CATEGORY_NO_CONVERSION],
6501 lstream_type_create_file_coding (void)
6503 LSTREAM_HAS_METHOD (decoding, reader);
6504 LSTREAM_HAS_METHOD (decoding, writer);
6505 LSTREAM_HAS_METHOD (decoding, rewinder);
6506 LSTREAM_HAS_METHOD (decoding, seekable_p);
6507 LSTREAM_HAS_METHOD (decoding, flusher);
6508 LSTREAM_HAS_METHOD (decoding, closer);
6509 LSTREAM_HAS_METHOD (decoding, marker);
6511 LSTREAM_HAS_METHOD (encoding, reader);
6512 LSTREAM_HAS_METHOD (encoding, writer);
6513 LSTREAM_HAS_METHOD (encoding, rewinder);
6514 LSTREAM_HAS_METHOD (encoding, seekable_p);
6515 LSTREAM_HAS_METHOD (encoding, flusher);
6516 LSTREAM_HAS_METHOD (encoding, closer);
6517 LSTREAM_HAS_METHOD (encoding, marker);
6521 vars_of_file_coding (void)
6525 fcd = xnew (struct file_coding_dump);
6526 dump_add_root_struct_ptr (&fcd, &fcd_description);
6528 /* Initialize to something reasonable ... */
6529 for (i = 0; i < CODING_CATEGORY_LAST; i++)
6531 fcd->coding_category_system[i] = Qnil;
6532 fcd->coding_category_by_priority[i] = i;
6535 Fprovide (intern ("file-coding"));
6537 DEFVAR_LISP ("keyboard-coding-system", &Vkeyboard_coding_system /*
6538 Coding system used for TTY keyboard input.
6539 Not used under a windowing system.
6541 Vkeyboard_coding_system = Qnil;
6543 DEFVAR_LISP ("terminal-coding-system", &Vterminal_coding_system /*
6544 Coding system used for TTY display output.
6545 Not used under a windowing system.
6547 Vterminal_coding_system = Qnil;
6549 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read /*
6550 Overriding coding system used when reading from a file or process.
6551 You should bind this variable with `let', but do not set it globally.
6552 If this is non-nil, it specifies the coding system that will be used
6553 to decode input on read operations, such as from a file or process.
6554 It overrides `buffer-file-coding-system-for-read',
6555 `insert-file-contents-pre-hook', etc. Use those variables instead of
6556 this one for permanent changes to the environment. */ );
6557 Vcoding_system_for_read = Qnil;
6559 DEFVAR_LISP ("coding-system-for-write",
6560 &Vcoding_system_for_write /*
6561 Overriding coding system used when writing to a file or process.
6562 You should bind this variable with `let', but do not set it globally.
6563 If this is non-nil, it specifies the coding system that will be used
6564 to encode output for write operations, such as to a file or process.
6565 It overrides `buffer-file-coding-system', `write-region-pre-hook', etc.
6566 Use those variables instead of this one for permanent changes to the
6568 Vcoding_system_for_write = Qnil;
6570 DEFVAR_LISP ("file-name-coding-system", &Vfile_name_coding_system /*
6571 Coding system used to convert pathnames when accessing files.
6573 Vfile_name_coding_system = Qnil;
6575 DEFVAR_LISP ("coded-charset-entity-reference-alist",
6576 &Vcoded_charset_entity_reference_alist /*
6577 Alist of coded-charset vs corresponding entity-reference.
6578 Each element looks like (CCS PREFIX CODE-COLUMNS CODE-TYPE).
6579 CCS is coded-charset.
6580 CODE-COLUMNS is columns of code-point of entity-reference.
6581 CODE-TYPE is format type of code-point of entity-reference.
6582 `d' means decimal value and `x' means hexadecimal value.
6584 Vcoded_charset_entity_reference_alist = Qnil;
6586 DEFVAR_BOOL ("enable-multibyte-characters", &enable_multibyte_characters /*
6587 Non-nil means the buffer contents are regarded as multi-byte form
6588 of characters, not a binary code. This affects the display, file I/O,
6589 and behaviors of various editing commands.
6591 Setting this to nil does not do anything.
6593 enable_multibyte_characters = 1;
6596 DEFVAR_LISP ("decomposition-feature-list",
6597 &Vdecomposition_feature_list /*
6598 List of `=decomposition@FOO' feature to encode characters as IVS.
6600 Vdecomposition_feature_list = Qnil;
6605 complex_vars_of_file_coding (void)
6607 staticpro (&Vcoding_system_hash_table);
6608 Vcoding_system_hash_table =
6609 make_lisp_hash_table (50, HASH_TABLE_NON_WEAK, HASH_TABLE_EQ);
6611 the_codesys_prop_dynarr = Dynarr_new (codesys_prop);
6612 dump_add_root_struct_ptr (&the_codesys_prop_dynarr, &codesys_prop_dynarr_description);
6614 #define DEFINE_CODESYS_PROP(Prop_Type, Sym) do \
6616 struct codesys_prop csp; \
6618 csp.prop_type = (Prop_Type); \
6619 Dynarr_add (the_codesys_prop_dynarr, csp); \
6622 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qmnemonic);
6623 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_type);
6624 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_cr);
6625 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_crlf);
6626 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_lf);
6627 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qpost_read_conversion);
6628 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qpre_write_conversion);
6630 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g0);
6631 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g1);
6632 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g2);
6633 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g3);
6634 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g0_on_output);
6635 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g1_on_output);
6636 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g2_on_output);
6637 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g3_on_output);
6638 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qshort);
6639 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_eol);
6640 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_cntl);
6641 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qseven);
6642 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qlock_shift);
6643 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_iso6429);
6644 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qescape_quoted);
6645 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qinput_charset_conversion);
6646 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qoutput_charset_conversion);
6648 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qccs_priority_list);
6651 DEFINE_CODESYS_PROP (CODESYS_PROP_CCL, Qencode);
6652 DEFINE_CODESYS_PROP (CODESYS_PROP_CCL, Qdecode);
6654 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qdisable_composition);
6655 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qenable_decomposition);
6656 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Quse_entity_reference);
6659 /* Need to create this here or we're really screwed. */
6661 (Qraw_text, Qno_conversion,
6662 build_string ("Raw text, which means it converts only line-break-codes."),
6663 list2 (Qmnemonic, build_string ("Raw")));
6666 (Qbinary, Qno_conversion,
6667 build_string ("Binary, which means it does not convert anything."),
6668 list4 (Qeol_type, Qlf,
6669 Qmnemonic, build_string ("Binary")));
6675 ("Coding-system of UTF-8 with Multiple Coded-character-Sets extension."),
6676 list2 (Qmnemonic, build_string ("MTF8")));
6679 Fdefine_coding_system_alias (Qno_conversion, Qraw_text);
6681 Fdefine_coding_system_alias (Qfile_name, Qbinary);
6683 Fdefine_coding_system_alias (Qterminal, Qbinary);
6684 Fdefine_coding_system_alias (Qkeyboard, Qbinary);
6686 /* Need this for bootstrapping */
6687 fcd->coding_category_system[CODING_CATEGORY_NO_CONVERSION] =
6688 Fget_coding_system (Qraw_text);
6691 fcd->coding_category_system[CODING_CATEGORY_UTF8]
6692 = Fget_coding_system (Qutf_8_mcs);
6695 #if defined(MULE) && !defined(UTF2000)
6699 for (i = 0; i < countof (fcd->ucs_to_mule_table); i++)
6700 fcd->ucs_to_mule_table[i] = Qnil;
6702 staticpro (&mule_to_ucs_table);
6703 mule_to_ucs_table = Fmake_char_table(Qgeneric);
6704 #endif /* defined(MULE) && !defined(UTF2000) */