1 /* Code conversion functions.
2 Copyright (C) 1991, 1995 Free Software Foundation, Inc.
3 Copyright (C) 1995 Sun Microsystems, Inc.
4 Copyright (C) 1999,2000,2001,2002,2003,2004,2005,2008 MORIOKA Tomohiko
6 This file is part of XEmacs.
8 XEmacs is free software; you can redistribute it and/or modify it
9 under the terms of the GNU General Public License as published by the
10 Free Software Foundation; either version 2, or (at your option) any
13 XEmacs is distributed in the hope that it will be useful, but WITHOUT
14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
18 You should have received a copy of the GNU General Public License
19 along with XEmacs; see the file COPYING. If not, write to
20 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
21 Boston, MA 02111-1307, USA. */
23 /* Synched up with: Mule 2.3. Not in FSF. */
25 /* Rewritten by Ben Wing <ben@xemacs.org>. */
26 /* Rewritten by MORIOKA Tomohiko <tomo@m17n.org> for XEmacs CHISE. */
40 #include "file-coding.h"
42 Lisp_Object Qcoding_system_error;
44 Lisp_Object Vkeyboard_coding_system;
45 Lisp_Object Vterminal_coding_system;
46 Lisp_Object Vcoding_system_for_read;
47 Lisp_Object Vcoding_system_for_write;
48 Lisp_Object Vfile_name_coding_system;
50 Lisp_Object Vcoded_charset_entity_reference_alist;
52 /* Table of symbols identifying each coding category. */
53 Lisp_Object coding_category_symbol[CODING_CATEGORY_LAST];
57 struct file_coding_dump {
58 /* Coding system currently associated with each coding category. */
59 Lisp_Object coding_category_system[CODING_CATEGORY_LAST];
61 /* Table of all coding categories in decreasing order of priority.
62 This describes a permutation of the possible coding categories. */
63 int coding_category_by_priority[CODING_CATEGORY_LAST];
65 #if defined(MULE) && !defined(UTF2000)
66 Lisp_Object ucs_to_mule_table[65536];
70 static const struct lrecord_description fcd_description_1[] = {
71 { XD_LISP_OBJECT_ARRAY, offsetof (struct file_coding_dump, coding_category_system), CODING_CATEGORY_LAST },
72 #if defined(MULE) && !defined(UTF2000)
73 { XD_LISP_OBJECT_ARRAY, offsetof (struct file_coding_dump, ucs_to_mule_table), countof (fcd->ucs_to_mule_table) },
78 static const struct struct_description fcd_description = {
79 sizeof (struct file_coding_dump),
83 Lisp_Object mule_to_ucs_table;
85 Lisp_Object Qcoding_systemp;
87 Lisp_Object Qraw_text, Qno_conversion, Qccl, Qiso2022;
88 /* Qinternal in general.c */
90 Lisp_Object Qmnemonic, Qeol_type;
91 Lisp_Object Qcr, Qcrlf, Qlf;
92 Lisp_Object Qeol_cr, Qeol_crlf, Qeol_lf;
93 Lisp_Object Qpost_read_conversion;
94 Lisp_Object Qpre_write_conversion;
97 Lisp_Object Qucs4, Qutf16, Qutf8;
98 Lisp_Object Qbig5, Qshift_jis;
99 Lisp_Object Qcharset_g0, Qcharset_g1, Qcharset_g2, Qcharset_g3;
100 Lisp_Object Qforce_g0_on_output, Qforce_g1_on_output;
101 Lisp_Object Qforce_g2_on_output, Qforce_g3_on_output;
102 Lisp_Object Qno_iso6429;
103 Lisp_Object Qinput_charset_conversion, Qoutput_charset_conversion;
104 Lisp_Object Qescape_quoted;
105 Lisp_Object Qshort, Qno_ascii_eol, Qno_ascii_cntl, Qseven, Qlock_shift;
108 Lisp_Object Qutf_8_mcs;
109 Lisp_Object Qdisable_composition;
110 Lisp_Object Qccs_priority_list;
111 Lisp_Object Quse_entity_reference;
112 Lisp_Object Qd, Qx, QX;
114 Lisp_Object Qencode, Qdecode;
116 Lisp_Object Vcoding_system_hash_table;
118 int enable_multibyte_characters;
121 /* Additional information used by the ISO2022 decoder and detector. */
122 struct iso2022_decoder
124 /* CHARSET holds the character sets currently assigned to the G0
125 through G3 variables. It is initialized from the array
126 INITIAL_CHARSET in CODESYS. */
127 Lisp_Object charset[4];
129 /* Which registers are currently invoked into the left (GL) and
130 right (GR) halves of the 8-bit encoding space? */
131 int register_left, register_right;
133 /* ISO_ESC holds a value indicating part of an escape sequence
134 that has already been seen. */
135 enum iso_esc_flag esc;
137 /* This records the bytes we've seen so far in an escape sequence,
138 in case the sequence is invalid (we spit out the bytes unchanged). */
139 unsigned char esc_bytes[8];
141 /* Index for next byte to store in ISO escape sequence. */
144 #ifdef ENABLE_COMPOSITE_CHARS
145 /* Stuff seen so far when composing a string. */
146 unsigned_char_dynarr *composite_chars;
149 /* If we saw an invalid designation sequence for a particular
150 register, we flag it here and switch to ASCII. The next time we
151 see a valid designation for this register, we turn off the flag
152 and do the designation normally, but pretend the sequence was
153 invalid. The effect of all this is that (most of the time) the
154 escape sequences for both the switch to the unknown charset, and
155 the switch back to the known charset, get inserted literally into
156 the buffer and saved out as such. The hope is that we can
157 preserve the escape sequences so that the resulting written out
158 file makes sense. If we don't do any of this, the designation
159 to the invalid charset will be preserved but that switch back
160 to the known charset will probably get eaten because it was
161 the same charset that was already present in the register. */
162 unsigned char invalid_designated[4];
164 /* We try to do similar things as above for direction-switching
165 sequences. If we encountered a direction switch while an
166 invalid designation was present, or an invalid designation
167 just after a direction switch (i.e. no valid designation
168 encountered yet), we insert the direction-switch escape
169 sequence literally into the output stream, and later on
170 insert the corresponding direction-restoring escape sequence
172 unsigned int switched_dir_and_no_valid_charset_yet :1;
173 unsigned int invalid_switch_dir :1;
175 /* Tells the decoder to output the escape sequence literally
176 even though it was valid. Used in the games we play to
177 avoid lossage when we encounter invalid designations. */
178 unsigned int output_literally :1;
179 /* We encountered a direction switch followed by an invalid
180 designation. We didn't output the direction switch
181 literally because we didn't know about the invalid designation;
182 but we have to do so now. */
183 unsigned int output_direction_sequence :1;
186 EXFUN (Fcopy_coding_system, 2);
188 struct detection_state;
191 text_encode_generic (Lstream *encoding, const Bufbyte *src,
192 unsigned_char_dynarr *dst, Lstream_data_count n);
194 static int detect_coding_sjis (struct detection_state *st,
195 const Extbyte *src, Lstream_data_count n);
196 static void decode_coding_sjis (Lstream *decoding, const Extbyte *src,
197 unsigned_char_dynarr *dst, Lstream_data_count n);
198 void char_encode_shift_jis (struct encoding_stream *str, Emchar c,
199 unsigned_char_dynarr *dst, unsigned int *flags);
200 void char_finish_shift_jis (struct encoding_stream *str,
201 unsigned_char_dynarr *dst, unsigned int *flags);
203 static int detect_coding_big5 (struct detection_state *st,
204 const Extbyte *src, Lstream_data_count n);
205 static void decode_coding_big5 (Lstream *decoding, const Extbyte *src,
206 unsigned_char_dynarr *dst, Lstream_data_count n);
207 void char_encode_big5 (struct encoding_stream *str, Emchar c,
208 unsigned_char_dynarr *dst, unsigned int *flags);
209 void char_finish_big5 (struct encoding_stream *str,
210 unsigned_char_dynarr *dst, unsigned int *flags);
212 static int detect_coding_ucs4 (struct detection_state *st,
213 const Extbyte *src, Lstream_data_count n);
214 static void decode_coding_ucs4 (Lstream *decoding, const Extbyte *src,
215 unsigned_char_dynarr *dst, Lstream_data_count n);
216 void char_encode_ucs4 (struct encoding_stream *str, Emchar c,
217 unsigned_char_dynarr *dst, unsigned int *flags);
218 void char_finish_ucs4 (struct encoding_stream *str,
219 unsigned_char_dynarr *dst, unsigned int *flags);
221 static int detect_coding_utf16 (struct detection_state *st,
222 const Extbyte *src, Lstream_data_count n);
223 static void decode_coding_utf16 (Lstream *decoding, const Extbyte *src,
224 unsigned_char_dynarr *dst, Lstream_data_count n);
225 void char_encode_utf16 (struct encoding_stream *str, Emchar c,
226 unsigned_char_dynarr *dst, unsigned int *flags);
227 void char_finish_utf16 (struct encoding_stream *str,
228 unsigned_char_dynarr *dst, unsigned int *flags);
230 static int detect_coding_utf8 (struct detection_state *st,
231 const Extbyte *src, Lstream_data_count n);
232 static void decode_coding_utf8 (Lstream *decoding, const Extbyte *src,
233 unsigned_char_dynarr *dst, Lstream_data_count n);
234 void char_encode_utf8 (struct encoding_stream *str, Emchar c,
235 unsigned_char_dynarr *dst, unsigned int *flags);
236 void char_finish_utf8 (struct encoding_stream *str,
237 unsigned_char_dynarr *dst, unsigned int *flags);
239 static int postprocess_iso2022_mask (int mask);
240 static void reset_iso2022 (Lisp_Object coding_system,
241 struct iso2022_decoder *iso);
242 static int detect_coding_iso2022 (struct detection_state *st,
243 const Extbyte *src, Lstream_data_count n);
244 static void decode_coding_iso2022 (Lstream *decoding, const Extbyte *src,
245 unsigned_char_dynarr *dst, Lstream_data_count n);
246 void char_encode_iso2022 (struct encoding_stream *str, Emchar c,
247 unsigned_char_dynarr *dst, unsigned int *flags);
248 void char_finish_iso2022 (struct encoding_stream *str,
249 unsigned_char_dynarr *dst, unsigned int *flags);
251 static void decode_coding_no_conversion (Lstream *decoding, const Extbyte *src,
252 unsigned_char_dynarr *dst, Lstream_data_count n);
253 static void encode_coding_no_conversion (Lstream *encoding, const Bufbyte *src,
254 unsigned_char_dynarr *dst, Lstream_data_count n);
255 static void mule_decode (Lstream *decoding, const Extbyte *src,
256 unsigned_char_dynarr *dst, Lstream_data_count n);
257 static void mule_encode (Lstream *encoding, const Bufbyte *src,
258 unsigned_char_dynarr *dst, Lstream_data_count n);
260 typedef struct codesys_prop codesys_prop;
269 Dynarr_declare (codesys_prop);
270 } codesys_prop_dynarr;
272 static const struct lrecord_description codesys_prop_description_1[] = {
273 { XD_LISP_OBJECT, offsetof (codesys_prop, sym) },
277 static const struct struct_description codesys_prop_description = {
278 sizeof (codesys_prop),
279 codesys_prop_description_1
282 static const struct lrecord_description codesys_prop_dynarr_description_1[] = {
283 XD_DYNARR_DESC (codesys_prop_dynarr, &codesys_prop_description),
287 static const struct struct_description codesys_prop_dynarr_description = {
288 sizeof (codesys_prop_dynarr),
289 codesys_prop_dynarr_description_1
292 codesys_prop_dynarr *the_codesys_prop_dynarr;
294 enum codesys_prop_enum
297 CODESYS_PROP_ISO2022,
302 /************************************************************************/
303 /* Coding system functions */
304 /************************************************************************/
306 static Lisp_Object mark_coding_system (Lisp_Object);
307 static void print_coding_system (Lisp_Object, Lisp_Object, int);
308 static void finalize_coding_system (void *header, int for_disksave);
311 static const struct lrecord_description ccs_description_1[] = {
312 { XD_LISP_OBJECT, offsetof (charset_conversion_spec, from_charset) },
313 { XD_LISP_OBJECT, offsetof (charset_conversion_spec, to_charset) },
317 static const struct struct_description ccs_description = {
318 sizeof (charset_conversion_spec),
322 static const struct lrecord_description ccsd_description_1[] = {
323 XD_DYNARR_DESC (charset_conversion_spec_dynarr, &ccs_description),
327 static const struct struct_description ccsd_description = {
328 sizeof (charset_conversion_spec_dynarr),
333 static const struct lrecord_description coding_system_description[] = {
334 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, name) },
335 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, doc_string) },
336 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, mnemonic) },
337 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, post_read_conversion) },
338 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, pre_write_conversion) },
339 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, eol_lf) },
340 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, eol_crlf) },
341 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, eol_cr) },
343 { XD_LISP_OBJECT_ARRAY, offsetof (Lisp_Coding_System, iso2022.initial_charset), 4 },
344 { XD_STRUCT_PTR, offsetof (Lisp_Coding_System, iso2022.input_conv), 1, &ccsd_description },
345 { XD_STRUCT_PTR, offsetof (Lisp_Coding_System, iso2022.output_conv), 1, &ccsd_description },
346 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, ccl.decode) },
347 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, ccl.encode) },
349 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, ccs_priority_list) },
355 DEFINE_LRECORD_IMPLEMENTATION ("coding-system", coding_system,
356 mark_coding_system, print_coding_system,
357 finalize_coding_system,
358 0, 0, coding_system_description,
362 mark_coding_system (Lisp_Object obj)
364 Lisp_Coding_System *codesys = XCODING_SYSTEM (obj);
366 mark_object (CODING_SYSTEM_NAME (codesys));
367 mark_object (CODING_SYSTEM_DOC_STRING (codesys));
368 mark_object (CODING_SYSTEM_MNEMONIC (codesys));
369 mark_object (CODING_SYSTEM_EOL_LF (codesys));
370 mark_object (CODING_SYSTEM_EOL_CRLF (codesys));
371 mark_object (CODING_SYSTEM_EOL_CR (codesys));
373 switch (CODING_SYSTEM_TYPE (codesys))
377 case CODESYS_ISO2022:
378 for (i = 0; i < 4; i++)
379 mark_object (CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i));
380 if (codesys->iso2022.input_conv)
382 for (i = 0; i < Dynarr_length (codesys->iso2022.input_conv); i++)
384 struct charset_conversion_spec *ccs =
385 Dynarr_atp (codesys->iso2022.input_conv, i);
386 mark_object (ccs->from_charset);
387 mark_object (ccs->to_charset);
390 if (codesys->iso2022.output_conv)
392 for (i = 0; i < Dynarr_length (codesys->iso2022.output_conv); i++)
394 struct charset_conversion_spec *ccs =
395 Dynarr_atp (codesys->iso2022.output_conv, i);
396 mark_object (ccs->from_charset);
397 mark_object (ccs->to_charset);
404 mark_object (CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 0));
405 mark_object (CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 1));
410 mark_object (CODING_SYSTEM_CCL_DECODE (codesys));
411 mark_object (CODING_SYSTEM_CCL_ENCODE (codesys));
418 mark_object (CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys));
420 mark_object (CODING_SYSTEM_CCS_PRIORITY_LIST (codesys));
422 return CODING_SYSTEM_POST_READ_CONVERSION (codesys);
426 print_coding_system (Lisp_Object obj, Lisp_Object printcharfun,
429 Lisp_Coding_System *c = XCODING_SYSTEM (obj);
431 error ("printing unreadable object #<coding_system 0x%x>",
434 write_c_string ("#<coding_system ", printcharfun);
435 print_internal (c->name, printcharfun, 1);
436 write_c_string (">", printcharfun);
440 finalize_coding_system (void *header, int for_disksave)
442 Lisp_Coding_System *c = (Lisp_Coding_System *) header;
443 /* Since coding systems never go away, this function is not
444 necessary. But it would be necessary if we changed things
445 so that coding systems could go away. */
446 if (!for_disksave) /* see comment in lstream.c */
448 switch (CODING_SYSTEM_TYPE (c))
451 case CODESYS_ISO2022:
452 if (c->iso2022.input_conv)
454 Dynarr_free (c->iso2022.input_conv);
455 c->iso2022.input_conv = 0;
457 if (c->iso2022.output_conv)
459 Dynarr_free (c->iso2022.output_conv);
460 c->iso2022.output_conv = 0;
471 symbol_to_eol_type (Lisp_Object symbol)
473 CHECK_SYMBOL (symbol);
474 if (NILP (symbol)) return EOL_AUTODETECT;
475 if (EQ (symbol, Qlf)) return EOL_LF;
476 if (EQ (symbol, Qcrlf)) return EOL_CRLF;
477 if (EQ (symbol, Qcr)) return EOL_CR;
479 signal_simple_error ("Unrecognized eol type", symbol);
480 return EOL_AUTODETECT; /* not reached */
484 eol_type_to_symbol (eol_type_t type)
489 case EOL_LF: return Qlf;
490 case EOL_CRLF: return Qcrlf;
491 case EOL_CR: return Qcr;
492 case EOL_AUTODETECT: return Qnil;
497 setup_eol_coding_systems (Lisp_Coding_System *codesys)
499 Lisp_Object codesys_obj;
500 int len = string_length (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name);
501 char *codesys_name = (char *) alloca (len + 7);
503 char *codesys_mnemonic=0;
505 Lisp_Object codesys_name_sym, sub_codesys_obj;
509 XSETCODING_SYSTEM (codesys_obj, codesys);
511 memcpy (codesys_name,
512 string_data (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name), len);
514 if (STRINGP (CODING_SYSTEM_MNEMONIC (codesys)))
516 mlen = XSTRING_LENGTH (CODING_SYSTEM_MNEMONIC (codesys));
517 codesys_mnemonic = (char *) alloca (mlen + 7);
518 memcpy (codesys_mnemonic,
519 XSTRING_DATA (CODING_SYSTEM_MNEMONIC (codesys)), mlen);
522 #define DEFINE_SUB_CODESYS(op_sys, op_sys_abbr, Type) do { \
523 strcpy (codesys_name + len, "-" op_sys); \
525 strcpy (codesys_mnemonic + mlen, op_sys_abbr); \
526 codesys_name_sym = intern (codesys_name); \
527 sub_codesys_obj = Fcopy_coding_system (codesys_obj, codesys_name_sym); \
528 XCODING_SYSTEM_EOL_TYPE (sub_codesys_obj) = Type; \
530 XCODING_SYSTEM_MNEMONIC(sub_codesys_obj) = \
531 build_string (codesys_mnemonic); \
532 CODING_SYSTEM_##Type (codesys) = sub_codesys_obj; \
535 DEFINE_SUB_CODESYS("unix", "", EOL_LF);
536 DEFINE_SUB_CODESYS("dos", ":T", EOL_CRLF);
537 DEFINE_SUB_CODESYS("mac", ":t", EOL_CR);
540 DEFUN ("coding-system-p", Fcoding_system_p, 1, 1, 0, /*
541 Return t if OBJECT is a coding system.
542 A coding system is an object that defines how text containing multiple
543 character sets is encoded into a stream of (typically 8-bit) bytes.
544 The coding system is used to decode the stream into a series of
545 characters (which may be from multiple charsets) when the text is read
546 from a file or process, and is used to encode the text back into the
547 same format when it is written out to a file or process.
549 For example, many ISO2022-compliant coding systems (such as Compound
550 Text, which is used for inter-client data under the X Window System)
551 use escape sequences to switch between different charsets -- Japanese
552 Kanji, for example, is invoked with "ESC $ ( B"; ASCII is invoked
553 with "ESC ( B"; and Cyrillic is invoked with "ESC - L". See
554 `make-coding-system' for more information.
556 Coding systems are normally identified using a symbol, and the
557 symbol is accepted in place of the actual coding system object whenever
558 a coding system is called for. (This is similar to how faces work.)
562 return CODING_SYSTEMP (object) ? Qt : Qnil;
565 DEFUN ("find-coding-system", Ffind_coding_system, 1, 1, 0, /*
566 Retrieve the coding system of the given name.
568 If CODING-SYSTEM-OR-NAME is a coding-system object, it is simply
569 returned. Otherwise, CODING-SYSTEM-OR-NAME should be a symbol.
570 If there is no such coding system, nil is returned. Otherwise the
571 associated coding system object is returned.
573 (coding_system_or_name))
575 if (NILP (coding_system_or_name))
576 coding_system_or_name = Qbinary;
577 else if (CODING_SYSTEMP (coding_system_or_name))
578 return coding_system_or_name;
580 CHECK_SYMBOL (coding_system_or_name);
584 coding_system_or_name =
585 Fgethash (coding_system_or_name, Vcoding_system_hash_table, Qnil);
587 if (CODING_SYSTEMP (coding_system_or_name) || NILP (coding_system_or_name))
588 return coding_system_or_name;
592 DEFUN ("get-coding-system", Fget_coding_system, 1, 1, 0, /*
593 Retrieve the coding system of the given name.
594 Same as `find-coding-system' except that if there is no such
595 coding system, an error is signaled instead of returning nil.
599 Lisp_Object coding_system = Ffind_coding_system (name);
601 if (NILP (coding_system))
602 signal_simple_error ("No such coding system", name);
603 return coding_system;
606 /* We store the coding systems in hash tables with the names as the key and the
607 actual coding system object as the value. Occasionally we need to use them
608 in a list format. These routines provide us with that. */
609 struct coding_system_list_closure
611 Lisp_Object *coding_system_list;
615 add_coding_system_to_list_mapper (Lisp_Object key, Lisp_Object value,
616 void *coding_system_list_closure)
618 /* This function can GC */
619 struct coding_system_list_closure *cscl =
620 (struct coding_system_list_closure *) coding_system_list_closure;
621 Lisp_Object *coding_system_list = cscl->coding_system_list;
623 *coding_system_list = Fcons (key, *coding_system_list);
627 DEFUN ("coding-system-list", Fcoding_system_list, 0, 0, 0, /*
628 Return a list of the names of all defined coding systems.
632 Lisp_Object coding_system_list = Qnil;
634 struct coding_system_list_closure coding_system_list_closure;
636 GCPRO1 (coding_system_list);
637 coding_system_list_closure.coding_system_list = &coding_system_list;
638 elisp_maphash (add_coding_system_to_list_mapper, Vcoding_system_hash_table,
639 &coding_system_list_closure);
642 return coding_system_list;
645 DEFUN ("coding-system-name", Fcoding_system_name, 1, 1, 0, /*
646 Return the name of the given coding system.
650 coding_system = Fget_coding_system (coding_system);
651 return XCODING_SYSTEM_NAME (coding_system);
654 static Lisp_Coding_System *
655 allocate_coding_system (enum coding_system_type type, Lisp_Object name)
657 Lisp_Coding_System *codesys =
658 alloc_lcrecord_type (Lisp_Coding_System, &lrecord_coding_system);
660 zero_lcrecord (codesys);
661 CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = Qnil;
662 CODING_SYSTEM_POST_READ_CONVERSION (codesys) = Qnil;
663 CODING_SYSTEM_EOL_TYPE (codesys) = EOL_AUTODETECT;
664 CODING_SYSTEM_EOL_CRLF (codesys) = Qnil;
665 CODING_SYSTEM_EOL_CR (codesys) = Qnil;
666 CODING_SYSTEM_EOL_LF (codesys) = Qnil;
667 CODING_SYSTEM_TYPE (codesys) = type;
668 CODING_SYSTEM_MNEMONIC (codesys) = Qnil;
671 CODING_SYSTEM_CCS_PRIORITY_LIST (codesys) = Qnil;
673 if (type == CODESYS_ISO2022)
676 for (i = 0; i < 4; i++)
677 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i) = Qnil;
680 if (type == CODESYS_UTF8)
682 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 0)
684 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 1)
686 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 2)
688 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 3)
691 else if (type == CODESYS_BIG5)
693 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 0)
695 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 1)
696 = Vcharset_chinese_big5;
697 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 2)
699 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 3)
703 else if (type == CODESYS_CCL)
705 CODING_SYSTEM_CCL_DECODE (codesys) = Qnil;
706 CODING_SYSTEM_CCL_ENCODE (codesys) = Qnil;
709 CODING_SYSTEM_NAME (codesys) = name;
715 /* Given a list of charset conversion specs as specified in a Lisp
716 program, parse it into STORE_HERE. */
719 parse_charset_conversion_specs (charset_conversion_spec_dynarr *store_here,
720 Lisp_Object spec_list)
724 EXTERNAL_LIST_LOOP (rest, spec_list)
726 Lisp_Object car = XCAR (rest);
727 Lisp_Object from, to;
728 struct charset_conversion_spec spec;
730 if (!CONSP (car) || !CONSP (XCDR (car)) || !NILP (XCDR (XCDR (car))))
731 signal_simple_error ("Invalid charset conversion spec", car);
732 from = Fget_charset (XCAR (car));
733 to = Fget_charset (XCAR (XCDR (car)));
734 if ( (XCHARSET_CHARS (from) != XCHARSET_CHARS (to)) ||
735 (XCHARSET_DIMENSION (from) != XCHARSET_DIMENSION (to)) )
736 signal_simple_error_2
737 ("Attempted conversion between different charset types",
739 spec.from_charset = from;
740 spec.to_charset = to;
742 Dynarr_add (store_here, spec);
746 /* Given a dynarr LOAD_HERE of internally-stored charset conversion
747 specs, return the equivalent as the Lisp programmer would see it.
749 If LOAD_HERE is 0, return Qnil. */
752 unparse_charset_conversion_specs (charset_conversion_spec_dynarr *load_here)
759 for (i = 0, result = Qnil; i < Dynarr_length (load_here); i++)
761 struct charset_conversion_spec *ccs = Dynarr_atp (load_here, i);
762 result = Fcons (list2 (ccs->from_charset, ccs->to_charset), result);
765 return Fnreverse (result);
770 DEFUN ("make-coding-system", Fmake_coding_system, 2, 4, 0, /*
771 Register symbol NAME as a coding system.
773 TYPE describes the conversion method used and should be one of
776 Automatic conversion. XEmacs attempts to detect the coding system
779 No conversion. Use this for binary files and such. On output,
780 graphic characters that are not in ASCII or Latin-1 will be
781 replaced by a ?. (For a no-conversion-encoded buffer, these
782 characters will only be present if you explicitly insert them.)
784 Shift-JIS (a Japanese encoding commonly used in PC operating systems).
786 ISO 10646 UCS-4 encoding.
788 ISO 10646 UTF-8 encoding.
790 Any ISO2022-compliant encoding. Among other things, this includes
791 JIS (the Japanese encoding commonly used for e-mail), EUC (the
792 standard Unix encoding for Japanese and other languages), and
793 Compound Text (the encoding used in X11). You can specify more
794 specific information about the conversion with the PROPS argument.
796 Big5 (the encoding commonly used for Taiwanese).
798 The conversion is performed using a user-written pseudo-code
799 program. CCL (Code Conversion Language) is the name of this
802 Write out or read in the raw contents of the memory representing
803 the buffer's text. This is primarily useful for debugging
804 purposes, and is only enabled when XEmacs has been compiled with
805 DEBUG_XEMACS defined (via the --debug configure option).
806 WARNING: Reading in a file using 'internal conversion can result
807 in an internal inconsistency in the memory representing a
808 buffer's text, which will produce unpredictable results and may
809 cause XEmacs to crash. Under normal circumstances you should
810 never use 'internal conversion.
812 DOC-STRING is a string describing the coding system.
814 PROPS is a property list, describing the specific nature of the
815 character set. Recognized properties are:
818 String to be displayed in the modeline when this coding system is
822 End-of-line conversion to be used. It should be one of
825 Automatically detect the end-of-line type (LF, CRLF,
826 or CR). Also generate subsidiary coding systems named
827 `NAME-unix', `NAME-dos', and `NAME-mac', that are
828 identical to this coding system but have an EOL-TYPE
829 value of 'lf, 'crlf, and 'cr, respectively.
831 The end of a line is marked externally using ASCII LF.
832 Since this is also the way that XEmacs represents an
833 end-of-line internally, specifying this option results
834 in no end-of-line conversion. This is the standard
835 format for Unix text files.
837 The end of a line is marked externally using ASCII
838 CRLF. This is the standard format for MS-DOS text
841 The end of a line is marked externally using ASCII CR.
842 This is the standard format for Macintosh text files.
844 Automatically detect the end-of-line type but do not
845 generate subsidiary coding systems. (This value is
846 converted to nil when stored internally, and
847 `coding-system-property' will return nil.)
850 If non-nil, composition/decomposition for combining characters
853 'use-entity-reference
854 If non-nil, SGML style entity-reference is used for non-system-characters.
856 'post-read-conversion
857 Function called after a file has been read in, to perform the
858 decoding. Called with two arguments, START and END, denoting
859 a region of the current buffer to be decoded.
861 'pre-write-conversion
862 Function called before a file is written out, to perform the
863 encoding. Called with two arguments, START and END, denoting
864 a region of the current buffer to be encoded.
867 The following additional properties are recognized if TYPE is 'iso2022:
873 The character set initially designated to the G0 - G3 registers.
874 The value should be one of
876 -- A charset object (designate that character set)
877 -- nil (do not ever use this register)
878 -- t (no character set is initially designated to
879 the register, but may be later on; this automatically
880 sets the corresponding `force-g*-on-output' property)
886 If non-nil, send an explicit designation sequence on output before
887 using the specified register.
890 If non-nil, use the short forms "ESC $ @", "ESC $ A", and
891 "ESC $ B" on output in place of the full designation sequences
892 "ESC $ ( @", "ESC $ ( A", and "ESC $ ( B".
895 If non-nil, don't designate ASCII to G0 at each end of line on output.
896 Setting this to non-nil also suppresses other state-resetting that
897 normally happens at the end of a line.
900 If non-nil, don't designate ASCII to G0 before control chars on output.
903 If non-nil, use 7-bit environment on output. Otherwise, use 8-bit
907 If non-nil, use locking-shift (SO/SI) instead of single-shift
908 or designation by escape sequence.
911 If non-nil, don't use ISO6429's direction specification.
914 If non-nil, literal control characters that are the same as
915 the beginning of a recognized ISO2022 or ISO6429 escape sequence
916 (in particular, ESC (0x1B), SO (0x0E), SI (0x0F), SS2 (0x8E),
917 SS3 (0x8F), and CSI (0x9B)) are "quoted" with an escape character
918 so that they can be properly distinguished from an escape sequence.
919 (Note that doing this results in a non-portable encoding.) This
920 encoding flag is used for byte-compiled files. Note that ESC
921 is a good choice for a quoting character because there are no
922 escape sequences whose second byte is a character from the Control-0
923 or Control-1 character sets; this is explicitly disallowed by the
926 'input-charset-conversion
927 A list of conversion specifications, specifying conversion of
928 characters in one charset to another when decoding is performed.
929 Each specification is a list of two elements: the source charset,
930 and the destination charset.
932 'output-charset-conversion
933 A list of conversion specifications, specifying conversion of
934 characters in one charset to another when encoding is performed.
935 The form of each specification is the same as for
936 'input-charset-conversion.
939 The following additional properties are recognized (and required)
943 CCL program used for decoding (converting to internal format).
946 CCL program used for encoding (converting to external format).
948 (name, type, doc_string, props))
950 Lisp_Coding_System *codesys;
951 enum coding_system_type ty;
952 int need_to_setup_eol_systems = 1;
954 /* Convert type to constant */
955 if (NILP (type) || EQ (type, Qundecided))
956 { ty = CODESYS_AUTODETECT; }
958 else if (EQ (type, Qshift_jis)) { ty = CODESYS_SHIFT_JIS; }
959 else if (EQ (type, Qiso2022)) { ty = CODESYS_ISO2022; }
960 else if (EQ (type, Qbig5)) { ty = CODESYS_BIG5; }
961 else if (EQ (type, Qucs4)) { ty = CODESYS_UCS4; }
962 else if (EQ (type, Qutf16)) { ty = CODESYS_UTF16; }
963 else if (EQ (type, Qutf8)) { ty = CODESYS_UTF8; }
964 else if (EQ (type, Qccl)) { ty = CODESYS_CCL; }
966 else if (EQ (type, Qno_conversion)) { ty = CODESYS_NO_CONVERSION; }
968 else if (EQ (type, Qinternal)) { ty = CODESYS_INTERNAL; }
971 signal_simple_error ("Invalid coding system type", type);
975 codesys = allocate_coding_system (ty, name);
977 if (NILP (doc_string))
978 doc_string = build_string ("");
980 CHECK_STRING (doc_string);
981 CODING_SYSTEM_DOC_STRING (codesys) = doc_string;
984 EXTERNAL_PROPERTY_LIST_LOOP_3 (key, value, props)
986 if (EQ (key, Qmnemonic))
989 CHECK_STRING (value);
990 CODING_SYSTEM_MNEMONIC (codesys) = value;
993 else if (EQ (key, Qeol_type))
995 need_to_setup_eol_systems = NILP (value);
998 CODING_SYSTEM_EOL_TYPE (codesys) = symbol_to_eol_type (value);
1001 else if (EQ (key, Qpost_read_conversion))
1002 CODING_SYSTEM_POST_READ_CONVERSION (codesys) = value;
1003 else if (EQ (key, Qpre_write_conversion))
1004 CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = value;
1006 else if (EQ (key, Qdisable_composition))
1007 CODING_SYSTEM_DISABLE_COMPOSITION (codesys) = !NILP (value);
1008 else if (EQ (key, Quse_entity_reference))
1009 CODING_SYSTEM_USE_ENTITY_REFERENCE (codesys) = !NILP (value);
1012 else if (ty == CODESYS_ISO2022)
1014 #define FROB_INITIAL_CHARSET(charset_num) \
1015 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, charset_num) = \
1016 ((EQ (value, Qt) || EQ (value, Qnil)) ? value : Fget_charset (value))
1018 if (EQ (key, Qcharset_g0)) FROB_INITIAL_CHARSET (0);
1019 else if (EQ (key, Qcharset_g1)) FROB_INITIAL_CHARSET (1);
1020 else if (EQ (key, Qcharset_g2)) FROB_INITIAL_CHARSET (2);
1021 else if (EQ (key, Qcharset_g3)) FROB_INITIAL_CHARSET (3);
1023 #define FROB_FORCE_CHARSET(charset_num) \
1024 CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (codesys, charset_num) = !NILP (value)
1026 else if (EQ (key, Qforce_g0_on_output)) FROB_FORCE_CHARSET (0);
1027 else if (EQ (key, Qforce_g1_on_output)) FROB_FORCE_CHARSET (1);
1028 else if (EQ (key, Qforce_g2_on_output)) FROB_FORCE_CHARSET (2);
1029 else if (EQ (key, Qforce_g3_on_output)) FROB_FORCE_CHARSET (3);
1031 #define FROB_BOOLEAN_PROPERTY(prop) \
1032 CODING_SYSTEM_ISO2022_##prop (codesys) = !NILP (value)
1034 else if (EQ (key, Qshort)) FROB_BOOLEAN_PROPERTY (SHORT);
1035 else if (EQ (key, Qno_ascii_eol)) FROB_BOOLEAN_PROPERTY (NO_ASCII_EOL);
1036 else if (EQ (key, Qno_ascii_cntl)) FROB_BOOLEAN_PROPERTY (NO_ASCII_CNTL);
1037 else if (EQ (key, Qseven)) FROB_BOOLEAN_PROPERTY (SEVEN);
1038 else if (EQ (key, Qlock_shift)) FROB_BOOLEAN_PROPERTY (LOCK_SHIFT);
1039 else if (EQ (key, Qno_iso6429)) FROB_BOOLEAN_PROPERTY (NO_ISO6429);
1040 else if (EQ (key, Qescape_quoted)) FROB_BOOLEAN_PROPERTY (ESCAPE_QUOTED);
1042 else if (EQ (key, Qinput_charset_conversion))
1044 codesys->iso2022.input_conv =
1045 Dynarr_new (charset_conversion_spec);
1046 parse_charset_conversion_specs (codesys->iso2022.input_conv,
1049 else if (EQ (key, Qoutput_charset_conversion))
1051 codesys->iso2022.output_conv =
1052 Dynarr_new (charset_conversion_spec);
1053 parse_charset_conversion_specs (codesys->iso2022.output_conv,
1057 else if (EQ (key, Qccs_priority_list))
1059 codesys->ccs_priority_list = value;
1063 signal_simple_error ("Unrecognized property", key);
1066 else if (ty == CODESYS_UTF8)
1068 if (EQ (key, Qcharset_g0)) FROB_INITIAL_CHARSET (0);
1069 else if (EQ (key, Qcharset_g1))
1070 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 1) = value;
1071 else if (EQ (key, Qcharset_g2))
1072 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 2) = value;
1074 signal_simple_error ("Unrecognized property", key);
1076 else if (ty == CODESYS_BIG5)
1078 if (EQ (key, Qcharset_g0)) FROB_INITIAL_CHARSET (0);
1079 else if (EQ (key, Qcharset_g1)) FROB_INITIAL_CHARSET (1);
1081 signal_simple_error ("Unrecognized property", key);
1084 else if (EQ (type, Qccl))
1087 struct ccl_program test_ccl;
1090 /* Check key first. */
1091 if (EQ (key, Qdecode))
1092 suffix = "-ccl-decode";
1093 else if (EQ (key, Qencode))
1094 suffix = "-ccl-encode";
1096 signal_simple_error ("Unrecognized property", key);
1098 /* If value is vector, register it as a ccl program
1099 associated with an newly created symbol for
1100 backward compatibility. */
1101 if (VECTORP (value))
1103 sym = Fintern (concat2 (Fsymbol_name (name),
1104 build_string (suffix)),
1106 Fregister_ccl_program (sym, value);
1110 CHECK_SYMBOL (value);
1113 /* check if the given ccl programs are valid. */
1114 if (setup_ccl_program (&test_ccl, sym) < 0)
1115 signal_simple_error ("Invalid CCL program", value);
1117 if (EQ (key, Qdecode))
1118 CODING_SYSTEM_CCL_DECODE (codesys) = sym;
1119 else if (EQ (key, Qencode))
1120 CODING_SYSTEM_CCL_ENCODE (codesys) = sym;
1125 signal_simple_error ("Unrecognized property", key);
1129 if (need_to_setup_eol_systems)
1130 setup_eol_coding_systems (codesys);
1133 Lisp_Object codesys_obj;
1134 XSETCODING_SYSTEM (codesys_obj, codesys);
1135 Fputhash (name, codesys_obj, Vcoding_system_hash_table);
1140 DEFUN ("copy-coding-system", Fcopy_coding_system, 2, 2, 0, /*
1141 Copy OLD-CODING-SYSTEM to NEW-NAME.
1142 If NEW-NAME does not name an existing coding system, a new one will
1145 (old_coding_system, new_name))
1147 Lisp_Object new_coding_system;
1148 old_coding_system = Fget_coding_system (old_coding_system);
1149 new_coding_system = Ffind_coding_system (new_name);
1150 if (NILP (new_coding_system))
1152 XSETCODING_SYSTEM (new_coding_system,
1153 allocate_coding_system
1154 (XCODING_SYSTEM_TYPE (old_coding_system),
1156 Fputhash (new_name, new_coding_system, Vcoding_system_hash_table);
1160 Lisp_Coding_System *to = XCODING_SYSTEM (new_coding_system);
1161 Lisp_Coding_System *from = XCODING_SYSTEM (old_coding_system);
1162 memcpy (((char *) to ) + sizeof (to->header),
1163 ((char *) from) + sizeof (from->header),
1164 sizeof (*from) - sizeof (from->header));
1165 to->name = new_name;
1167 return new_coding_system;
1170 DEFUN ("coding-system-canonical-name-p", Fcoding_system_canonical_name_p, 1, 1, 0, /*
1171 Return t if OBJECT names a coding system, and is not a coding system alias.
1175 return CODING_SYSTEMP (Fgethash (object, Vcoding_system_hash_table, Qnil))
1179 DEFUN ("coding-system-alias-p", Fcoding_system_alias_p, 1, 1, 0, /*
1180 Return t if OBJECT is a coding system alias.
1181 All coding system aliases are created by `define-coding-system-alias'.
1185 return SYMBOLP (Fgethash (object, Vcoding_system_hash_table, Qzero))
1189 DEFUN ("coding-system-aliasee", Fcoding_system_aliasee, 1, 1, 0, /*
1190 Return the coding-system symbol for which symbol ALIAS is an alias.
1194 Lisp_Object aliasee = Fgethash (alias, Vcoding_system_hash_table, Qnil);
1195 if (SYMBOLP (aliasee))
1198 signal_simple_error ("Symbol is not a coding system alias", alias);
1199 return Qnil; /* To keep the compiler happy */
1203 append_suffix_to_symbol (Lisp_Object symbol, const char *ascii_string)
1205 return Fintern (concat2 (Fsymbol_name (symbol), build_string (ascii_string)),
1209 /* A maphash function, for removing dangling coding system aliases. */
1211 dangling_coding_system_alias_p (Lisp_Object alias,
1212 Lisp_Object aliasee,
1213 void *dangling_aliases)
1215 if (SYMBOLP (aliasee)
1216 && NILP (Fgethash (aliasee, Vcoding_system_hash_table, Qnil)))
1218 (*(int *) dangling_aliases)++;
1225 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias, 2, 2, 0, /*
1226 Define symbol ALIAS as an alias for coding system ALIASEE.
1228 You can use this function to redefine an alias that has already been defined,
1229 but you cannot redefine a name which is the canonical name for a coding system.
1230 \(a canonical name of a coding system is what is returned when you call
1231 `coding-system-name' on a coding system).
1233 ALIASEE itself can be an alias, which allows you to define nested aliases.
1235 You are forbidden, however, from creating alias loops or `dangling' aliases.
1236 These will be detected, and an error will be signaled if you attempt to do so.
1238 If ALIASEE is nil, then ALIAS will simply be undefined.
1240 See also `coding-system-alias-p', `coding-system-aliasee',
1241 and `coding-system-canonical-name-p'.
1245 Lisp_Object real_coding_system, probe;
1247 CHECK_SYMBOL (alias);
1249 if (!NILP (Fcoding_system_canonical_name_p (alias)))
1251 ("Symbol is the canonical name of a coding system and cannot be redefined",
1256 Lisp_Object subsidiary_unix = append_suffix_to_symbol (alias, "-unix");
1257 Lisp_Object subsidiary_dos = append_suffix_to_symbol (alias, "-dos");
1258 Lisp_Object subsidiary_mac = append_suffix_to_symbol (alias, "-mac");
1260 Fremhash (alias, Vcoding_system_hash_table);
1262 /* Undefine subsidiary aliases,
1263 presumably created by a previous call to this function */
1264 if (! NILP (Fcoding_system_alias_p (subsidiary_unix)) &&
1265 ! NILP (Fcoding_system_alias_p (subsidiary_dos)) &&
1266 ! NILP (Fcoding_system_alias_p (subsidiary_mac)))
1268 Fdefine_coding_system_alias (subsidiary_unix, Qnil);
1269 Fdefine_coding_system_alias (subsidiary_dos, Qnil);
1270 Fdefine_coding_system_alias (subsidiary_mac, Qnil);
1273 /* Undefine dangling coding system aliases. */
1275 int dangling_aliases;
1278 dangling_aliases = 0;
1279 elisp_map_remhash (dangling_coding_system_alias_p,
1280 Vcoding_system_hash_table,
1282 } while (dangling_aliases > 0);
1288 if (CODING_SYSTEMP (aliasee))
1289 aliasee = XCODING_SYSTEM_NAME (aliasee);
1291 /* Checks that aliasee names a coding-system */
1292 real_coding_system = Fget_coding_system (aliasee);
1294 /* Check for coding system alias loops */
1295 if (EQ (alias, aliasee))
1296 alias_loop: signal_simple_error_2
1297 ("Attempt to create a coding system alias loop", alias, aliasee);
1299 for (probe = aliasee;
1301 probe = Fgethash (probe, Vcoding_system_hash_table, Qzero))
1303 if (EQ (probe, alias))
1307 Fputhash (alias, aliasee, Vcoding_system_hash_table);
1309 /* Set up aliases for subsidiaries.
1310 #### There must be a better way to handle subsidiary coding systems. */
1312 static const char *suffixes[] = { "-unix", "-dos", "-mac" };
1314 for (i = 0; i < countof (suffixes); i++)
1316 Lisp_Object alias_subsidiary =
1317 append_suffix_to_symbol (alias, suffixes[i]);
1318 Lisp_Object aliasee_subsidiary =
1319 append_suffix_to_symbol (aliasee, suffixes[i]);
1321 if (! NILP (Ffind_coding_system (aliasee_subsidiary)))
1322 Fdefine_coding_system_alias (alias_subsidiary, aliasee_subsidiary);
1325 /* FSF return value is a vector of [ALIAS-unix ALIAS-dos ALIAS-mac],
1326 but it doesn't look intentional, so I'd rather return something
1327 meaningful or nothing at all. */
1332 subsidiary_coding_system (Lisp_Object coding_system, eol_type_t type)
1334 Lisp_Coding_System *cs = XCODING_SYSTEM (coding_system);
1335 Lisp_Object new_coding_system;
1337 if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
1338 return coding_system;
1342 case EOL_AUTODETECT: return coding_system;
1343 case EOL_LF: new_coding_system = CODING_SYSTEM_EOL_LF (cs); break;
1344 case EOL_CR: new_coding_system = CODING_SYSTEM_EOL_CR (cs); break;
1345 case EOL_CRLF: new_coding_system = CODING_SYSTEM_EOL_CRLF (cs); break;
1346 default: abort (); return Qnil;
1349 return NILP (new_coding_system) ? coding_system : new_coding_system;
1352 DEFUN ("subsidiary-coding-system", Fsubsidiary_coding_system, 2, 2, 0, /*
1353 Return the subsidiary coding system of CODING-SYSTEM with eol type EOL-TYPE.
1355 (coding_system, eol_type))
1357 coding_system = Fget_coding_system (coding_system);
1359 return subsidiary_coding_system (coding_system,
1360 symbol_to_eol_type (eol_type));
1364 /************************************************************************/
1365 /* Coding system accessors */
1366 /************************************************************************/
1368 DEFUN ("coding-system-doc-string", Fcoding_system_doc_string, 1, 1, 0, /*
1369 Return the doc string for CODING-SYSTEM.
1373 coding_system = Fget_coding_system (coding_system);
1374 return XCODING_SYSTEM_DOC_STRING (coding_system);
1377 DEFUN ("coding-system-type", Fcoding_system_type, 1, 1, 0, /*
1378 Return the type of CODING-SYSTEM.
1382 switch (XCODING_SYSTEM_TYPE (Fget_coding_system (coding_system)))
1385 case CODESYS_AUTODETECT: return Qundecided;
1387 case CODESYS_SHIFT_JIS: return Qshift_jis;
1388 case CODESYS_ISO2022: return Qiso2022;
1389 case CODESYS_BIG5: return Qbig5;
1390 case CODESYS_UCS4: return Qucs4;
1391 case CODESYS_UTF16: return Qutf16;
1392 case CODESYS_UTF8: return Qutf8;
1393 case CODESYS_CCL: return Qccl;
1395 case CODESYS_NO_CONVERSION: return Qno_conversion;
1397 case CODESYS_INTERNAL: return Qinternal;
1404 Lisp_Object coding_system_charset (Lisp_Object coding_system, int gnum)
1407 = XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, gnum);
1409 return CHARSETP (cs) ? XCHARSET_NAME (cs) : Qnil;
1412 DEFUN ("coding-system-charset", Fcoding_system_charset, 2, 2, 0, /*
1413 Return initial charset of CODING-SYSTEM designated to GNUM.
1416 (coding_system, gnum))
1418 coding_system = Fget_coding_system (coding_system);
1421 return coding_system_charset (coding_system, XINT (gnum));
1425 DEFUN ("coding-system-property", Fcoding_system_property, 2, 2, 0, /*
1426 Return the PROP property of CODING-SYSTEM.
1428 (coding_system, prop))
1431 enum coding_system_type type;
1433 coding_system = Fget_coding_system (coding_system);
1434 CHECK_SYMBOL (prop);
1435 type = XCODING_SYSTEM_TYPE (coding_system);
1437 for (i = 0; !ok && i < Dynarr_length (the_codesys_prop_dynarr); i++)
1438 if (EQ (Dynarr_at (the_codesys_prop_dynarr, i).sym, prop))
1441 switch (Dynarr_at (the_codesys_prop_dynarr, i).prop_type)
1443 case CODESYS_PROP_ALL_OK:
1446 case CODESYS_PROP_ISO2022:
1447 if (type != CODESYS_ISO2022)
1449 ("Property only valid in ISO2022 coding systems",
1453 case CODESYS_PROP_CCL:
1454 if (type != CODESYS_CCL)
1456 ("Property only valid in CCL coding systems",
1466 signal_simple_error ("Unrecognized property", prop);
1468 if (EQ (prop, Qname))
1469 return XCODING_SYSTEM_NAME (coding_system);
1470 else if (EQ (prop, Qtype))
1471 return Fcoding_system_type (coding_system);
1472 else if (EQ (prop, Qdoc_string))
1473 return XCODING_SYSTEM_DOC_STRING (coding_system);
1474 else if (EQ (prop, Qmnemonic))
1475 return XCODING_SYSTEM_MNEMONIC (coding_system);
1476 else if (EQ (prop, Qeol_type))
1477 return eol_type_to_symbol (XCODING_SYSTEM_EOL_TYPE (coding_system));
1478 else if (EQ (prop, Qeol_lf))
1479 return XCODING_SYSTEM_EOL_LF (coding_system);
1480 else if (EQ (prop, Qeol_crlf))
1481 return XCODING_SYSTEM_EOL_CRLF (coding_system);
1482 else if (EQ (prop, Qeol_cr))
1483 return XCODING_SYSTEM_EOL_CR (coding_system);
1484 else if (EQ (prop, Qpost_read_conversion))
1485 return XCODING_SYSTEM_POST_READ_CONVERSION (coding_system);
1486 else if (EQ (prop, Qpre_write_conversion))
1487 return XCODING_SYSTEM_PRE_WRITE_CONVERSION (coding_system);
1490 else if (EQ (prop, Qdisable_composition))
1491 return XCODING_SYSTEM_DISABLE_COMPOSITION (coding_system) ? Qt : Qnil;
1492 else if (EQ (prop, Quse_entity_reference))
1493 return XCODING_SYSTEM_USE_ENTITY_REFERENCE (coding_system) ? Qt : Qnil;
1494 else if (EQ (prop, Qccs_priority_list))
1495 return XCODING_SYSTEM_CCS_PRIORITY_LIST (coding_system);
1497 else if (type == CODESYS_ISO2022)
1499 if (EQ (prop, Qcharset_g0))
1500 return coding_system_charset (coding_system, 0);
1501 else if (EQ (prop, Qcharset_g1))
1502 return coding_system_charset (coding_system, 1);
1503 else if (EQ (prop, Qcharset_g2))
1504 return coding_system_charset (coding_system, 2);
1505 else if (EQ (prop, Qcharset_g3))
1506 return coding_system_charset (coding_system, 3);
1508 #define FORCE_CHARSET(charset_num) \
1509 (XCODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT \
1510 (coding_system, charset_num) ? Qt : Qnil)
1512 else if (EQ (prop, Qforce_g0_on_output)) return FORCE_CHARSET (0);
1513 else if (EQ (prop, Qforce_g1_on_output)) return FORCE_CHARSET (1);
1514 else if (EQ (prop, Qforce_g2_on_output)) return FORCE_CHARSET (2);
1515 else if (EQ (prop, Qforce_g3_on_output)) return FORCE_CHARSET (3);
1517 #define LISP_BOOLEAN(prop) \
1518 (XCODING_SYSTEM_ISO2022_##prop (coding_system) ? Qt : Qnil)
1520 else if (EQ (prop, Qshort)) return LISP_BOOLEAN (SHORT);
1521 else if (EQ (prop, Qno_ascii_eol)) return LISP_BOOLEAN (NO_ASCII_EOL);
1522 else if (EQ (prop, Qno_ascii_cntl)) return LISP_BOOLEAN (NO_ASCII_CNTL);
1523 else if (EQ (prop, Qseven)) return LISP_BOOLEAN (SEVEN);
1524 else if (EQ (prop, Qlock_shift)) return LISP_BOOLEAN (LOCK_SHIFT);
1525 else if (EQ (prop, Qno_iso6429)) return LISP_BOOLEAN (NO_ISO6429);
1526 else if (EQ (prop, Qescape_quoted)) return LISP_BOOLEAN (ESCAPE_QUOTED);
1528 else if (EQ (prop, Qinput_charset_conversion))
1530 unparse_charset_conversion_specs
1531 (XCODING_SYSTEM (coding_system)->iso2022.input_conv);
1532 else if (EQ (prop, Qoutput_charset_conversion))
1534 unparse_charset_conversion_specs
1535 (XCODING_SYSTEM (coding_system)->iso2022.output_conv);
1539 else if (type == CODESYS_CCL)
1541 if (EQ (prop, Qdecode))
1542 return XCODING_SYSTEM_CCL_DECODE (coding_system);
1543 else if (EQ (prop, Qencode))
1544 return XCODING_SYSTEM_CCL_ENCODE (coding_system);
1552 return Qnil; /* not reached */
1556 /************************************************************************/
1557 /* Coding category functions */
1558 /************************************************************************/
1561 decode_coding_category (Lisp_Object symbol)
1565 CHECK_SYMBOL (symbol);
1566 for (i = 0; i < CODING_CATEGORY_LAST; i++)
1567 if (EQ (coding_category_symbol[i], symbol))
1570 signal_simple_error ("Unrecognized coding category", symbol);
1571 return 0; /* not reached */
1574 DEFUN ("coding-category-list", Fcoding_category_list, 0, 0, 0, /*
1575 Return a list of all recognized coding categories.
1580 Lisp_Object list = Qnil;
1582 for (i = CODING_CATEGORY_LAST - 1; i >= 0; i--)
1583 list = Fcons (coding_category_symbol[i], list);
1587 DEFUN ("set-coding-priority-list", Fset_coding_priority_list, 1, 1, 0, /*
1588 Change the priority order of the coding categories.
1589 LIST should be list of coding categories, in descending order of
1590 priority. Unspecified coding categories will be lower in priority
1591 than all specified ones, in the same relative order they were in
1596 int category_to_priority[CODING_CATEGORY_LAST];
1600 /* First generate a list that maps coding categories to priorities. */
1602 for (i = 0; i < CODING_CATEGORY_LAST; i++)
1603 category_to_priority[i] = -1;
1605 /* Highest priority comes from the specified list. */
1607 EXTERNAL_LIST_LOOP (rest, list)
1609 int cat = decode_coding_category (XCAR (rest));
1611 if (category_to_priority[cat] >= 0)
1612 signal_simple_error ("Duplicate coding category in list", XCAR (rest));
1613 category_to_priority[cat] = i++;
1616 /* Now go through the existing categories by priority to retrieve
1617 the categories not yet specified and preserve their priority
1619 for (j = 0; j < CODING_CATEGORY_LAST; j++)
1621 int cat = fcd->coding_category_by_priority[j];
1622 if (category_to_priority[cat] < 0)
1623 category_to_priority[cat] = i++;
1626 /* Now we need to construct the inverse of the mapping we just
1629 for (i = 0; i < CODING_CATEGORY_LAST; i++)
1630 fcd->coding_category_by_priority[category_to_priority[i]] = i;
1632 /* Phew! That was confusing. */
1636 DEFUN ("coding-priority-list", Fcoding_priority_list, 0, 0, 0, /*
1637 Return a list of coding categories in descending order of priority.
1642 Lisp_Object list = Qnil;
1644 for (i = CODING_CATEGORY_LAST - 1; i >= 0; i--)
1645 list = Fcons (coding_category_symbol[fcd->coding_category_by_priority[i]],
1650 DEFUN ("set-coding-category-system", Fset_coding_category_system, 2, 2, 0, /*
1651 Change the coding system associated with a coding category.
1653 (coding_category, coding_system))
1655 int cat = decode_coding_category (coding_category);
1657 coding_system = Fget_coding_system (coding_system);
1658 fcd->coding_category_system[cat] = coding_system;
1662 DEFUN ("coding-category-system", Fcoding_category_system, 1, 1, 0, /*
1663 Return the coding system associated with a coding category.
1667 int cat = decode_coding_category (coding_category);
1668 Lisp_Object sys = fcd->coding_category_system[cat];
1671 return XCODING_SYSTEM_NAME (sys);
1676 /************************************************************************/
1677 /* Detecting the encoding of data */
1678 /************************************************************************/
1680 struct detection_state
1682 eol_type_t eol_type;
1725 struct iso2022_decoder iso;
1727 int high_byte_count;
1728 unsigned int saw_single_shift:1;
1741 acceptable_control_char_p (int c)
1745 /* Allow and ignore control characters that you might
1746 reasonably see in a text file */
1751 case 8: /* backspace */
1752 case 11: /* vertical tab */
1753 case 12: /* form feed */
1754 case 26: /* MS-DOS C-z junk */
1755 case 31: /* '^_' -- for info */
1763 mask_has_at_most_one_bit_p (int mask)
1765 /* Perhaps the only thing useful you learn from intensive Microsoft
1766 technical interviews */
1767 return (mask & (mask - 1)) == 0;
1771 detect_eol_type (struct detection_state *st, const Extbyte *src,
1772 Lstream_data_count n)
1776 unsigned char c = *(unsigned char *)src++;
1779 if (st->eol.just_saw_cr)
1781 else if (st->eol.seen_anything)
1784 else if (st->eol.just_saw_cr)
1787 st->eol.just_saw_cr = 1;
1789 st->eol.just_saw_cr = 0;
1790 st->eol.seen_anything = 1;
1793 return EOL_AUTODETECT;
1796 /* Attempt to determine the encoding and EOL type of the given text.
1797 Before calling this function for the first type, you must initialize
1798 st->eol_type as appropriate and initialize st->mask to ~0.
1800 st->eol_type holds the determined EOL type, or EOL_AUTODETECT if
1803 st->mask holds the determined coding category mask, or ~0 if only
1804 ASCII has been seen so far.
1808 0 == st->eol_type is EOL_AUTODETECT and/or more than coding category
1809 is present in st->mask
1810 1 == definitive answers are here for both st->eol_type and st->mask
1814 detect_coding_type (struct detection_state *st, const Extbyte *src,
1815 Lstream_data_count n, int just_do_eol)
1817 if (st->eol_type == EOL_AUTODETECT)
1818 st->eol_type = detect_eol_type (st, src, n);
1821 return st->eol_type != EOL_AUTODETECT;
1823 if (!st->seen_non_ascii)
1825 for (; n; n--, src++)
1827 unsigned char c = *(unsigned char *) src;
1828 if ((c < 0x20 && !acceptable_control_char_p (c)) || c >= 0x80)
1830 st->seen_non_ascii = 1;
1832 st->shift_jis.mask = ~0;
1835 st->utf16.mask = ~0;
1837 st->iso2022.mask = ~0;
1847 if (!mask_has_at_most_one_bit_p (st->iso2022.mask))
1848 st->iso2022.mask = detect_coding_iso2022 (st, src, n);
1849 if (!mask_has_at_most_one_bit_p (st->shift_jis.mask))
1850 st->shift_jis.mask = detect_coding_sjis (st, src, n);
1851 if (!mask_has_at_most_one_bit_p (st->big5.mask))
1852 st->big5.mask = detect_coding_big5 (st, src, n);
1853 if (!mask_has_at_most_one_bit_p (st->utf8.mask))
1854 st->utf8.mask = detect_coding_utf8 (st, src, n);
1855 if (!mask_has_at_most_one_bit_p (st->utf16.mask))
1856 st->utf16.mask = detect_coding_utf16 (st, src, n);
1857 if (!mask_has_at_most_one_bit_p (st->ucs4.mask))
1858 st->ucs4.mask = detect_coding_ucs4 (st, src, n);
1861 = st->iso2022.mask | st->shift_jis.mask | st->big5.mask
1862 | st->utf8.mask | st->ucs4.mask;
1865 int retval = mask_has_at_most_one_bit_p (st->mask);
1866 st->mask |= CODING_CATEGORY_NO_CONVERSION_MASK;
1867 return retval && st->eol_type != EOL_AUTODETECT;
1872 coding_system_from_mask (int mask)
1876 /* If the file was entirely or basically ASCII, use the
1877 default value of `buffer-file-coding-system'. */
1878 Lisp_Object retval =
1879 XBUFFER (Vbuffer_defaults)->buffer_file_coding_system;
1882 retval = Ffind_coding_system (retval);
1886 (Qbad_variable, Qwarning,
1887 "Invalid `default-buffer-file-coding-system', set to nil");
1888 XBUFFER (Vbuffer_defaults)->buffer_file_coding_system = Qnil;
1892 retval = Fget_coding_system (Qraw_text);
1900 mask = postprocess_iso2022_mask (mask);
1902 /* Look through the coding categories by priority and find
1903 the first one that is allowed. */
1904 for (i = 0; i < CODING_CATEGORY_LAST; i++)
1906 cat = fcd->coding_category_by_priority[i];
1907 if ((mask & (1 << cat)) &&
1908 !NILP (fcd->coding_category_system[cat]))
1912 return fcd->coding_category_system[cat];
1914 return Fget_coding_system (Qraw_text);
1918 /* Given a seekable read stream and potential coding system and EOL type
1919 as specified, do any autodetection that is called for. If the
1920 coding system and/or EOL type are not `autodetect', they will be left
1921 alone; but this function will never return an autodetect coding system
1924 This function does not automatically fetch subsidiary coding systems;
1925 that should be unnecessary with the explicit eol-type argument. */
1927 #define LENGTH(string_constant) (sizeof (string_constant) - 1)
1928 /* number of leading lines to check for a coding cookie */
1929 #define LINES_TO_CHECK 2
1932 determine_real_coding_system (Lstream *stream, Lisp_Object *codesys_in_out,
1933 eol_type_t *eol_type_in_out)
1935 struct detection_state decst;
1937 if (*eol_type_in_out == EOL_AUTODETECT)
1938 *eol_type_in_out = XCODING_SYSTEM_EOL_TYPE (*codesys_in_out);
1941 decst.eol_type = *eol_type_in_out;
1944 /* If autodetection is called for, do it now. */
1945 if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT
1946 || *eol_type_in_out == EOL_AUTODETECT)
1949 Lisp_Object coding_system = Qnil;
1951 Lstream_data_count nread = Lstream_read (stream, buf, sizeof (buf));
1953 int lines_checked = 0;
1955 /* Look for initial "-*-"; mode line prefix */
1957 scan_end = buf + nread - LENGTH ("-*-coding:?-*-");
1959 && lines_checked < LINES_TO_CHECK;
1961 if (*p == '-' && *(p+1) == '*' && *(p+2) == '-')
1963 Extbyte *local_vars_beg = p + 3;
1964 /* Look for final "-*-"; mode line suffix */
1965 for (p = local_vars_beg,
1966 scan_end = buf + nread - LENGTH ("-*-");
1968 && lines_checked < LINES_TO_CHECK;
1970 if (*p == '-' && *(p+1) == '*' && *(p+2) == '-')
1972 Extbyte *suffix = p;
1973 /* Look for "coding:" */
1974 for (p = local_vars_beg,
1975 scan_end = suffix - LENGTH ("coding:?");
1978 if (memcmp ("coding:", p, LENGTH ("coding:")) == 0
1979 && (p == local_vars_beg
1980 || (*(p-1) == ' ' ||
1986 p += LENGTH ("coding:");
1987 while (*p == ' ' || *p == '\t') p++;
1989 /* Get coding system name */
1990 save = *suffix; *suffix = '\0';
1991 /* Characters valid in a MIME charset name (rfc 1521),
1992 and in a Lisp symbol name. */
1993 n = strspn ( (char *) p,
1994 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
1995 "abcdefghijklmnopqrstuvwxyz"
2001 save = p[n]; p[n] = '\0';
2003 Ffind_coding_system (intern ((char *) p));
2010 /* #### file must use standard EOLs or we miss 2d line */
2011 /* #### not to mention this is broken for UTF-16 DOS files */
2012 else if (*p == '\n' || *p == '\r')
2015 /* skip past multibyte (DOS) newline */
2016 if (*p == '\r' && *(p+1) == '\n') p++;
2020 /* #### file must use standard EOLs or we miss 2d line */
2021 /* #### not to mention this is broken for UTF-16 DOS files */
2022 else if (*p == '\n' || *p == '\r')
2025 /* skip past multibyte (DOS) newline */
2026 if (*p == '\r' && *(p+1) == '\n') p++;
2029 if (NILP (coding_system))
2032 if (detect_coding_type (&decst, buf, nread,
2033 XCODING_SYSTEM_TYPE (*codesys_in_out)
2034 != CODESYS_AUTODETECT))
2036 nread = Lstream_read (stream, buf, sizeof (buf));
2042 else if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT
2043 && XCODING_SYSTEM_EOL_TYPE (coding_system) == EOL_AUTODETECT)
2046 if (detect_coding_type (&decst, buf, nread, 1))
2048 nread = Lstream_read (stream, buf, sizeof (buf));
2054 *eol_type_in_out = decst.eol_type;
2055 if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT)
2057 if (NILP (coding_system))
2058 *codesys_in_out = coding_system_from_mask (decst.mask);
2060 *codesys_in_out = coding_system;
2064 /* If we absolutely can't determine the EOL type, just assume LF. */
2065 if (*eol_type_in_out == EOL_AUTODETECT)
2066 *eol_type_in_out = EOL_LF;
2068 Lstream_rewind (stream);
2071 DEFUN ("detect-coding-region", Fdetect_coding_region, 2, 3, 0, /*
2072 Detect coding system of the text in the region between START and END.
2073 Return a list of possible coding systems ordered by priority.
2074 If only ASCII characters are found, return 'undecided or one of
2075 its subsidiary coding systems according to a detected end-of-line
2076 type. Optional arg BUFFER defaults to the current buffer.
2078 (start, end, buffer))
2080 Lisp_Object val = Qnil;
2081 struct buffer *buf = decode_buffer (buffer, 0);
2083 Lisp_Object instream, lb_instream;
2084 Lstream *istr, *lb_istr;
2085 struct detection_state decst;
2086 struct gcpro gcpro1, gcpro2;
2088 get_buffer_range_char (buf, start, end, &b, &e, 0);
2089 lb_instream = make_lisp_buffer_input_stream (buf, b, e, 0);
2090 lb_istr = XLSTREAM (lb_instream);
2091 instream = make_encoding_input_stream (lb_istr, Fget_coding_system (Qbinary));
2092 istr = XLSTREAM (instream);
2093 GCPRO2 (instream, lb_instream);
2095 decst.eol_type = EOL_AUTODETECT;
2099 Extbyte random_buffer[4096];
2100 Lstream_data_count nread = Lstream_read (istr, random_buffer, sizeof (random_buffer));
2104 if (detect_coding_type (&decst, random_buffer, nread, 0))
2108 if (decst.mask == ~0)
2109 val = subsidiary_coding_system (Fget_coding_system (Qundecided),
2117 decst.mask = postprocess_iso2022_mask (decst.mask);
2119 for (i = CODING_CATEGORY_LAST - 1; i >= 0; i--)
2121 int sys = fcd->coding_category_by_priority[i];
2122 if (decst.mask & (1 << sys))
2124 Lisp_Object codesys = fcd->coding_category_system[sys];
2125 if (!NILP (codesys))
2126 codesys = subsidiary_coding_system (codesys, decst.eol_type);
2127 val = Fcons (codesys, val);
2131 Lstream_close (istr);
2133 Lstream_delete (istr);
2134 Lstream_delete (lb_istr);
2139 /************************************************************************/
2140 /* Converting to internal Mule format ("decoding") */
2141 /************************************************************************/
2143 /* A decoding stream is a stream used for decoding text (i.e.
2144 converting from some external format to internal format).
2145 The decoding-stream object keeps track of the actual coding
2146 stream, the stream that is at the other end, and data that
2147 needs to be persistent across the lifetime of the stream. */
2149 /* Handle the EOL stuff related to just-read-in character C.
2150 EOL_TYPE is the EOL type of the coding stream.
2151 FLAGS is the current value of FLAGS in the coding stream, and may
2152 be modified by this macro. (The macro only looks at the
2153 CODING_STATE_CR flag.) DST is the Dynarr to which the decoded
2154 bytes are to be written. You need to also define a local goto
2155 label "label_continue_loop" that is at the end of the main
2156 character-reading loop.
2158 If C is a CR character, then this macro handles it entirely and
2159 jumps to label_continue_loop. Otherwise, this macro does not add
2160 anything to DST, and continues normally. You should continue
2161 processing C normally after this macro. */
2163 #define DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst) \
2167 if (eol_type == EOL_CR) \
2168 Dynarr_add (dst, '\n'); \
2169 else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR) \
2170 Dynarr_add (dst, c); \
2172 flags |= CODING_STATE_CR; \
2173 goto label_continue_loop; \
2175 else if (flags & CODING_STATE_CR) \
2176 { /* eol_type == CODING_SYSTEM_EOL_CRLF */ \
2178 Dynarr_add (dst, '\r'); \
2179 flags &= ~CODING_STATE_CR; \
2183 /* C should be a binary character in the range 0 - 255; convert
2184 to internal format and add to Dynarr DST. */
2187 #define DECODE_ADD_BINARY_CHAR(c, dst) \
2189 if (BYTE_ASCII_P (c)) \
2190 Dynarr_add (dst, c); \
2193 Dynarr_add (dst, (c >> 6) | 0xc0); \
2194 Dynarr_add (dst, (c & 0x3f) | 0x80); \
2198 INLINE_HEADER void DECODE_ADD_UCS_CHAR(Emchar c, unsigned_char_dynarr* dst);
2200 DECODE_ADD_UCS_CHAR(Emchar c, unsigned_char_dynarr* dst)
2204 Dynarr_add (dst, c);
2206 else if ( c <= 0x7ff )
2208 Dynarr_add (dst, (c >> 6) | 0xc0);
2209 Dynarr_add (dst, (c & 0x3f) | 0x80);
2211 else if ( c <= 0xffff )
2213 Dynarr_add (dst, (c >> 12) | 0xe0);
2214 Dynarr_add (dst, ((c >> 6) & 0x3f) | 0x80);
2215 Dynarr_add (dst, (c & 0x3f) | 0x80);
2217 else if ( c <= 0x1fffff )
2219 Dynarr_add (dst, (c >> 18) | 0xf0);
2220 Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
2221 Dynarr_add (dst, ((c >> 6) & 0x3f) | 0x80);
2222 Dynarr_add (dst, (c & 0x3f) | 0x80);
2224 else if ( c <= 0x3ffffff )
2226 Dynarr_add (dst, (c >> 24) | 0xf8);
2227 Dynarr_add (dst, ((c >> 18) & 0x3f) | 0x80);
2228 Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
2229 Dynarr_add (dst, ((c >> 6) & 0x3f) | 0x80);
2230 Dynarr_add (dst, (c & 0x3f) | 0x80);
2234 Dynarr_add (dst, (c >> 30) | 0xfc);
2235 Dynarr_add (dst, ((c >> 24) & 0x3f) | 0x80);
2236 Dynarr_add (dst, ((c >> 18) & 0x3f) | 0x80);
2237 Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
2238 Dynarr_add (dst, ((c >> 6) & 0x3f) | 0x80);
2239 Dynarr_add (dst, (c & 0x3f) | 0x80);
2243 #define DECODE_ADD_BINARY_CHAR(c, dst) \
2245 if (BYTE_ASCII_P (c)) \
2246 Dynarr_add (dst, c); \
2247 else if (BYTE_C1_P (c)) \
2249 Dynarr_add (dst, LEADING_BYTE_CONTROL_1); \
2250 Dynarr_add (dst, c + 0x20); \
2254 Dynarr_add (dst, LEADING_BYTE_LATIN_ISO8859_1); \
2255 Dynarr_add (dst, c); \
2260 #define DECODE_OUTPUT_PARTIAL_CHAR(ch) \
2264 DECODE_ADD_BINARY_CHAR (ch, dst); \
2269 #define DECODE_HANDLE_END_OF_CONVERSION(flags, ch, dst) \
2271 if (flags & CODING_STATE_END) \
2273 DECODE_OUTPUT_PARTIAL_CHAR (ch); \
2274 if (flags & CODING_STATE_CR) \
2275 Dynarr_add (dst, '\r'); \
2279 #define DECODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, decoding)
2281 #define ER_BUF_SIZE 24
2283 struct decoding_stream
2285 /* Coding system that governs the conversion. */
2286 Lisp_Coding_System *codesys;
2288 /* Stream that we read the encoded data from or
2289 write the decoded data to. */
2292 /* If we are reading, then we can return only a fixed amount of
2293 data, so if the conversion resulted in too much data, we store it
2294 here for retrieval the next time around. */
2295 unsigned_char_dynarr *runoff;
2297 /* FLAGS holds flags indicating the current state of the decoding.
2298 Some of these flags are dependent on the coding system. */
2301 /* CPOS holds a partially built-up code-point of character. */
2304 /* EOL_TYPE specifies the type of end-of-line conversion that
2305 currently applies. We need to keep this separate from the
2306 EOL type stored in CODESYS because the latter might indicate
2307 automatic EOL-type detection while the former will always
2308 indicate a particular EOL type. */
2309 eol_type_t eol_type;
2311 /* Additional ISO2022 information. We define the structure above
2312 because it's also needed by the detection routines. */
2313 struct iso2022_decoder iso2022;
2315 /* Additional information (the state of the running CCL program)
2316 used by the CCL decoder. */
2317 struct ccl_program ccl;
2319 /* counter for UTF-8 or UCS-4 */
2320 unsigned char counter;
2323 unsigned char er_counter;
2324 unsigned char er_buf[ER_BUF_SIZE];
2326 unsigned combined_char_count;
2327 Emchar combined_chars[16];
2328 Lisp_Object combining_table;
2330 struct detection_state decst;
2333 static Lstream_data_count decoding_reader (Lstream *stream,
2334 unsigned char *data, Lstream_data_count size);
2335 static Lstream_data_count decoding_writer (Lstream *stream,
2336 const unsigned char *data, Lstream_data_count size);
2337 static int decoding_rewinder (Lstream *stream);
2338 static int decoding_seekable_p (Lstream *stream);
2339 static int decoding_flusher (Lstream *stream);
2340 static int decoding_closer (Lstream *stream);
2342 static Lisp_Object decoding_marker (Lisp_Object stream);
2344 DEFINE_LSTREAM_IMPLEMENTATION ("decoding", lstream_decoding,
2345 sizeof (struct decoding_stream));
2348 decoding_marker (Lisp_Object stream)
2350 Lstream *str = DECODING_STREAM_DATA (XLSTREAM (stream))->other_end;
2351 Lisp_Object str_obj;
2353 /* We do not need to mark the coding systems or charsets stored
2354 within the stream because they are stored in a global list
2355 and automatically marked. */
2357 XSETLSTREAM (str_obj, str);
2358 mark_object (str_obj);
2359 if (str->imp->marker)
2360 return (str->imp->marker) (str_obj);
2365 /* Read SIZE bytes of data and store it into DATA. We are a decoding stream
2366 so we read data from the other end, decode it, and store it into DATA. */
2368 static Lstream_data_count
2369 decoding_reader (Lstream *stream, unsigned char *data, Lstream_data_count size)
2371 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2372 unsigned char *orig_data = data;
2373 Lstream_data_count read_size;
2374 int error_occurred = 0;
2376 /* We need to interface to mule_decode(), which expects to take some
2377 amount of data and store the result into a Dynarr. We have
2378 mule_decode() store into str->runoff, and take data from there
2381 /* We loop until we have enough data, reading chunks from the other
2382 end and decoding it. */
2385 /* Take data from the runoff if we can. Make sure to take at
2386 most SIZE bytes, and delete the data from the runoff. */
2387 if (Dynarr_length (str->runoff) > 0)
2389 Lstream_data_count chunk = min (size, (Lstream_data_count) Dynarr_length (str->runoff));
2390 memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
2391 Dynarr_delete_many (str->runoff, 0, chunk);
2397 break; /* No more room for data */
2399 if (str->flags & CODING_STATE_END)
2400 /* This means that on the previous iteration, we hit the EOF on
2401 the other end. We loop once more so that mule_decode() can
2402 output any final stuff it may be holding, or any "go back
2403 to a sane state" escape sequences. (This latter makes sense
2404 during encoding.) */
2407 /* Exhausted the runoff, so get some more. DATA has at least
2408 SIZE bytes left of storage in it, so it's OK to read directly
2409 into it. (We'll be overwriting above, after we've decoded it
2410 into the runoff.) */
2411 read_size = Lstream_read (str->other_end, data, size);
2418 /* There might be some more end data produced in the translation.
2419 See the comment above. */
2420 str->flags |= CODING_STATE_END;
2421 mule_decode (stream, (Extbyte *) data, str->runoff, read_size);
2424 if (data - orig_data == 0)
2425 return error_occurred ? -1 : 0;
2427 return data - orig_data;
2430 static Lstream_data_count
2431 decoding_writer (Lstream *stream, const unsigned char *data, Lstream_data_count size)
2433 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2434 Lstream_data_count retval;
2436 /* Decode all our data into the runoff, and then attempt to write
2437 it all out to the other end. Remove whatever chunk we succeeded
2439 mule_decode (stream, (Extbyte *) data, str->runoff, size);
2440 retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
2441 Dynarr_length (str->runoff));
2443 Dynarr_delete_many (str->runoff, 0, retval);
2444 /* Do NOT return retval. The return value indicates how much
2445 of the incoming data was written, not how many bytes were
2451 reset_decoding_stream (struct decoding_stream *str)
2454 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_ISO2022)
2456 Lisp_Object coding_system;
2457 XSETCODING_SYSTEM (coding_system, str->codesys);
2458 reset_iso2022 (coding_system, &str->iso2022);
2460 else if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_CCL)
2462 setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_DECODE (str->codesys));
2467 str->er_counter = 0;
2468 str->combined_char_count = 0;
2469 str->combining_table = Qnil;
2471 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT
2472 || CODING_SYSTEM_EOL_TYPE (str->codesys) == EOL_AUTODETECT)
2475 str->decst.eol_type = EOL_AUTODETECT;
2476 str->decst.mask = ~0;
2478 str->flags = str->cpos = 0;
2482 decoding_rewinder (Lstream *stream)
2484 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2485 reset_decoding_stream (str);
2486 Dynarr_reset (str->runoff);
2487 return Lstream_rewind (str->other_end);
2491 decoding_seekable_p (Lstream *stream)
2493 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2494 return Lstream_seekable_p (str->other_end);
2498 decoding_flusher (Lstream *stream)
2500 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2501 return Lstream_flush (str->other_end);
2505 decoding_closer (Lstream *stream)
2507 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2508 if (stream->flags & LSTREAM_FL_WRITE)
2510 str->flags |= CODING_STATE_END;
2511 decoding_writer (stream, 0, 0);
2513 Dynarr_free (str->runoff);
2515 #ifdef ENABLE_COMPOSITE_CHARS
2516 if (str->iso2022.composite_chars)
2517 Dynarr_free (str->iso2022.composite_chars);
2520 return Lstream_close (str->other_end);
2524 decoding_stream_coding_system (Lstream *stream)
2526 Lisp_Object coding_system;
2527 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2529 XSETCODING_SYSTEM (coding_system, str->codesys);
2530 return subsidiary_coding_system (coding_system, str->eol_type);
2534 set_decoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
2536 Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
2537 struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
2539 if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
2540 str->eol_type = CODING_SYSTEM_EOL_TYPE (cs);
2541 reset_decoding_stream (str);
2544 /* WARNING WARNING WARNING WARNING!!!!! If you open up a decoding
2545 stream for writing, no automatic code detection will be performed.
2546 The reason for this is that automatic code detection requires a
2547 seekable input. Things will also fail if you open a decoding
2548 stream for reading using a non-fully-specified coding system and
2549 a non-seekable input stream. */
2552 make_decoding_stream_1 (Lstream *stream, Lisp_Object codesys,
2555 Lstream *lstr = Lstream_new (lstream_decoding, mode);
2556 struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
2560 str->other_end = stream;
2561 str->runoff = (unsigned_char_dynarr *) Dynarr_new (unsigned_char);
2562 str->eol_type = EOL_AUTODETECT;
2563 if (!strcmp (mode, "r")
2564 && Lstream_seekable_p (stream))
2565 /* We can determine the coding system now. */
2566 determine_real_coding_system (stream, &codesys, &str->eol_type);
2567 set_decoding_stream_coding_system (lstr, codesys);
2568 str->decst.eol_type = str->eol_type;
2569 str->decst.mask = ~0;
2570 XSETLSTREAM (obj, lstr);
2575 make_decoding_input_stream (Lstream *stream, Lisp_Object codesys)
2577 return make_decoding_stream_1 (stream, codesys, "r");
2581 make_decoding_output_stream (Lstream *stream, Lisp_Object codesys)
2583 return make_decoding_stream_1 (stream, codesys, "w");
2586 /* Note: the decode_coding_* functions all take the same
2587 arguments as mule_decode(), which is to say some SRC data of
2588 size N, which is to be stored into dynamic array DST.
2589 DECODING is the stream within which the decoding is
2590 taking place, but no data is actually read from or
2591 written to that stream; that is handled in decoding_reader()
2592 or decoding_writer(). This allows the same functions to
2593 be used for both reading and writing. */
2596 mule_decode (Lstream *decoding, const Extbyte *src,
2597 unsigned_char_dynarr *dst, Lstream_data_count n)
2599 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
2601 /* If necessary, do encoding-detection now. We do this when
2602 we're a writing stream or a non-seekable reading stream,
2603 meaning that we can't just process the whole input,
2604 rewind, and start over. */
2606 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT ||
2607 str->eol_type == EOL_AUTODETECT)
2609 Lisp_Object codesys;
2611 XSETCODING_SYSTEM (codesys, str->codesys);
2612 detect_coding_type (&str->decst, src, n,
2613 CODING_SYSTEM_TYPE (str->codesys) !=
2614 CODESYS_AUTODETECT);
2615 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT &&
2616 str->decst.mask != ~0)
2617 /* #### This is cheesy. What we really ought to do is
2618 buffer up a certain amount of data so as to get a
2619 less random result. */
2620 codesys = coding_system_from_mask (str->decst.mask);
2621 str->eol_type = str->decst.eol_type;
2622 if (XCODING_SYSTEM (codesys) != str->codesys)
2624 /* Preserve the CODING_STATE_END flag in case it was set.
2625 If we erase it, bad things might happen. */
2626 int was_end = str->flags & CODING_STATE_END;
2627 set_decoding_stream_coding_system (decoding, codesys);
2629 str->flags |= CODING_STATE_END;
2633 switch (CODING_SYSTEM_TYPE (str->codesys))
2636 case CODESYS_INTERNAL:
2637 Dynarr_add_many (dst, src, n);
2640 case CODESYS_AUTODETECT:
2641 /* If we got this far and still haven't decided on the coding
2642 system, then do no conversion. */
2643 case CODESYS_NO_CONVERSION:
2644 decode_coding_no_conversion (decoding, src, dst, n);
2647 case CODESYS_SHIFT_JIS:
2648 decode_coding_sjis (decoding, src, dst, n);
2651 decode_coding_big5 (decoding, src, dst, n);
2654 decode_coding_ucs4 (decoding, src, dst, n);
2657 decode_coding_utf16 (decoding, src, dst, n);
2660 decode_coding_utf8 (decoding, src, dst, n);
2663 str->ccl.last_block = str->flags & CODING_STATE_END;
2664 /* When applying ccl program to stream, MUST NOT set NULL
2666 ccl_driver (&str->ccl, (src ? (unsigned char *)src : (unsigned char*)""),
2667 dst, n, 0, CCL_MODE_DECODING);
2669 case CODESYS_ISO2022:
2670 decode_coding_iso2022 (decoding, src, dst, n);
2678 DEFUN ("decode-coding-region", Fdecode_coding_region, 3, 4, 0, /*
2679 Decode the text between START and END which is encoded in CODING-SYSTEM.
2680 This is useful if you've read in encoded text from a file without decoding
2681 it (e.g. you read in a JIS-formatted file but used the `binary' or
2682 `no-conversion' coding system, so that it shows up as "^[$B!<!+^[(B").
2683 Return length of decoded text.
2684 BUFFER defaults to the current buffer if unspecified.
2686 (start, end, coding_system, buffer))
2689 struct buffer *buf = decode_buffer (buffer, 0);
2690 Lisp_Object instream, lb_outstream, de_outstream, outstream;
2691 Lstream *istr, *ostr;
2692 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
2694 get_buffer_range_char (buf, start, end, &b, &e, 0);
2696 barf_if_buffer_read_only (buf, b, e);
2698 coding_system = Fget_coding_system (coding_system);
2699 instream = make_lisp_buffer_input_stream (buf, b, e, 0);
2700 lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
2701 de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
2703 outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
2704 Fget_coding_system (Qbinary));
2705 istr = XLSTREAM (instream);
2706 ostr = XLSTREAM (outstream);
2707 GCPRO4 (instream, lb_outstream, de_outstream, outstream);
2709 /* The chain of streams looks like this:
2711 [BUFFER] <----- send through
2712 ------> [ENCODE AS BINARY]
2713 ------> [DECODE AS SPECIFIED]
2719 char tempbuf[1024]; /* some random amount */
2720 Bufpos newpos, even_newer_pos;
2721 Bufpos oldpos = lisp_buffer_stream_startpos (istr);
2722 Lstream_data_count size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
2726 newpos = lisp_buffer_stream_startpos (istr);
2727 Lstream_write (ostr, tempbuf, size_in_bytes);
2728 even_newer_pos = lisp_buffer_stream_startpos (istr);
2729 buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
2732 Lstream_close (istr);
2733 Lstream_close (ostr);
2735 Lstream_delete (istr);
2736 Lstream_delete (ostr);
2737 Lstream_delete (XLSTREAM (de_outstream));
2738 Lstream_delete (XLSTREAM (lb_outstream));
2743 /************************************************************************/
2744 /* Converting to an external encoding ("encoding") */
2745 /************************************************************************/
2747 /* An encoding stream is an output stream. When you create the
2748 stream, you specify the coding system that governs the encoding
2749 and another stream that the resulting encoded data is to be
2750 sent to, and then start sending data to it. */
2752 #define ENCODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, encoding)
2754 struct encoding_stream
2756 /* Coding system that governs the conversion. */
2757 Lisp_Coding_System *codesys;
2759 /* Stream that we read the encoded data from or
2760 write the decoded data to. */
2763 /* If we are reading, then we can return only a fixed amount of
2764 data, so if the conversion resulted in too much data, we store it
2765 here for retrieval the next time around. */
2766 unsigned_char_dynarr *runoff;
2768 /* FLAGS holds flags indicating the current state of the encoding.
2769 Some of these flags are dependent on the coding system. */
2772 /* CH holds a partially built-up character. Since we only deal
2773 with one- and two-byte characters at the moment, we only use
2774 this to store the first byte of a two-byte character. */
2777 /* Additional information used by the ISO2022 encoder. */
2780 /* CHARSET holds the character sets currently assigned to the G0
2781 through G3 registers. It is initialized from the array
2782 INITIAL_CHARSET in CODESYS. */
2783 Lisp_Object charset[4];
2785 /* Which registers are currently invoked into the left (GL) and
2786 right (GR) halves of the 8-bit encoding space? */
2787 int register_left, register_right;
2789 /* Whether we need to explicitly designate the charset in the
2790 G? register before using it. It is initialized from the
2791 array FORCE_CHARSET_ON_OUTPUT in CODESYS. */
2792 unsigned char force_charset_on_output[4];
2794 /* Other state variables that need to be preserved across
2796 Lisp_Object current_charset;
2798 int current_char_boundary;
2801 void (*encode_char) (struct encoding_stream *str, Emchar c,
2802 unsigned_char_dynarr *dst, unsigned int *flags);
2803 void (*finish) (struct encoding_stream *str,
2804 unsigned_char_dynarr *dst, unsigned int *flags);
2806 /* Additional information (the state of the running CCL program)
2807 used by the CCL encoder. */
2808 struct ccl_program ccl;
2812 static Lstream_data_count encoding_reader (Lstream *stream, unsigned char *data, Lstream_data_count size);
2813 static Lstream_data_count encoding_writer (Lstream *stream, const unsigned char *data,
2814 Lstream_data_count size);
2815 static int encoding_rewinder (Lstream *stream);
2816 static int encoding_seekable_p (Lstream *stream);
2817 static int encoding_flusher (Lstream *stream);
2818 static int encoding_closer (Lstream *stream);
2820 static Lisp_Object encoding_marker (Lisp_Object stream);
2822 DEFINE_LSTREAM_IMPLEMENTATION ("encoding", lstream_encoding,
2823 sizeof (struct encoding_stream));
2826 encoding_marker (Lisp_Object stream)
2828 Lstream *str = ENCODING_STREAM_DATA (XLSTREAM (stream))->other_end;
2829 Lisp_Object str_obj;
2831 /* We do not need to mark the coding systems or charsets stored
2832 within the stream because they are stored in a global list
2833 and automatically marked. */
2835 XSETLSTREAM (str_obj, str);
2836 mark_object (str_obj);
2837 if (str->imp->marker)
2838 return (str->imp->marker) (str_obj);
2843 /* Read SIZE bytes of data and store it into DATA. We are a encoding stream
2844 so we read data from the other end, encode it, and store it into DATA. */
2846 static Lstream_data_count
2847 encoding_reader (Lstream *stream, unsigned char *data, Lstream_data_count size)
2849 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2850 unsigned char *orig_data = data;
2851 Lstream_data_count read_size;
2852 int error_occurred = 0;
2854 /* We need to interface to mule_encode(), which expects to take some
2855 amount of data and store the result into a Dynarr. We have
2856 mule_encode() store into str->runoff, and take data from there
2859 /* We loop until we have enough data, reading chunks from the other
2860 end and encoding it. */
2863 /* Take data from the runoff if we can. Make sure to take at
2864 most SIZE bytes, and delete the data from the runoff. */
2865 if (Dynarr_length (str->runoff) > 0)
2867 int chunk = min ((int) size, Dynarr_length (str->runoff));
2868 memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
2869 Dynarr_delete_many (str->runoff, 0, chunk);
2875 break; /* No more room for data */
2877 if (str->flags & CODING_STATE_END)
2878 /* This means that on the previous iteration, we hit the EOF on
2879 the other end. We loop once more so that mule_encode() can
2880 output any final stuff it may be holding, or any "go back
2881 to a sane state" escape sequences. (This latter makes sense
2882 during encoding.) */
2885 /* Exhausted the runoff, so get some more. DATA at least SIZE bytes
2886 left of storage in it, so it's OK to read directly into it.
2887 (We'll be overwriting above, after we've encoded it into the
2889 read_size = Lstream_read (str->other_end, data, size);
2896 /* There might be some more end data produced in the translation.
2897 See the comment above. */
2898 str->flags |= CODING_STATE_END;
2899 mule_encode (stream, data, str->runoff, read_size);
2902 if (data == orig_data)
2903 return error_occurred ? -1 : 0;
2905 return data - orig_data;
2908 static Lstream_data_count
2909 encoding_writer (Lstream *stream, const unsigned char *data, Lstream_data_count size)
2911 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2912 Lstream_data_count retval;
2914 /* Encode all our data into the runoff, and then attempt to write
2915 it all out to the other end. Remove whatever chunk we succeeded
2917 mule_encode (stream, data, str->runoff, size);
2918 retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
2919 Dynarr_length (str->runoff));
2921 Dynarr_delete_many (str->runoff, 0, retval);
2922 /* Do NOT return retval. The return value indicates how much
2923 of the incoming data was written, not how many bytes were
2929 reset_encoding_stream (struct encoding_stream *str)
2932 switch (CODING_SYSTEM_TYPE (str->codesys))
2934 case CODESYS_ISO2022:
2938 str->encode_char = &char_encode_iso2022;
2939 str->finish = &char_finish_iso2022;
2940 for (i = 0; i < 4; i++)
2942 str->iso2022.charset[i] =
2943 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (str->codesys, i);
2944 str->iso2022.force_charset_on_output[i] =
2945 CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (str->codesys, i);
2947 str->iso2022.register_left = 0;
2948 str->iso2022.register_right = 1;
2949 str->iso2022.current_charset = Qnil;
2950 str->iso2022.current_half = 0;
2954 setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_ENCODE (str->codesys));
2957 str->encode_char = &char_encode_utf8;
2958 str->finish = &char_finish_utf8;
2961 str->encode_char = &char_encode_utf16;
2962 str->finish = &char_finish_utf16;
2965 str->encode_char = &char_encode_ucs4;
2966 str->finish = &char_finish_ucs4;
2968 case CODESYS_SHIFT_JIS:
2969 str->encode_char = &char_encode_shift_jis;
2970 str->finish = &char_finish_shift_jis;
2973 str->encode_char = &char_encode_big5;
2974 str->finish = &char_finish_big5;
2980 str->iso2022.current_char_boundary = 0;
2981 str->flags = str->ch = 0;
2985 encoding_rewinder (Lstream *stream)
2987 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2988 reset_encoding_stream (str);
2989 Dynarr_reset (str->runoff);
2990 return Lstream_rewind (str->other_end);
2994 encoding_seekable_p (Lstream *stream)
2996 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2997 return Lstream_seekable_p (str->other_end);
3001 encoding_flusher (Lstream *stream)
3003 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
3004 return Lstream_flush (str->other_end);
3008 encoding_closer (Lstream *stream)
3010 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
3011 if (stream->flags & LSTREAM_FL_WRITE)
3013 str->flags |= CODING_STATE_END;
3014 encoding_writer (stream, 0, 0);
3016 Dynarr_free (str->runoff);
3017 return Lstream_close (str->other_end);
3021 encoding_stream_coding_system (Lstream *stream)
3023 Lisp_Object coding_system;
3024 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
3026 XSETCODING_SYSTEM (coding_system, str->codesys);
3027 return coding_system;
3031 set_encoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
3033 Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
3034 struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
3036 reset_encoding_stream (str);
3040 make_encoding_stream_1 (Lstream *stream, Lisp_Object codesys,
3043 Lstream *lstr = Lstream_new (lstream_encoding, mode);
3044 struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
3048 str->runoff = Dynarr_new (unsigned_char);
3049 str->other_end = stream;
3050 set_encoding_stream_coding_system (lstr, codesys);
3051 XSETLSTREAM (obj, lstr);
3056 make_encoding_input_stream (Lstream *stream, Lisp_Object codesys)
3058 return make_encoding_stream_1 (stream, codesys, "r");
3062 make_encoding_output_stream (Lstream *stream, Lisp_Object codesys)
3064 return make_encoding_stream_1 (stream, codesys, "w");
3067 /* Convert N bytes of internally-formatted data stored in SRC to an
3068 external format, according to the encoding stream ENCODING.
3069 Store the encoded data into DST. */
3072 mule_encode (Lstream *encoding, const Bufbyte *src,
3073 unsigned_char_dynarr *dst, Lstream_data_count n)
3075 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3077 switch (CODING_SYSTEM_TYPE (str->codesys))
3080 case CODESYS_INTERNAL:
3081 Dynarr_add_many (dst, src, n);
3084 case CODESYS_AUTODETECT:
3085 /* If we got this far and still haven't decided on the coding
3086 system, then do no conversion. */
3087 case CODESYS_NO_CONVERSION:
3088 encode_coding_no_conversion (encoding, src, dst, n);
3092 str->ccl.last_block = str->flags & CODING_STATE_END;
3093 /* When applying ccl program to stream, MUST NOT set NULL
3095 ccl_driver (&str->ccl, ((src) ? src : (unsigned char*)""),
3096 dst, n, 0, CCL_MODE_ENCODING);
3100 text_encode_generic (encoding, src, dst, n);
3104 DEFUN ("encode-coding-region", Fencode_coding_region, 3, 4, 0, /*
3105 Encode the text between START and END using CODING-SYSTEM.
3106 This will, for example, convert Japanese characters into stuff such as
3107 "^[$B!<!+^[(B" if you use the JIS encoding. Return length of encoded
3108 text. BUFFER defaults to the current buffer if unspecified.
3110 (start, end, coding_system, buffer))
3113 struct buffer *buf = decode_buffer (buffer, 0);
3114 Lisp_Object instream, lb_outstream, de_outstream, outstream;
3115 Lstream *istr, *ostr;
3116 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
3118 get_buffer_range_char (buf, start, end, &b, &e, 0);
3120 barf_if_buffer_read_only (buf, b, e);
3122 coding_system = Fget_coding_system (coding_system);
3123 instream = make_lisp_buffer_input_stream (buf, b, e, 0);
3124 lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
3125 de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
3126 Fget_coding_system (Qbinary));
3127 outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
3129 istr = XLSTREAM (instream);
3130 ostr = XLSTREAM (outstream);
3131 GCPRO4 (instream, outstream, de_outstream, lb_outstream);
3132 /* The chain of streams looks like this:
3134 [BUFFER] <----- send through
3135 ------> [ENCODE AS SPECIFIED]
3136 ------> [DECODE AS BINARY]
3141 char tempbuf[1024]; /* some random amount */
3142 Bufpos newpos, even_newer_pos;
3143 Bufpos oldpos = lisp_buffer_stream_startpos (istr);
3144 Lstream_data_count size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
3148 newpos = lisp_buffer_stream_startpos (istr);
3149 Lstream_write (ostr, tempbuf, size_in_bytes);
3150 even_newer_pos = lisp_buffer_stream_startpos (istr);
3151 buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
3157 lisp_buffer_stream_startpos (XLSTREAM (instream)) - b;
3158 Lstream_close (istr);
3159 Lstream_close (ostr);
3161 Lstream_delete (istr);
3162 Lstream_delete (ostr);
3163 Lstream_delete (XLSTREAM (de_outstream));
3164 Lstream_delete (XLSTREAM (lb_outstream));
3165 return make_int (retlen);
3172 text_encode_generic (Lstream *encoding, const Bufbyte *src,
3173 unsigned_char_dynarr *dst, Lstream_data_count n)
3176 unsigned char char_boundary;
3177 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3178 unsigned int flags = str->flags;
3179 Emchar ch = str->ch;
3181 char_boundary = str->iso2022.current_char_boundary;
3187 if (char_boundary == 0)
3215 (*str->encode_char) (str, c, dst, &flags);
3217 else if (char_boundary == 1)
3219 (*str->encode_char) (str, (ch << 6) | (c & 0x3f), dst, &flags);
3225 ch = (ch << 6) | (c & 0x3f);
3230 if ((char_boundary == 0) && (flags & CODING_STATE_END))
3232 (*str->finish) (str, dst, &flags);
3237 str->iso2022.current_char_boundary = char_boundary;
3242 /************************************************************************/
3243 /* entity reference */
3244 /************************************************************************/
3247 decode_flush_er_chars (struct decoding_stream *str, unsigned_char_dynarr* dst);
3249 decode_flush_er_chars (struct decoding_stream *str, unsigned_char_dynarr* dst)
3251 if ( str->er_counter > 0)
3253 Dynarr_add_many (dst, str->er_buf, str->er_counter);
3254 str->er_counter = 0;
3258 EXFUN (Fregexp_quote, 1);
3260 void decode_add_er_char (struct decoding_stream *str, Emchar character,
3261 unsigned_char_dynarr* dst);
3263 decode_add_er_char (struct decoding_stream *str, Emchar c,
3264 unsigned_char_dynarr* dst)
3266 if (str->er_counter == 0)
3268 if (CODING_SYSTEM_USE_ENTITY_REFERENCE (str->codesys)
3271 str->er_buf[0] = '&';
3275 DECODE_ADD_UCS_CHAR (c, dst);
3279 Lisp_Object string = make_string (str->er_buf,
3286 Lisp_Object char_type;
3289 for ( rest = Vcoded_charset_entity_reference_alist;
3290 !NILP (rest); rest = Fcdr (rest) )
3296 char_type = XCDR (ccs);
3301 if (NILP (ccs = Ffind_charset (ccs)))
3310 pat = Fregexp_quote (pat);
3317 pat = concat3 (build_string ("^&"),
3318 pat, build_string ("\\([0-9]+\\)$"));
3321 else if (EQ (ret, Qx))
3323 pat = concat3 (build_string ("^&"),
3324 pat, build_string ("\\([0-9a-f]+\\)$"));
3327 else if (EQ (ret, QX))
3329 pat = concat3 (build_string ("^&"),
3330 pat, build_string ("\\([0-9A-F]+\\)$"));
3336 if (!NILP (Fstring_match (pat, string, Qnil, Qnil)))
3339 = XINT (Fstring_to_number
3340 (Fsubstring (string,
3341 Fmatch_beginning (make_int (1)),
3342 Fmatch_end (make_int (1))),
3346 ? DECODE_CHAR (ccs, code, 0)
3347 : decode_builtin_char (ccs, code);
3350 DECODE_ADD_UCS_CHAR (chr, dst);
3353 Dynarr_add_many (dst, str->er_buf, str->er_counter);
3354 Dynarr_add (dst, ';');
3360 if (!NILP (Fstring_match (build_string ("^&MCS-\\([0-9A-F]+\\)$"),
3361 string, Qnil, Qnil)))
3364 = XUINT (Fstring_to_number
3365 (Fsubstring (string,
3366 Fmatch_beginning (make_int (1)),
3367 Fmatch_end (make_int (1))),
3370 DECODE_ADD_UCS_CHAR (code, dst);
3374 Dynarr_add_many (dst, str->er_buf, str->er_counter);
3375 Dynarr_add (dst, ';');
3378 str->er_counter = 0;
3380 else if ( (str->er_counter >= ER_BUF_SIZE) || (c >= 0x7F) )
3382 Dynarr_add_many (dst, str->er_buf, str->er_counter);
3383 str->er_counter = 0;
3384 DECODE_ADD_UCS_CHAR (c, dst);
3387 str->er_buf[str->er_counter++] = c;
3390 void char_encode_as_entity_reference (Emchar ch, char* buf);
3392 char_encode_as_entity_reference (Emchar ch, char* buf)
3394 Lisp_Object rest = Vcoded_charset_entity_reference_alist;
3397 Lisp_Object char_type;
3398 int format_columns, idx;
3399 char format[ER_BUF_SIZE];
3401 while (!NILP (rest))
3407 char_type = XCDR (ccs);
3412 if (!NILP (ccs = Ffind_charset (ccs)))
3414 int code_point = charset_code_point (ccs, ch, 0);
3416 if ( (code_point >= 0)
3417 && (NILP (char_type)
3418 || DECODE_CHAR (ccs, code_point, 0) != ch) )
3424 if ( STRINGP (ret) &&
3425 ( (idx = XSTRING_LENGTH (ret)) <= (ER_BUF_SIZE - 4) ) )
3428 strncpy (&format[1], XSTRING_DATA (ret), idx);
3438 format[idx++] = '%';
3439 format_columns = XINT (ret);
3440 if ( (2 <= format_columns) && (format_columns <= 8)
3441 && (idx + format_columns <= ER_BUF_SIZE - 1) )
3443 format [idx++] = '0';
3444 format [idx++] = '0' + format_columns;
3453 format [idx++] = 'd';
3454 else if (EQ (ret, Qx))
3455 format [idx++] = 'x';
3456 else if (EQ (ret, QX))
3457 format [idx++] = 'X';
3460 format [idx++] = ';';
3463 sprintf (buf, format, code_point);
3470 sprintf (buf, "&MCS-%08X;", ch);
3474 /************************************************************************/
3475 /* character composition */
3476 /************************************************************************/
3477 extern Lisp_Object Qcomposition;
3480 COMPOSE_FLUSH_CHARS (struct decoding_stream *str, unsigned_char_dynarr* dst);
3482 COMPOSE_FLUSH_CHARS (struct decoding_stream *str, unsigned_char_dynarr* dst)
3486 for (i = 0; i < str->combined_char_count; i++)
3487 decode_add_er_char (str, str->combined_chars[i], dst);
3488 str->combined_char_count = 0;
3489 str->combining_table = Qnil;
3492 void COMPOSE_ADD_CHAR (struct decoding_stream *str, Emchar character,
3493 unsigned_char_dynarr* dst);
3495 COMPOSE_ADD_CHAR (struct decoding_stream *str,
3496 Emchar character, unsigned_char_dynarr* dst)
3498 if (CODING_SYSTEM_DISABLE_COMPOSITION (str->codesys))
3499 decode_add_er_char (str, character, dst);
3500 else if (!CONSP (str->combining_table))
3503 = Fchar_feature (make_char (character), Qcomposition, Qnil,
3507 decode_add_er_char (str, character, dst);
3510 str->combined_chars[0] = character;
3511 str->combined_char_count = 1;
3512 str->combining_table = ret;
3518 = Fcdr (Fassq (make_char (character), str->combining_table));
3522 Emchar char2 = XCHARVAL (ret);
3523 Lisp_Object ret2 = Fchar_feature (ret, Qcomposition, Qnil,
3528 decode_add_er_char (str, char2, dst);
3529 str->combined_char_count = 0;
3530 str->combining_table = Qnil;
3534 str->combined_chars[0] = char2;
3535 str->combined_char_count = 1;
3536 str->combining_table = ret2;
3541 ret = Fchar_feature (make_char (character), Qcomposition, Qnil,
3544 COMPOSE_FLUSH_CHARS (str, dst);
3546 decode_add_er_char (str, character, dst);
3549 str->combined_chars[0] = character;
3550 str->combined_char_count = 1;
3551 str->combining_table = ret;
3556 #else /* not UTF2000 */
3557 #define COMPOSE_FLUSH_CHARS(str, dst)
3558 #define COMPOSE_ADD_CHAR(str, ch, dst) DECODE_ADD_UCS_CHAR (ch, dst)
3559 #endif /* UTF2000 */
3562 /************************************************************************/
3563 /* Shift-JIS methods */
3564 /************************************************************************/
3566 /* Shift-JIS is a coding system encoding three character sets: ASCII, right
3567 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
3568 as is. A character of JISX0201-Kana (DIMENSION1_CHARS94 character set) is
3569 encoded by "position-code + 0x80". A character of JISX0208
3570 (DIMENSION2_CHARS94 character set) is encoded in 2-byte but two
3571 position-codes are divided and shifted so that it fit in the range
3574 --- CODE RANGE of Shift-JIS ---
3575 (character set) (range)
3577 JISX0201-Kana 0xA0 .. 0xDF
3578 JISX0208 (1st byte) 0x80 .. 0x9F and 0xE0 .. 0xEF
3579 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
3580 -------------------------------
3584 /* Is this the first byte of a Shift-JIS two-byte char? */
3586 #define BYTE_SJIS_TWO_BYTE_1_P(c) \
3587 (((c) >= 0x81 && (c) <= 0x9F) || ((c) >= 0xE0 && (c) <= 0xEF))
3589 /* Is this the second byte of a Shift-JIS two-byte char? */
3591 #define BYTE_SJIS_TWO_BYTE_2_P(c) \
3592 (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0x80 && (c) <= 0xFC))
3594 #define BYTE_SJIS_KATAKANA_P(c) \
3595 ((c) >= 0xA1 && (c) <= 0xDF)
3598 detect_coding_sjis (struct detection_state *st, const Extbyte *src, Lstream_data_count n)
3602 unsigned char c = *(unsigned char *)src++;
3603 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
3605 if (st->shift_jis.in_second_byte)
3607 st->shift_jis.in_second_byte = 0;
3611 else if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
3612 st->shift_jis.in_second_byte = 1;
3614 return CODING_CATEGORY_SHIFT_JIS_MASK;
3617 /* Convert Shift-JIS data to internal format. */
3620 decode_coding_sjis (Lstream *decoding, const Extbyte *src,
3621 unsigned_char_dynarr *dst, Lstream_data_count n)
3623 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3624 unsigned int flags = str->flags;
3625 unsigned int cpos = str->cpos;
3626 eol_type_t eol_type = str->eol_type;
3630 unsigned char c = *(unsigned char *)src++;
3634 /* Previous character was first byte of Shift-JIS Kanji char. */
3635 if (BYTE_SJIS_TWO_BYTE_2_P (c))
3637 unsigned char e1, e2;
3639 DECODE_SJIS (cpos, c, e1, e2);
3641 DECODE_ADD_UCS_CHAR(MAKE_CHAR(Vcharset_japanese_jisx0208,
3645 Dynarr_add (dst, LEADING_BYTE_JAPANESE_JISX0208);
3646 Dynarr_add (dst, e1);
3647 Dynarr_add (dst, e2);
3652 DECODE_ADD_BINARY_CHAR (cpos, dst);
3653 DECODE_ADD_BINARY_CHAR (c, dst);
3659 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3660 if (BYTE_SJIS_TWO_BYTE_1_P (c))
3662 else if (BYTE_SJIS_KATAKANA_P (c))
3665 DECODE_ADD_UCS_CHAR(MAKE_CHAR(Vcharset_katakana_jisx0201,
3668 Dynarr_add (dst, LEADING_BYTE_KATAKANA_JISX0201);
3669 Dynarr_add (dst, c);
3674 DECODE_ADD_UCS_CHAR(MAKE_CHAR(Vcharset_latin_jisx0201,
3678 DECODE_ADD_BINARY_CHAR (c, dst);
3680 label_continue_loop:;
3683 DECODE_HANDLE_END_OF_CONVERSION (flags, cpos, dst);
3689 /* Convert internal character representation to Shift_JIS. */
3692 char_encode_shift_jis (struct encoding_stream *str, Emchar ch,
3693 unsigned_char_dynarr *dst, unsigned int *flags)
3695 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
3699 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3700 Dynarr_add (dst, '\r');
3701 if (eol_type != EOL_CR)
3702 Dynarr_add (dst, ch);
3706 unsigned int s1, s2;
3708 int code_point = charset_code_point (Vcharset_latin_jisx0201, ch, 0);
3710 if (code_point >= 0)
3711 Dynarr_add (dst, code_point);
3712 else if ((code_point
3713 = charset_code_point (Vcharset_japanese_jisx0208_1990, ch, 0))
3716 ENCODE_SJIS ((code_point >> 8) | 0x80,
3717 (code_point & 0xFF) | 0x80, s1, s2);
3718 Dynarr_add (dst, s1);
3719 Dynarr_add (dst, s2);
3721 else if ((code_point
3722 = charset_code_point (Vcharset_katakana_jisx0201, ch, 0))
3724 Dynarr_add (dst, code_point | 0x80);
3725 else if ((code_point
3726 = charset_code_point (Vcharset_japanese_jisx0208, ch, 0))
3729 ENCODE_SJIS ((code_point >> 8) | 0x80,
3730 (code_point & 0xFF) | 0x80, s1, s2);
3731 Dynarr_add (dst, s1);
3732 Dynarr_add (dst, s2);
3734 else if ((code_point = charset_code_point (Vcharset_ascii, ch, 0))
3736 Dynarr_add (dst, code_point);
3738 Dynarr_add (dst, '?');
3740 Lisp_Object charset;
3741 unsigned int c1, c2;
3743 BREAKUP_CHAR (ch, charset, c1, c2);
3745 if (EQ(charset, Vcharset_katakana_jisx0201))
3747 Dynarr_add (dst, c1 | 0x80);
3751 Dynarr_add (dst, c1);
3753 else if (EQ(charset, Vcharset_japanese_jisx0208))
3755 ENCODE_SJIS (c1 | 0x80, c2 | 0x80, s1, s2);
3756 Dynarr_add (dst, s1);
3757 Dynarr_add (dst, s2);
3760 Dynarr_add (dst, '?');
3766 char_finish_shift_jis (struct encoding_stream *str, unsigned_char_dynarr *dst,
3767 unsigned int *flags)
3771 DEFUN ("decode-shift-jis-char", Fdecode_shift_jis_char, 1, 1, 0, /*
3772 Decode a JISX0208 character of Shift-JIS coding-system.
3773 CODE is the character code in Shift-JIS as a cons of type bytes.
3774 Return the corresponding character.
3778 unsigned char c1, c2, s1, s2;
3781 CHECK_INT (XCAR (code));
3782 CHECK_INT (XCDR (code));
3783 s1 = XINT (XCAR (code));
3784 s2 = XINT (XCDR (code));
3785 if (BYTE_SJIS_TWO_BYTE_1_P (s1) &&
3786 BYTE_SJIS_TWO_BYTE_2_P (s2))
3788 DECODE_SJIS (s1, s2, c1, c2);
3789 return make_char (MAKE_CHAR (Vcharset_japanese_jisx0208,
3790 c1 & 0x7F, c2 & 0x7F));
3796 DEFUN ("encode-shift-jis-char", Fencode_shift_jis_char, 1, 1, 0, /*
3797 Encode a JISX0208 character CHARACTER to SHIFT-JIS coding-system.
3798 Return the corresponding character code in SHIFT-JIS as a cons of two bytes.
3802 Lisp_Object charset;
3805 CHECK_CHAR_COERCE_INT (character);
3806 BREAKUP_CHAR (XCHAR (character), charset, c1, c2);
3807 if (EQ (charset, Vcharset_japanese_jisx0208))
3809 ENCODE_SJIS (c1 | 0x80, c2 | 0x80, s1, s2);
3810 return Fcons (make_int (s1), make_int (s2));
3817 /************************************************************************/
3819 /************************************************************************/
3821 /* BIG5 is a coding system encoding two character sets: ASCII and
3822 Big5. An ASCII character is encoded as is. Big5 is a two-byte
3823 character set and is encoded in two-byte.
3825 --- CODE RANGE of BIG5 ---
3826 (character set) (range)
3828 Big5 (1st byte) 0xA1 .. 0xFE
3829 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
3830 --------------------------
3832 Since the number of characters in Big5 is larger than maximum
3833 characters in Emacs' charset (96x96), it can't be handled as one
3834 charset. So, in Emacs, Big5 is divided into two: `charset-big5-1'
3835 and `charset-big5-2'. Both <type>s are DIMENSION2_CHARS94. The former
3836 contains frequently used characters and the latter contains less
3837 frequently used characters. */
3840 #define BYTE_BIG5_TWO_BYTE_1_P(c) \
3841 ((c) >= 0x81 && (c) <= 0xFE)
3843 #define BYTE_BIG5_TWO_BYTE_1_P(c) \
3844 ((c) >= 0xA1 && (c) <= 0xFE)
3847 /* Is this the second byte of a Shift-JIS two-byte char? */
3849 #define BYTE_BIG5_TWO_BYTE_2_P(c) \
3850 (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0xA1 && (c) <= 0xFE))
3852 /* Number of Big5 characters which have the same code in 1st byte. */
3854 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
3856 /* Code conversion macros. These are macros because they are used in
3857 inner loops during code conversion.
3859 Note that temporary variables in macros introduce the classic
3860 dynamic-scoping problems with variable names. We use capital-
3861 lettered variables in the assumption that XEmacs does not use
3862 capital letters in variables except in a very formalized way
3865 /* Convert Big5 code (b1, b2) into its internal string representation
3868 /* There is a much simpler way to split the Big5 charset into two.
3869 For the moment I'm going to leave the algorithm as-is because it
3870 claims to separate out the most-used characters into a single
3871 charset, which perhaps will lead to optimizations in various
3874 The way the algorithm works is something like this:
3876 Big5 can be viewed as a 94x157 charset, where the row is
3877 encoded into the bytes 0xA1 .. 0xFE and the column is encoded
3878 into the bytes 0x40 .. 0x7E and 0xA1 .. 0xFE. As for frequency,
3879 the split between low and high column numbers is apparently
3880 meaningless; ascending rows produce less and less frequent chars.
3881 Therefore, we assign the lower half of rows (0xA1 .. 0xC8) to
3882 the first charset, and the upper half (0xC9 .. 0xFE) to the
3883 second. To do the conversion, we convert the character into
3884 a single number where 0 .. 156 is the first row, 157 .. 313
3885 is the second, etc. That way, the characters are ordered by
3886 decreasing frequency. Then we just chop the space in two
3887 and coerce the result into a 94x94 space.
3890 #define DECODE_BIG5(b1, b2, lb, c1, c2) do \
3892 int B1 = b1, B2 = b2; \
3894 = (B1 - 0xA1) * BIG5_SAME_ROW + B2 - (B2 < 0x7F ? 0x40 : 0x62); \
3898 lb = LEADING_BYTE_CHINESE_BIG5_1; \
3902 lb = LEADING_BYTE_CHINESE_BIG5_2; \
3903 I -= (BIG5_SAME_ROW) * (0xC9 - 0xA1); \
3905 c1 = I / (0xFF - 0xA1) + 0xA1; \
3906 c2 = I % (0xFF - 0xA1) + 0xA1; \
3909 /* Convert the internal string representation of a Big5 character
3910 (lb, c1, c2) into Big5 code (b1, b2). */
3912 #define ENCODE_BIG5(lb, c1, c2, b1, b2) do \
3914 unsigned int I = ((c1) - 0xA1) * (0xFF - 0xA1) + ((c2) - 0xA1); \
3916 if (lb == LEADING_BYTE_CHINESE_BIG5_2) \
3918 I += BIG5_SAME_ROW * (0xC9 - 0xA1); \
3920 b1 = I / BIG5_SAME_ROW + 0xA1; \
3921 b2 = I % BIG5_SAME_ROW; \
3922 b2 += b2 < 0x3F ? 0x40 : 0x62; \
3926 detect_coding_big5 (struct detection_state *st, const Extbyte *src, Lstream_data_count n)
3930 unsigned char c = *(unsigned char *)src++;
3931 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO
3933 || (c >= 0x80 && c <= 0xA0)
3937 if (st->big5.in_second_byte)
3939 st->big5.in_second_byte = 0;
3940 if (c < 0x40 || (c >= 0x80 && c <= 0xA0))
3950 st->big5.in_second_byte = 1;
3952 return CODING_CATEGORY_BIG5_MASK;
3955 /* Convert Big5 data to internal format. */
3958 decode_coding_big5 (Lstream *decoding, const Extbyte *src,
3959 unsigned_char_dynarr *dst, Lstream_data_count n)
3961 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3962 unsigned int flags = str->flags;
3963 unsigned int cpos = str->cpos;
3964 eol_type_t eol_type = str->eol_type;
3967 = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (DECODING_STREAM_DATA
3968 (decoding)->codesys, 1);
3973 unsigned char c = *(unsigned char *)src++;
3976 /* Previous character was first byte of Big5 char. */
3977 if (BYTE_BIG5_TWO_BYTE_2_P (c))
3980 int code_point = (cpos << 8) | c;
3981 Emchar char_id = decode_defined_char (ccs, code_point, 0);
3985 = DECODE_CHAR (Vcharset_chinese_big5, code_point, 0);
3986 DECODE_ADD_UCS_CHAR (char_id, dst);
3988 unsigned char b1, b2, b3;
3989 DECODE_BIG5 (cpos, c, b1, b2, b3);
3990 Dynarr_add (dst, b1);
3991 Dynarr_add (dst, b2);
3992 Dynarr_add (dst, b3);
3997 DECODE_ADD_BINARY_CHAR (cpos, dst);
3998 DECODE_ADD_BINARY_CHAR (c, dst);
4004 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
4005 if (BYTE_BIG5_TWO_BYTE_1_P (c))
4007 decode_flush_er_chars (str, dst);
4012 decode_flush_er_chars (str, dst);
4013 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
4014 DECODE_ADD_BINARY_CHAR (c, dst);
4018 /* DECODE_ADD_BINARY_CHAR (c, dst); */
4019 decode_add_er_char (str, c, dst);
4022 label_continue_loop:;
4025 /* DECODE_HANDLE_END_OF_CONVERSION (flags, cpos, dst); */
4026 if (flags & CODING_STATE_END)
4028 decode_flush_er_chars (str, dst);
4029 DECODE_OUTPUT_PARTIAL_CHAR (cpos);
4030 if (flags & CODING_STATE_CR)
4031 Dynarr_add (dst, '\r');
4038 /* Convert internally-formatted data to Big5. */
4041 char_encode_big5 (struct encoding_stream *str, Emchar ch,
4042 unsigned_char_dynarr *dst, unsigned int *flags)
4044 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
4048 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
4049 Dynarr_add (dst, '\r');
4050 if (eol_type != EOL_CR)
4051 Dynarr_add (dst, ch);
4058 = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (str->codesys, 1);
4060 if ((code_point = charset_code_point (Vcharset_ascii, ch, 0)) >= 0)
4061 Dynarr_add (dst, code_point);
4062 else if ((code_point = charset_code_point (ccs, ch, 0)) >= 0)
4064 Dynarr_add (dst, code_point >> 8);
4065 Dynarr_add (dst, code_point & 0xFF);
4067 else if ((code_point
4068 = charset_code_point (Vcharset_chinese_big5, ch, 0)) >= 0)
4070 Dynarr_add (dst, code_point >> 8);
4071 Dynarr_add (dst, code_point & 0xFF);
4073 else if ((code_point
4074 = charset_code_point (Vcharset_chinese_big5_1, ch, 0)) >= 0)
4077 = ((code_point >> 8) - 33) * (0xFF - 0xA1)
4078 + ((code_point & 0xFF) - 33);
4079 unsigned char b1 = I / BIG5_SAME_ROW + 0xA1;
4080 unsigned char b2 = I % BIG5_SAME_ROW;
4082 b2 += b2 < 0x3F ? 0x40 : 0x62;
4083 Dynarr_add (dst, b1);
4084 Dynarr_add (dst, b2);
4086 else if ((code_point
4087 = charset_code_point (Vcharset_chinese_big5_2, ch, 0)) >= 0)
4090 = ((code_point >> 8) - 33) * (0xFF - 0xA1)
4091 + ((code_point & 0xFF) - 33);
4092 unsigned char b1, b2;
4094 I += BIG5_SAME_ROW * (0xC9 - 0xA1);
4095 b1 = I / BIG5_SAME_ROW + 0xA1;
4096 b2 = I % BIG5_SAME_ROW;
4097 b2 += b2 < 0x3F ? 0x40 : 0x62;
4098 Dynarr_add (dst, b1);
4099 Dynarr_add (dst, b2);
4101 else if (CODING_SYSTEM_USE_ENTITY_REFERENCE (str->codesys))
4105 char_encode_as_entity_reference (ch, buf);
4106 Dynarr_add_many (dst, buf, strlen (buf));
4109 Dynarr_add (dst, '?');
4116 char_finish_big5 (struct encoding_stream *str, unsigned_char_dynarr *dst,
4117 unsigned int *flags)
4122 DEFUN ("decode-big5-char", Fdecode_big5_char, 1, 1, 0, /*
4123 Decode a Big5 character CODE of BIG5 coding-system.
4124 CODE is the character code in BIG5, a cons of two integers.
4125 Return the corresponding character.
4129 unsigned char c1, c2, b1, b2;
4132 CHECK_INT (XCAR (code));
4133 CHECK_INT (XCDR (code));
4134 b1 = XINT (XCAR (code));
4135 b2 = XINT (XCDR (code));
4136 if (BYTE_BIG5_TWO_BYTE_1_P (b1) &&
4137 BYTE_BIG5_TWO_BYTE_2_P (b2))
4139 Charset_ID leading_byte;
4140 Lisp_Object charset;
4141 DECODE_BIG5 (b1, b2, leading_byte, c1, c2);
4142 charset = CHARSET_BY_LEADING_BYTE (leading_byte);
4143 return make_char (MAKE_CHAR (charset, c1 & 0x7F, c2 & 0x7F));
4149 DEFUN ("encode-big5-char", Fencode_big5_char, 1, 1, 0, /*
4150 Encode the Big5 character CHARACTER in the BIG5 coding-system.
4151 Return the corresponding character code in Big5.
4155 Lisp_Object charset;
4158 CHECK_CHAR_COERCE_INT (character);
4159 BREAKUP_CHAR (XCHAR (character), charset, c1, c2);
4160 if (EQ (charset, Vcharset_chinese_big5_1) ||
4161 EQ (charset, Vcharset_chinese_big5_2))
4163 ENCODE_BIG5 (XCHARSET_LEADING_BYTE (charset), c1 | 0x80, c2 | 0x80,
4165 return Fcons (make_int (b1), make_int (b2));
4172 /************************************************************************/
4174 /************************************************************************/
4177 detect_coding_ucs4 (struct detection_state *st, const Extbyte *src, Lstream_data_count n)
4181 unsigned char c = *(unsigned char *)src++;
4182 switch (st->ucs4.in_byte)
4191 st->ucs4.in_byte = 0;
4197 return CODING_CATEGORY_UCS4_MASK;
4201 decode_coding_ucs4 (Lstream *decoding, const Extbyte *src,
4202 unsigned_char_dynarr *dst, Lstream_data_count n)
4204 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
4205 unsigned int flags = str->flags;
4206 unsigned int cpos = str->cpos;
4207 unsigned char counter = str->counter;
4211 unsigned char c = *(unsigned char *)src++;
4219 DECODE_ADD_UCS_CHAR ((cpos << 8) | c, dst);
4224 cpos = ( cpos << 8 ) | c;
4228 if (counter & CODING_STATE_END)
4229 DECODE_OUTPUT_PARTIAL_CHAR (cpos);
4233 str->counter = counter;
4237 char_encode_ucs4 (struct encoding_stream *str, Emchar ch,
4238 unsigned_char_dynarr *dst, unsigned int *flags)
4240 Dynarr_add (dst, ch >> 24);
4241 Dynarr_add (dst, ch >> 16);
4242 Dynarr_add (dst, ch >> 8);
4243 Dynarr_add (dst, ch );
4247 char_finish_ucs4 (struct encoding_stream *str, unsigned_char_dynarr *dst,
4248 unsigned int *flags)
4253 /************************************************************************/
4254 /* UTF-16 methods */
4255 /************************************************************************/
4258 detect_coding_utf16 (struct detection_state *st, const Extbyte *src, Lstream_data_count n)
4260 return CODING_CATEGORY_UTF16_MASK;
4264 decode_coding_utf16 (Lstream *decoding, const Extbyte *src,
4265 unsigned_char_dynarr *dst, Lstream_data_count n)
4267 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
4268 unsigned int flags = str->flags;
4269 unsigned int cpos = str->cpos;
4270 unsigned char counter = str->counter & 3;
4271 unsigned char byte_order = str->counter >> 2;
4272 eol_type_t eol_type = str->eol_type;
4276 unsigned char c = *(unsigned char *)src++;
4282 else if (counter == 1)
4286 if (byte_order == 0)
4287 code = (c << 8) | cpos;
4289 code = (cpos << 8) | c;
4292 code = ((code & 0xFF) << 8) | (code >> 8);
4293 if ( byte_order == 0 )
4298 if ( (0xD800 <= code) && (code <= 0xDBFF) )
4309 DECODE_HANDLE_EOL_TYPE (eol_type, code, flags, dst);
4310 DECODE_ADD_UCS_CHAR (code, dst);
4314 else if (counter == 2)
4316 cpos = (cpos << 8) | c;
4324 ? (c << 8) | (cpos & 0xFF)
4325 : ((cpos & 0xFF) << 8) | c;
4327 DECODE_ADD_UCS_CHAR ((x - 0xD800) * 0x400 + (y - 0xDC00)
4332 label_continue_loop:;
4334 if (counter & CODING_STATE_END)
4335 DECODE_OUTPUT_PARTIAL_CHAR (cpos);
4339 str->counter = (byte_order << 2) | counter;
4343 char_encode_utf16 (struct encoding_stream *str, Emchar ch,
4344 unsigned_char_dynarr *dst, unsigned int *flags)
4348 Dynarr_add (dst, ch);
4349 Dynarr_add (dst, ch >> 8);
4353 int y = ((ch - 0x10000) / 0x400) + 0xD800;
4354 int z = ((ch - 0x10000) % 0x400) + 0xDC00;
4356 Dynarr_add (dst, y);
4357 Dynarr_add (dst, y >> 8);
4358 Dynarr_add (dst, z);
4359 Dynarr_add (dst, z >> 8);
4364 char_finish_utf16 (struct encoding_stream *str, unsigned_char_dynarr *dst,
4365 unsigned int *flags)
4370 /************************************************************************/
4372 /************************************************************************/
4375 detect_coding_utf8 (struct detection_state *st, const Extbyte *src, Lstream_data_count n)
4379 unsigned char c = *(unsigned char *)src++;
4380 switch (st->utf8.in_byte)
4383 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
4386 st->utf8.in_byte = 5;
4388 st->utf8.in_byte = 4;
4390 st->utf8.in_byte = 3;
4392 st->utf8.in_byte = 2;
4394 st->utf8.in_byte = 1;
4399 if ((c & 0xc0) != 0x80)
4405 return CODING_CATEGORY_UTF8_MASK;
4409 decode_output_utf8_partial_char (unsigned char counter,
4411 unsigned_char_dynarr *dst)
4414 DECODE_ADD_BINARY_CHAR ( (cpos|0xFC), dst);
4415 else if (counter == 4)
4417 if (cpos < (1 << 6))
4418 DECODE_ADD_BINARY_CHAR ( (cpos|0xF8), dst);
4421 DECODE_ADD_BINARY_CHAR ( ((cpos >> 6)|0xFC), dst);
4422 DECODE_ADD_BINARY_CHAR ( ((cpos&0x3F)|0x80), dst);
4425 else if (counter == 3)
4427 if (cpos < (1 << 6))
4428 DECODE_ADD_BINARY_CHAR ( (cpos|0xF0), dst);
4429 else if (cpos < (1 << 12))
4431 DECODE_ADD_BINARY_CHAR ( ((cpos >> 6)|0xF8), dst);
4432 DECODE_ADD_BINARY_CHAR ( ((cpos&0x3F)|0x80), dst);
4436 DECODE_ADD_BINARY_CHAR ( ( (cpos >> 12)|0xFC), dst);
4437 DECODE_ADD_BINARY_CHAR ( (((cpos >> 6)&0x3F)|0x80), dst);
4438 DECODE_ADD_BINARY_CHAR ( ( (cpos &0x3F)|0x80), dst);
4441 else if (counter == 2)
4443 if (cpos < (1 << 6))
4444 DECODE_ADD_BINARY_CHAR ( (cpos|0xE0), dst);
4445 else if (cpos < (1 << 12))
4447 DECODE_ADD_BINARY_CHAR ( ((cpos >> 6)|0xF0), dst);
4448 DECODE_ADD_BINARY_CHAR ( ((cpos&0x3F)|0x80), dst);
4450 else if (cpos < (1 << 18))
4452 DECODE_ADD_BINARY_CHAR ( ( (cpos >> 12)|0xF8), dst);
4453 DECODE_ADD_BINARY_CHAR ( (((cpos >> 6)&0x3F)|0x80), dst);
4454 DECODE_ADD_BINARY_CHAR ( ( (cpos &0x3F)|0x80), dst);
4458 DECODE_ADD_BINARY_CHAR ( ( (cpos >> 18)|0xFC), dst);
4459 DECODE_ADD_BINARY_CHAR ( (((cpos >> 12)&0x3F)|0x80), dst);
4460 DECODE_ADD_BINARY_CHAR ( (((cpos >> 6)&0x3F)|0x80), dst);
4461 DECODE_ADD_BINARY_CHAR ( ( (cpos &0x3F)|0x80), dst);
4466 if (cpos < (1 << 6))
4467 DECODE_ADD_BINARY_CHAR ( (cpos|0xC0), dst);
4468 else if (cpos < (1 << 12))
4470 DECODE_ADD_BINARY_CHAR ( ((cpos >> 6)|0xE0), dst);
4471 DECODE_ADD_BINARY_CHAR ( ((cpos&0x3F)|0x80), dst);
4473 else if (cpos < (1 << 18))
4475 DECODE_ADD_BINARY_CHAR ( ( (cpos >> 12)|0xF0), dst);
4476 DECODE_ADD_BINARY_CHAR ( (((cpos >> 6)&0x3F)|0x80), dst);
4477 DECODE_ADD_BINARY_CHAR ( ( (cpos &0x3F)|0x80), dst);
4479 else if (cpos < (1 << 24))
4481 DECODE_ADD_BINARY_CHAR ( ( (cpos >> 18)|0xF8), dst);
4482 DECODE_ADD_BINARY_CHAR ( (((cpos >> 12)&0x3F)|0x80), dst);
4483 DECODE_ADD_BINARY_CHAR ( (((cpos >> 6)&0x3F)|0x80), dst);
4484 DECODE_ADD_BINARY_CHAR ( ( (cpos &0x3F)|0x80), dst);
4488 DECODE_ADD_BINARY_CHAR ( ( (cpos >> 24)|0xFC), dst);
4489 DECODE_ADD_BINARY_CHAR ( (((cpos >> 18)&0x3F)|0x80), dst);
4490 DECODE_ADD_BINARY_CHAR ( (((cpos >> 12)&0x3F)|0x80), dst);
4491 DECODE_ADD_BINARY_CHAR ( (((cpos >> 6)&0x3F)|0x80), dst);
4492 DECODE_ADD_BINARY_CHAR ( ( (cpos &0x3F)|0x80), dst);
4498 decode_coding_utf8 (Lstream *decoding, const Extbyte *src,
4499 unsigned_char_dynarr *dst, Lstream_data_count n)
4501 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
4502 unsigned int flags = str->flags;
4503 unsigned int cpos = str->cpos;
4504 eol_type_t eol_type = str->eol_type;
4505 unsigned char counter = str->counter;
4508 = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (DECODING_STREAM_DATA
4509 (decoding)->codesys, 0);
4514 unsigned char c = *(unsigned char *)src++;
4519 COMPOSE_FLUSH_CHARS (str, dst);
4520 decode_flush_er_chars (str, dst);
4521 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
4522 DECODE_ADD_UCS_CHAR (c, dst);
4524 else if ( c < 0xC0 )
4525 /* decode_add_er_char (str, c, dst); */
4526 COMPOSE_ADD_CHAR (str, c, dst);
4529 /* decode_flush_er_chars (str, dst); */
4535 else if ( c < 0xF0 )
4540 else if ( c < 0xF8 )
4545 else if ( c < 0xFC )
4557 else if ( (c & 0xC0) == 0x80 )
4559 cpos = ( cpos << 6 ) | ( c & 0x3f );
4566 char_id = decode_defined_char (ccs, cpos, 0);
4573 COMPOSE_ADD_CHAR (str, char_id, dst);
4582 COMPOSE_FLUSH_CHARS (str, dst);
4583 decode_flush_er_chars (str, dst);
4584 decode_output_utf8_partial_char (counter, cpos, dst);
4585 DECODE_ADD_BINARY_CHAR (c, dst);
4589 label_continue_loop:;
4592 if (flags & CODING_STATE_END)
4594 COMPOSE_FLUSH_CHARS (str, dst);
4595 decode_flush_er_chars (str, dst);
4598 decode_output_utf8_partial_char (counter, cpos, dst);
4605 str->counter = counter;
4609 char_encode_utf8 (struct encoding_stream *str, Emchar ch,
4610 unsigned_char_dynarr *dst, unsigned int *flags)
4612 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
4616 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
4617 Dynarr_add (dst, '\r');
4618 if (eol_type != EOL_CR)
4619 Dynarr_add (dst, ch);
4621 else if (ch <= 0x7f)
4623 Dynarr_add (dst, ch);
4628 = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (str->codesys, 0);
4629 int code_point = charset_code_point (ucs_ccs, ch, 0);
4631 if ( (code_point < 0) || (code_point > 0xEFFFF) )
4634 = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (str->codesys, 1);
4638 && INTP (ret = Fchar_feature (make_char (ch),
4641 code_point = XINT (ret);
4642 else if ( !NILP (map =
4643 CODING_SYSTEM_ISO2022_INITIAL_CHARSET
4645 && INTP (ret = Fchar_feature (make_char (ch),
4648 code_point = XINT (ret);
4649 else if (CODING_SYSTEM_USE_ENTITY_REFERENCE (str->codesys))
4653 char_encode_as_entity_reference (ch, buf);
4654 Dynarr_add_many (dst, buf, strlen (buf));
4660 if (code_point <= 0x7ff)
4662 Dynarr_add (dst, (code_point >> 6) | 0xc0);
4663 Dynarr_add (dst, (code_point & 0x3f) | 0x80);
4665 else if (code_point <= 0xffff)
4667 Dynarr_add (dst, (code_point >> 12) | 0xe0);
4668 Dynarr_add (dst, ((code_point >> 6) & 0x3f) | 0x80);
4669 Dynarr_add (dst, (code_point & 0x3f) | 0x80);
4671 else if (code_point <= 0x1fffff)
4673 Dynarr_add (dst, (code_point >> 18) | 0xf0);
4674 Dynarr_add (dst, ((code_point >> 12) & 0x3f) | 0x80);
4675 Dynarr_add (dst, ((code_point >> 6) & 0x3f) | 0x80);
4676 Dynarr_add (dst, (code_point & 0x3f) | 0x80);
4678 else if (code_point <= 0x3ffffff)
4680 Dynarr_add (dst, (code_point >> 24) | 0xf8);
4681 Dynarr_add (dst, ((code_point >> 18) & 0x3f) | 0x80);
4682 Dynarr_add (dst, ((code_point >> 12) & 0x3f) | 0x80);
4683 Dynarr_add (dst, ((code_point >> 6) & 0x3f) | 0x80);
4684 Dynarr_add (dst, (code_point & 0x3f) | 0x80);
4688 Dynarr_add (dst, (code_point >> 30) | 0xfc);
4689 Dynarr_add (dst, ((code_point >> 24) & 0x3f) | 0x80);
4690 Dynarr_add (dst, ((code_point >> 18) & 0x3f) | 0x80);
4691 Dynarr_add (dst, ((code_point >> 12) & 0x3f) | 0x80);
4692 Dynarr_add (dst, ((code_point >> 6) & 0x3f) | 0x80);
4693 Dynarr_add (dst, (code_point & 0x3f) | 0x80);
4699 char_finish_utf8 (struct encoding_stream *str, unsigned_char_dynarr *dst,
4700 unsigned int *flags)
4705 /************************************************************************/
4706 /* ISO2022 methods */
4707 /************************************************************************/
4709 /* The following note describes the coding system ISO2022 briefly.
4710 Since the intention of this note is to help understand the
4711 functions in this file, some parts are NOT ACCURATE or OVERLY
4712 SIMPLIFIED. For thorough understanding, please refer to the
4713 original document of ISO2022.
4715 ISO2022 provides many mechanisms to encode several character sets
4716 in 7-bit and 8-bit environments. For 7-bit environments, all text
4717 is encoded using bytes less than 128. This may make the encoded
4718 text a little bit longer, but the text passes more easily through
4719 several gateways, some of which strip off MSB (Most Signigant Bit).
4721 There are two kinds of character sets: control character set and
4722 graphic character set. The former contains control characters such
4723 as `newline' and `escape' to provide control functions (control
4724 functions are also provided by escape sequences). The latter
4725 contains graphic characters such as 'A' and '-'. Emacs recognizes
4726 two control character sets and many graphic character sets.
4728 Graphic character sets are classified into one of the following
4729 four classes, according to the number of bytes (DIMENSION) and
4730 number of characters in one dimension (CHARS) of the set:
4731 - DIMENSION1_CHARS94
4732 - DIMENSION1_CHARS96
4733 - DIMENSION2_CHARS94
4734 - DIMENSION2_CHARS96
4736 In addition, each character set is assigned an identification tag,
4737 unique for each set, called "final character" (denoted as <F>
4738 hereafter). The <F> of each character set is decided by ECMA(*)
4739 when it is registered in ISO. The code range of <F> is 0x30..0x7F
4740 (0x30..0x3F are for private use only).
4742 Note (*): ECMA = European Computer Manufacturers Association
4744 Here are examples of graphic character set [NAME(<F>)]:
4745 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
4746 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
4747 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
4748 o DIMENSION2_CHARS96 -- none for the moment
4750 A code area (1 byte = 8 bits) is divided into 4 areas, C0, GL, C1, and GR.
4751 C0 [0x00..0x1F] -- control character plane 0
4752 GL [0x20..0x7F] -- graphic character plane 0
4753 C1 [0x80..0x9F] -- control character plane 1
4754 GR [0xA0..0xFF] -- graphic character plane 1
4756 A control character set is directly designated and invoked to C0 or
4757 C1 by an escape sequence. The most common case is that:
4758 - ISO646's control character set is designated/invoked to C0, and
4759 - ISO6429's control character set is designated/invoked to C1,
4760 and usually these designations/invocations are omitted in encoded
4761 text. In a 7-bit environment, only C0 can be used, and a control
4762 character for C1 is encoded by an appropriate escape sequence to
4763 fit into the environment. All control characters for C1 are
4764 defined to have corresponding escape sequences.
4766 A graphic character set is at first designated to one of four
4767 graphic registers (G0 through G3), then these graphic registers are
4768 invoked to GL or GR. These designations and invocations can be
4769 done independently. The most common case is that G0 is invoked to
4770 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
4771 these invocations and designations are omitted in encoded text.
4772 In a 7-bit environment, only GL can be used.
4774 When a graphic character set of CHARS94 is invoked to GL, codes
4775 0x20 and 0x7F of the GL area work as control characters SPACE and
4776 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
4779 There are two ways of invocation: locking-shift and single-shift.
4780 With locking-shift, the invocation lasts until the next different
4781 invocation, whereas with single-shift, the invocation affects the
4782 following character only and doesn't affect the locking-shift
4783 state. Invocations are done by the following control characters or
4786 ----------------------------------------------------------------------
4787 abbrev function cntrl escape seq description
4788 ----------------------------------------------------------------------
4789 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
4790 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
4791 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
4792 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
4793 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
4794 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
4795 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
4796 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
4797 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
4798 ----------------------------------------------------------------------
4799 (*) These are not used by any known coding system.
4801 Control characters for these functions are defined by macros
4802 ISO_CODE_XXX in `coding.h'.
4804 Designations are done by the following escape sequences:
4805 ----------------------------------------------------------------------
4806 escape sequence description
4807 ----------------------------------------------------------------------
4808 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
4809 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
4810 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
4811 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
4812 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
4813 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
4814 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
4815 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
4816 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
4817 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
4818 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
4819 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
4820 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
4821 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
4822 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
4823 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
4824 ----------------------------------------------------------------------
4826 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
4827 of dimension 1, chars 94, and final character <F>, etc...
4829 Note (*): Although these designations are not allowed in ISO2022,
4830 Emacs accepts them on decoding, and produces them on encoding
4831 CHARS96 character sets in a coding system which is characterized as
4832 7-bit environment, non-locking-shift, and non-single-shift.
4834 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
4835 '(' can be omitted. We refer to this as "short-form" hereafter.
4837 Now you may notice that there are a lot of ways for encoding the
4838 same multilingual text in ISO2022. Actually, there exist many
4839 coding systems such as Compound Text (used in X11's inter client
4840 communication, ISO-2022-JP (used in Japanese internet), ISO-2022-KR
4841 (used in Korean internet), EUC (Extended UNIX Code, used in Asian
4842 localized platforms), and all of these are variants of ISO2022.
4844 In addition to the above, Emacs handles two more kinds of escape
4845 sequences: ISO6429's direction specification and Emacs' private
4846 sequence for specifying character composition.
4848 ISO6429's direction specification takes the following form:
4849 o CSI ']' -- end of the current direction
4850 o CSI '0' ']' -- end of the current direction
4851 o CSI '1' ']' -- start of left-to-right text
4852 o CSI '2' ']' -- start of right-to-left text
4853 The control character CSI (0x9B: control sequence introducer) is
4854 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
4856 Character composition specification takes the following form:
4857 o ESC '0' -- start character composition
4858 o ESC '1' -- end character composition
4859 Since these are not standard escape sequences of any ISO standard,
4860 their use with these meanings is restricted to Emacs only. */
4863 reset_iso2022 (Lisp_Object coding_system, struct iso2022_decoder *iso)
4867 for (i = 0; i < 4; i++)
4869 if (!NILP (coding_system))
4871 XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, i);
4873 iso->charset[i] = Qt;
4874 iso->invalid_designated[i] = 0;
4876 iso->esc = ISO_ESC_NOTHING;
4877 iso->esc_bytes_index = 0;
4878 iso->register_left = 0;
4879 iso->register_right = 1;
4880 iso->switched_dir_and_no_valid_charset_yet = 0;
4881 iso->invalid_switch_dir = 0;
4882 iso->output_direction_sequence = 0;
4883 iso->output_literally = 0;
4884 #ifdef ENABLE_COMPOSITE_CHARS
4885 if (iso->composite_chars)
4886 Dynarr_reset (iso->composite_chars);
4891 fit_to_be_escape_quoted (unsigned char c)
4908 /* Parse one byte of an ISO2022 escape sequence.
4909 If the result is an invalid escape sequence, return 0 and
4910 do not change anything in STR. Otherwise, if the result is
4911 an incomplete escape sequence, update ISO2022.ESC and
4912 ISO2022.ESC_BYTES and return -1. Otherwise, update
4913 all the state variables (but not ISO2022.ESC_BYTES) and
4916 If CHECK_INVALID_CHARSETS is non-zero, check for designation
4917 or invocation of an invalid character set and treat that as
4918 an unrecognized escape sequence. */
4921 parse_iso2022_esc (Lisp_Object codesys, struct iso2022_decoder *iso,
4922 unsigned char c, unsigned int *flags,
4923 int check_invalid_charsets)
4925 /* (1) If we're at the end of a designation sequence, CS is the
4926 charset being designated and REG is the register to designate
4929 (2) If we're at the end of a locking-shift sequence, REG is
4930 the register to invoke and HALF (0 == left, 1 == right) is
4931 the half to invoke it into.
4933 (3) If we're at the end of a single-shift sequence, REG is
4934 the register to invoke. */
4935 Lisp_Object cs = Qnil;
4938 /* NOTE: This code does goto's all over the fucking place.
4939 The reason for this is that we're basically implementing
4940 a state machine here, and hierarchical languages like C
4941 don't really provide a clean way of doing this. */
4943 if (! (*flags & CODING_STATE_ESCAPE))
4944 /* At beginning of escape sequence; we need to reset our
4945 escape-state variables. */
4946 iso->esc = ISO_ESC_NOTHING;
4948 iso->output_literally = 0;
4949 iso->output_direction_sequence = 0;
4953 case ISO_ESC_NOTHING:
4954 iso->esc_bytes_index = 0;
4957 case ISO_CODE_ESC: /* Start escape sequence */
4958 *flags |= CODING_STATE_ESCAPE;
4962 case ISO_CODE_CSI: /* ISO6429 (specifying directionality) */
4963 *flags |= CODING_STATE_ESCAPE;
4964 iso->esc = ISO_ESC_5_11;
4967 case ISO_CODE_SO: /* locking shift 1 */
4970 case ISO_CODE_SI: /* locking shift 0 */
4974 case ISO_CODE_SS2: /* single shift */
4977 case ISO_CODE_SS3: /* single shift */
4981 default: /* Other control characters */
4988 /**** single shift ****/
4990 case 'N': /* single shift 2 */
4993 case 'O': /* single shift 3 */
4997 /**** locking shift ****/
4999 case '~': /* locking shift 1 right */
5002 case 'n': /* locking shift 2 */
5005 case '}': /* locking shift 2 right */
5008 case 'o': /* locking shift 3 */
5011 case '|': /* locking shift 3 right */
5015 #ifdef ENABLE_COMPOSITE_CHARS
5016 /**** composite ****/
5019 iso->esc = ISO_ESC_START_COMPOSITE;
5020 *flags = (*flags & CODING_STATE_ISO2022_LOCK) |
5021 CODING_STATE_COMPOSITE;
5025 iso->esc = ISO_ESC_END_COMPOSITE;
5026 *flags = (*flags & CODING_STATE_ISO2022_LOCK) &
5027 ~CODING_STATE_COMPOSITE;
5029 #endif /* ENABLE_COMPOSITE_CHARS */
5031 /**** directionality ****/
5034 iso->esc = ISO_ESC_5_11;
5037 /**** designation ****/
5039 case '$': /* multibyte charset prefix */
5040 iso->esc = ISO_ESC_2_4;
5044 if (0x28 <= c && c <= 0x2F)
5046 iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_8);
5050 /* This function is called with CODESYS equal to nil when
5051 doing coding-system detection. */
5053 && XCODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
5054 && fit_to_be_escape_quoted (c))
5056 iso->esc = ISO_ESC_LITERAL;
5057 *flags &= CODING_STATE_ISO2022_LOCK;
5067 /**** directionality ****/
5069 case ISO_ESC_5_11: /* ISO6429 direction control */
5072 *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
5073 goto directionality;
5075 if (c == '0') iso->esc = ISO_ESC_5_11_0;
5076 else if (c == '1') iso->esc = ISO_ESC_5_11_1;
5077 else if (c == '2') iso->esc = ISO_ESC_5_11_2;
5081 case ISO_ESC_5_11_0:
5084 *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
5085 goto directionality;
5089 case ISO_ESC_5_11_1:
5092 *flags = (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
5093 goto directionality;
5097 case ISO_ESC_5_11_2:
5100 *flags = (*flags & CODING_STATE_ISO2022_LOCK) | CODING_STATE_R2L;
5101 goto directionality;
5106 iso->esc = ISO_ESC_DIRECTIONALITY;
5107 /* Various junk here to attempt to preserve the direction sequences
5108 literally in the text if they would otherwise be swallowed due
5109 to invalid designations that don't show up as actual charset
5110 changes in the text. */
5111 if (iso->invalid_switch_dir)
5113 /* We already inserted a direction switch literally into the
5114 text. We assume (#### this may not be right) that the
5115 next direction switch is the one going the other way,
5116 and we need to output that literally as well. */
5117 iso->output_literally = 1;
5118 iso->invalid_switch_dir = 0;
5124 /* If we are in the thrall of an invalid designation,
5125 then stick the directionality sequence literally into the
5126 output stream so it ends up in the original text again. */
5127 for (jj = 0; jj < 4; jj++)
5128 if (iso->invalid_designated[jj])
5132 iso->output_literally = 1;
5133 iso->invalid_switch_dir = 1;
5136 /* Indicate that we haven't yet seen a valid designation,
5137 so that if a switch-dir is directly followed by an
5138 invalid designation, both get inserted literally. */
5139 iso->switched_dir_and_no_valid_charset_yet = 1;
5144 /**** designation ****/
5147 if (0x28 <= c && c <= 0x2F)
5149 iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_4_8);
5152 if (0x40 <= c && c <= 0x42)
5155 cs = CHARSET_BY_ATTRIBUTES (94, -1, c,
5156 *flags & CODING_STATE_R2L ?
5157 CHARSET_RIGHT_TO_LEFT :
5158 CHARSET_LEFT_TO_RIGHT);
5169 if (c < '0' || c > '~')
5170 return 0; /* bad final byte */
5172 if (iso->esc >= ISO_ESC_2_8 &&
5173 iso->esc <= ISO_ESC_2_15)
5175 chars = (iso->esc >= ISO_ESC_2_12) ? 96 : 94;
5176 single = 1; /* single-byte */
5177 reg = (iso->esc - ISO_ESC_2_8) & 3;
5179 else if (iso->esc >= ISO_ESC_2_4_8 &&
5180 iso->esc <= ISO_ESC_2_4_15)
5182 chars = (iso->esc >= ISO_ESC_2_4_12) ? 96 : 94;
5183 single = -1; /* multi-byte */
5184 reg = (iso->esc - ISO_ESC_2_4_8) & 3;
5188 /* Can this ever be reached? -slb */
5192 cs = CHARSET_BY_ATTRIBUTES (chars, single, c,
5193 *flags & CODING_STATE_R2L ?
5194 CHARSET_RIGHT_TO_LEFT :
5195 CHARSET_LEFT_TO_RIGHT);
5201 iso->esc_bytes[iso->esc_bytes_index++] = (unsigned char) c;
5205 if (check_invalid_charsets && !CHARSETP (iso->charset[reg]))
5206 /* can't invoke something that ain't there. */
5208 iso->esc = ISO_ESC_SINGLE_SHIFT;
5209 *flags &= CODING_STATE_ISO2022_LOCK;
5211 *flags |= CODING_STATE_SS2;
5213 *flags |= CODING_STATE_SS3;
5217 if (check_invalid_charsets &&
5218 !CHARSETP (iso->charset[reg]))
5219 /* can't invoke something that ain't there. */
5222 iso->register_right = reg;
5224 iso->register_left = reg;
5225 *flags &= CODING_STATE_ISO2022_LOCK;
5226 iso->esc = ISO_ESC_LOCKING_SHIFT;
5230 if (NILP (cs) && check_invalid_charsets)
5232 iso->invalid_designated[reg] = 1;
5233 iso->charset[reg] = Vcharset_ascii;
5234 iso->esc = ISO_ESC_DESIGNATE;
5235 *flags &= CODING_STATE_ISO2022_LOCK;
5236 iso->output_literally = 1;
5237 if (iso->switched_dir_and_no_valid_charset_yet)
5239 /* We encountered a switch-direction followed by an
5240 invalid designation. Ensure that the switch-direction
5241 gets outputted; otherwise it will probably get eaten
5242 when the text is written out again. */
5243 iso->switched_dir_and_no_valid_charset_yet = 0;
5244 iso->output_direction_sequence = 1;
5245 /* And make sure that the switch-dir going the other
5246 way gets outputted, as well. */
5247 iso->invalid_switch_dir = 1;
5251 /* This function is called with CODESYS equal to nil when
5252 doing coding-system detection. */
5253 if (!NILP (codesys))
5255 charset_conversion_spec_dynarr *dyn =
5256 XCODING_SYSTEM (codesys)->iso2022.input_conv;
5262 for (i = 0; i < Dynarr_length (dyn); i++)
5264 struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
5265 if (EQ (cs, spec->from_charset))
5266 cs = spec->to_charset;
5271 iso->charset[reg] = cs;
5272 iso->esc = ISO_ESC_DESIGNATE;
5273 *flags &= CODING_STATE_ISO2022_LOCK;
5274 if (iso->invalid_designated[reg])
5276 iso->invalid_designated[reg] = 0;
5277 iso->output_literally = 1;
5279 if (iso->switched_dir_and_no_valid_charset_yet)
5280 iso->switched_dir_and_no_valid_charset_yet = 0;
5285 detect_coding_iso2022 (struct detection_state *st, const Extbyte *src, Lstream_data_count n)
5289 /* #### There are serious deficiencies in the recognition mechanism
5290 here. This needs to be much smarter if it's going to cut it.
5291 The sequence "\xff\x0f" is currently detected as LOCK_SHIFT while
5292 it should be detected as Latin-1.
5293 All the ISO2022 stuff in this file should be synced up with the
5294 code from FSF Emacs-20.4, in which Mule should be more or less stable.
5295 Perhaps we should wait till R2L works in FSF Emacs? */
5297 if (!st->iso2022.initted)
5299 reset_iso2022 (Qnil, &st->iso2022.iso);
5300 st->iso2022.mask = (CODING_CATEGORY_ISO_7_MASK |
5301 CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
5302 CODING_CATEGORY_ISO_8_1_MASK |
5303 CODING_CATEGORY_ISO_8_2_MASK |
5304 CODING_CATEGORY_ISO_LOCK_SHIFT_MASK);
5305 st->iso2022.flags = 0;
5306 st->iso2022.high_byte_count = 0;
5307 st->iso2022.saw_single_shift = 0;
5308 st->iso2022.initted = 1;
5311 mask = st->iso2022.mask;
5315 unsigned char c = *(unsigned char *)src++;
5318 mask &= ~CODING_CATEGORY_ISO_7_MASK;
5319 st->iso2022.high_byte_count++;
5323 if (st->iso2022.high_byte_count && !st->iso2022.saw_single_shift)
5325 if (st->iso2022.high_byte_count & 1)
5326 /* odd number of high bytes; assume not iso-8-2 */
5327 mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
5329 st->iso2022.high_byte_count = 0;
5330 st->iso2022.saw_single_shift = 0;
5332 mask &= ~CODING_CATEGORY_ISO_7_MASK;
5334 if (!(st->iso2022.flags & CODING_STATE_ESCAPE)
5335 && (BYTE_C0_P (c) || BYTE_C1_P (c)))
5336 { /* control chars */
5339 /* Allow and ignore control characters that you might
5340 reasonably see in a text file */
5345 case 8: /* backspace */
5346 case 11: /* vertical tab */
5347 case 12: /* form feed */
5348 case 26: /* MS-DOS C-z junk */
5349 case 31: /* '^_' -- for info */
5350 goto label_continue_loop;
5357 if ((st->iso2022.flags & CODING_STATE_ESCAPE) || BYTE_C0_P (c)
5360 if (parse_iso2022_esc (Qnil, &st->iso2022.iso, c,
5361 &st->iso2022.flags, 0))
5363 switch (st->iso2022.iso.esc)
5365 case ISO_ESC_DESIGNATE:
5366 mask &= ~CODING_CATEGORY_ISO_8_1_MASK;
5367 mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
5369 case ISO_ESC_LOCKING_SHIFT:
5370 mask = CODING_CATEGORY_ISO_LOCK_SHIFT_MASK;
5371 goto ran_out_of_chars;
5372 case ISO_ESC_SINGLE_SHIFT:
5373 mask &= ~CODING_CATEGORY_ISO_8_DESIGNATE_MASK;
5374 st->iso2022.saw_single_shift = 1;
5383 goto ran_out_of_chars;
5386 label_continue_loop:;
5395 postprocess_iso2022_mask (int mask)
5397 /* #### kind of cheesy */
5398 /* If seven-bit ISO is allowed, then assume that the encoding is
5399 entirely seven-bit and turn off the eight-bit ones. */
5400 if (mask & CODING_CATEGORY_ISO_7_MASK)
5401 mask &= ~ (CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
5402 CODING_CATEGORY_ISO_8_1_MASK |
5403 CODING_CATEGORY_ISO_8_2_MASK);
5407 /* If FLAGS is a null pointer or specifies right-to-left motion,
5408 output a switch-dir-to-left-to-right sequence to DST.
5409 Also update FLAGS if it is not a null pointer.
5410 If INTERNAL_P is set, we are outputting in internal format and
5411 need to handle the CSI differently. */
5414 restore_left_to_right_direction (Lisp_Coding_System *codesys,
5415 unsigned_char_dynarr *dst,
5416 unsigned int *flags,
5419 if (!flags || (*flags & CODING_STATE_R2L))
5421 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
5423 Dynarr_add (dst, ISO_CODE_ESC);
5424 Dynarr_add (dst, '[');
5426 else if (internal_p)
5427 DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
5429 Dynarr_add (dst, ISO_CODE_CSI);
5430 Dynarr_add (dst, '0');
5431 Dynarr_add (dst, ']');
5433 *flags &= ~CODING_STATE_R2L;
5437 /* If FLAGS is a null pointer or specifies a direction different from
5438 DIRECTION (which should be either CHARSET_RIGHT_TO_LEFT or
5439 CHARSET_LEFT_TO_RIGHT), output the appropriate switch-dir escape
5440 sequence to DST. Also update FLAGS if it is not a null pointer.
5441 If INTERNAL_P is set, we are outputting in internal format and
5442 need to handle the CSI differently. */
5445 ensure_correct_direction (int direction, Lisp_Coding_System *codesys,
5446 unsigned_char_dynarr *dst, unsigned int *flags,
5449 if ((!flags || (*flags & CODING_STATE_R2L)) &&
5450 direction == CHARSET_LEFT_TO_RIGHT)
5451 restore_left_to_right_direction (codesys, dst, flags, internal_p);
5452 else if (!CODING_SYSTEM_ISO2022_NO_ISO6429 (codesys)
5453 && (!flags || !(*flags & CODING_STATE_R2L)) &&
5454 direction == CHARSET_RIGHT_TO_LEFT)
5456 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
5458 Dynarr_add (dst, ISO_CODE_ESC);
5459 Dynarr_add (dst, '[');
5461 else if (internal_p)
5462 DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
5464 Dynarr_add (dst, ISO_CODE_CSI);
5465 Dynarr_add (dst, '2');
5466 Dynarr_add (dst, ']');
5468 *flags |= CODING_STATE_R2L;
5472 /* Convert ISO2022-format data to internal format. */
5475 decode_coding_iso2022 (Lstream *decoding, const Extbyte *src,
5476 unsigned_char_dynarr *dst, Lstream_data_count n)
5478 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
5479 unsigned int flags = str->flags;
5480 unsigned int cpos = str->cpos;
5481 unsigned char counter = str->counter;
5482 eol_type_t eol_type = str->eol_type;
5483 #ifdef ENABLE_COMPOSITE_CHARS
5484 unsigned_char_dynarr *real_dst = dst;
5486 Lisp_Object coding_system;
5488 XSETCODING_SYSTEM (coding_system, str->codesys);
5490 #ifdef ENABLE_COMPOSITE_CHARS
5491 if (flags & CODING_STATE_COMPOSITE)
5492 dst = str->iso2022.composite_chars;
5493 #endif /* ENABLE_COMPOSITE_CHARS */
5497 unsigned char c = *(unsigned char *)src++;
5498 if (flags & CODING_STATE_ESCAPE)
5499 { /* Within ESC sequence */
5500 int retval = parse_iso2022_esc (coding_system, &str->iso2022,
5505 switch (str->iso2022.esc)
5507 #ifdef ENABLE_COMPOSITE_CHARS
5508 case ISO_ESC_START_COMPOSITE:
5509 if (str->iso2022.composite_chars)
5510 Dynarr_reset (str->iso2022.composite_chars);
5512 str->iso2022.composite_chars = Dynarr_new (unsigned_char);
5513 dst = str->iso2022.composite_chars;
5515 case ISO_ESC_END_COMPOSITE:
5517 Bufbyte comstr[MAX_EMCHAR_LEN];
5519 Emchar emch = lookup_composite_char (Dynarr_atp (dst, 0),
5520 Dynarr_length (dst));
5522 len = set_charptr_emchar (comstr, emch);
5523 Dynarr_add_many (dst, comstr, len);
5526 #endif /* ENABLE_COMPOSITE_CHARS */
5528 case ISO_ESC_LITERAL:
5529 COMPOSE_FLUSH_CHARS (str, dst);
5530 decode_flush_er_chars (str, dst);
5531 DECODE_ADD_BINARY_CHAR (c, dst);
5535 /* Everything else handled already */
5540 /* Attempted error recovery. */
5541 if (str->iso2022.output_direction_sequence)
5542 ensure_correct_direction (flags & CODING_STATE_R2L ?
5543 CHARSET_RIGHT_TO_LEFT :
5544 CHARSET_LEFT_TO_RIGHT,
5545 str->codesys, dst, 0, 1);
5546 /* More error recovery. */
5547 if (!retval || str->iso2022.output_literally)
5549 /* Output the (possibly invalid) sequence */
5551 COMPOSE_FLUSH_CHARS (str, dst);
5552 decode_flush_er_chars (str, dst);
5553 for (i = 0; i < str->iso2022.esc_bytes_index; i++)
5554 DECODE_ADD_BINARY_CHAR (str->iso2022.esc_bytes[i], dst);
5555 flags &= CODING_STATE_ISO2022_LOCK;
5557 n++, src--;/* Repeat the loop with the same character. */
5560 /* No sense in reprocessing the final byte of the
5561 escape sequence; it could mess things up anyway.
5563 COMPOSE_FLUSH_CHARS (str, dst);
5564 decode_flush_er_chars (str, dst);
5565 DECODE_ADD_BINARY_CHAR (c, dst);
5571 else if (BYTE_C0_P (c) || BYTE_C1_P (c))
5572 { /* Control characters */
5574 /***** Error-handling *****/
5576 /* If we were in the middle of a character, dump out the
5577 partial character. */
5580 COMPOSE_FLUSH_CHARS (str, dst);
5581 decode_flush_er_chars (str, dst);
5585 DECODE_ADD_BINARY_CHAR
5586 ((unsigned char)(cpos >> (counter * 8)), dst);
5591 /* If we just saw a single-shift character, dump it out.
5592 This may dump out the wrong sort of single-shift character,
5593 but least it will give an indication that something went
5595 if (flags & CODING_STATE_SS2)
5597 COMPOSE_FLUSH_CHARS (str, dst);
5598 decode_flush_er_chars (str, dst);
5599 DECODE_ADD_BINARY_CHAR (ISO_CODE_SS2, dst);
5600 flags &= ~CODING_STATE_SS2;
5602 if (flags & CODING_STATE_SS3)
5604 COMPOSE_FLUSH_CHARS (str, dst);
5605 decode_flush_er_chars (str, dst);
5606 DECODE_ADD_BINARY_CHAR (ISO_CODE_SS3, dst);
5607 flags &= ~CODING_STATE_SS3;
5610 /***** Now handle the control characters. *****/
5616 COMPOSE_FLUSH_CHARS (str, dst);
5617 decode_flush_er_chars (str, dst);
5618 if (eol_type == EOL_CR)
5619 Dynarr_add (dst, '\n');
5620 else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR)
5621 Dynarr_add (dst, c);
5623 flags |= CODING_STATE_CR;
5624 goto label_continue_loop;
5626 else if (flags & CODING_STATE_CR)
5627 { /* eol_type == CODING_SYSTEM_EOL_CRLF */
5629 Dynarr_add (dst, '\r');
5630 flags &= ~CODING_STATE_CR;
5633 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
5636 flags &= CODING_STATE_ISO2022_LOCK;
5638 if (!parse_iso2022_esc (coding_system, &str->iso2022, c, &flags, 1))
5640 COMPOSE_FLUSH_CHARS (str, dst);
5641 decode_flush_er_chars (str, dst);
5642 DECODE_ADD_BINARY_CHAR (c, dst);
5646 { /* Graphic characters */
5647 Lisp_Object charset;
5656 COMPOSE_FLUSH_CHARS (str, dst);
5657 decode_flush_er_chars (str, dst);
5658 if (eol_type == EOL_CR)
5659 Dynarr_add (dst, '\n');
5660 else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR)
5661 Dynarr_add (dst, c);
5663 flags |= CODING_STATE_CR;
5664 goto label_continue_loop;
5666 else if (flags & CODING_STATE_CR)
5667 { /* eol_type == CODING_SYSTEM_EOL_CRLF */
5669 Dynarr_add (dst, '\r');
5670 flags &= ~CODING_STATE_CR;
5673 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
5676 /* Now determine the charset. */
5677 reg = ((flags & CODING_STATE_SS2) ? 2
5678 : (flags & CODING_STATE_SS3) ? 3
5679 : !BYTE_ASCII_P (c) ? str->iso2022.register_right
5680 : str->iso2022.register_left);
5681 charset = str->iso2022.charset[reg];
5683 /* Error checking: */
5684 if (! CHARSETP (charset)
5685 || str->iso2022.invalid_designated[reg]
5686 || (((c & 0x7F) == ' ' || (c & 0x7F) == ISO_CODE_DEL)
5687 && XCHARSET_CHARS (charset) == 94))
5688 /* Mrmph. We are trying to invoke a register that has no
5689 or an invalid charset in it, or trying to add a character
5690 outside the range of the charset. Insert that char literally
5691 to preserve it for the output. */
5693 COMPOSE_FLUSH_CHARS (str, dst);
5694 decode_flush_er_chars (str, dst);
5698 DECODE_ADD_BINARY_CHAR
5699 ((unsigned char)(cpos >> (counter * 8)), dst);
5702 DECODE_ADD_BINARY_CHAR (c, dst);
5707 /* Things are probably hunky-dorey. */
5709 /* Fetch reverse charset, maybe. */
5710 if (((flags & CODING_STATE_R2L) &&
5711 XCHARSET_DIRECTION (charset) == CHARSET_LEFT_TO_RIGHT)
5713 (!(flags & CODING_STATE_R2L) &&
5714 XCHARSET_DIRECTION (charset) == CHARSET_RIGHT_TO_LEFT))
5716 Lisp_Object new_charset =
5717 XCHARSET_REVERSE_DIRECTION_CHARSET (charset);
5718 if (!NILP (new_charset))
5719 charset = new_charset;
5724 if (XCHARSET_DIMENSION (charset) == counter)
5726 COMPOSE_ADD_CHAR (str,
5727 DECODE_CHAR (charset,
5728 ((cpos & 0x7F7F7F) << 8)
5735 cpos = (cpos << 8) | c;
5737 lb = XCHARSET_LEADING_BYTE (charset);
5738 switch (XCHARSET_REP_BYTES (charset))
5741 DECODE_OUTPUT_PARTIAL_CHAR (ch);
5742 Dynarr_add (dst, c & 0x7F);
5745 case 2: /* one-byte official */
5746 DECODE_OUTPUT_PARTIAL_CHAR (ch);
5747 Dynarr_add (dst, lb);
5748 Dynarr_add (dst, c | 0x80);
5751 case 3: /* one-byte private or two-byte official */
5752 if (XCHARSET_PRIVATE_P (charset))
5754 DECODE_OUTPUT_PARTIAL_CHAR (ch);
5755 Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_1);
5756 Dynarr_add (dst, lb);
5757 Dynarr_add (dst, c | 0x80);
5763 Dynarr_add (dst, lb);
5764 Dynarr_add (dst, ch | 0x80);
5765 Dynarr_add (dst, c | 0x80);
5773 default: /* two-byte private */
5776 Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_2);
5777 Dynarr_add (dst, lb);
5778 Dynarr_add (dst, ch | 0x80);
5779 Dynarr_add (dst, c | 0x80);
5789 flags &= CODING_STATE_ISO2022_LOCK;
5792 label_continue_loop:;
5795 if (flags & CODING_STATE_END)
5797 COMPOSE_FLUSH_CHARS (str, dst);
5798 decode_flush_er_chars (str, dst);
5799 DECODE_OUTPUT_PARTIAL_CHAR (cpos);
5803 str->counter = counter;
5807 /***** ISO2022 encoder *****/
5809 /* Designate CHARSET into register REG. */
5812 iso2022_designate (Lisp_Object charset, unsigned char reg,
5813 struct encoding_stream *str, unsigned_char_dynarr *dst)
5815 static const char inter94[] = "()*+";
5816 static const char inter96[] = ",-./";
5817 unsigned short chars;
5818 unsigned char dimension;
5819 unsigned char final;
5820 Lisp_Object old_charset = str->iso2022.charset[reg];
5822 str->iso2022.charset[reg] = charset;
5823 if (!CHARSETP (charset))
5824 /* charset might be an initial nil or t. */
5826 chars = XCHARSET_CHARS (charset);
5827 dimension = XCHARSET_DIMENSION (charset);
5828 final = XCHARSET_FINAL (charset);
5829 if (!str->iso2022.force_charset_on_output[reg] &&
5830 CHARSETP (old_charset) &&
5831 XCHARSET_CHARS (old_charset) == chars &&
5832 XCHARSET_DIMENSION (old_charset) == dimension &&
5833 XCHARSET_FINAL (old_charset) == final)
5836 str->iso2022.force_charset_on_output[reg] = 0;
5839 charset_conversion_spec_dynarr *dyn =
5840 str->codesys->iso2022.output_conv;
5846 for (i = 0; i < Dynarr_length (dyn); i++)
5848 struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
5849 if (EQ (charset, spec->from_charset))
5850 charset = spec->to_charset;
5855 Dynarr_add (dst, ISO_CODE_ESC);
5860 Dynarr_add (dst, inter94[reg]);
5863 Dynarr_add (dst, '$');
5865 || !(CODING_SYSTEM_ISO2022_SHORT (str->codesys))
5868 Dynarr_add (dst, inter94[reg]);
5873 Dynarr_add (dst, inter96[reg]);
5876 Dynarr_add (dst, '$');
5877 Dynarr_add (dst, inter96[reg]);
5881 Dynarr_add (dst, final);
5885 ensure_normal_shift (struct encoding_stream *str, unsigned_char_dynarr *dst)
5887 if (str->iso2022.register_left != 0)
5889 Dynarr_add (dst, ISO_CODE_SI);
5890 str->iso2022.register_left = 0;
5895 ensure_shift_out (struct encoding_stream *str, unsigned_char_dynarr *dst)
5897 if (str->iso2022.register_left != 1)
5899 Dynarr_add (dst, ISO_CODE_SO);
5900 str->iso2022.register_left = 1;
5905 char_encode_iso2022 (struct encoding_stream *str, Emchar ch,
5906 unsigned_char_dynarr *dst, unsigned int *flags)
5908 unsigned char charmask;
5909 Lisp_Coding_System* codesys = str->codesys;
5910 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
5912 Lisp_Object charset = str->iso2022.current_charset;
5913 int half = str->iso2022.current_half;
5914 int code_point = -1;
5918 restore_left_to_right_direction (codesys, dst, flags, 0);
5920 /* Make sure G0 contains ASCII */
5921 if ((ch > ' ' && ch < ISO_CODE_DEL)
5922 || !CODING_SYSTEM_ISO2022_NO_ASCII_CNTL (codesys))
5924 ensure_normal_shift (str, dst);
5925 iso2022_designate (Vcharset_ascii, 0, str, dst);
5928 /* If necessary, restore everything to the default state
5930 if (ch == '\n' && !(CODING_SYSTEM_ISO2022_NO_ASCII_EOL (codesys)))
5932 restore_left_to_right_direction (codesys, dst, flags, 0);
5934 ensure_normal_shift (str, dst);
5936 for (i = 0; i < 4; i++)
5938 Lisp_Object initial_charset =
5939 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
5940 iso2022_designate (initial_charset, i, str, dst);
5945 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5946 Dynarr_add (dst, '\r');
5947 if (eol_type != EOL_CR)
5948 Dynarr_add (dst, ch);
5952 if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
5953 && fit_to_be_escape_quoted (ch))
5954 Dynarr_add (dst, ISO_CODE_ESC);
5955 Dynarr_add (dst, ch);
5958 else if ( (0x80 <= ch) && (ch <= 0x9f) )
5960 charmask = (half == 0 ? 0x00 : 0x80);
5962 if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
5963 && fit_to_be_escape_quoted (ch))
5964 Dynarr_add (dst, ISO_CODE_ESC);
5965 /* you asked for it ... */
5966 Dynarr_add (dst, ch);
5972 /* Now determine which register to use. */
5974 for (i = 0; i < 4; i++)
5976 if ((CHARSETP (charset = str->iso2022.charset[i])
5977 && ((code_point = charset_code_point (charset, ch, 0)) >= 0))
5981 = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i))
5982 && ((code_point = charset_code_point (charset, ch, 0)) >= 0)))
5990 Lisp_Object original_default_coded_charset_priority_list
5991 = Vdefault_coded_charset_priority_list;
5992 Vdefault_coded_charset_priority_list
5993 = CODING_SYSTEM_CCS_PRIORITY_LIST (codesys);
5994 while (!EQ (Vdefault_coded_charset_priority_list, Qnil))
5996 code_point = ENCODE_CHAR (ch, charset);
5997 if (XCHARSET_FINAL (charset))
5999 Vdefault_coded_charset_priority_list
6000 = Fcdr (Fmemq (XCHARSET_NAME (charset),
6001 Vdefault_coded_charset_priority_list));
6003 Vdefault_coded_charset_priority_list
6004 = original_default_coded_charset_priority_list;
6005 while (!EQ (Vdefault_coded_charset_priority_list, Qnil))
6007 code_point = ENCODE_CHAR (ch, charset);
6008 if (XCHARSET_FINAL (charset))
6010 Vdefault_coded_charset_priority_list
6011 = Fcdr (Fmemq (XCHARSET_NAME (charset),
6012 Vdefault_coded_charset_priority_list));
6014 code_point = ENCODE_CHAR (ch, charset);
6015 if (!XCHARSET_FINAL (charset))
6017 charset = Vcharset_ascii;
6021 Vdefault_coded_charset_priority_list
6022 = original_default_coded_charset_priority_list;
6024 ensure_correct_direction (XCHARSET_DIRECTION (charset),
6025 codesys, dst, flags, 0);
6029 if (XCHARSET_GRAPHIC (charset) != 0)
6031 if (!NILP (str->iso2022.charset[1]) &&
6032 (!CODING_SYSTEM_ISO2022_SEVEN (codesys)
6033 || CODING_SYSTEM_ISO2022_LOCK_SHIFT (codesys)))
6035 else if (!NILP (str->iso2022.charset[2]))
6037 else if (!NILP (str->iso2022.charset[3]))
6046 iso2022_designate (charset, reg, str, dst);
6048 /* Now invoke that register. */
6052 ensure_normal_shift (str, dst);
6056 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
6058 ensure_shift_out (str, dst);
6065 if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
6067 Dynarr_add (dst, ISO_CODE_ESC);
6068 Dynarr_add (dst, 'N');
6073 Dynarr_add (dst, ISO_CODE_SS2);
6078 if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
6080 Dynarr_add (dst, ISO_CODE_ESC);
6081 Dynarr_add (dst, 'O');
6086 Dynarr_add (dst, ISO_CODE_SS3);
6094 charmask = (half == 0 ? 0x00 : 0x80);
6096 switch (XCHARSET_DIMENSION (charset))
6099 Dynarr_add (dst, (code_point & 0xFF) | charmask);
6102 Dynarr_add (dst, ((code_point >> 8) & 0xFF) | charmask);
6103 Dynarr_add (dst, ( code_point & 0xFF) | charmask);
6106 Dynarr_add (dst, ((code_point >> 16) & 0xFF) | charmask);
6107 Dynarr_add (dst, ((code_point >> 8) & 0xFF) | charmask);
6108 Dynarr_add (dst, ( code_point & 0xFF) | charmask);
6111 Dynarr_add (dst, ((code_point >> 24) & 0xFF) | charmask);
6112 Dynarr_add (dst, ((code_point >> 16) & 0xFF) | charmask);
6113 Dynarr_add (dst, ((code_point >> 8) & 0xFF) | charmask);
6114 Dynarr_add (dst, ( code_point & 0xFF) | charmask);
6120 str->iso2022.current_charset = charset;
6121 str->iso2022.current_half = half;
6125 char_finish_iso2022 (struct encoding_stream *str, unsigned_char_dynarr *dst,
6126 unsigned int *flags)
6128 Lisp_Coding_System* codesys = str->codesys;
6131 restore_left_to_right_direction (codesys, dst, flags, 0);
6132 ensure_normal_shift (str, dst);
6133 for (i = 0; i < 4; i++)
6135 Lisp_Object initial_charset
6136 = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
6137 iso2022_designate (initial_charset, i, str, dst);
6142 /************************************************************************/
6143 /* No-conversion methods */
6144 /************************************************************************/
6146 /* This is used when reading in "binary" files -- i.e. files that may
6147 contain all 256 possible byte values and that are not to be
6148 interpreted as being in any particular decoding. */
6150 decode_coding_no_conversion (Lstream *decoding, const Extbyte *src,
6151 unsigned_char_dynarr *dst, Lstream_data_count n)
6153 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
6154 unsigned int flags = str->flags;
6155 unsigned int cpos = str->cpos;
6156 eol_type_t eol_type = str->eol_type;
6160 unsigned char c = *(unsigned char *)src++;
6162 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
6163 DECODE_ADD_BINARY_CHAR (c, dst);
6164 label_continue_loop:;
6167 DECODE_HANDLE_END_OF_CONVERSION (flags, cpos, dst);
6174 encode_coding_no_conversion (Lstream *encoding, const Bufbyte *src,
6175 unsigned_char_dynarr *dst, Lstream_data_count n)
6178 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
6179 unsigned int flags = str->flags;
6180 unsigned int ch = str->ch;
6181 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
6183 unsigned char char_boundary = str->iso2022.current_char_boundary;
6190 if (char_boundary == 0)
6196 else if ( c >= 0xf8 )
6201 else if ( c >= 0xf0 )
6206 else if ( c >= 0xe0 )
6211 else if ( c >= 0xc0 )
6221 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
6222 Dynarr_add (dst, '\r');
6223 if (eol_type != EOL_CR)
6224 Dynarr_add (dst, c);
6227 Dynarr_add (dst, c);
6230 else if (char_boundary == 1)
6232 ch = ( ch << 6 ) | ( c & 0x3f );
6233 Dynarr_add (dst, ch & 0xff);
6238 ch = ( ch << 6 ) | ( c & 0x3f );
6241 #else /* not UTF2000 */
6244 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
6245 Dynarr_add (dst, '\r');
6246 if (eol_type != EOL_CR)
6247 Dynarr_add (dst, '\n');
6250 else if (BYTE_ASCII_P (c))
6253 Dynarr_add (dst, c);
6255 else if (BUFBYTE_LEADING_BYTE_P (c))
6258 if (c == LEADING_BYTE_LATIN_ISO8859_1 ||
6259 c == LEADING_BYTE_CONTROL_1)
6262 Dynarr_add (dst, '~'); /* untranslatable character */
6266 if (ch == LEADING_BYTE_LATIN_ISO8859_1)
6267 Dynarr_add (dst, c);
6268 else if (ch == LEADING_BYTE_CONTROL_1)
6271 Dynarr_add (dst, c - 0x20);
6273 /* else it should be the second or third byte of an
6274 untranslatable character, so ignore it */
6277 #endif /* not UTF2000 */
6283 str->iso2022.current_char_boundary = char_boundary;
6289 /************************************************************************/
6290 /* Initialization */
6291 /************************************************************************/
6294 syms_of_file_coding (void)
6296 INIT_LRECORD_IMPLEMENTATION (coding_system);
6298 DEFERROR_STANDARD (Qcoding_system_error, Qio_error);
6300 DEFSUBR (Fcoding_system_p);
6301 DEFSUBR (Ffind_coding_system);
6302 DEFSUBR (Fget_coding_system);
6303 DEFSUBR (Fcoding_system_list);
6304 DEFSUBR (Fcoding_system_name);
6305 DEFSUBR (Fmake_coding_system);
6306 DEFSUBR (Fcopy_coding_system);
6307 DEFSUBR (Fcoding_system_canonical_name_p);
6308 DEFSUBR (Fcoding_system_alias_p);
6309 DEFSUBR (Fcoding_system_aliasee);
6310 DEFSUBR (Fdefine_coding_system_alias);
6311 DEFSUBR (Fsubsidiary_coding_system);
6313 DEFSUBR (Fcoding_system_type);
6314 DEFSUBR (Fcoding_system_doc_string);
6316 DEFSUBR (Fcoding_system_charset);
6318 DEFSUBR (Fcoding_system_property);
6320 DEFSUBR (Fcoding_category_list);
6321 DEFSUBR (Fset_coding_priority_list);
6322 DEFSUBR (Fcoding_priority_list);
6323 DEFSUBR (Fset_coding_category_system);
6324 DEFSUBR (Fcoding_category_system);
6326 DEFSUBR (Fdetect_coding_region);
6327 DEFSUBR (Fdecode_coding_region);
6328 DEFSUBR (Fencode_coding_region);
6330 DEFSUBR (Fdecode_shift_jis_char);
6331 DEFSUBR (Fencode_shift_jis_char);
6332 DEFSUBR (Fdecode_big5_char);
6333 DEFSUBR (Fencode_big5_char);
6335 defsymbol (&Qcoding_systemp, "coding-system-p");
6336 defsymbol (&Qno_conversion, "no-conversion");
6337 defsymbol (&Qraw_text, "raw-text");
6339 defsymbol (&Qbig5, "big5");
6340 defsymbol (&Qshift_jis, "shift-jis");
6341 defsymbol (&Qucs4, "ucs-4");
6342 defsymbol (&Qutf8, "utf-8");
6343 defsymbol (&Qutf16, "utf-16");
6344 defsymbol (&Qccl, "ccl");
6345 defsymbol (&Qiso2022, "iso2022");
6347 defsymbol (&Qmnemonic, "mnemonic");
6348 defsymbol (&Qeol_type, "eol-type");
6349 defsymbol (&Qpost_read_conversion, "post-read-conversion");
6350 defsymbol (&Qpre_write_conversion, "pre-write-conversion");
6352 defsymbol (&Qcr, "cr");
6353 defsymbol (&Qlf, "lf");
6354 defsymbol (&Qcrlf, "crlf");
6355 defsymbol (&Qeol_cr, "eol-cr");
6356 defsymbol (&Qeol_lf, "eol-lf");
6357 defsymbol (&Qeol_crlf, "eol-crlf");
6359 defsymbol (&Qcharset_g0, "charset-g0");
6360 defsymbol (&Qcharset_g1, "charset-g1");
6361 defsymbol (&Qcharset_g2, "charset-g2");
6362 defsymbol (&Qcharset_g3, "charset-g3");
6363 defsymbol (&Qforce_g0_on_output, "force-g0-on-output");
6364 defsymbol (&Qforce_g1_on_output, "force-g1-on-output");
6365 defsymbol (&Qforce_g2_on_output, "force-g2-on-output");
6366 defsymbol (&Qforce_g3_on_output, "force-g3-on-output");
6367 defsymbol (&Qno_iso6429, "no-iso6429");
6368 defsymbol (&Qinput_charset_conversion, "input-charset-conversion");
6369 defsymbol (&Qoutput_charset_conversion, "output-charset-conversion");
6371 defsymbol (&Qshort, "short");
6372 defsymbol (&Qno_ascii_eol, "no-ascii-eol");
6373 defsymbol (&Qno_ascii_cntl, "no-ascii-cntl");
6374 defsymbol (&Qseven, "seven");
6375 defsymbol (&Qlock_shift, "lock-shift");
6376 defsymbol (&Qescape_quoted, "escape-quoted");
6379 defsymbol (&Qutf_8_mcs, "utf-8-mcs");
6380 defsymbol (&Qdisable_composition, "disable-composition");
6381 defsymbol (&Qccs_priority_list, "ccs-priority-list");
6382 defsymbol (&Quse_entity_reference, "use-entity-reference");
6383 defsymbol (&Qd, "d");
6384 defsymbol (&Qx, "x");
6385 defsymbol (&QX, "X");
6387 defsymbol (&Qencode, "encode");
6388 defsymbol (&Qdecode, "decode");
6391 defsymbol (&coding_category_symbol[CODING_CATEGORY_SHIFT_JIS],
6393 defsymbol (&coding_category_symbol[CODING_CATEGORY_BIG5],
6395 defsymbol (&coding_category_symbol[CODING_CATEGORY_UCS4],
6397 defsymbol (&coding_category_symbol[CODING_CATEGORY_UTF16],
6399 defsymbol (&coding_category_symbol[CODING_CATEGORY_UTF8],
6401 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_7],
6403 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_DESIGNATE],
6405 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_1],
6407 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_2],
6409 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_LOCK_SHIFT],
6412 defsymbol (&coding_category_symbol[CODING_CATEGORY_NO_CONVERSION],
6417 lstream_type_create_file_coding (void)
6419 LSTREAM_HAS_METHOD (decoding, reader);
6420 LSTREAM_HAS_METHOD (decoding, writer);
6421 LSTREAM_HAS_METHOD (decoding, rewinder);
6422 LSTREAM_HAS_METHOD (decoding, seekable_p);
6423 LSTREAM_HAS_METHOD (decoding, flusher);
6424 LSTREAM_HAS_METHOD (decoding, closer);
6425 LSTREAM_HAS_METHOD (decoding, marker);
6427 LSTREAM_HAS_METHOD (encoding, reader);
6428 LSTREAM_HAS_METHOD (encoding, writer);
6429 LSTREAM_HAS_METHOD (encoding, rewinder);
6430 LSTREAM_HAS_METHOD (encoding, seekable_p);
6431 LSTREAM_HAS_METHOD (encoding, flusher);
6432 LSTREAM_HAS_METHOD (encoding, closer);
6433 LSTREAM_HAS_METHOD (encoding, marker);
6437 vars_of_file_coding (void)
6441 fcd = xnew (struct file_coding_dump);
6442 dump_add_root_struct_ptr (&fcd, &fcd_description);
6444 /* Initialize to something reasonable ... */
6445 for (i = 0; i < CODING_CATEGORY_LAST; i++)
6447 fcd->coding_category_system[i] = Qnil;
6448 fcd->coding_category_by_priority[i] = i;
6451 Fprovide (intern ("file-coding"));
6453 DEFVAR_LISP ("keyboard-coding-system", &Vkeyboard_coding_system /*
6454 Coding system used for TTY keyboard input.
6455 Not used under a windowing system.
6457 Vkeyboard_coding_system = Qnil;
6459 DEFVAR_LISP ("terminal-coding-system", &Vterminal_coding_system /*
6460 Coding system used for TTY display output.
6461 Not used under a windowing system.
6463 Vterminal_coding_system = Qnil;
6465 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read /*
6466 Overriding coding system used when reading from a file or process.
6467 You should bind this variable with `let', but do not set it globally.
6468 If this is non-nil, it specifies the coding system that will be used
6469 to decode input on read operations, such as from a file or process.
6470 It overrides `buffer-file-coding-system-for-read',
6471 `insert-file-contents-pre-hook', etc. Use those variables instead of
6472 this one for permanent changes to the environment. */ );
6473 Vcoding_system_for_read = Qnil;
6475 DEFVAR_LISP ("coding-system-for-write",
6476 &Vcoding_system_for_write /*
6477 Overriding coding system used when writing to a file or process.
6478 You should bind this variable with `let', but do not set it globally.
6479 If this is non-nil, it specifies the coding system that will be used
6480 to encode output for write operations, such as to a file or process.
6481 It overrides `buffer-file-coding-system', `write-region-pre-hook', etc.
6482 Use those variables instead of this one for permanent changes to the
6484 Vcoding_system_for_write = Qnil;
6486 DEFVAR_LISP ("file-name-coding-system", &Vfile_name_coding_system /*
6487 Coding system used to convert pathnames when accessing files.
6489 Vfile_name_coding_system = Qnil;
6491 DEFVAR_LISP ("coded-charset-entity-reference-alist",
6492 &Vcoded_charset_entity_reference_alist /*
6493 Alist of coded-charset vs corresponding entity-reference.
6494 Each element looks like (CCS PREFIX CODE-COLUMNS CODE-TYPE).
6495 CCS is coded-charset.
6496 CODE-COLUMNS is columns of code-point of entity-reference.
6497 CODE-TYPE is format type of code-point of entity-reference.
6498 `d' means decimal value and `x' means hexadecimal value.
6500 Vcoded_charset_entity_reference_alist = Qnil;
6502 DEFVAR_BOOL ("enable-multibyte-characters", &enable_multibyte_characters /*
6503 Non-nil means the buffer contents are regarded as multi-byte form
6504 of characters, not a binary code. This affects the display, file I/O,
6505 and behaviors of various editing commands.
6507 Setting this to nil does not do anything.
6509 enable_multibyte_characters = 1;
6513 complex_vars_of_file_coding (void)
6515 staticpro (&Vcoding_system_hash_table);
6516 Vcoding_system_hash_table =
6517 make_lisp_hash_table (50, HASH_TABLE_NON_WEAK, HASH_TABLE_EQ);
6519 the_codesys_prop_dynarr = Dynarr_new (codesys_prop);
6520 dump_add_root_struct_ptr (&the_codesys_prop_dynarr, &codesys_prop_dynarr_description);
6522 #define DEFINE_CODESYS_PROP(Prop_Type, Sym) do \
6524 struct codesys_prop csp; \
6526 csp.prop_type = (Prop_Type); \
6527 Dynarr_add (the_codesys_prop_dynarr, csp); \
6530 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qmnemonic);
6531 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_type);
6532 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_cr);
6533 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_crlf);
6534 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_lf);
6535 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qpost_read_conversion);
6536 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qpre_write_conversion);
6538 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g0);
6539 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g1);
6540 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g2);
6541 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g3);
6542 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g0_on_output);
6543 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g1_on_output);
6544 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g2_on_output);
6545 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g3_on_output);
6546 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qshort);
6547 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_eol);
6548 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_cntl);
6549 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qseven);
6550 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qlock_shift);
6551 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_iso6429);
6552 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qescape_quoted);
6553 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qinput_charset_conversion);
6554 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qoutput_charset_conversion);
6556 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qccs_priority_list);
6559 DEFINE_CODESYS_PROP (CODESYS_PROP_CCL, Qencode);
6560 DEFINE_CODESYS_PROP (CODESYS_PROP_CCL, Qdecode);
6562 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qdisable_composition);
6563 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Quse_entity_reference);
6566 /* Need to create this here or we're really screwed. */
6568 (Qraw_text, Qno_conversion,
6569 build_string ("Raw text, which means it converts only line-break-codes."),
6570 list2 (Qmnemonic, build_string ("Raw")));
6573 (Qbinary, Qno_conversion,
6574 build_string ("Binary, which means it does not convert anything."),
6575 list4 (Qeol_type, Qlf,
6576 Qmnemonic, build_string ("Binary")));
6582 ("Coding-system of UTF-8 with Multiple Coded-character-Sets extension."),
6583 list2 (Qmnemonic, build_string ("MTF8")));
6586 Fdefine_coding_system_alias (Qno_conversion, Qraw_text);
6588 Fdefine_coding_system_alias (Qfile_name, Qbinary);
6590 Fdefine_coding_system_alias (Qterminal, Qbinary);
6591 Fdefine_coding_system_alias (Qkeyboard, Qbinary);
6593 /* Need this for bootstrapping */
6594 fcd->coding_category_system[CODING_CATEGORY_NO_CONVERSION] =
6595 Fget_coding_system (Qraw_text);
6598 fcd->coding_category_system[CODING_CATEGORY_UTF8]
6599 = Fget_coding_system (Qutf_8_mcs);
6602 #if defined(MULE) && !defined(UTF2000)
6606 for (i = 0; i < countof (fcd->ucs_to_mule_table); i++)
6607 fcd->ucs_to_mule_table[i] = Qnil;
6609 staticpro (&mule_to_ucs_table);
6610 mule_to_ucs_table = Fmake_char_table(Qgeneric);
6611 #endif /* defined(MULE) && !defined(UTF2000) */