1 /* Code conversion functions.
2 Copyright (C) 1991, 1995 Free Software Foundation, Inc.
3 Copyright (C) 1995 Sun Microsystems, Inc.
4 Copyright (C) 1999,2000,2001,2002,2003,2004,2005,2008 MORIOKA Tomohiko
6 This file is part of XEmacs.
8 XEmacs is free software; you can redistribute it and/or modify it
9 under the terms of the GNU General Public License as published by the
10 Free Software Foundation; either version 2, or (at your option) any
13 XEmacs is distributed in the hope that it will be useful, but WITHOUT
14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
18 You should have received a copy of the GNU General Public License
19 along with XEmacs; see the file COPYING. If not, write to
20 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
21 Boston, MA 02111-1307, USA. */
23 /* Synched up with: Mule 2.3. Not in FSF. */
25 /* Rewritten by Ben Wing <ben@xemacs.org>. */
26 /* Rewritten by MORIOKA Tomohiko <tomo@m17n.org> for XEmacs CHISE. */
40 #include "file-coding.h"
42 Lisp_Object Qcoding_system_error;
44 Lisp_Object Vkeyboard_coding_system;
45 Lisp_Object Vterminal_coding_system;
46 Lisp_Object Vcoding_system_for_read;
47 Lisp_Object Vcoding_system_for_write;
48 Lisp_Object Vfile_name_coding_system;
50 Lisp_Object Vcoded_charset_entity_reference_alist;
52 /* Table of symbols identifying each coding category. */
53 Lisp_Object coding_category_symbol[CODING_CATEGORY_LAST];
57 struct file_coding_dump {
58 /* Coding system currently associated with each coding category. */
59 Lisp_Object coding_category_system[CODING_CATEGORY_LAST];
61 /* Table of all coding categories in decreasing order of priority.
62 This describes a permutation of the possible coding categories. */
63 int coding_category_by_priority[CODING_CATEGORY_LAST];
65 #if defined(MULE) && !defined(UTF2000)
66 Lisp_Object ucs_to_mule_table[65536];
70 static const struct lrecord_description fcd_description_1[] = {
71 { XD_LISP_OBJECT_ARRAY, offsetof (struct file_coding_dump, coding_category_system), CODING_CATEGORY_LAST },
72 #if defined(MULE) && !defined(UTF2000)
73 { XD_LISP_OBJECT_ARRAY, offsetof (struct file_coding_dump, ucs_to_mule_table), countof (fcd->ucs_to_mule_table) },
78 static const struct struct_description fcd_description = {
79 sizeof (struct file_coding_dump),
83 Lisp_Object mule_to_ucs_table;
85 Lisp_Object Qcoding_systemp;
87 Lisp_Object Qraw_text, Qno_conversion, Qccl, Qiso2022;
88 /* Qinternal in general.c */
90 Lisp_Object Qmnemonic, Qeol_type;
91 Lisp_Object Qcr, Qcrlf, Qlf;
92 Lisp_Object Qeol_cr, Qeol_crlf, Qeol_lf;
93 Lisp_Object Qpost_read_conversion;
94 Lisp_Object Qpre_write_conversion;
97 Lisp_Object Qucs4, Qutf16, Qutf8;
98 Lisp_Object Qbig5, Qshift_jis;
99 Lisp_Object Qcharset_g0, Qcharset_g1, Qcharset_g2, Qcharset_g3;
100 Lisp_Object Qforce_g0_on_output, Qforce_g1_on_output;
101 Lisp_Object Qforce_g2_on_output, Qforce_g3_on_output;
102 Lisp_Object Qno_iso6429;
103 Lisp_Object Qinput_charset_conversion, Qoutput_charset_conversion;
104 Lisp_Object Qescape_quoted;
105 Lisp_Object Qshort, Qno_ascii_eol, Qno_ascii_cntl, Qseven, Qlock_shift;
108 Lisp_Object Qutf_8_mcs;
109 Lisp_Object Qdisable_composition;
110 Lisp_Object Qccs_priority_list;
111 Lisp_Object Quse_entity_reference;
112 Lisp_Object Qd, Qx, QX;
114 Lisp_Object Qencode, Qdecode;
116 Lisp_Object Vcoding_system_hash_table;
118 int enable_multibyte_characters;
121 /* Additional information used by the ISO2022 decoder and detector. */
122 struct iso2022_decoder
124 /* CHARSET holds the character sets currently assigned to the G0
125 through G3 variables. It is initialized from the array
126 INITIAL_CHARSET in CODESYS. */
127 Lisp_Object charset[4];
129 /* Which registers are currently invoked into the left (GL) and
130 right (GR) halves of the 8-bit encoding space? */
131 int register_left, register_right;
133 /* ISO_ESC holds a value indicating part of an escape sequence
134 that has already been seen. */
135 enum iso_esc_flag esc;
137 /* This records the bytes we've seen so far in an escape sequence,
138 in case the sequence is invalid (we spit out the bytes unchanged). */
139 unsigned char esc_bytes[8];
141 /* Index for next byte to store in ISO escape sequence. */
144 #ifdef ENABLE_COMPOSITE_CHARS
145 /* Stuff seen so far when composing a string. */
146 unsigned_char_dynarr *composite_chars;
149 /* If we saw an invalid designation sequence for a particular
150 register, we flag it here and switch to ASCII. The next time we
151 see a valid designation for this register, we turn off the flag
152 and do the designation normally, but pretend the sequence was
153 invalid. The effect of all this is that (most of the time) the
154 escape sequences for both the switch to the unknown charset, and
155 the switch back to the known charset, get inserted literally into
156 the buffer and saved out as such. The hope is that we can
157 preserve the escape sequences so that the resulting written out
158 file makes sense. If we don't do any of this, the designation
159 to the invalid charset will be preserved but that switch back
160 to the known charset will probably get eaten because it was
161 the same charset that was already present in the register. */
162 unsigned char invalid_designated[4];
164 /* We try to do similar things as above for direction-switching
165 sequences. If we encountered a direction switch while an
166 invalid designation was present, or an invalid designation
167 just after a direction switch (i.e. no valid designation
168 encountered yet), we insert the direction-switch escape
169 sequence literally into the output stream, and later on
170 insert the corresponding direction-restoring escape sequence
172 unsigned int switched_dir_and_no_valid_charset_yet :1;
173 unsigned int invalid_switch_dir :1;
175 /* Tells the decoder to output the escape sequence literally
176 even though it was valid. Used in the games we play to
177 avoid lossage when we encounter invalid designations. */
178 unsigned int output_literally :1;
179 /* We encountered a direction switch followed by an invalid
180 designation. We didn't output the direction switch
181 literally because we didn't know about the invalid designation;
182 but we have to do so now. */
183 unsigned int output_direction_sequence :1;
186 EXFUN (Fcopy_coding_system, 2);
188 struct detection_state;
191 text_encode_generic (Lstream *encoding, const Bufbyte *src,
192 unsigned_char_dynarr *dst, Lstream_data_count n);
194 static int detect_coding_sjis (struct detection_state *st,
195 const Extbyte *src, Lstream_data_count n);
196 static void decode_coding_sjis (Lstream *decoding, const Extbyte *src,
197 unsigned_char_dynarr *dst, Lstream_data_count n);
198 void char_encode_shift_jis (struct encoding_stream *str, Emchar c,
199 unsigned_char_dynarr *dst, unsigned int *flags);
200 void char_finish_shift_jis (struct encoding_stream *str,
201 unsigned_char_dynarr *dst, unsigned int *flags);
203 static int detect_coding_big5 (struct detection_state *st,
204 const Extbyte *src, Lstream_data_count n);
205 static void decode_coding_big5 (Lstream *decoding, const Extbyte *src,
206 unsigned_char_dynarr *dst, Lstream_data_count n);
207 void char_encode_big5 (struct encoding_stream *str, Emchar c,
208 unsigned_char_dynarr *dst, unsigned int *flags);
209 void char_finish_big5 (struct encoding_stream *str,
210 unsigned_char_dynarr *dst, unsigned int *flags);
212 static int detect_coding_ucs4 (struct detection_state *st,
213 const Extbyte *src, Lstream_data_count n);
214 static void decode_coding_ucs4 (Lstream *decoding, const Extbyte *src,
215 unsigned_char_dynarr *dst, Lstream_data_count n);
216 void char_encode_ucs4 (struct encoding_stream *str, Emchar c,
217 unsigned_char_dynarr *dst, unsigned int *flags);
218 void char_finish_ucs4 (struct encoding_stream *str,
219 unsigned_char_dynarr *dst, unsigned int *flags);
221 static int detect_coding_utf16 (struct detection_state *st,
222 const Extbyte *src, Lstream_data_count n);
223 static void decode_coding_utf16 (Lstream *decoding, const Extbyte *src,
224 unsigned_char_dynarr *dst, Lstream_data_count n);
225 void char_encode_utf16 (struct encoding_stream *str, Emchar c,
226 unsigned_char_dynarr *dst, unsigned int *flags);
227 void char_finish_utf16 (struct encoding_stream *str,
228 unsigned_char_dynarr *dst, unsigned int *flags);
230 static int detect_coding_utf8 (struct detection_state *st,
231 const Extbyte *src, Lstream_data_count n);
232 static void decode_coding_utf8 (Lstream *decoding, const Extbyte *src,
233 unsigned_char_dynarr *dst, Lstream_data_count n);
234 void char_encode_utf8 (struct encoding_stream *str, Emchar c,
235 unsigned_char_dynarr *dst, unsigned int *flags);
236 void char_finish_utf8 (struct encoding_stream *str,
237 unsigned_char_dynarr *dst, unsigned int *flags);
239 static int postprocess_iso2022_mask (int mask);
240 static void reset_iso2022 (Lisp_Object coding_system,
241 struct iso2022_decoder *iso);
242 static int detect_coding_iso2022 (struct detection_state *st,
243 const Extbyte *src, Lstream_data_count n);
244 static void decode_coding_iso2022 (Lstream *decoding, const Extbyte *src,
245 unsigned_char_dynarr *dst, Lstream_data_count n);
246 void char_encode_iso2022 (struct encoding_stream *str, Emchar c,
247 unsigned_char_dynarr *dst, unsigned int *flags);
248 void char_finish_iso2022 (struct encoding_stream *str,
249 unsigned_char_dynarr *dst, unsigned int *flags);
251 static void decode_coding_no_conversion (Lstream *decoding, const Extbyte *src,
252 unsigned_char_dynarr *dst, Lstream_data_count n);
253 static void encode_coding_no_conversion (Lstream *encoding, const Bufbyte *src,
254 unsigned_char_dynarr *dst, Lstream_data_count n);
255 static void mule_decode (Lstream *decoding, const Extbyte *src,
256 unsigned_char_dynarr *dst, Lstream_data_count n);
257 static void mule_encode (Lstream *encoding, const Bufbyte *src,
258 unsigned_char_dynarr *dst, Lstream_data_count n);
260 typedef struct codesys_prop codesys_prop;
269 Dynarr_declare (codesys_prop);
270 } codesys_prop_dynarr;
272 static const struct lrecord_description codesys_prop_description_1[] = {
273 { XD_LISP_OBJECT, offsetof (codesys_prop, sym) },
277 static const struct struct_description codesys_prop_description = {
278 sizeof (codesys_prop),
279 codesys_prop_description_1
282 static const struct lrecord_description codesys_prop_dynarr_description_1[] = {
283 XD_DYNARR_DESC (codesys_prop_dynarr, &codesys_prop_description),
287 static const struct struct_description codesys_prop_dynarr_description = {
288 sizeof (codesys_prop_dynarr),
289 codesys_prop_dynarr_description_1
292 codesys_prop_dynarr *the_codesys_prop_dynarr;
294 enum codesys_prop_enum
297 CODESYS_PROP_ISO2022,
302 /************************************************************************/
303 /* Coding system functions */
304 /************************************************************************/
306 static Lisp_Object mark_coding_system (Lisp_Object);
307 static void print_coding_system (Lisp_Object, Lisp_Object, int);
308 static void finalize_coding_system (void *header, int for_disksave);
311 static const struct lrecord_description ccs_description_1[] = {
312 { XD_LISP_OBJECT, offsetof (charset_conversion_spec, from_charset) },
313 { XD_LISP_OBJECT, offsetof (charset_conversion_spec, to_charset) },
317 static const struct struct_description ccs_description = {
318 sizeof (charset_conversion_spec),
322 static const struct lrecord_description ccsd_description_1[] = {
323 XD_DYNARR_DESC (charset_conversion_spec_dynarr, &ccs_description),
327 static const struct struct_description ccsd_description = {
328 sizeof (charset_conversion_spec_dynarr),
333 static const struct lrecord_description coding_system_description[] = {
334 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, name) },
335 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, doc_string) },
336 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, mnemonic) },
337 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, post_read_conversion) },
338 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, pre_write_conversion) },
339 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, eol_lf) },
340 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, eol_crlf) },
341 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, eol_cr) },
343 { XD_LISP_OBJECT_ARRAY, offsetof (Lisp_Coding_System, iso2022.initial_charset), 4 },
344 { XD_STRUCT_PTR, offsetof (Lisp_Coding_System, iso2022.input_conv), 1, &ccsd_description },
345 { XD_STRUCT_PTR, offsetof (Lisp_Coding_System, iso2022.output_conv), 1, &ccsd_description },
346 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, ccl.decode) },
347 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, ccl.encode) },
349 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, ccs_priority_list) },
355 DEFINE_LRECORD_IMPLEMENTATION ("coding-system", coding_system,
356 mark_coding_system, print_coding_system,
357 finalize_coding_system,
358 0, 0, coding_system_description,
362 mark_coding_system (Lisp_Object obj)
364 Lisp_Coding_System *codesys = XCODING_SYSTEM (obj);
366 mark_object (CODING_SYSTEM_NAME (codesys));
367 mark_object (CODING_SYSTEM_DOC_STRING (codesys));
368 mark_object (CODING_SYSTEM_MNEMONIC (codesys));
369 mark_object (CODING_SYSTEM_EOL_LF (codesys));
370 mark_object (CODING_SYSTEM_EOL_CRLF (codesys));
371 mark_object (CODING_SYSTEM_EOL_CR (codesys));
373 switch (CODING_SYSTEM_TYPE (codesys))
377 case CODESYS_ISO2022:
378 for (i = 0; i < 4; i++)
379 mark_object (CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i));
380 if (codesys->iso2022.input_conv)
382 for (i = 0; i < Dynarr_length (codesys->iso2022.input_conv); i++)
384 struct charset_conversion_spec *ccs =
385 Dynarr_atp (codesys->iso2022.input_conv, i);
386 mark_object (ccs->from_charset);
387 mark_object (ccs->to_charset);
390 if (codesys->iso2022.output_conv)
392 for (i = 0; i < Dynarr_length (codesys->iso2022.output_conv); i++)
394 struct charset_conversion_spec *ccs =
395 Dynarr_atp (codesys->iso2022.output_conv, i);
396 mark_object (ccs->from_charset);
397 mark_object (ccs->to_charset);
404 mark_object (CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 0));
405 mark_object (CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 1));
410 mark_object (CODING_SYSTEM_CCL_DECODE (codesys));
411 mark_object (CODING_SYSTEM_CCL_ENCODE (codesys));
418 mark_object (CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys));
420 mark_object (CODING_SYSTEM_CCS_PRIORITY_LIST (codesys));
422 return CODING_SYSTEM_POST_READ_CONVERSION (codesys);
426 print_coding_system (Lisp_Object obj, Lisp_Object printcharfun,
429 Lisp_Coding_System *c = XCODING_SYSTEM (obj);
431 error ("printing unreadable object #<coding_system 0x%x>",
434 write_c_string ("#<coding_system ", printcharfun);
435 print_internal (c->name, printcharfun, 1);
436 write_c_string (">", printcharfun);
440 finalize_coding_system (void *header, int for_disksave)
442 Lisp_Coding_System *c = (Lisp_Coding_System *) header;
443 /* Since coding systems never go away, this function is not
444 necessary. But it would be necessary if we changed things
445 so that coding systems could go away. */
446 if (!for_disksave) /* see comment in lstream.c */
448 switch (CODING_SYSTEM_TYPE (c))
451 case CODESYS_ISO2022:
452 if (c->iso2022.input_conv)
454 Dynarr_free (c->iso2022.input_conv);
455 c->iso2022.input_conv = 0;
457 if (c->iso2022.output_conv)
459 Dynarr_free (c->iso2022.output_conv);
460 c->iso2022.output_conv = 0;
471 symbol_to_eol_type (Lisp_Object symbol)
473 CHECK_SYMBOL (symbol);
474 if (NILP (symbol)) return EOL_AUTODETECT;
475 if (EQ (symbol, Qlf)) return EOL_LF;
476 if (EQ (symbol, Qcrlf)) return EOL_CRLF;
477 if (EQ (symbol, Qcr)) return EOL_CR;
479 signal_simple_error ("Unrecognized eol type", symbol);
480 return EOL_AUTODETECT; /* not reached */
484 eol_type_to_symbol (eol_type_t type)
489 case EOL_LF: return Qlf;
490 case EOL_CRLF: return Qcrlf;
491 case EOL_CR: return Qcr;
492 case EOL_AUTODETECT: return Qnil;
497 setup_eol_coding_systems (Lisp_Coding_System *codesys)
499 Lisp_Object codesys_obj;
500 int len = string_length (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name);
501 char *codesys_name = (char *) alloca (len + 7);
503 char *codesys_mnemonic=0;
505 Lisp_Object codesys_name_sym, sub_codesys_obj;
509 XSETCODING_SYSTEM (codesys_obj, codesys);
511 memcpy (codesys_name,
512 string_data (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name), len);
514 if (STRINGP (CODING_SYSTEM_MNEMONIC (codesys)))
516 mlen = XSTRING_LENGTH (CODING_SYSTEM_MNEMONIC (codesys));
517 codesys_mnemonic = (char *) alloca (mlen + 7);
518 memcpy (codesys_mnemonic,
519 XSTRING_DATA (CODING_SYSTEM_MNEMONIC (codesys)), mlen);
522 #define DEFINE_SUB_CODESYS(op_sys, op_sys_abbr, Type) do { \
523 strcpy (codesys_name + len, "-" op_sys); \
525 strcpy (codesys_mnemonic + mlen, op_sys_abbr); \
526 codesys_name_sym = intern (codesys_name); \
527 sub_codesys_obj = Fcopy_coding_system (codesys_obj, codesys_name_sym); \
528 XCODING_SYSTEM_EOL_TYPE (sub_codesys_obj) = Type; \
530 XCODING_SYSTEM_MNEMONIC(sub_codesys_obj) = \
531 build_string (codesys_mnemonic); \
532 CODING_SYSTEM_##Type (codesys) = sub_codesys_obj; \
535 DEFINE_SUB_CODESYS("unix", "", EOL_LF);
536 DEFINE_SUB_CODESYS("dos", ":T", EOL_CRLF);
537 DEFINE_SUB_CODESYS("mac", ":t", EOL_CR);
540 DEFUN ("coding-system-p", Fcoding_system_p, 1, 1, 0, /*
541 Return t if OBJECT is a coding system.
542 A coding system is an object that defines how text containing multiple
543 character sets is encoded into a stream of (typically 8-bit) bytes.
544 The coding system is used to decode the stream into a series of
545 characters (which may be from multiple charsets) when the text is read
546 from a file or process, and is used to encode the text back into the
547 same format when it is written out to a file or process.
549 For example, many ISO2022-compliant coding systems (such as Compound
550 Text, which is used for inter-client data under the X Window System)
551 use escape sequences to switch between different charsets -- Japanese
552 Kanji, for example, is invoked with "ESC $ ( B"; ASCII is invoked
553 with "ESC ( B"; and Cyrillic is invoked with "ESC - L". See
554 `make-coding-system' for more information.
556 Coding systems are normally identified using a symbol, and the
557 symbol is accepted in place of the actual coding system object whenever
558 a coding system is called for. (This is similar to how faces work.)
562 return CODING_SYSTEMP (object) ? Qt : Qnil;
565 DEFUN ("find-coding-system", Ffind_coding_system, 1, 1, 0, /*
566 Retrieve the coding system of the given name.
568 If CODING-SYSTEM-OR-NAME is a coding-system object, it is simply
569 returned. Otherwise, CODING-SYSTEM-OR-NAME should be a symbol.
570 If there is no such coding system, nil is returned. Otherwise the
571 associated coding system object is returned.
573 (coding_system_or_name))
575 if (NILP (coding_system_or_name))
576 coding_system_or_name = Qbinary;
577 else if (CODING_SYSTEMP (coding_system_or_name))
578 return coding_system_or_name;
580 CHECK_SYMBOL (coding_system_or_name);
584 coding_system_or_name =
585 Fgethash (coding_system_or_name, Vcoding_system_hash_table, Qnil);
587 if (CODING_SYSTEMP (coding_system_or_name) || NILP (coding_system_or_name))
588 return coding_system_or_name;
592 DEFUN ("get-coding-system", Fget_coding_system, 1, 1, 0, /*
593 Retrieve the coding system of the given name.
594 Same as `find-coding-system' except that if there is no such
595 coding system, an error is signaled instead of returning nil.
599 Lisp_Object coding_system = Ffind_coding_system (name);
601 if (NILP (coding_system))
602 signal_simple_error ("No such coding system", name);
603 return coding_system;
606 /* We store the coding systems in hash tables with the names as the key and the
607 actual coding system object as the value. Occasionally we need to use them
608 in a list format. These routines provide us with that. */
609 struct coding_system_list_closure
611 Lisp_Object *coding_system_list;
615 add_coding_system_to_list_mapper (Lisp_Object key, Lisp_Object value,
616 void *coding_system_list_closure)
618 /* This function can GC */
619 struct coding_system_list_closure *cscl =
620 (struct coding_system_list_closure *) coding_system_list_closure;
621 Lisp_Object *coding_system_list = cscl->coding_system_list;
623 *coding_system_list = Fcons (key, *coding_system_list);
627 DEFUN ("coding-system-list", Fcoding_system_list, 0, 0, 0, /*
628 Return a list of the names of all defined coding systems.
632 Lisp_Object coding_system_list = Qnil;
634 struct coding_system_list_closure coding_system_list_closure;
636 GCPRO1 (coding_system_list);
637 coding_system_list_closure.coding_system_list = &coding_system_list;
638 elisp_maphash (add_coding_system_to_list_mapper, Vcoding_system_hash_table,
639 &coding_system_list_closure);
642 return coding_system_list;
645 DEFUN ("coding-system-name", Fcoding_system_name, 1, 1, 0, /*
646 Return the name of the given coding system.
650 coding_system = Fget_coding_system (coding_system);
651 return XCODING_SYSTEM_NAME (coding_system);
654 static Lisp_Coding_System *
655 allocate_coding_system (enum coding_system_type type, Lisp_Object name)
657 Lisp_Coding_System *codesys =
658 alloc_lcrecord_type (Lisp_Coding_System, &lrecord_coding_system);
660 zero_lcrecord (codesys);
661 CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = Qnil;
662 CODING_SYSTEM_POST_READ_CONVERSION (codesys) = Qnil;
663 CODING_SYSTEM_EOL_TYPE (codesys) = EOL_AUTODETECT;
664 CODING_SYSTEM_EOL_CRLF (codesys) = Qnil;
665 CODING_SYSTEM_EOL_CR (codesys) = Qnil;
666 CODING_SYSTEM_EOL_LF (codesys) = Qnil;
667 CODING_SYSTEM_TYPE (codesys) = type;
668 CODING_SYSTEM_MNEMONIC (codesys) = Qnil;
671 CODING_SYSTEM_CCS_PRIORITY_LIST (codesys) = Qnil;
673 if (type == CODESYS_ISO2022)
676 for (i = 0; i < 4; i++)
677 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i) = Qnil;
680 if (type == CODESYS_UTF8)
682 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 0)
684 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 1)
686 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 2)
688 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 3)
691 else if (type == CODESYS_BIG5)
693 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 0)
695 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 1)
696 = Vcharset_chinese_big5;
697 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 2)
699 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 3)
703 else if (type == CODESYS_CCL)
705 CODING_SYSTEM_CCL_DECODE (codesys) = Qnil;
706 CODING_SYSTEM_CCL_ENCODE (codesys) = Qnil;
709 CODING_SYSTEM_NAME (codesys) = name;
715 /* Given a list of charset conversion specs as specified in a Lisp
716 program, parse it into STORE_HERE. */
719 parse_charset_conversion_specs (charset_conversion_spec_dynarr *store_here,
720 Lisp_Object spec_list)
724 EXTERNAL_LIST_LOOP (rest, spec_list)
726 Lisp_Object car = XCAR (rest);
727 Lisp_Object from, to;
728 struct charset_conversion_spec spec;
730 if (!CONSP (car) || !CONSP (XCDR (car)) || !NILP (XCDR (XCDR (car))))
731 signal_simple_error ("Invalid charset conversion spec", car);
732 from = Fget_charset (XCAR (car));
733 to = Fget_charset (XCAR (XCDR (car)));
734 if ( (XCHARSET_CHARS (from) != XCHARSET_CHARS (to)) ||
735 (XCHARSET_DIMENSION (from) != XCHARSET_DIMENSION (to)) )
736 signal_simple_error_2
737 ("Attempted conversion between different charset types",
739 spec.from_charset = from;
740 spec.to_charset = to;
742 Dynarr_add (store_here, spec);
746 /* Given a dynarr LOAD_HERE of internally-stored charset conversion
747 specs, return the equivalent as the Lisp programmer would see it.
749 If LOAD_HERE is 0, return Qnil. */
752 unparse_charset_conversion_specs (charset_conversion_spec_dynarr *load_here)
759 for (i = 0, result = Qnil; i < Dynarr_length (load_here); i++)
761 struct charset_conversion_spec *ccs = Dynarr_atp (load_here, i);
762 result = Fcons (list2 (ccs->from_charset, ccs->to_charset), result);
765 return Fnreverse (result);
770 DEFUN ("make-coding-system", Fmake_coding_system, 2, 4, 0, /*
771 Register symbol NAME as a coding system.
773 TYPE describes the conversion method used and should be one of
776 Automatic conversion. XEmacs attempts to detect the coding system
779 No conversion. Use this for binary files and such. On output,
780 graphic characters that are not in ASCII or Latin-1 will be
781 replaced by a ?. (For a no-conversion-encoded buffer, these
782 characters will only be present if you explicitly insert them.)
784 Shift-JIS (a Japanese encoding commonly used in PC operating systems).
786 ISO 10646 UCS-4 encoding.
788 ISO 10646 UTF-8 encoding.
790 Any ISO2022-compliant encoding. Among other things, this includes
791 JIS (the Japanese encoding commonly used for e-mail), EUC (the
792 standard Unix encoding for Japanese and other languages), and
793 Compound Text (the encoding used in X11). You can specify more
794 specific information about the conversion with the PROPS argument.
796 Big5 (the encoding commonly used for Taiwanese).
798 The conversion is performed using a user-written pseudo-code
799 program. CCL (Code Conversion Language) is the name of this
802 Write out or read in the raw contents of the memory representing
803 the buffer's text. This is primarily useful for debugging
804 purposes, and is only enabled when XEmacs has been compiled with
805 DEBUG_XEMACS defined (via the --debug configure option).
806 WARNING: Reading in a file using 'internal conversion can result
807 in an internal inconsistency in the memory representing a
808 buffer's text, which will produce unpredictable results and may
809 cause XEmacs to crash. Under normal circumstances you should
810 never use 'internal conversion.
812 DOC-STRING is a string describing the coding system.
814 PROPS is a property list, describing the specific nature of the
815 character set. Recognized properties are:
818 String to be displayed in the modeline when this coding system is
822 End-of-line conversion to be used. It should be one of
825 Automatically detect the end-of-line type (LF, CRLF,
826 or CR). Also generate subsidiary coding systems named
827 `NAME-unix', `NAME-dos', and `NAME-mac', that are
828 identical to this coding system but have an EOL-TYPE
829 value of 'lf, 'crlf, and 'cr, respectively.
831 The end of a line is marked externally using ASCII LF.
832 Since this is also the way that XEmacs represents an
833 end-of-line internally, specifying this option results
834 in no end-of-line conversion. This is the standard
835 format for Unix text files.
837 The end of a line is marked externally using ASCII
838 CRLF. This is the standard format for MS-DOS text
841 The end of a line is marked externally using ASCII CR.
842 This is the standard format for Macintosh text files.
844 Automatically detect the end-of-line type but do not
845 generate subsidiary coding systems. (This value is
846 converted to nil when stored internally, and
847 `coding-system-property' will return nil.)
850 If non-nil, composition/decomposition for combining characters
853 'use-entity-reference
854 If non-nil, SGML style entity-reference is used for non-system-characters.
856 'post-read-conversion
857 Function called after a file has been read in, to perform the
858 decoding. Called with two arguments, START and END, denoting
859 a region of the current buffer to be decoded.
861 'pre-write-conversion
862 Function called before a file is written out, to perform the
863 encoding. Called with two arguments, START and END, denoting
864 a region of the current buffer to be encoded.
867 The following additional properties are recognized if TYPE is 'iso2022:
873 The character set initially designated to the G0 - G3 registers.
874 The value should be one of
876 -- A charset object (designate that character set)
877 -- nil (do not ever use this register)
878 -- t (no character set is initially designated to
879 the register, but may be later on; this automatically
880 sets the corresponding `force-g*-on-output' property)
886 If non-nil, send an explicit designation sequence on output before
887 using the specified register.
890 If non-nil, use the short forms "ESC $ @", "ESC $ A", and
891 "ESC $ B" on output in place of the full designation sequences
892 "ESC $ ( @", "ESC $ ( A", and "ESC $ ( B".
895 If non-nil, don't designate ASCII to G0 at each end of line on output.
896 Setting this to non-nil also suppresses other state-resetting that
897 normally happens at the end of a line.
900 If non-nil, don't designate ASCII to G0 before control chars on output.
903 If non-nil, use 7-bit environment on output. Otherwise, use 8-bit
907 If non-nil, use locking-shift (SO/SI) instead of single-shift
908 or designation by escape sequence.
911 If non-nil, don't use ISO6429's direction specification.
914 If non-nil, literal control characters that are the same as
915 the beginning of a recognized ISO2022 or ISO6429 escape sequence
916 (in particular, ESC (0x1B), SO (0x0E), SI (0x0F), SS2 (0x8E),
917 SS3 (0x8F), and CSI (0x9B)) are "quoted" with an escape character
918 so that they can be properly distinguished from an escape sequence.
919 (Note that doing this results in a non-portable encoding.) This
920 encoding flag is used for byte-compiled files. Note that ESC
921 is a good choice for a quoting character because there are no
922 escape sequences whose second byte is a character from the Control-0
923 or Control-1 character sets; this is explicitly disallowed by the
926 'input-charset-conversion
927 A list of conversion specifications, specifying conversion of
928 characters in one charset to another when decoding is performed.
929 Each specification is a list of two elements: the source charset,
930 and the destination charset.
932 'output-charset-conversion
933 A list of conversion specifications, specifying conversion of
934 characters in one charset to another when encoding is performed.
935 The form of each specification is the same as for
936 'input-charset-conversion.
939 The following additional properties are recognized (and required)
943 CCL program used for decoding (converting to internal format).
946 CCL program used for encoding (converting to external format).
948 (name, type, doc_string, props))
950 Lisp_Coding_System *codesys;
951 enum coding_system_type ty;
952 int need_to_setup_eol_systems = 1;
954 /* Convert type to constant */
955 if (NILP (type) || EQ (type, Qundecided))
956 { ty = CODESYS_AUTODETECT; }
958 else if (EQ (type, Qshift_jis)) { ty = CODESYS_SHIFT_JIS; }
959 else if (EQ (type, Qiso2022)) { ty = CODESYS_ISO2022; }
960 else if (EQ (type, Qbig5)) { ty = CODESYS_BIG5; }
961 else if (EQ (type, Qucs4)) { ty = CODESYS_UCS4; }
962 else if (EQ (type, Qutf16)) { ty = CODESYS_UTF16; }
963 else if (EQ (type, Qutf8)) { ty = CODESYS_UTF8; }
964 else if (EQ (type, Qccl)) { ty = CODESYS_CCL; }
966 else if (EQ (type, Qno_conversion)) { ty = CODESYS_NO_CONVERSION; }
968 else if (EQ (type, Qinternal)) { ty = CODESYS_INTERNAL; }
971 signal_simple_error ("Invalid coding system type", type);
975 codesys = allocate_coding_system (ty, name);
977 if (NILP (doc_string))
978 doc_string = build_string ("");
980 CHECK_STRING (doc_string);
981 CODING_SYSTEM_DOC_STRING (codesys) = doc_string;
984 EXTERNAL_PROPERTY_LIST_LOOP_3 (key, value, props)
986 if (EQ (key, Qmnemonic))
989 CHECK_STRING (value);
990 CODING_SYSTEM_MNEMONIC (codesys) = value;
993 else if (EQ (key, Qeol_type))
995 need_to_setup_eol_systems = NILP (value);
998 CODING_SYSTEM_EOL_TYPE (codesys) = symbol_to_eol_type (value);
1001 else if (EQ (key, Qpost_read_conversion))
1002 CODING_SYSTEM_POST_READ_CONVERSION (codesys) = value;
1003 else if (EQ (key, Qpre_write_conversion))
1004 CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = value;
1006 else if (EQ (key, Qdisable_composition))
1007 CODING_SYSTEM_DISABLE_COMPOSITION (codesys) = !NILP (value);
1008 else if (EQ (key, Quse_entity_reference))
1009 CODING_SYSTEM_USE_ENTITY_REFERENCE (codesys) = !NILP (value);
1012 else if (ty == CODESYS_ISO2022)
1014 #define FROB_INITIAL_CHARSET(charset_num) \
1015 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, charset_num) = \
1016 ((EQ (value, Qt) || EQ (value, Qnil)) ? value : Fget_charset (value))
1018 if (EQ (key, Qcharset_g0)) FROB_INITIAL_CHARSET (0);
1019 else if (EQ (key, Qcharset_g1)) FROB_INITIAL_CHARSET (1);
1020 else if (EQ (key, Qcharset_g2)) FROB_INITIAL_CHARSET (2);
1021 else if (EQ (key, Qcharset_g3)) FROB_INITIAL_CHARSET (3);
1023 #define FROB_FORCE_CHARSET(charset_num) \
1024 CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (codesys, charset_num) = !NILP (value)
1026 else if (EQ (key, Qforce_g0_on_output)) FROB_FORCE_CHARSET (0);
1027 else if (EQ (key, Qforce_g1_on_output)) FROB_FORCE_CHARSET (1);
1028 else if (EQ (key, Qforce_g2_on_output)) FROB_FORCE_CHARSET (2);
1029 else if (EQ (key, Qforce_g3_on_output)) FROB_FORCE_CHARSET (3);
1031 #define FROB_BOOLEAN_PROPERTY(prop) \
1032 CODING_SYSTEM_ISO2022_##prop (codesys) = !NILP (value)
1034 else if (EQ (key, Qshort)) FROB_BOOLEAN_PROPERTY (SHORT);
1035 else if (EQ (key, Qno_ascii_eol)) FROB_BOOLEAN_PROPERTY (NO_ASCII_EOL);
1036 else if (EQ (key, Qno_ascii_cntl)) FROB_BOOLEAN_PROPERTY (NO_ASCII_CNTL);
1037 else if (EQ (key, Qseven)) FROB_BOOLEAN_PROPERTY (SEVEN);
1038 else if (EQ (key, Qlock_shift)) FROB_BOOLEAN_PROPERTY (LOCK_SHIFT);
1039 else if (EQ (key, Qno_iso6429)) FROB_BOOLEAN_PROPERTY (NO_ISO6429);
1040 else if (EQ (key, Qescape_quoted)) FROB_BOOLEAN_PROPERTY (ESCAPE_QUOTED);
1042 else if (EQ (key, Qinput_charset_conversion))
1044 codesys->iso2022.input_conv =
1045 Dynarr_new (charset_conversion_spec);
1046 parse_charset_conversion_specs (codesys->iso2022.input_conv,
1049 else if (EQ (key, Qoutput_charset_conversion))
1051 codesys->iso2022.output_conv =
1052 Dynarr_new (charset_conversion_spec);
1053 parse_charset_conversion_specs (codesys->iso2022.output_conv,
1057 else if (EQ (key, Qccs_priority_list))
1059 codesys->ccs_priority_list = value;
1063 signal_simple_error ("Unrecognized property", key);
1066 else if (ty == CODESYS_UTF8)
1068 if (EQ (key, Qcharset_g0)) FROB_INITIAL_CHARSET (0);
1069 else if (EQ (key, Qcharset_g1))
1070 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 1) = value;
1071 else if (EQ (key, Qcharset_g2))
1072 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 2) = value;
1074 signal_simple_error ("Unrecognized property", key);
1076 else if (ty == CODESYS_BIG5)
1078 if (EQ (key, Qcharset_g0)) FROB_INITIAL_CHARSET (0);
1079 else if (EQ (key, Qcharset_g1)) FROB_INITIAL_CHARSET (1);
1081 signal_simple_error ("Unrecognized property", key);
1084 else if (EQ (type, Qccl))
1087 struct ccl_program test_ccl;
1090 /* Check key first. */
1091 if (EQ (key, Qdecode))
1092 suffix = "-ccl-decode";
1093 else if (EQ (key, Qencode))
1094 suffix = "-ccl-encode";
1096 signal_simple_error ("Unrecognized property", key);
1098 /* If value is vector, register it as a ccl program
1099 associated with an newly created symbol for
1100 backward compatibility. */
1101 if (VECTORP (value))
1103 sym = Fintern (concat2 (Fsymbol_name (name),
1104 build_string (suffix)),
1106 Fregister_ccl_program (sym, value);
1110 CHECK_SYMBOL (value);
1113 /* check if the given ccl programs are valid. */
1114 if (setup_ccl_program (&test_ccl, sym) < 0)
1115 signal_simple_error ("Invalid CCL program", value);
1117 if (EQ (key, Qdecode))
1118 CODING_SYSTEM_CCL_DECODE (codesys) = sym;
1119 else if (EQ (key, Qencode))
1120 CODING_SYSTEM_CCL_ENCODE (codesys) = sym;
1125 signal_simple_error ("Unrecognized property", key);
1129 if (need_to_setup_eol_systems)
1130 setup_eol_coding_systems (codesys);
1133 Lisp_Object codesys_obj;
1134 XSETCODING_SYSTEM (codesys_obj, codesys);
1135 Fputhash (name, codesys_obj, Vcoding_system_hash_table);
1140 DEFUN ("copy-coding-system", Fcopy_coding_system, 2, 2, 0, /*
1141 Copy OLD-CODING-SYSTEM to NEW-NAME.
1142 If NEW-NAME does not name an existing coding system, a new one will
1145 (old_coding_system, new_name))
1147 Lisp_Object new_coding_system;
1148 old_coding_system = Fget_coding_system (old_coding_system);
1149 new_coding_system = Ffind_coding_system (new_name);
1150 if (NILP (new_coding_system))
1152 XSETCODING_SYSTEM (new_coding_system,
1153 allocate_coding_system
1154 (XCODING_SYSTEM_TYPE (old_coding_system),
1156 Fputhash (new_name, new_coding_system, Vcoding_system_hash_table);
1160 Lisp_Coding_System *to = XCODING_SYSTEM (new_coding_system);
1161 Lisp_Coding_System *from = XCODING_SYSTEM (old_coding_system);
1162 memcpy (((char *) to ) + sizeof (to->header),
1163 ((char *) from) + sizeof (from->header),
1164 sizeof (*from) - sizeof (from->header));
1165 to->name = new_name;
1167 return new_coding_system;
1170 DEFUN ("coding-system-canonical-name-p", Fcoding_system_canonical_name_p, 1, 1, 0, /*
1171 Return t if OBJECT names a coding system, and is not a coding system alias.
1175 return CODING_SYSTEMP (Fgethash (object, Vcoding_system_hash_table, Qnil))
1179 DEFUN ("coding-system-alias-p", Fcoding_system_alias_p, 1, 1, 0, /*
1180 Return t if OBJECT is a coding system alias.
1181 All coding system aliases are created by `define-coding-system-alias'.
1185 return SYMBOLP (Fgethash (object, Vcoding_system_hash_table, Qzero))
1189 DEFUN ("coding-system-aliasee", Fcoding_system_aliasee, 1, 1, 0, /*
1190 Return the coding-system symbol for which symbol ALIAS is an alias.
1194 Lisp_Object aliasee = Fgethash (alias, Vcoding_system_hash_table, Qnil);
1195 if (SYMBOLP (aliasee))
1198 signal_simple_error ("Symbol is not a coding system alias", alias);
1199 return Qnil; /* To keep the compiler happy */
1203 append_suffix_to_symbol (Lisp_Object symbol, const char *ascii_string)
1205 return Fintern (concat2 (Fsymbol_name (symbol), build_string (ascii_string)),
1209 /* A maphash function, for removing dangling coding system aliases. */
1211 dangling_coding_system_alias_p (Lisp_Object alias,
1212 Lisp_Object aliasee,
1213 void *dangling_aliases)
1215 if (SYMBOLP (aliasee)
1216 && NILP (Fgethash (aliasee, Vcoding_system_hash_table, Qnil)))
1218 (*(int *) dangling_aliases)++;
1225 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias, 2, 2, 0, /*
1226 Define symbol ALIAS as an alias for coding system ALIASEE.
1228 You can use this function to redefine an alias that has already been defined,
1229 but you cannot redefine a name which is the canonical name for a coding system.
1230 \(a canonical name of a coding system is what is returned when you call
1231 `coding-system-name' on a coding system).
1233 ALIASEE itself can be an alias, which allows you to define nested aliases.
1235 You are forbidden, however, from creating alias loops or `dangling' aliases.
1236 These will be detected, and an error will be signaled if you attempt to do so.
1238 If ALIASEE is nil, then ALIAS will simply be undefined.
1240 See also `coding-system-alias-p', `coding-system-aliasee',
1241 and `coding-system-canonical-name-p'.
1245 Lisp_Object real_coding_system, probe;
1247 CHECK_SYMBOL (alias);
1249 if (!NILP (Fcoding_system_canonical_name_p (alias)))
1251 ("Symbol is the canonical name of a coding system and cannot be redefined",
1256 Lisp_Object subsidiary_unix = append_suffix_to_symbol (alias, "-unix");
1257 Lisp_Object subsidiary_dos = append_suffix_to_symbol (alias, "-dos");
1258 Lisp_Object subsidiary_mac = append_suffix_to_symbol (alias, "-mac");
1260 Fremhash (alias, Vcoding_system_hash_table);
1262 /* Undefine subsidiary aliases,
1263 presumably created by a previous call to this function */
1264 if (! NILP (Fcoding_system_alias_p (subsidiary_unix)) &&
1265 ! NILP (Fcoding_system_alias_p (subsidiary_dos)) &&
1266 ! NILP (Fcoding_system_alias_p (subsidiary_mac)))
1268 Fdefine_coding_system_alias (subsidiary_unix, Qnil);
1269 Fdefine_coding_system_alias (subsidiary_dos, Qnil);
1270 Fdefine_coding_system_alias (subsidiary_mac, Qnil);
1273 /* Undefine dangling coding system aliases. */
1275 int dangling_aliases;
1278 dangling_aliases = 0;
1279 elisp_map_remhash (dangling_coding_system_alias_p,
1280 Vcoding_system_hash_table,
1282 } while (dangling_aliases > 0);
1288 if (CODING_SYSTEMP (aliasee))
1289 aliasee = XCODING_SYSTEM_NAME (aliasee);
1291 /* Checks that aliasee names a coding-system */
1292 real_coding_system = Fget_coding_system (aliasee);
1294 /* Check for coding system alias loops */
1295 if (EQ (alias, aliasee))
1296 alias_loop: signal_simple_error_2
1297 ("Attempt to create a coding system alias loop", alias, aliasee);
1299 for (probe = aliasee;
1301 probe = Fgethash (probe, Vcoding_system_hash_table, Qzero))
1303 if (EQ (probe, alias))
1307 Fputhash (alias, aliasee, Vcoding_system_hash_table);
1309 /* Set up aliases for subsidiaries.
1310 #### There must be a better way to handle subsidiary coding systems. */
1312 static const char *suffixes[] = { "-unix", "-dos", "-mac" };
1314 for (i = 0; i < countof (suffixes); i++)
1316 Lisp_Object alias_subsidiary =
1317 append_suffix_to_symbol (alias, suffixes[i]);
1318 Lisp_Object aliasee_subsidiary =
1319 append_suffix_to_symbol (aliasee, suffixes[i]);
1321 if (! NILP (Ffind_coding_system (aliasee_subsidiary)))
1322 Fdefine_coding_system_alias (alias_subsidiary, aliasee_subsidiary);
1325 /* FSF return value is a vector of [ALIAS-unix ALIAS-dos ALIAS-mac],
1326 but it doesn't look intentional, so I'd rather return something
1327 meaningful or nothing at all. */
1332 subsidiary_coding_system (Lisp_Object coding_system, eol_type_t type)
1334 Lisp_Coding_System *cs = XCODING_SYSTEM (coding_system);
1335 Lisp_Object new_coding_system;
1337 if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
1338 return coding_system;
1342 case EOL_AUTODETECT: return coding_system;
1343 case EOL_LF: new_coding_system = CODING_SYSTEM_EOL_LF (cs); break;
1344 case EOL_CR: new_coding_system = CODING_SYSTEM_EOL_CR (cs); break;
1345 case EOL_CRLF: new_coding_system = CODING_SYSTEM_EOL_CRLF (cs); break;
1346 default: abort (); return Qnil;
1349 return NILP (new_coding_system) ? coding_system : new_coding_system;
1352 DEFUN ("subsidiary-coding-system", Fsubsidiary_coding_system, 2, 2, 0, /*
1353 Return the subsidiary coding system of CODING-SYSTEM with eol type EOL-TYPE.
1355 (coding_system, eol_type))
1357 coding_system = Fget_coding_system (coding_system);
1359 return subsidiary_coding_system (coding_system,
1360 symbol_to_eol_type (eol_type));
1364 /************************************************************************/
1365 /* Coding system accessors */
1366 /************************************************************************/
1368 DEFUN ("coding-system-doc-string", Fcoding_system_doc_string, 1, 1, 0, /*
1369 Return the doc string for CODING-SYSTEM.
1373 coding_system = Fget_coding_system (coding_system);
1374 return XCODING_SYSTEM_DOC_STRING (coding_system);
1377 DEFUN ("coding-system-type", Fcoding_system_type, 1, 1, 0, /*
1378 Return the type of CODING-SYSTEM.
1382 switch (XCODING_SYSTEM_TYPE (Fget_coding_system (coding_system)))
1385 case CODESYS_AUTODETECT: return Qundecided;
1387 case CODESYS_SHIFT_JIS: return Qshift_jis;
1388 case CODESYS_ISO2022: return Qiso2022;
1389 case CODESYS_BIG5: return Qbig5;
1390 case CODESYS_UCS4: return Qucs4;
1391 case CODESYS_UTF16: return Qutf16;
1392 case CODESYS_UTF8: return Qutf8;
1393 case CODESYS_CCL: return Qccl;
1395 case CODESYS_NO_CONVERSION: return Qno_conversion;
1397 case CODESYS_INTERNAL: return Qinternal;
1404 Lisp_Object coding_system_charset (Lisp_Object coding_system, int gnum)
1407 = XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, gnum);
1409 return CHARSETP (cs) ? XCHARSET_NAME (cs) : Qnil;
1412 DEFUN ("coding-system-charset", Fcoding_system_charset, 2, 2, 0, /*
1413 Return initial charset of CODING-SYSTEM designated to GNUM.
1416 (coding_system, gnum))
1418 coding_system = Fget_coding_system (coding_system);
1421 return coding_system_charset (coding_system, XINT (gnum));
1425 DEFUN ("coding-system-property", Fcoding_system_property, 2, 2, 0, /*
1426 Return the PROP property of CODING-SYSTEM.
1428 (coding_system, prop))
1431 enum coding_system_type type;
1433 coding_system = Fget_coding_system (coding_system);
1434 CHECK_SYMBOL (prop);
1435 type = XCODING_SYSTEM_TYPE (coding_system);
1437 for (i = 0; !ok && i < Dynarr_length (the_codesys_prop_dynarr); i++)
1438 if (EQ (Dynarr_at (the_codesys_prop_dynarr, i).sym, prop))
1441 switch (Dynarr_at (the_codesys_prop_dynarr, i).prop_type)
1443 case CODESYS_PROP_ALL_OK:
1446 case CODESYS_PROP_ISO2022:
1447 if (type != CODESYS_ISO2022)
1449 ("Property only valid in ISO2022 coding systems",
1453 case CODESYS_PROP_CCL:
1454 if (type != CODESYS_CCL)
1456 ("Property only valid in CCL coding systems",
1466 signal_simple_error ("Unrecognized property", prop);
1468 if (EQ (prop, Qname))
1469 return XCODING_SYSTEM_NAME (coding_system);
1470 else if (EQ (prop, Qtype))
1471 return Fcoding_system_type (coding_system);
1472 else if (EQ (prop, Qdoc_string))
1473 return XCODING_SYSTEM_DOC_STRING (coding_system);
1474 else if (EQ (prop, Qmnemonic))
1475 return XCODING_SYSTEM_MNEMONIC (coding_system);
1476 else if (EQ (prop, Qeol_type))
1477 return eol_type_to_symbol (XCODING_SYSTEM_EOL_TYPE (coding_system));
1478 else if (EQ (prop, Qeol_lf))
1479 return XCODING_SYSTEM_EOL_LF (coding_system);
1480 else if (EQ (prop, Qeol_crlf))
1481 return XCODING_SYSTEM_EOL_CRLF (coding_system);
1482 else if (EQ (prop, Qeol_cr))
1483 return XCODING_SYSTEM_EOL_CR (coding_system);
1484 else if (EQ (prop, Qpost_read_conversion))
1485 return XCODING_SYSTEM_POST_READ_CONVERSION (coding_system);
1486 else if (EQ (prop, Qpre_write_conversion))
1487 return XCODING_SYSTEM_PRE_WRITE_CONVERSION (coding_system);
1490 else if (EQ (prop, Qdisable_composition))
1491 return XCODING_SYSTEM_DISABLE_COMPOSITION (coding_system) ? Qt : Qnil;
1492 else if (EQ (prop, Quse_entity_reference))
1493 return XCODING_SYSTEM_USE_ENTITY_REFERENCE (coding_system) ? Qt : Qnil;
1494 else if (EQ (prop, Qccs_priority_list))
1495 return XCODING_SYSTEM_CCS_PRIORITY_LIST (coding_system);
1497 else if (type == CODESYS_ISO2022)
1499 if (EQ (prop, Qcharset_g0))
1500 return coding_system_charset (coding_system, 0);
1501 else if (EQ (prop, Qcharset_g1))
1502 return coding_system_charset (coding_system, 1);
1503 else if (EQ (prop, Qcharset_g2))
1504 return coding_system_charset (coding_system, 2);
1505 else if (EQ (prop, Qcharset_g3))
1506 return coding_system_charset (coding_system, 3);
1508 #define FORCE_CHARSET(charset_num) \
1509 (XCODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT \
1510 (coding_system, charset_num) ? Qt : Qnil)
1512 else if (EQ (prop, Qforce_g0_on_output)) return FORCE_CHARSET (0);
1513 else if (EQ (prop, Qforce_g1_on_output)) return FORCE_CHARSET (1);
1514 else if (EQ (prop, Qforce_g2_on_output)) return FORCE_CHARSET (2);
1515 else if (EQ (prop, Qforce_g3_on_output)) return FORCE_CHARSET (3);
1517 #define LISP_BOOLEAN(prop) \
1518 (XCODING_SYSTEM_ISO2022_##prop (coding_system) ? Qt : Qnil)
1520 else if (EQ (prop, Qshort)) return LISP_BOOLEAN (SHORT);
1521 else if (EQ (prop, Qno_ascii_eol)) return LISP_BOOLEAN (NO_ASCII_EOL);
1522 else if (EQ (prop, Qno_ascii_cntl)) return LISP_BOOLEAN (NO_ASCII_CNTL);
1523 else if (EQ (prop, Qseven)) return LISP_BOOLEAN (SEVEN);
1524 else if (EQ (prop, Qlock_shift)) return LISP_BOOLEAN (LOCK_SHIFT);
1525 else if (EQ (prop, Qno_iso6429)) return LISP_BOOLEAN (NO_ISO6429);
1526 else if (EQ (prop, Qescape_quoted)) return LISP_BOOLEAN (ESCAPE_QUOTED);
1528 else if (EQ (prop, Qinput_charset_conversion))
1530 unparse_charset_conversion_specs
1531 (XCODING_SYSTEM (coding_system)->iso2022.input_conv);
1532 else if (EQ (prop, Qoutput_charset_conversion))
1534 unparse_charset_conversion_specs
1535 (XCODING_SYSTEM (coding_system)->iso2022.output_conv);
1539 else if (type == CODESYS_CCL)
1541 if (EQ (prop, Qdecode))
1542 return XCODING_SYSTEM_CCL_DECODE (coding_system);
1543 else if (EQ (prop, Qencode))
1544 return XCODING_SYSTEM_CCL_ENCODE (coding_system);
1552 return Qnil; /* not reached */
1556 /************************************************************************/
1557 /* Coding category functions */
1558 /************************************************************************/
1561 decode_coding_category (Lisp_Object symbol)
1565 CHECK_SYMBOL (symbol);
1566 for (i = 0; i < CODING_CATEGORY_LAST; i++)
1567 if (EQ (coding_category_symbol[i], symbol))
1570 signal_simple_error ("Unrecognized coding category", symbol);
1571 return 0; /* not reached */
1574 DEFUN ("coding-category-list", Fcoding_category_list, 0, 0, 0, /*
1575 Return a list of all recognized coding categories.
1580 Lisp_Object list = Qnil;
1582 for (i = CODING_CATEGORY_LAST - 1; i >= 0; i--)
1583 list = Fcons (coding_category_symbol[i], list);
1587 DEFUN ("set-coding-priority-list", Fset_coding_priority_list, 1, 1, 0, /*
1588 Change the priority order of the coding categories.
1589 LIST should be list of coding categories, in descending order of
1590 priority. Unspecified coding categories will be lower in priority
1591 than all specified ones, in the same relative order they were in
1596 int category_to_priority[CODING_CATEGORY_LAST];
1600 /* First generate a list that maps coding categories to priorities. */
1602 for (i = 0; i < CODING_CATEGORY_LAST; i++)
1603 category_to_priority[i] = -1;
1605 /* Highest priority comes from the specified list. */
1607 EXTERNAL_LIST_LOOP (rest, list)
1609 int cat = decode_coding_category (XCAR (rest));
1611 if (category_to_priority[cat] >= 0)
1612 signal_simple_error ("Duplicate coding category in list", XCAR (rest));
1613 category_to_priority[cat] = i++;
1616 /* Now go through the existing categories by priority to retrieve
1617 the categories not yet specified and preserve their priority
1619 for (j = 0; j < CODING_CATEGORY_LAST; j++)
1621 int cat = fcd->coding_category_by_priority[j];
1622 if (category_to_priority[cat] < 0)
1623 category_to_priority[cat] = i++;
1626 /* Now we need to construct the inverse of the mapping we just
1629 for (i = 0; i < CODING_CATEGORY_LAST; i++)
1630 fcd->coding_category_by_priority[category_to_priority[i]] = i;
1632 /* Phew! That was confusing. */
1636 DEFUN ("coding-priority-list", Fcoding_priority_list, 0, 0, 0, /*
1637 Return a list of coding categories in descending order of priority.
1642 Lisp_Object list = Qnil;
1644 for (i = CODING_CATEGORY_LAST - 1; i >= 0; i--)
1645 list = Fcons (coding_category_symbol[fcd->coding_category_by_priority[i]],
1650 DEFUN ("set-coding-category-system", Fset_coding_category_system, 2, 2, 0, /*
1651 Change the coding system associated with a coding category.
1653 (coding_category, coding_system))
1655 int cat = decode_coding_category (coding_category);
1657 coding_system = Fget_coding_system (coding_system);
1658 fcd->coding_category_system[cat] = coding_system;
1662 DEFUN ("coding-category-system", Fcoding_category_system, 1, 1, 0, /*
1663 Return the coding system associated with a coding category.
1667 int cat = decode_coding_category (coding_category);
1668 Lisp_Object sys = fcd->coding_category_system[cat];
1671 return XCODING_SYSTEM_NAME (sys);
1676 /************************************************************************/
1677 /* Detecting the encoding of data */
1678 /************************************************************************/
1680 struct detection_state
1682 eol_type_t eol_type;
1725 struct iso2022_decoder iso;
1727 int high_byte_count;
1728 unsigned int saw_single_shift:1;
1741 acceptable_control_char_p (int c)
1745 /* Allow and ignore control characters that you might
1746 reasonably see in a text file */
1751 case 8: /* backspace */
1752 case 11: /* vertical tab */
1753 case 12: /* form feed */
1754 case 26: /* MS-DOS C-z junk */
1755 case 31: /* '^_' -- for info */
1763 mask_has_at_most_one_bit_p (int mask)
1765 /* Perhaps the only thing useful you learn from intensive Microsoft
1766 technical interviews */
1767 return (mask & (mask - 1)) == 0;
1771 detect_eol_type (struct detection_state *st, const Extbyte *src,
1772 Lstream_data_count n)
1776 unsigned char c = *(unsigned char *)src++;
1779 if (st->eol.just_saw_cr)
1781 else if (st->eol.seen_anything)
1784 else if (st->eol.just_saw_cr)
1787 st->eol.just_saw_cr = 1;
1789 st->eol.just_saw_cr = 0;
1790 st->eol.seen_anything = 1;
1793 return EOL_AUTODETECT;
1796 /* Attempt to determine the encoding and EOL type of the given text.
1797 Before calling this function for the first type, you must initialize
1798 st->eol_type as appropriate and initialize st->mask to ~0.
1800 st->eol_type holds the determined EOL type, or EOL_AUTODETECT if
1803 st->mask holds the determined coding category mask, or ~0 if only
1804 ASCII has been seen so far.
1808 0 == st->eol_type is EOL_AUTODETECT and/or more than coding category
1809 is present in st->mask
1810 1 == definitive answers are here for both st->eol_type and st->mask
1814 detect_coding_type (struct detection_state *st, const Extbyte *src,
1815 Lstream_data_count n, int just_do_eol)
1817 if (st->eol_type == EOL_AUTODETECT)
1818 st->eol_type = detect_eol_type (st, src, n);
1821 return st->eol_type != EOL_AUTODETECT;
1823 if (!st->seen_non_ascii)
1825 for (; n; n--, src++)
1827 unsigned char c = *(unsigned char *) src;
1828 if ((c < 0x20 && !acceptable_control_char_p (c)) || c >= 0x80)
1830 st->seen_non_ascii = 1;
1832 st->shift_jis.mask = ~0;
1835 st->utf16.mask = ~0;
1837 st->iso2022.mask = ~0;
1847 if (!mask_has_at_most_one_bit_p (st->iso2022.mask))
1848 st->iso2022.mask = detect_coding_iso2022 (st, src, n);
1849 if (!mask_has_at_most_one_bit_p (st->shift_jis.mask))
1850 st->shift_jis.mask = detect_coding_sjis (st, src, n);
1851 if (!mask_has_at_most_one_bit_p (st->big5.mask))
1852 st->big5.mask = detect_coding_big5 (st, src, n);
1853 if (!mask_has_at_most_one_bit_p (st->utf8.mask))
1854 st->utf8.mask = detect_coding_utf8 (st, src, n);
1855 if (!mask_has_at_most_one_bit_p (st->utf16.mask))
1856 st->utf16.mask = detect_coding_utf16 (st, src, n);
1857 if (!mask_has_at_most_one_bit_p (st->ucs4.mask))
1858 st->ucs4.mask = detect_coding_ucs4 (st, src, n);
1861 = st->iso2022.mask | st->shift_jis.mask | st->big5.mask
1862 | st->utf8.mask | st->ucs4.mask;
1865 int retval = mask_has_at_most_one_bit_p (st->mask);
1866 st->mask |= CODING_CATEGORY_NO_CONVERSION_MASK;
1867 return retval && st->eol_type != EOL_AUTODETECT;
1872 coding_system_from_mask (int mask)
1876 /* If the file was entirely or basically ASCII, use the
1877 default value of `buffer-file-coding-system'. */
1878 Lisp_Object retval =
1879 XBUFFER (Vbuffer_defaults)->buffer_file_coding_system;
1882 retval = Ffind_coding_system (retval);
1886 (Qbad_variable, Qwarning,
1887 "Invalid `default-buffer-file-coding-system', set to nil");
1888 XBUFFER (Vbuffer_defaults)->buffer_file_coding_system = Qnil;
1892 retval = Fget_coding_system (Qraw_text);
1900 mask = postprocess_iso2022_mask (mask);
1902 /* Look through the coding categories by priority and find
1903 the first one that is allowed. */
1904 for (i = 0; i < CODING_CATEGORY_LAST; i++)
1906 cat = fcd->coding_category_by_priority[i];
1907 if ((mask & (1 << cat)) &&
1908 !NILP (fcd->coding_category_system[cat]))
1912 return fcd->coding_category_system[cat];
1914 return Fget_coding_system (Qraw_text);
1918 /* Given a seekable read stream and potential coding system and EOL type
1919 as specified, do any autodetection that is called for. If the
1920 coding system and/or EOL type are not `autodetect', they will be left
1921 alone; but this function will never return an autodetect coding system
1924 This function does not automatically fetch subsidiary coding systems;
1925 that should be unnecessary with the explicit eol-type argument. */
1927 #define LENGTH(string_constant) (sizeof (string_constant) - 1)
1928 /* number of leading lines to check for a coding cookie */
1929 #define LINES_TO_CHECK 2
1932 determine_real_coding_system (Lstream *stream, Lisp_Object *codesys_in_out,
1933 eol_type_t *eol_type_in_out)
1935 struct detection_state decst;
1937 if (*eol_type_in_out == EOL_AUTODETECT)
1938 *eol_type_in_out = XCODING_SYSTEM_EOL_TYPE (*codesys_in_out);
1941 decst.eol_type = *eol_type_in_out;
1944 /* If autodetection is called for, do it now. */
1945 if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT
1946 || *eol_type_in_out == EOL_AUTODETECT)
1949 Lisp_Object coding_system = Qnil;
1951 Lstream_data_count nread = Lstream_read (stream, buf, sizeof (buf));
1953 int lines_checked = 0;
1955 /* Look for initial "-*-"; mode line prefix */
1957 scan_end = buf + nread - LENGTH ("-*-coding:?-*-");
1959 && lines_checked < LINES_TO_CHECK;
1961 if (*p == '-' && *(p+1) == '*' && *(p+2) == '-')
1963 Extbyte *local_vars_beg = p + 3;
1964 /* Look for final "-*-"; mode line suffix */
1965 for (p = local_vars_beg,
1966 scan_end = buf + nread - LENGTH ("-*-");
1968 && lines_checked < LINES_TO_CHECK;
1970 if (*p == '-' && *(p+1) == '*' && *(p+2) == '-')
1972 Extbyte *suffix = p;
1973 /* Look for "coding:" */
1974 for (p = local_vars_beg,
1975 scan_end = suffix - LENGTH ("coding:?");
1978 if (memcmp ("coding:", p, LENGTH ("coding:")) == 0
1979 && (p == local_vars_beg
1980 || (*(p-1) == ' ' ||
1986 p += LENGTH ("coding:");
1987 while (*p == ' ' || *p == '\t') p++;
1989 /* Get coding system name */
1990 save = *suffix; *suffix = '\0';
1991 /* Characters valid in a MIME charset name (rfc 1521),
1992 and in a Lisp symbol name. */
1993 n = strspn ( (char *) p,
1994 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
1995 "abcdefghijklmnopqrstuvwxyz"
2001 save = p[n]; p[n] = '\0';
2003 Ffind_coding_system (intern ((char *) p));
2010 /* #### file must use standard EOLs or we miss 2d line */
2011 /* #### not to mention this is broken for UTF-16 DOS files */
2012 else if (*p == '\n' || *p == '\r')
2015 /* skip past multibyte (DOS) newline */
2016 if (*p == '\r' && *(p+1) == '\n') p++;
2020 /* #### file must use standard EOLs or we miss 2d line */
2021 /* #### not to mention this is broken for UTF-16 DOS files */
2022 else if (*p == '\n' || *p == '\r')
2025 /* skip past multibyte (DOS) newline */
2026 if (*p == '\r' && *(p+1) == '\n') p++;
2029 if (NILP (coding_system))
2032 if (detect_coding_type (&decst, buf, nread,
2033 XCODING_SYSTEM_TYPE (*codesys_in_out)
2034 != CODESYS_AUTODETECT))
2036 nread = Lstream_read (stream, buf, sizeof (buf));
2042 else if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT
2043 && XCODING_SYSTEM_EOL_TYPE (coding_system) == EOL_AUTODETECT)
2046 if (detect_coding_type (&decst, buf, nread, 1))
2048 nread = Lstream_read (stream, buf, sizeof (buf));
2054 *eol_type_in_out = decst.eol_type;
2055 if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT)
2057 if (NILP (coding_system))
2058 *codesys_in_out = coding_system_from_mask (decst.mask);
2060 *codesys_in_out = coding_system;
2064 /* If we absolutely can't determine the EOL type, just assume LF. */
2065 if (*eol_type_in_out == EOL_AUTODETECT)
2066 *eol_type_in_out = EOL_LF;
2068 Lstream_rewind (stream);
2071 DEFUN ("detect-coding-region", Fdetect_coding_region, 2, 3, 0, /*
2072 Detect coding system of the text in the region between START and END.
2073 Return a list of possible coding systems ordered by priority.
2074 If only ASCII characters are found, return 'undecided or one of
2075 its subsidiary coding systems according to a detected end-of-line
2076 type. Optional arg BUFFER defaults to the current buffer.
2078 (start, end, buffer))
2080 Lisp_Object val = Qnil;
2081 struct buffer *buf = decode_buffer (buffer, 0);
2083 Lisp_Object instream, lb_instream;
2084 Lstream *istr, *lb_istr;
2085 struct detection_state decst;
2086 struct gcpro gcpro1, gcpro2;
2088 get_buffer_range_char (buf, start, end, &b, &e, 0);
2089 lb_instream = make_lisp_buffer_input_stream (buf, b, e, 0);
2090 lb_istr = XLSTREAM (lb_instream);
2091 instream = make_encoding_input_stream (lb_istr, Fget_coding_system (Qbinary));
2092 istr = XLSTREAM (instream);
2093 GCPRO2 (instream, lb_instream);
2095 decst.eol_type = EOL_AUTODETECT;
2099 Extbyte random_buffer[4096];
2100 Lstream_data_count nread = Lstream_read (istr, random_buffer, sizeof (random_buffer));
2104 if (detect_coding_type (&decst, random_buffer, nread, 0))
2108 if (decst.mask == ~0)
2109 val = subsidiary_coding_system (Fget_coding_system (Qundecided),
2117 decst.mask = postprocess_iso2022_mask (decst.mask);
2119 for (i = CODING_CATEGORY_LAST - 1; i >= 0; i--)
2121 int sys = fcd->coding_category_by_priority[i];
2122 if (decst.mask & (1 << sys))
2124 Lisp_Object codesys = fcd->coding_category_system[sys];
2125 if (!NILP (codesys))
2126 codesys = subsidiary_coding_system (codesys, decst.eol_type);
2127 val = Fcons (codesys, val);
2131 Lstream_close (istr);
2133 Lstream_delete (istr);
2134 Lstream_delete (lb_istr);
2139 /************************************************************************/
2140 /* Converting to internal Mule format ("decoding") */
2141 /************************************************************************/
2143 /* A decoding stream is a stream used for decoding text (i.e.
2144 converting from some external format to internal format).
2145 The decoding-stream object keeps track of the actual coding
2146 stream, the stream that is at the other end, and data that
2147 needs to be persistent across the lifetime of the stream. */
2149 /* Handle the EOL stuff related to just-read-in character C.
2150 EOL_TYPE is the EOL type of the coding stream.
2151 FLAGS is the current value of FLAGS in the coding stream, and may
2152 be modified by this macro. (The macro only looks at the
2153 CODING_STATE_CR flag.) DST is the Dynarr to which the decoded
2154 bytes are to be written. You need to also define a local goto
2155 label "label_continue_loop" that is at the end of the main
2156 character-reading loop.
2158 If C is a CR character, then this macro handles it entirely and
2159 jumps to label_continue_loop. Otherwise, this macro does not add
2160 anything to DST, and continues normally. You should continue
2161 processing C normally after this macro. */
2163 #define DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst) \
2167 if (eol_type == EOL_CR) \
2168 Dynarr_add (dst, '\n'); \
2169 else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR) \
2170 Dynarr_add (dst, c); \
2172 flags |= CODING_STATE_CR; \
2173 goto label_continue_loop; \
2175 else if (flags & CODING_STATE_CR) \
2176 { /* eol_type == CODING_SYSTEM_EOL_CRLF */ \
2178 Dynarr_add (dst, '\r'); \
2179 flags &= ~CODING_STATE_CR; \
2183 /* C should be a binary character in the range 0 - 255; convert
2184 to internal format and add to Dynarr DST. */
2187 #define DECODE_ADD_BINARY_CHAR(c, dst) \
2189 if (BYTE_ASCII_P (c)) \
2190 Dynarr_add (dst, c); \
2193 Dynarr_add (dst, (c >> 6) | 0xc0); \
2194 Dynarr_add (dst, (c & 0x3f) | 0x80); \
2198 INLINE_HEADER void DECODE_ADD_UCS_CHAR(Emchar c, unsigned_char_dynarr* dst);
2200 DECODE_ADD_UCS_CHAR(Emchar c, unsigned_char_dynarr* dst)
2204 Dynarr_add (dst, c);
2206 else if ( c <= 0x7ff )
2208 Dynarr_add (dst, (c >> 6) | 0xc0);
2209 Dynarr_add (dst, (c & 0x3f) | 0x80);
2211 else if ( c <= 0xffff )
2213 Dynarr_add (dst, (c >> 12) | 0xe0);
2214 Dynarr_add (dst, ((c >> 6) & 0x3f) | 0x80);
2215 Dynarr_add (dst, (c & 0x3f) | 0x80);
2217 else if ( c <= 0x1fffff )
2219 Dynarr_add (dst, (c >> 18) | 0xf0);
2220 Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
2221 Dynarr_add (dst, ((c >> 6) & 0x3f) | 0x80);
2222 Dynarr_add (dst, (c & 0x3f) | 0x80);
2224 else if ( c <= 0x3ffffff )
2226 Dynarr_add (dst, (c >> 24) | 0xf8);
2227 Dynarr_add (dst, ((c >> 18) & 0x3f) | 0x80);
2228 Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
2229 Dynarr_add (dst, ((c >> 6) & 0x3f) | 0x80);
2230 Dynarr_add (dst, (c & 0x3f) | 0x80);
2234 Dynarr_add (dst, (c >> 30) | 0xfc);
2235 Dynarr_add (dst, ((c >> 24) & 0x3f) | 0x80);
2236 Dynarr_add (dst, ((c >> 18) & 0x3f) | 0x80);
2237 Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
2238 Dynarr_add (dst, ((c >> 6) & 0x3f) | 0x80);
2239 Dynarr_add (dst, (c & 0x3f) | 0x80);
2243 #define DECODE_ADD_BINARY_CHAR(c, dst) \
2245 if (BYTE_ASCII_P (c)) \
2246 Dynarr_add (dst, c); \
2247 else if (BYTE_C1_P (c)) \
2249 Dynarr_add (dst, LEADING_BYTE_CONTROL_1); \
2250 Dynarr_add (dst, c + 0x20); \
2254 Dynarr_add (dst, LEADING_BYTE_LATIN_ISO8859_1); \
2255 Dynarr_add (dst, c); \
2260 #define DECODE_OUTPUT_PARTIAL_CHAR(ch) \
2264 DECODE_ADD_BINARY_CHAR (ch, dst); \
2269 #define DECODE_HANDLE_END_OF_CONVERSION(flags, ch, dst) \
2271 if (flags & CODING_STATE_END) \
2273 DECODE_OUTPUT_PARTIAL_CHAR (ch); \
2274 if (flags & CODING_STATE_CR) \
2275 Dynarr_add (dst, '\r'); \
2279 #define DECODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, decoding)
2281 #define ER_BUF_SIZE 24
2283 struct decoding_stream
2285 /* Coding system that governs the conversion. */
2286 Lisp_Coding_System *codesys;
2288 /* Stream that we read the encoded data from or
2289 write the decoded data to. */
2292 /* If we are reading, then we can return only a fixed amount of
2293 data, so if the conversion resulted in too much data, we store it
2294 here for retrieval the next time around. */
2295 unsigned_char_dynarr *runoff;
2297 /* FLAGS holds flags indicating the current state of the decoding.
2298 Some of these flags are dependent on the coding system. */
2301 /* CPOS holds a partially built-up code-point of character. */
2304 /* EOL_TYPE specifies the type of end-of-line conversion that
2305 currently applies. We need to keep this separate from the
2306 EOL type stored in CODESYS because the latter might indicate
2307 automatic EOL-type detection while the former will always
2308 indicate a particular EOL type. */
2309 eol_type_t eol_type;
2311 /* Additional ISO2022 information. We define the structure above
2312 because it's also needed by the detection routines. */
2313 struct iso2022_decoder iso2022;
2315 /* Additional information (the state of the running CCL program)
2316 used by the CCL decoder. */
2317 struct ccl_program ccl;
2319 /* counter for UTF-8 or UCS-4 */
2320 unsigned char counter;
2324 unsigned char er_counter;
2325 unsigned char er_buf[ER_BUF_SIZE];
2327 unsigned combined_char_count;
2328 Emchar combined_chars[16];
2329 Lisp_Object combining_table;
2331 struct detection_state decst;
2334 static Lstream_data_count decoding_reader (Lstream *stream,
2335 unsigned char *data, Lstream_data_count size);
2336 static Lstream_data_count decoding_writer (Lstream *stream,
2337 const unsigned char *data, Lstream_data_count size);
2338 static int decoding_rewinder (Lstream *stream);
2339 static int decoding_seekable_p (Lstream *stream);
2340 static int decoding_flusher (Lstream *stream);
2341 static int decoding_closer (Lstream *stream);
2343 static Lisp_Object decoding_marker (Lisp_Object stream);
2345 DEFINE_LSTREAM_IMPLEMENTATION ("decoding", lstream_decoding,
2346 sizeof (struct decoding_stream));
2349 decoding_marker (Lisp_Object stream)
2351 Lstream *str = DECODING_STREAM_DATA (XLSTREAM (stream))->other_end;
2352 Lisp_Object str_obj;
2354 /* We do not need to mark the coding systems or charsets stored
2355 within the stream because they are stored in a global list
2356 and automatically marked. */
2358 XSETLSTREAM (str_obj, str);
2359 mark_object (str_obj);
2360 if (str->imp->marker)
2361 return (str->imp->marker) (str_obj);
2366 /* Read SIZE bytes of data and store it into DATA. We are a decoding stream
2367 so we read data from the other end, decode it, and store it into DATA. */
2369 static Lstream_data_count
2370 decoding_reader (Lstream *stream, unsigned char *data, Lstream_data_count size)
2372 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2373 unsigned char *orig_data = data;
2374 Lstream_data_count read_size;
2375 int error_occurred = 0;
2377 /* We need to interface to mule_decode(), which expects to take some
2378 amount of data and store the result into a Dynarr. We have
2379 mule_decode() store into str->runoff, and take data from there
2382 /* We loop until we have enough data, reading chunks from the other
2383 end and decoding it. */
2386 /* Take data from the runoff if we can. Make sure to take at
2387 most SIZE bytes, and delete the data from the runoff. */
2388 if (Dynarr_length (str->runoff) > 0)
2390 Lstream_data_count chunk = min (size, (Lstream_data_count) Dynarr_length (str->runoff));
2391 memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
2392 Dynarr_delete_many (str->runoff, 0, chunk);
2398 break; /* No more room for data */
2400 if (str->flags & CODING_STATE_END)
2401 /* This means that on the previous iteration, we hit the EOF on
2402 the other end. We loop once more so that mule_decode() can
2403 output any final stuff it may be holding, or any "go back
2404 to a sane state" escape sequences. (This latter makes sense
2405 during encoding.) */
2408 /* Exhausted the runoff, so get some more. DATA has at least
2409 SIZE bytes left of storage in it, so it's OK to read directly
2410 into it. (We'll be overwriting above, after we've decoded it
2411 into the runoff.) */
2412 read_size = Lstream_read (str->other_end, data, size);
2419 /* There might be some more end data produced in the translation.
2420 See the comment above. */
2421 str->flags |= CODING_STATE_END;
2422 mule_decode (stream, (Extbyte *) data, str->runoff, read_size);
2425 if (data - orig_data == 0)
2426 return error_occurred ? -1 : 0;
2428 return data - orig_data;
2431 static Lstream_data_count
2432 decoding_writer (Lstream *stream, const unsigned char *data, Lstream_data_count size)
2434 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2435 Lstream_data_count retval;
2437 /* Decode all our data into the runoff, and then attempt to write
2438 it all out to the other end. Remove whatever chunk we succeeded
2440 mule_decode (stream, (Extbyte *) data, str->runoff, size);
2441 retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
2442 Dynarr_length (str->runoff));
2444 Dynarr_delete_many (str->runoff, 0, retval);
2445 /* Do NOT return retval. The return value indicates how much
2446 of the incoming data was written, not how many bytes were
2452 reset_decoding_stream (struct decoding_stream *str)
2455 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_ISO2022)
2457 Lisp_Object coding_system;
2458 XSETCODING_SYSTEM (coding_system, str->codesys);
2459 reset_iso2022 (coding_system, &str->iso2022);
2461 else if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_CCL)
2463 setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_DECODE (str->codesys));
2469 str->er_counter = 0;
2470 str->combined_char_count = 0;
2471 str->combining_table = Qnil;
2473 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT
2474 || CODING_SYSTEM_EOL_TYPE (str->codesys) == EOL_AUTODETECT)
2477 str->decst.eol_type = EOL_AUTODETECT;
2478 str->decst.mask = ~0;
2480 str->flags = str->cpos = 0;
2484 decoding_rewinder (Lstream *stream)
2486 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2487 reset_decoding_stream (str);
2488 Dynarr_reset (str->runoff);
2489 return Lstream_rewind (str->other_end);
2493 decoding_seekable_p (Lstream *stream)
2495 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2496 return Lstream_seekable_p (str->other_end);
2500 decoding_flusher (Lstream *stream)
2502 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2503 return Lstream_flush (str->other_end);
2507 decoding_closer (Lstream *stream)
2509 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2510 if (stream->flags & LSTREAM_FL_WRITE)
2512 str->flags |= CODING_STATE_END;
2513 decoding_writer (stream, 0, 0);
2515 Dynarr_free (str->runoff);
2517 #ifdef ENABLE_COMPOSITE_CHARS
2518 if (str->iso2022.composite_chars)
2519 Dynarr_free (str->iso2022.composite_chars);
2522 return Lstream_close (str->other_end);
2526 decoding_stream_coding_system (Lstream *stream)
2528 Lisp_Object coding_system;
2529 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2531 XSETCODING_SYSTEM (coding_system, str->codesys);
2532 return subsidiary_coding_system (coding_system, str->eol_type);
2536 set_decoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
2538 Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
2539 struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
2541 if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
2542 str->eol_type = CODING_SYSTEM_EOL_TYPE (cs);
2543 reset_decoding_stream (str);
2546 /* WARNING WARNING WARNING WARNING!!!!! If you open up a decoding
2547 stream for writing, no automatic code detection will be performed.
2548 The reason for this is that automatic code detection requires a
2549 seekable input. Things will also fail if you open a decoding
2550 stream for reading using a non-fully-specified coding system and
2551 a non-seekable input stream. */
2554 make_decoding_stream_1 (Lstream *stream, Lisp_Object codesys,
2557 Lstream *lstr = Lstream_new (lstream_decoding, mode);
2558 struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
2562 str->other_end = stream;
2563 str->runoff = (unsigned_char_dynarr *) Dynarr_new (unsigned_char);
2564 str->eol_type = EOL_AUTODETECT;
2565 if (!strcmp (mode, "r")
2566 && Lstream_seekable_p (stream))
2567 /* We can determine the coding system now. */
2568 determine_real_coding_system (stream, &codesys, &str->eol_type);
2569 set_decoding_stream_coding_system (lstr, codesys);
2570 str->decst.eol_type = str->eol_type;
2571 str->decst.mask = ~0;
2572 XSETLSTREAM (obj, lstr);
2577 make_decoding_input_stream (Lstream *stream, Lisp_Object codesys)
2579 return make_decoding_stream_1 (stream, codesys, "r");
2583 make_decoding_output_stream (Lstream *stream, Lisp_Object codesys)
2585 return make_decoding_stream_1 (stream, codesys, "w");
2588 /* Note: the decode_coding_* functions all take the same
2589 arguments as mule_decode(), which is to say some SRC data of
2590 size N, which is to be stored into dynamic array DST.
2591 DECODING is the stream within which the decoding is
2592 taking place, but no data is actually read from or
2593 written to that stream; that is handled in decoding_reader()
2594 or decoding_writer(). This allows the same functions to
2595 be used for both reading and writing. */
2598 mule_decode (Lstream *decoding, const Extbyte *src,
2599 unsigned_char_dynarr *dst, Lstream_data_count n)
2601 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
2603 /* If necessary, do encoding-detection now. We do this when
2604 we're a writing stream or a non-seekable reading stream,
2605 meaning that we can't just process the whole input,
2606 rewind, and start over. */
2608 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT ||
2609 str->eol_type == EOL_AUTODETECT)
2611 Lisp_Object codesys;
2613 XSETCODING_SYSTEM (codesys, str->codesys);
2614 detect_coding_type (&str->decst, src, n,
2615 CODING_SYSTEM_TYPE (str->codesys) !=
2616 CODESYS_AUTODETECT);
2617 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT &&
2618 str->decst.mask != ~0)
2619 /* #### This is cheesy. What we really ought to do is
2620 buffer up a certain amount of data so as to get a
2621 less random result. */
2622 codesys = coding_system_from_mask (str->decst.mask);
2623 str->eol_type = str->decst.eol_type;
2624 if (XCODING_SYSTEM (codesys) != str->codesys)
2626 /* Preserve the CODING_STATE_END flag in case it was set.
2627 If we erase it, bad things might happen. */
2628 int was_end = str->flags & CODING_STATE_END;
2629 set_decoding_stream_coding_system (decoding, codesys);
2631 str->flags |= CODING_STATE_END;
2635 switch (CODING_SYSTEM_TYPE (str->codesys))
2638 case CODESYS_INTERNAL:
2639 Dynarr_add_many (dst, src, n);
2642 case CODESYS_AUTODETECT:
2643 /* If we got this far and still haven't decided on the coding
2644 system, then do no conversion. */
2645 case CODESYS_NO_CONVERSION:
2646 decode_coding_no_conversion (decoding, src, dst, n);
2649 case CODESYS_SHIFT_JIS:
2650 decode_coding_sjis (decoding, src, dst, n);
2653 decode_coding_big5 (decoding, src, dst, n);
2656 decode_coding_ucs4 (decoding, src, dst, n);
2659 decode_coding_utf16 (decoding, src, dst, n);
2662 decode_coding_utf8 (decoding, src, dst, n);
2665 str->ccl.last_block = str->flags & CODING_STATE_END;
2666 /* When applying ccl program to stream, MUST NOT set NULL
2668 ccl_driver (&str->ccl, (src ? (unsigned char *)src : (unsigned char*)""),
2669 dst, n, 0, CCL_MODE_DECODING);
2671 case CODESYS_ISO2022:
2672 decode_coding_iso2022 (decoding, src, dst, n);
2680 DEFUN ("decode-coding-region", Fdecode_coding_region, 3, 4, 0, /*
2681 Decode the text between START and END which is encoded in CODING-SYSTEM.
2682 This is useful if you've read in encoded text from a file without decoding
2683 it (e.g. you read in a JIS-formatted file but used the `binary' or
2684 `no-conversion' coding system, so that it shows up as "^[$B!<!+^[(B").
2685 Return length of decoded text.
2686 BUFFER defaults to the current buffer if unspecified.
2688 (start, end, coding_system, buffer))
2691 struct buffer *buf = decode_buffer (buffer, 0);
2692 Lisp_Object instream, lb_outstream, de_outstream, outstream;
2693 Lstream *istr, *ostr;
2694 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
2696 get_buffer_range_char (buf, start, end, &b, &e, 0);
2698 barf_if_buffer_read_only (buf, b, e);
2700 coding_system = Fget_coding_system (coding_system);
2701 instream = make_lisp_buffer_input_stream (buf, b, e, 0);
2702 lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
2703 de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
2705 outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
2706 Fget_coding_system (Qbinary));
2707 istr = XLSTREAM (instream);
2708 ostr = XLSTREAM (outstream);
2709 GCPRO4 (instream, lb_outstream, de_outstream, outstream);
2711 /* The chain of streams looks like this:
2713 [BUFFER] <----- send through
2714 ------> [ENCODE AS BINARY]
2715 ------> [DECODE AS SPECIFIED]
2721 char tempbuf[1024]; /* some random amount */
2722 Bufpos newpos, even_newer_pos;
2723 Bufpos oldpos = lisp_buffer_stream_startpos (istr);
2724 Lstream_data_count size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
2728 newpos = lisp_buffer_stream_startpos (istr);
2729 Lstream_write (ostr, tempbuf, size_in_bytes);
2730 even_newer_pos = lisp_buffer_stream_startpos (istr);
2731 buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
2734 Lstream_close (istr);
2735 Lstream_close (ostr);
2737 Lstream_delete (istr);
2738 Lstream_delete (ostr);
2739 Lstream_delete (XLSTREAM (de_outstream));
2740 Lstream_delete (XLSTREAM (lb_outstream));
2745 /************************************************************************/
2746 /* Converting to an external encoding ("encoding") */
2747 /************************************************************************/
2749 /* An encoding stream is an output stream. When you create the
2750 stream, you specify the coding system that governs the encoding
2751 and another stream that the resulting encoded data is to be
2752 sent to, and then start sending data to it. */
2754 #define ENCODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, encoding)
2756 struct encoding_stream
2758 /* Coding system that governs the conversion. */
2759 Lisp_Coding_System *codesys;
2761 /* Stream that we read the encoded data from or
2762 write the decoded data to. */
2765 /* If we are reading, then we can return only a fixed amount of
2766 data, so if the conversion resulted in too much data, we store it
2767 here for retrieval the next time around. */
2768 unsigned_char_dynarr *runoff;
2770 /* FLAGS holds flags indicating the current state of the encoding.
2771 Some of these flags are dependent on the coding system. */
2774 /* CH holds a partially built-up character. Since we only deal
2775 with one- and two-byte characters at the moment, we only use
2776 this to store the first byte of a two-byte character. */
2779 /* Additional information used by the ISO2022 encoder. */
2782 /* CHARSET holds the character sets currently assigned to the G0
2783 through G3 registers. It is initialized from the array
2784 INITIAL_CHARSET in CODESYS. */
2785 Lisp_Object charset[4];
2787 /* Which registers are currently invoked into the left (GL) and
2788 right (GR) halves of the 8-bit encoding space? */
2789 int register_left, register_right;
2791 /* Whether we need to explicitly designate the charset in the
2792 G? register before using it. It is initialized from the
2793 array FORCE_CHARSET_ON_OUTPUT in CODESYS. */
2794 unsigned char force_charset_on_output[4];
2796 /* Other state variables that need to be preserved across
2798 Lisp_Object current_charset;
2800 int current_char_boundary;
2803 void (*encode_char) (struct encoding_stream *str, Emchar c,
2804 unsigned_char_dynarr *dst, unsigned int *flags);
2805 void (*finish) (struct encoding_stream *str,
2806 unsigned_char_dynarr *dst, unsigned int *flags);
2808 /* Additional information (the state of the running CCL program)
2809 used by the CCL encoder. */
2810 struct ccl_program ccl;
2814 static Lstream_data_count encoding_reader (Lstream *stream, unsigned char *data, Lstream_data_count size);
2815 static Lstream_data_count encoding_writer (Lstream *stream, const unsigned char *data,
2816 Lstream_data_count size);
2817 static int encoding_rewinder (Lstream *stream);
2818 static int encoding_seekable_p (Lstream *stream);
2819 static int encoding_flusher (Lstream *stream);
2820 static int encoding_closer (Lstream *stream);
2822 static Lisp_Object encoding_marker (Lisp_Object stream);
2824 DEFINE_LSTREAM_IMPLEMENTATION ("encoding", lstream_encoding,
2825 sizeof (struct encoding_stream));
2828 encoding_marker (Lisp_Object stream)
2830 Lstream *str = ENCODING_STREAM_DATA (XLSTREAM (stream))->other_end;
2831 Lisp_Object str_obj;
2833 /* We do not need to mark the coding systems or charsets stored
2834 within the stream because they are stored in a global list
2835 and automatically marked. */
2837 XSETLSTREAM (str_obj, str);
2838 mark_object (str_obj);
2839 if (str->imp->marker)
2840 return (str->imp->marker) (str_obj);
2845 /* Read SIZE bytes of data and store it into DATA. We are a encoding stream
2846 so we read data from the other end, encode it, and store it into DATA. */
2848 static Lstream_data_count
2849 encoding_reader (Lstream *stream, unsigned char *data, Lstream_data_count size)
2851 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2852 unsigned char *orig_data = data;
2853 Lstream_data_count read_size;
2854 int error_occurred = 0;
2856 /* We need to interface to mule_encode(), which expects to take some
2857 amount of data and store the result into a Dynarr. We have
2858 mule_encode() store into str->runoff, and take data from there
2861 /* We loop until we have enough data, reading chunks from the other
2862 end and encoding it. */
2865 /* Take data from the runoff if we can. Make sure to take at
2866 most SIZE bytes, and delete the data from the runoff. */
2867 if (Dynarr_length (str->runoff) > 0)
2869 int chunk = min ((int) size, Dynarr_length (str->runoff));
2870 memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
2871 Dynarr_delete_many (str->runoff, 0, chunk);
2877 break; /* No more room for data */
2879 if (str->flags & CODING_STATE_END)
2880 /* This means that on the previous iteration, we hit the EOF on
2881 the other end. We loop once more so that mule_encode() can
2882 output any final stuff it may be holding, or any "go back
2883 to a sane state" escape sequences. (This latter makes sense
2884 during encoding.) */
2887 /* Exhausted the runoff, so get some more. DATA at least SIZE bytes
2888 left of storage in it, so it's OK to read directly into it.
2889 (We'll be overwriting above, after we've encoded it into the
2891 read_size = Lstream_read (str->other_end, data, size);
2898 /* There might be some more end data produced in the translation.
2899 See the comment above. */
2900 str->flags |= CODING_STATE_END;
2901 mule_encode (stream, data, str->runoff, read_size);
2904 if (data == orig_data)
2905 return error_occurred ? -1 : 0;
2907 return data - orig_data;
2910 static Lstream_data_count
2911 encoding_writer (Lstream *stream, const unsigned char *data, Lstream_data_count size)
2913 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2914 Lstream_data_count retval;
2916 /* Encode all our data into the runoff, and then attempt to write
2917 it all out to the other end. Remove whatever chunk we succeeded
2919 mule_encode (stream, data, str->runoff, size);
2920 retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
2921 Dynarr_length (str->runoff));
2923 Dynarr_delete_many (str->runoff, 0, retval);
2924 /* Do NOT return retval. The return value indicates how much
2925 of the incoming data was written, not how many bytes were
2931 reset_encoding_stream (struct encoding_stream *str)
2934 switch (CODING_SYSTEM_TYPE (str->codesys))
2936 case CODESYS_ISO2022:
2940 str->encode_char = &char_encode_iso2022;
2941 str->finish = &char_finish_iso2022;
2942 for (i = 0; i < 4; i++)
2944 str->iso2022.charset[i] =
2945 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (str->codesys, i);
2946 str->iso2022.force_charset_on_output[i] =
2947 CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (str->codesys, i);
2949 str->iso2022.register_left = 0;
2950 str->iso2022.register_right = 1;
2951 str->iso2022.current_charset = Qnil;
2952 str->iso2022.current_half = 0;
2956 setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_ENCODE (str->codesys));
2959 str->encode_char = &char_encode_utf8;
2960 str->finish = &char_finish_utf8;
2963 str->encode_char = &char_encode_utf16;
2964 str->finish = &char_finish_utf16;
2967 str->encode_char = &char_encode_ucs4;
2968 str->finish = &char_finish_ucs4;
2970 case CODESYS_SHIFT_JIS:
2971 str->encode_char = &char_encode_shift_jis;
2972 str->finish = &char_finish_shift_jis;
2975 str->encode_char = &char_encode_big5;
2976 str->finish = &char_finish_big5;
2982 str->iso2022.current_char_boundary = 0;
2983 str->flags = str->ch = 0;
2987 encoding_rewinder (Lstream *stream)
2989 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2990 reset_encoding_stream (str);
2991 Dynarr_reset (str->runoff);
2992 return Lstream_rewind (str->other_end);
2996 encoding_seekable_p (Lstream *stream)
2998 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2999 return Lstream_seekable_p (str->other_end);
3003 encoding_flusher (Lstream *stream)
3005 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
3006 return Lstream_flush (str->other_end);
3010 encoding_closer (Lstream *stream)
3012 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
3013 if (stream->flags & LSTREAM_FL_WRITE)
3015 str->flags |= CODING_STATE_END;
3016 encoding_writer (stream, 0, 0);
3018 Dynarr_free (str->runoff);
3019 return Lstream_close (str->other_end);
3023 encoding_stream_coding_system (Lstream *stream)
3025 Lisp_Object coding_system;
3026 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
3028 XSETCODING_SYSTEM (coding_system, str->codesys);
3029 return coding_system;
3033 set_encoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
3035 Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
3036 struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
3038 reset_encoding_stream (str);
3042 make_encoding_stream_1 (Lstream *stream, Lisp_Object codesys,
3045 Lstream *lstr = Lstream_new (lstream_encoding, mode);
3046 struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
3050 str->runoff = Dynarr_new (unsigned_char);
3051 str->other_end = stream;
3052 set_encoding_stream_coding_system (lstr, codesys);
3053 XSETLSTREAM (obj, lstr);
3058 make_encoding_input_stream (Lstream *stream, Lisp_Object codesys)
3060 return make_encoding_stream_1 (stream, codesys, "r");
3064 make_encoding_output_stream (Lstream *stream, Lisp_Object codesys)
3066 return make_encoding_stream_1 (stream, codesys, "w");
3069 /* Convert N bytes of internally-formatted data stored in SRC to an
3070 external format, according to the encoding stream ENCODING.
3071 Store the encoded data into DST. */
3074 mule_encode (Lstream *encoding, const Bufbyte *src,
3075 unsigned_char_dynarr *dst, Lstream_data_count n)
3077 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3079 switch (CODING_SYSTEM_TYPE (str->codesys))
3082 case CODESYS_INTERNAL:
3083 Dynarr_add_many (dst, src, n);
3086 case CODESYS_AUTODETECT:
3087 /* If we got this far and still haven't decided on the coding
3088 system, then do no conversion. */
3089 case CODESYS_NO_CONVERSION:
3090 encode_coding_no_conversion (encoding, src, dst, n);
3094 str->ccl.last_block = str->flags & CODING_STATE_END;
3095 /* When applying ccl program to stream, MUST NOT set NULL
3097 ccl_driver (&str->ccl, ((src) ? src : (unsigned char*)""),
3098 dst, n, 0, CCL_MODE_ENCODING);
3102 text_encode_generic (encoding, src, dst, n);
3106 DEFUN ("encode-coding-region", Fencode_coding_region, 3, 4, 0, /*
3107 Encode the text between START and END using CODING-SYSTEM.
3108 This will, for example, convert Japanese characters into stuff such as
3109 "^[$B!<!+^[(B" if you use the JIS encoding. Return length of encoded
3110 text. BUFFER defaults to the current buffer if unspecified.
3112 (start, end, coding_system, buffer))
3115 struct buffer *buf = decode_buffer (buffer, 0);
3116 Lisp_Object instream, lb_outstream, de_outstream, outstream;
3117 Lstream *istr, *ostr;
3118 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
3120 get_buffer_range_char (buf, start, end, &b, &e, 0);
3122 barf_if_buffer_read_only (buf, b, e);
3124 coding_system = Fget_coding_system (coding_system);
3125 instream = make_lisp_buffer_input_stream (buf, b, e, 0);
3126 lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
3127 de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
3128 Fget_coding_system (Qbinary));
3129 outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
3131 istr = XLSTREAM (instream);
3132 ostr = XLSTREAM (outstream);
3133 GCPRO4 (instream, outstream, de_outstream, lb_outstream);
3134 /* The chain of streams looks like this:
3136 [BUFFER] <----- send through
3137 ------> [ENCODE AS SPECIFIED]
3138 ------> [DECODE AS BINARY]
3143 char tempbuf[1024]; /* some random amount */
3144 Bufpos newpos, even_newer_pos;
3145 Bufpos oldpos = lisp_buffer_stream_startpos (istr);
3146 Lstream_data_count size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
3150 newpos = lisp_buffer_stream_startpos (istr);
3151 Lstream_write (ostr, tempbuf, size_in_bytes);
3152 even_newer_pos = lisp_buffer_stream_startpos (istr);
3153 buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
3159 lisp_buffer_stream_startpos (XLSTREAM (instream)) - b;
3160 Lstream_close (istr);
3161 Lstream_close (ostr);
3163 Lstream_delete (istr);
3164 Lstream_delete (ostr);
3165 Lstream_delete (XLSTREAM (de_outstream));
3166 Lstream_delete (XLSTREAM (lb_outstream));
3167 return make_int (retlen);
3174 text_encode_generic (Lstream *encoding, const Bufbyte *src,
3175 unsigned_char_dynarr *dst, Lstream_data_count n)
3178 unsigned char char_boundary;
3179 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3180 unsigned int flags = str->flags;
3181 Emchar ch = str->ch;
3183 char_boundary = str->iso2022.current_char_boundary;
3189 if (char_boundary == 0)
3217 (*str->encode_char) (str, c, dst, &flags);
3219 else if (char_boundary == 1)
3221 (*str->encode_char) (str, (ch << 6) | (c & 0x3f), dst, &flags);
3227 ch = (ch << 6) | (c & 0x3f);
3232 if ((char_boundary == 0) && (flags & CODING_STATE_END))
3234 (*str->finish) (str, dst, &flags);
3239 str->iso2022.current_char_boundary = char_boundary;
3244 /************************************************************************/
3245 /* entity reference */
3246 /************************************************************************/
3249 decode_flush_er_chars (struct decoding_stream *str, unsigned_char_dynarr* dst);
3251 decode_flush_er_chars (struct decoding_stream *str, unsigned_char_dynarr* dst)
3253 if ( str->er_counter > 0)
3255 Dynarr_add_many (dst, str->er_buf, str->er_counter);
3256 str->er_counter = 0;
3260 EXFUN (Fregexp_quote, 1);
3262 void decode_add_er_char (struct decoding_stream *str, Emchar character,
3263 unsigned_char_dynarr* dst);
3265 decode_add_er_char (struct decoding_stream *str, Emchar c,
3266 unsigned_char_dynarr* dst)
3268 if (str->er_counter == 0)
3270 if (CODING_SYSTEM_USE_ENTITY_REFERENCE (str->codesys)
3273 str->er_buf[0] = '&';
3277 DECODE_ADD_UCS_CHAR (c, dst);
3281 Lisp_Object string = make_string (str->er_buf,
3288 Lisp_Object char_type;
3291 for ( rest = Vcoded_charset_entity_reference_alist;
3292 !NILP (rest); rest = Fcdr (rest) )
3298 char_type = XCDR (ccs);
3303 if (NILP (ccs = Ffind_charset (ccs)))
3312 pat = Fregexp_quote (pat);
3319 pat = concat3 (build_string ("^&"),
3320 pat, build_string ("\\([0-9]+\\)$"));
3323 else if (EQ (ret, Qx))
3325 pat = concat3 (build_string ("^&"),
3326 pat, build_string ("\\([0-9a-f]+\\)$"));
3329 else if (EQ (ret, QX))
3331 pat = concat3 (build_string ("^&"),
3332 pat, build_string ("\\([0-9A-F]+\\)$"));
3338 if (!NILP (Fstring_match (pat, string, Qnil, Qnil)))
3341 = XINT (Fstring_to_number
3342 (Fsubstring (string,
3343 Fmatch_beginning (make_int (1)),
3344 Fmatch_end (make_int (1))),
3348 ? DECODE_CHAR (ccs, code, 0)
3349 : decode_builtin_char (ccs, code);
3352 DECODE_ADD_UCS_CHAR (chr, dst);
3355 Dynarr_add_many (dst, str->er_buf, str->er_counter);
3356 Dynarr_add (dst, ';');
3362 if (!NILP (Fstring_match (build_string ("^&MCS-\\([0-9A-F]+\\)$"),
3363 string, Qnil, Qnil)))
3366 = XUINT (Fstring_to_number
3367 (Fsubstring (string,
3368 Fmatch_beginning (make_int (1)),
3369 Fmatch_end (make_int (1))),
3372 DECODE_ADD_UCS_CHAR (code, dst);
3376 Dynarr_add_many (dst, str->er_buf, str->er_counter);
3377 Dynarr_add (dst, ';');
3380 str->er_counter = 0;
3382 else if ( (str->er_counter >= ER_BUF_SIZE) || (c >= 0x7F) )
3384 Dynarr_add_many (dst, str->er_buf, str->er_counter);
3385 str->er_counter = 0;
3386 DECODE_ADD_UCS_CHAR (c, dst);
3389 str->er_buf[str->er_counter++] = c;
3392 void char_encode_as_entity_reference (Emchar ch, char* buf);
3394 char_encode_as_entity_reference (Emchar ch, char* buf)
3396 Lisp_Object rest = Vcoded_charset_entity_reference_alist;
3399 Lisp_Object char_type;
3400 int format_columns, idx;
3401 char format[ER_BUF_SIZE];
3403 while (!NILP (rest))
3409 char_type = XCDR (ccs);
3414 if (!NILP (ccs = Ffind_charset (ccs)))
3416 int code_point = charset_code_point (ccs, ch, 0);
3418 if ( (code_point >= 0)
3419 && (NILP (char_type)
3420 || DECODE_CHAR (ccs, code_point, 0) != ch) )
3426 if ( STRINGP (ret) &&
3427 ( (idx = XSTRING_LENGTH (ret)) <= (ER_BUF_SIZE - 4) ) )
3430 strncpy (&format[1], XSTRING_DATA (ret), idx);
3440 format[idx++] = '%';
3441 format_columns = XINT (ret);
3442 if ( (2 <= format_columns) && (format_columns <= 8)
3443 && (idx + format_columns <= ER_BUF_SIZE - 1) )
3445 format [idx++] = '0';
3446 format [idx++] = '0' + format_columns;
3455 format [idx++] = 'd';
3456 else if (EQ (ret, Qx))
3457 format [idx++] = 'x';
3458 else if (EQ (ret, QX))
3459 format [idx++] = 'X';
3462 format [idx++] = ';';
3465 sprintf (buf, format, code_point);
3472 sprintf (buf, "&MCS-%08X;", ch);
3476 /************************************************************************/
3477 /* character composition */
3478 /************************************************************************/
3479 extern Lisp_Object Qcomposition;
3482 COMPOSE_FLUSH_CHARS (struct decoding_stream *str, unsigned_char_dynarr* dst);
3484 COMPOSE_FLUSH_CHARS (struct decoding_stream *str, unsigned_char_dynarr* dst)
3488 for (i = 0; i < str->combined_char_count; i++)
3489 decode_add_er_char (str, str->combined_chars[i], dst);
3490 str->combined_char_count = 0;
3491 str->combining_table = Qnil;
3494 void COMPOSE_ADD_CHAR (struct decoding_stream *str, Emchar character,
3495 unsigned_char_dynarr* dst);
3497 COMPOSE_ADD_CHAR (struct decoding_stream *str,
3498 Emchar character, unsigned_char_dynarr* dst)
3500 if (CODING_SYSTEM_DISABLE_COMPOSITION (str->codesys))
3501 decode_add_er_char (str, character, dst);
3502 else if (!CONSP (str->combining_table))
3505 = Fchar_feature (make_char (character), Qcomposition, Qnil,
3509 decode_add_er_char (str, character, dst);
3512 str->combined_chars[0] = character;
3513 str->combined_char_count = 1;
3514 str->combining_table = ret;
3520 = Fcdr (Fassq (make_char (character), str->combining_table));
3524 Emchar char2 = XCHARVAL (ret);
3525 Lisp_Object ret2 = Fchar_feature (ret, Qcomposition, Qnil,
3530 decode_add_er_char (str, char2, dst);
3531 str->combined_char_count = 0;
3532 str->combining_table = Qnil;
3536 str->combined_chars[0] = char2;
3537 str->combined_char_count = 1;
3538 str->combining_table = ret2;
3543 ret = Fchar_feature (make_char (character), Qcomposition, Qnil,
3546 COMPOSE_FLUSH_CHARS (str, dst);
3548 decode_add_er_char (str, character, dst);
3551 str->combined_chars[0] = character;
3552 str->combined_char_count = 1;
3553 str->combining_table = ret;
3558 #else /* not UTF2000 */
3559 #define COMPOSE_FLUSH_CHARS(str, dst)
3560 #define COMPOSE_ADD_CHAR(str, ch, dst) DECODE_ADD_UCS_CHAR (ch, dst)
3561 #endif /* UTF2000 */
3564 /************************************************************************/
3565 /* Shift-JIS methods */
3566 /************************************************************************/
3568 /* Shift-JIS is a coding system encoding three character sets: ASCII, right
3569 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
3570 as is. A character of JISX0201-Kana (DIMENSION1_CHARS94 character set) is
3571 encoded by "position-code + 0x80". A character of JISX0208
3572 (DIMENSION2_CHARS94 character set) is encoded in 2-byte but two
3573 position-codes are divided and shifted so that it fit in the range
3576 --- CODE RANGE of Shift-JIS ---
3577 (character set) (range)
3579 JISX0201-Kana 0xA0 .. 0xDF
3580 JISX0208 (1st byte) 0x80 .. 0x9F and 0xE0 .. 0xEF
3581 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
3582 -------------------------------
3586 /* Is this the first byte of a Shift-JIS two-byte char? */
3588 #define BYTE_SJIS_TWO_BYTE_1_P(c) \
3589 (((c) >= 0x81 && (c) <= 0x9F) || ((c) >= 0xE0 && (c) <= 0xEF))
3591 /* Is this the second byte of a Shift-JIS two-byte char? */
3593 #define BYTE_SJIS_TWO_BYTE_2_P(c) \
3594 (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0x80 && (c) <= 0xFC))
3596 #define BYTE_SJIS_KATAKANA_P(c) \
3597 ((c) >= 0xA1 && (c) <= 0xDF)
3600 detect_coding_sjis (struct detection_state *st, const Extbyte *src, Lstream_data_count n)
3604 unsigned char c = *(unsigned char *)src++;
3605 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
3607 if (st->shift_jis.in_second_byte)
3609 st->shift_jis.in_second_byte = 0;
3613 else if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
3614 st->shift_jis.in_second_byte = 1;
3616 return CODING_CATEGORY_SHIFT_JIS_MASK;
3619 /* Convert Shift-JIS data to internal format. */
3622 decode_coding_sjis (Lstream *decoding, const Extbyte *src,
3623 unsigned_char_dynarr *dst, Lstream_data_count n)
3625 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3626 unsigned int flags = str->flags;
3627 unsigned int cpos = str->cpos;
3628 eol_type_t eol_type = str->eol_type;
3632 unsigned char c = *(unsigned char *)src++;
3636 /* Previous character was first byte of Shift-JIS Kanji char. */
3637 if (BYTE_SJIS_TWO_BYTE_2_P (c))
3639 unsigned char e1, e2;
3641 DECODE_SJIS (cpos, c, e1, e2);
3643 DECODE_ADD_UCS_CHAR(MAKE_CHAR(Vcharset_japanese_jisx0208,
3647 Dynarr_add (dst, LEADING_BYTE_JAPANESE_JISX0208);
3648 Dynarr_add (dst, e1);
3649 Dynarr_add (dst, e2);
3654 DECODE_ADD_BINARY_CHAR (cpos, dst);
3655 DECODE_ADD_BINARY_CHAR (c, dst);
3661 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3662 if (BYTE_SJIS_TWO_BYTE_1_P (c))
3664 else if (BYTE_SJIS_KATAKANA_P (c))
3667 DECODE_ADD_UCS_CHAR(MAKE_CHAR(Vcharset_katakana_jisx0201,
3670 Dynarr_add (dst, LEADING_BYTE_KATAKANA_JISX0201);
3671 Dynarr_add (dst, c);
3676 DECODE_ADD_UCS_CHAR(MAKE_CHAR(Vcharset_latin_jisx0201,
3680 DECODE_ADD_BINARY_CHAR (c, dst);
3682 label_continue_loop:;
3685 DECODE_HANDLE_END_OF_CONVERSION (flags, cpos, dst);
3691 /* Convert internal character representation to Shift_JIS. */
3694 char_encode_shift_jis (struct encoding_stream *str, Emchar ch,
3695 unsigned_char_dynarr *dst, unsigned int *flags)
3697 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
3701 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3702 Dynarr_add (dst, '\r');
3703 if (eol_type != EOL_CR)
3704 Dynarr_add (dst, ch);
3708 unsigned int s1, s2;
3710 int code_point = charset_code_point (Vcharset_latin_jisx0201, ch, 0);
3712 if (code_point >= 0)
3713 Dynarr_add (dst, code_point);
3714 else if ((code_point
3715 = charset_code_point (Vcharset_japanese_jisx0208_1990, ch, 0))
3718 ENCODE_SJIS ((code_point >> 8) | 0x80,
3719 (code_point & 0xFF) | 0x80, s1, s2);
3720 Dynarr_add (dst, s1);
3721 Dynarr_add (dst, s2);
3723 else if ((code_point
3724 = charset_code_point (Vcharset_katakana_jisx0201, ch, 0))
3726 Dynarr_add (dst, code_point | 0x80);
3727 else if ((code_point
3728 = charset_code_point (Vcharset_japanese_jisx0208, ch, 0))
3731 ENCODE_SJIS ((code_point >> 8) | 0x80,
3732 (code_point & 0xFF) | 0x80, s1, s2);
3733 Dynarr_add (dst, s1);
3734 Dynarr_add (dst, s2);
3736 else if ((code_point = charset_code_point (Vcharset_ascii, ch, 0))
3738 Dynarr_add (dst, code_point);
3740 Dynarr_add (dst, '?');
3742 Lisp_Object charset;
3743 unsigned int c1, c2;
3745 BREAKUP_CHAR (ch, charset, c1, c2);
3747 if (EQ(charset, Vcharset_katakana_jisx0201))
3749 Dynarr_add (dst, c1 | 0x80);
3753 Dynarr_add (dst, c1);
3755 else if (EQ(charset, Vcharset_japanese_jisx0208))
3757 ENCODE_SJIS (c1 | 0x80, c2 | 0x80, s1, s2);
3758 Dynarr_add (dst, s1);
3759 Dynarr_add (dst, s2);
3762 Dynarr_add (dst, '?');
3768 char_finish_shift_jis (struct encoding_stream *str, unsigned_char_dynarr *dst,
3769 unsigned int *flags)
3773 DEFUN ("decode-shift-jis-char", Fdecode_shift_jis_char, 1, 1, 0, /*
3774 Decode a JISX0208 character of Shift-JIS coding-system.
3775 CODE is the character code in Shift-JIS as a cons of type bytes.
3776 Return the corresponding character.
3780 unsigned char c1, c2, s1, s2;
3783 CHECK_INT (XCAR (code));
3784 CHECK_INT (XCDR (code));
3785 s1 = XINT (XCAR (code));
3786 s2 = XINT (XCDR (code));
3787 if (BYTE_SJIS_TWO_BYTE_1_P (s1) &&
3788 BYTE_SJIS_TWO_BYTE_2_P (s2))
3790 DECODE_SJIS (s1, s2, c1, c2);
3791 return make_char (MAKE_CHAR (Vcharset_japanese_jisx0208,
3792 c1 & 0x7F, c2 & 0x7F));
3798 DEFUN ("encode-shift-jis-char", Fencode_shift_jis_char, 1, 1, 0, /*
3799 Encode a JISX0208 character CHARACTER to SHIFT-JIS coding-system.
3800 Return the corresponding character code in SHIFT-JIS as a cons of two bytes.
3804 Lisp_Object charset;
3807 CHECK_CHAR_COERCE_INT (character);
3808 BREAKUP_CHAR (XCHAR (character), charset, c1, c2);
3809 if (EQ (charset, Vcharset_japanese_jisx0208))
3811 ENCODE_SJIS (c1 | 0x80, c2 | 0x80, s1, s2);
3812 return Fcons (make_int (s1), make_int (s2));
3819 /************************************************************************/
3821 /************************************************************************/
3823 /* BIG5 is a coding system encoding two character sets: ASCII and
3824 Big5. An ASCII character is encoded as is. Big5 is a two-byte
3825 character set and is encoded in two-byte.
3827 --- CODE RANGE of BIG5 ---
3828 (character set) (range)
3830 Big5 (1st byte) 0xA1 .. 0xFE
3831 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
3832 --------------------------
3834 Since the number of characters in Big5 is larger than maximum
3835 characters in Emacs' charset (96x96), it can't be handled as one
3836 charset. So, in Emacs, Big5 is divided into two: `charset-big5-1'
3837 and `charset-big5-2'. Both <type>s are DIMENSION2_CHARS94. The former
3838 contains frequently used characters and the latter contains less
3839 frequently used characters. */
3842 #define BYTE_BIG5_TWO_BYTE_1_P(c) \
3843 ((c) >= 0x81 && (c) <= 0xFE)
3845 #define BYTE_BIG5_TWO_BYTE_1_P(c) \
3846 ((c) >= 0xA1 && (c) <= 0xFE)
3849 /* Is this the second byte of a Shift-JIS two-byte char? */
3851 #define BYTE_BIG5_TWO_BYTE_2_P(c) \
3852 (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0xA1 && (c) <= 0xFE))
3854 /* Number of Big5 characters which have the same code in 1st byte. */
3856 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
3858 /* Code conversion macros. These are macros because they are used in
3859 inner loops during code conversion.
3861 Note that temporary variables in macros introduce the classic
3862 dynamic-scoping problems with variable names. We use capital-
3863 lettered variables in the assumption that XEmacs does not use
3864 capital letters in variables except in a very formalized way
3867 /* Convert Big5 code (b1, b2) into its internal string representation
3870 /* There is a much simpler way to split the Big5 charset into two.
3871 For the moment I'm going to leave the algorithm as-is because it
3872 claims to separate out the most-used characters into a single
3873 charset, which perhaps will lead to optimizations in various
3876 The way the algorithm works is something like this:
3878 Big5 can be viewed as a 94x157 charset, where the row is
3879 encoded into the bytes 0xA1 .. 0xFE and the column is encoded
3880 into the bytes 0x40 .. 0x7E and 0xA1 .. 0xFE. As for frequency,
3881 the split between low and high column numbers is apparently
3882 meaningless; ascending rows produce less and less frequent chars.
3883 Therefore, we assign the lower half of rows (0xA1 .. 0xC8) to
3884 the first charset, and the upper half (0xC9 .. 0xFE) to the
3885 second. To do the conversion, we convert the character into
3886 a single number where 0 .. 156 is the first row, 157 .. 313
3887 is the second, etc. That way, the characters are ordered by
3888 decreasing frequency. Then we just chop the space in two
3889 and coerce the result into a 94x94 space.
3892 #define DECODE_BIG5(b1, b2, lb, c1, c2) do \
3894 int B1 = b1, B2 = b2; \
3896 = (B1 - 0xA1) * BIG5_SAME_ROW + B2 - (B2 < 0x7F ? 0x40 : 0x62); \
3900 lb = LEADING_BYTE_CHINESE_BIG5_1; \
3904 lb = LEADING_BYTE_CHINESE_BIG5_2; \
3905 I -= (BIG5_SAME_ROW) * (0xC9 - 0xA1); \
3907 c1 = I / (0xFF - 0xA1) + 0xA1; \
3908 c2 = I % (0xFF - 0xA1) + 0xA1; \
3911 /* Convert the internal string representation of a Big5 character
3912 (lb, c1, c2) into Big5 code (b1, b2). */
3914 #define ENCODE_BIG5(lb, c1, c2, b1, b2) do \
3916 unsigned int I = ((c1) - 0xA1) * (0xFF - 0xA1) + ((c2) - 0xA1); \
3918 if (lb == LEADING_BYTE_CHINESE_BIG5_2) \
3920 I += BIG5_SAME_ROW * (0xC9 - 0xA1); \
3922 b1 = I / BIG5_SAME_ROW + 0xA1; \
3923 b2 = I % BIG5_SAME_ROW; \
3924 b2 += b2 < 0x3F ? 0x40 : 0x62; \
3928 detect_coding_big5 (struct detection_state *st, const Extbyte *src, Lstream_data_count n)
3932 unsigned char c = *(unsigned char *)src++;
3933 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO
3935 || (c >= 0x80 && c <= 0xA0)
3939 if (st->big5.in_second_byte)
3941 st->big5.in_second_byte = 0;
3942 if (c < 0x40 || (c >= 0x80 && c <= 0xA0))
3952 st->big5.in_second_byte = 1;
3954 return CODING_CATEGORY_BIG5_MASK;
3957 /* Convert Big5 data to internal format. */
3960 decode_coding_big5 (Lstream *decoding, const Extbyte *src,
3961 unsigned_char_dynarr *dst, Lstream_data_count n)
3963 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3964 unsigned int flags = str->flags;
3965 unsigned int cpos = str->cpos;
3966 eol_type_t eol_type = str->eol_type;
3969 = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (DECODING_STREAM_DATA
3970 (decoding)->codesys, 1);
3975 unsigned char c = *(unsigned char *)src++;
3978 /* Previous character was first byte of Big5 char. */
3979 if (BYTE_BIG5_TWO_BYTE_2_P (c))
3982 int code_point = (cpos << 8) | c;
3983 Emchar char_id = decode_defined_char (ccs, code_point, 0);
3987 = DECODE_CHAR (Vcharset_chinese_big5, code_point, 0);
3988 DECODE_ADD_UCS_CHAR (char_id, dst);
3990 unsigned char b1, b2, b3;
3991 DECODE_BIG5 (cpos, c, b1, b2, b3);
3992 Dynarr_add (dst, b1);
3993 Dynarr_add (dst, b2);
3994 Dynarr_add (dst, b3);
3999 DECODE_ADD_BINARY_CHAR (cpos, dst);
4000 DECODE_ADD_BINARY_CHAR (c, dst);
4006 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
4007 if (BYTE_BIG5_TWO_BYTE_1_P (c))
4009 decode_flush_er_chars (str, dst);
4014 decode_flush_er_chars (str, dst);
4015 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
4016 DECODE_ADD_BINARY_CHAR (c, dst);
4020 /* DECODE_ADD_BINARY_CHAR (c, dst); */
4021 decode_add_er_char (str, c, dst);
4024 label_continue_loop:;
4027 /* DECODE_HANDLE_END_OF_CONVERSION (flags, cpos, dst); */
4028 if (flags & CODING_STATE_END)
4030 decode_flush_er_chars (str, dst);
4031 DECODE_OUTPUT_PARTIAL_CHAR (cpos);
4032 if (flags & CODING_STATE_CR)
4033 Dynarr_add (dst, '\r');
4040 /* Convert internally-formatted data to Big5. */
4043 char_encode_big5 (struct encoding_stream *str, Emchar ch,
4044 unsigned_char_dynarr *dst, unsigned int *flags)
4046 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
4050 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
4051 Dynarr_add (dst, '\r');
4052 if (eol_type != EOL_CR)
4053 Dynarr_add (dst, ch);
4060 = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (str->codesys, 1);
4062 if ((code_point = charset_code_point (Vcharset_ascii, ch, 0)) >= 0)
4063 Dynarr_add (dst, code_point);
4064 else if ((code_point = charset_code_point (ccs, ch, 0)) >= 0)
4066 Dynarr_add (dst, code_point >> 8);
4067 Dynarr_add (dst, code_point & 0xFF);
4069 else if ((code_point
4070 = charset_code_point (Vcharset_chinese_big5, ch, 0)) >= 0)
4072 Dynarr_add (dst, code_point >> 8);
4073 Dynarr_add (dst, code_point & 0xFF);
4075 else if ((code_point
4076 = charset_code_point (Vcharset_chinese_big5_1, ch, 0)) >= 0)
4079 = ((code_point >> 8) - 33) * (0xFF - 0xA1)
4080 + ((code_point & 0xFF) - 33);
4081 unsigned char b1 = I / BIG5_SAME_ROW + 0xA1;
4082 unsigned char b2 = I % BIG5_SAME_ROW;
4084 b2 += b2 < 0x3F ? 0x40 : 0x62;
4085 Dynarr_add (dst, b1);
4086 Dynarr_add (dst, b2);
4088 else if ((code_point
4089 = charset_code_point (Vcharset_chinese_big5_2, ch, 0)) >= 0)
4092 = ((code_point >> 8) - 33) * (0xFF - 0xA1)
4093 + ((code_point & 0xFF) - 33);
4094 unsigned char b1, b2;
4096 I += BIG5_SAME_ROW * (0xC9 - 0xA1);
4097 b1 = I / BIG5_SAME_ROW + 0xA1;
4098 b2 = I % BIG5_SAME_ROW;
4099 b2 += b2 < 0x3F ? 0x40 : 0x62;
4100 Dynarr_add (dst, b1);
4101 Dynarr_add (dst, b2);
4103 else if (CODING_SYSTEM_USE_ENTITY_REFERENCE (str->codesys))
4107 char_encode_as_entity_reference (ch, buf);
4108 Dynarr_add_many (dst, buf, strlen (buf));
4111 Dynarr_add (dst, '?');
4118 char_finish_big5 (struct encoding_stream *str, unsigned_char_dynarr *dst,
4119 unsigned int *flags)
4124 DEFUN ("decode-big5-char", Fdecode_big5_char, 1, 1, 0, /*
4125 Decode a Big5 character CODE of BIG5 coding-system.
4126 CODE is the character code in BIG5, a cons of two integers.
4127 Return the corresponding character.
4131 unsigned char c1, c2, b1, b2;
4134 CHECK_INT (XCAR (code));
4135 CHECK_INT (XCDR (code));
4136 b1 = XINT (XCAR (code));
4137 b2 = XINT (XCDR (code));
4138 if (BYTE_BIG5_TWO_BYTE_1_P (b1) &&
4139 BYTE_BIG5_TWO_BYTE_2_P (b2))
4141 Charset_ID leading_byte;
4142 Lisp_Object charset;
4143 DECODE_BIG5 (b1, b2, leading_byte, c1, c2);
4144 charset = CHARSET_BY_LEADING_BYTE (leading_byte);
4145 return make_char (MAKE_CHAR (charset, c1 & 0x7F, c2 & 0x7F));
4151 DEFUN ("encode-big5-char", Fencode_big5_char, 1, 1, 0, /*
4152 Encode the Big5 character CHARACTER in the BIG5 coding-system.
4153 Return the corresponding character code in Big5.
4157 Lisp_Object charset;
4160 CHECK_CHAR_COERCE_INT (character);
4161 BREAKUP_CHAR (XCHAR (character), charset, c1, c2);
4162 if (EQ (charset, Vcharset_chinese_big5_1) ||
4163 EQ (charset, Vcharset_chinese_big5_2))
4165 ENCODE_BIG5 (XCHARSET_LEADING_BYTE (charset), c1 | 0x80, c2 | 0x80,
4167 return Fcons (make_int (b1), make_int (b2));
4174 /************************************************************************/
4176 /************************************************************************/
4179 detect_coding_ucs4 (struct detection_state *st, const Extbyte *src, Lstream_data_count n)
4183 unsigned char c = *(unsigned char *)src++;
4184 switch (st->ucs4.in_byte)
4193 st->ucs4.in_byte = 0;
4199 return CODING_CATEGORY_UCS4_MASK;
4203 decode_coding_ucs4 (Lstream *decoding, const Extbyte *src,
4204 unsigned_char_dynarr *dst, Lstream_data_count n)
4206 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
4207 unsigned int flags = str->flags;
4208 unsigned int cpos = str->cpos;
4209 unsigned char counter = str->counter;
4213 unsigned char c = *(unsigned char *)src++;
4221 DECODE_ADD_UCS_CHAR ((cpos << 8) | c, dst);
4226 cpos = ( cpos << 8 ) | c;
4230 if (counter & CODING_STATE_END)
4231 DECODE_OUTPUT_PARTIAL_CHAR (cpos);
4235 str->counter = counter;
4239 char_encode_ucs4 (struct encoding_stream *str, Emchar ch,
4240 unsigned_char_dynarr *dst, unsigned int *flags)
4242 Dynarr_add (dst, ch >> 24);
4243 Dynarr_add (dst, ch >> 16);
4244 Dynarr_add (dst, ch >> 8);
4245 Dynarr_add (dst, ch );
4249 char_finish_ucs4 (struct encoding_stream *str, unsigned_char_dynarr *dst,
4250 unsigned int *flags)
4255 /************************************************************************/
4256 /* UTF-16 methods */
4257 /************************************************************************/
4260 detect_coding_utf16 (struct detection_state *st, const Extbyte *src, Lstream_data_count n)
4262 return CODING_CATEGORY_UTF16_MASK;
4266 decode_coding_utf16 (Lstream *decoding, const Extbyte *src,
4267 unsigned_char_dynarr *dst, Lstream_data_count n)
4269 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
4270 unsigned int flags = str->flags;
4271 unsigned int cpos = str->cpos;
4272 unsigned char counter = str->counter & 3;
4273 unsigned char byte_order = str->counter >> 2;
4274 eol_type_t eol_type = str->eol_type;
4278 unsigned char c = *(unsigned char *)src++;
4284 else if (counter == 1)
4288 if (byte_order == 0)
4289 code = (c << 8) | cpos;
4291 code = (cpos << 8) | c;
4294 code = ((code & 0xFF) << 8) | (code >> 8);
4295 if ( byte_order == 0 )
4300 if ( (0xD800 <= code) && (code <= 0xDBFF) )
4311 DECODE_HANDLE_EOL_TYPE (eol_type, code, flags, dst);
4312 DECODE_ADD_UCS_CHAR (code, dst);
4316 else if (counter == 2)
4318 cpos = (cpos << 8) | c;
4326 ? (c << 8) | (cpos & 0xFF)
4327 : ((cpos & 0xFF) << 8) | c;
4329 DECODE_ADD_UCS_CHAR ((x - 0xD800) * 0x400 + (y - 0xDC00)
4334 label_continue_loop:;
4336 if (counter & CODING_STATE_END)
4337 DECODE_OUTPUT_PARTIAL_CHAR (cpos);
4341 str->counter = (byte_order << 2) | counter;
4345 char_encode_utf16 (struct encoding_stream *str, Emchar ch,
4346 unsigned_char_dynarr *dst, unsigned int *flags)
4350 Dynarr_add (dst, ch);
4351 Dynarr_add (dst, ch >> 8);
4355 int y = ((ch - 0x10000) / 0x400) + 0xD800;
4356 int z = ((ch - 0x10000) % 0x400) + 0xDC00;
4358 Dynarr_add (dst, y);
4359 Dynarr_add (dst, y >> 8);
4360 Dynarr_add (dst, z);
4361 Dynarr_add (dst, z >> 8);
4366 char_finish_utf16 (struct encoding_stream *str, unsigned_char_dynarr *dst,
4367 unsigned int *flags)
4372 /************************************************************************/
4374 /************************************************************************/
4377 detect_coding_utf8 (struct detection_state *st, const Extbyte *src, Lstream_data_count n)
4381 unsigned char c = *(unsigned char *)src++;
4382 switch (st->utf8.in_byte)
4385 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
4388 st->utf8.in_byte = 5;
4390 st->utf8.in_byte = 4;
4392 st->utf8.in_byte = 3;
4394 st->utf8.in_byte = 2;
4396 st->utf8.in_byte = 1;
4401 if ((c & 0xc0) != 0x80)
4407 return CODING_CATEGORY_UTF8_MASK;
4411 decode_output_utf8_partial_char (unsigned char counter,
4413 unsigned_char_dynarr *dst)
4416 DECODE_ADD_BINARY_CHAR ( (cpos|0xFC), dst);
4417 else if (counter == 4)
4419 if (cpos < (1 << 6))
4420 DECODE_ADD_BINARY_CHAR ( (cpos|0xF8), dst);
4423 DECODE_ADD_BINARY_CHAR ( ((cpos >> 6)|0xFC), dst);
4424 DECODE_ADD_BINARY_CHAR ( ((cpos&0x3F)|0x80), dst);
4427 else if (counter == 3)
4429 if (cpos < (1 << 6))
4430 DECODE_ADD_BINARY_CHAR ( (cpos|0xF0), dst);
4431 else if (cpos < (1 << 12))
4433 DECODE_ADD_BINARY_CHAR ( ((cpos >> 6)|0xF8), dst);
4434 DECODE_ADD_BINARY_CHAR ( ((cpos&0x3F)|0x80), dst);
4438 DECODE_ADD_BINARY_CHAR ( ( (cpos >> 12)|0xFC), dst);
4439 DECODE_ADD_BINARY_CHAR ( (((cpos >> 6)&0x3F)|0x80), dst);
4440 DECODE_ADD_BINARY_CHAR ( ( (cpos &0x3F)|0x80), dst);
4443 else if (counter == 2)
4445 if (cpos < (1 << 6))
4446 DECODE_ADD_BINARY_CHAR ( (cpos|0xE0), dst);
4447 else if (cpos < (1 << 12))
4449 DECODE_ADD_BINARY_CHAR ( ((cpos >> 6)|0xF0), dst);
4450 DECODE_ADD_BINARY_CHAR ( ((cpos&0x3F)|0x80), dst);
4452 else if (cpos < (1 << 18))
4454 DECODE_ADD_BINARY_CHAR ( ( (cpos >> 12)|0xF8), dst);
4455 DECODE_ADD_BINARY_CHAR ( (((cpos >> 6)&0x3F)|0x80), dst);
4456 DECODE_ADD_BINARY_CHAR ( ( (cpos &0x3F)|0x80), dst);
4460 DECODE_ADD_BINARY_CHAR ( ( (cpos >> 18)|0xFC), dst);
4461 DECODE_ADD_BINARY_CHAR ( (((cpos >> 12)&0x3F)|0x80), dst);
4462 DECODE_ADD_BINARY_CHAR ( (((cpos >> 6)&0x3F)|0x80), dst);
4463 DECODE_ADD_BINARY_CHAR ( ( (cpos &0x3F)|0x80), dst);
4468 if (cpos < (1 << 6))
4469 DECODE_ADD_BINARY_CHAR ( (cpos|0xC0), dst);
4470 else if (cpos < (1 << 12))
4472 DECODE_ADD_BINARY_CHAR ( ((cpos >> 6)|0xE0), dst);
4473 DECODE_ADD_BINARY_CHAR ( ((cpos&0x3F)|0x80), dst);
4475 else if (cpos < (1 << 18))
4477 DECODE_ADD_BINARY_CHAR ( ( (cpos >> 12)|0xF0), dst);
4478 DECODE_ADD_BINARY_CHAR ( (((cpos >> 6)&0x3F)|0x80), dst);
4479 DECODE_ADD_BINARY_CHAR ( ( (cpos &0x3F)|0x80), dst);
4481 else if (cpos < (1 << 24))
4483 DECODE_ADD_BINARY_CHAR ( ( (cpos >> 18)|0xF8), dst);
4484 DECODE_ADD_BINARY_CHAR ( (((cpos >> 12)&0x3F)|0x80), dst);
4485 DECODE_ADD_BINARY_CHAR ( (((cpos >> 6)&0x3F)|0x80), dst);
4486 DECODE_ADD_BINARY_CHAR ( ( (cpos &0x3F)|0x80), dst);
4490 DECODE_ADD_BINARY_CHAR ( ( (cpos >> 24)|0xFC), dst);
4491 DECODE_ADD_BINARY_CHAR ( (((cpos >> 18)&0x3F)|0x80), dst);
4492 DECODE_ADD_BINARY_CHAR ( (((cpos >> 12)&0x3F)|0x80), dst);
4493 DECODE_ADD_BINARY_CHAR ( (((cpos >> 6)&0x3F)|0x80), dst);
4494 DECODE_ADD_BINARY_CHAR ( ( (cpos &0x3F)|0x80), dst);
4500 decode_coding_utf8 (Lstream *decoding, const Extbyte *src,
4501 unsigned_char_dynarr *dst, Lstream_data_count n)
4503 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
4504 unsigned int flags = str->flags;
4505 unsigned int cpos = str->cpos;
4506 eol_type_t eol_type = str->eol_type;
4507 unsigned char counter = str->counter;
4509 int bom_flag = str->bom_flag;
4511 = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (DECODING_STREAM_DATA
4512 (decoding)->codesys, 0);
4517 unsigned char c = *(unsigned char *)src++;
4522 COMPOSE_FLUSH_CHARS (str, dst);
4523 decode_flush_er_chars (str, dst);
4524 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
4526 if ( bom_flag == 0 )
4529 DECODE_ADD_UCS_CHAR (c, dst);
4531 else if ( c < 0xC0 )
4533 if ( bom_flag == 0 )
4536 /* decode_add_er_char (str, c, dst); */
4537 COMPOSE_ADD_CHAR (str, c, dst);
4541 /* decode_flush_er_chars (str, dst); */
4547 else if ( c < 0xF0 )
4552 else if ( c < 0xF8 )
4557 else if ( c < 0xFC )
4569 else if ( (c & 0xC0) == 0x80 )
4571 cpos = ( cpos << 6 ) | ( c & 0x3f );
4576 if ( bom_flag == 0 )
4578 if ( cpos == 0xFEFF )
4589 char_id = decode_defined_char (ccs, cpos, 0);
4596 COMPOSE_ADD_CHAR (str, char_id, dst);
4606 COMPOSE_FLUSH_CHARS (str, dst);
4607 decode_flush_er_chars (str, dst);
4608 decode_output_utf8_partial_char (counter, cpos, dst);
4609 DECODE_ADD_BINARY_CHAR (c, dst);
4613 label_continue_loop:;
4616 if (flags & CODING_STATE_END)
4618 COMPOSE_FLUSH_CHARS (str, dst);
4619 decode_flush_er_chars (str, dst);
4622 decode_output_utf8_partial_char (counter, cpos, dst);
4629 str->counter = counter;
4631 str->bom_flag = bom_flag;
4636 char_encode_utf8 (struct encoding_stream *str, Emchar ch,
4637 unsigned_char_dynarr *dst, unsigned int *flags)
4639 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
4643 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
4644 Dynarr_add (dst, '\r');
4645 if (eol_type != EOL_CR)
4646 Dynarr_add (dst, ch);
4648 else if (ch <= 0x7f)
4650 Dynarr_add (dst, ch);
4655 = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (str->codesys, 0);
4656 int code_point = charset_code_point (ucs_ccs, ch, 0);
4658 if ( (code_point < 0) || (code_point > 0xEFFFF) )
4661 = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (str->codesys, 1);
4665 && INTP (ret = Fchar_feature (make_char (ch),
4668 code_point = XINT (ret);
4669 else if ( !NILP (map =
4670 CODING_SYSTEM_ISO2022_INITIAL_CHARSET
4672 && INTP (ret = Fchar_feature (make_char (ch),
4675 code_point = XINT (ret);
4676 else if (CODING_SYSTEM_USE_ENTITY_REFERENCE (str->codesys))
4680 char_encode_as_entity_reference (ch, buf);
4681 Dynarr_add_many (dst, buf, strlen (buf));
4687 if (code_point <= 0x7ff)
4689 Dynarr_add (dst, (code_point >> 6) | 0xc0);
4690 Dynarr_add (dst, (code_point & 0x3f) | 0x80);
4692 else if (code_point <= 0xffff)
4694 Dynarr_add (dst, (code_point >> 12) | 0xe0);
4695 Dynarr_add (dst, ((code_point >> 6) & 0x3f) | 0x80);
4696 Dynarr_add (dst, (code_point & 0x3f) | 0x80);
4698 else if (code_point <= 0x1fffff)
4700 Dynarr_add (dst, (code_point >> 18) | 0xf0);
4701 Dynarr_add (dst, ((code_point >> 12) & 0x3f) | 0x80);
4702 Dynarr_add (dst, ((code_point >> 6) & 0x3f) | 0x80);
4703 Dynarr_add (dst, (code_point & 0x3f) | 0x80);
4705 else if (code_point <= 0x3ffffff)
4707 Dynarr_add (dst, (code_point >> 24) | 0xf8);
4708 Dynarr_add (dst, ((code_point >> 18) & 0x3f) | 0x80);
4709 Dynarr_add (dst, ((code_point >> 12) & 0x3f) | 0x80);
4710 Dynarr_add (dst, ((code_point >> 6) & 0x3f) | 0x80);
4711 Dynarr_add (dst, (code_point & 0x3f) | 0x80);
4715 Dynarr_add (dst, (code_point >> 30) | 0xfc);
4716 Dynarr_add (dst, ((code_point >> 24) & 0x3f) | 0x80);
4717 Dynarr_add (dst, ((code_point >> 18) & 0x3f) | 0x80);
4718 Dynarr_add (dst, ((code_point >> 12) & 0x3f) | 0x80);
4719 Dynarr_add (dst, ((code_point >> 6) & 0x3f) | 0x80);
4720 Dynarr_add (dst, (code_point & 0x3f) | 0x80);
4726 char_finish_utf8 (struct encoding_stream *str, unsigned_char_dynarr *dst,
4727 unsigned int *flags)
4732 /************************************************************************/
4733 /* ISO2022 methods */
4734 /************************************************************************/
4736 /* The following note describes the coding system ISO2022 briefly.
4737 Since the intention of this note is to help understand the
4738 functions in this file, some parts are NOT ACCURATE or OVERLY
4739 SIMPLIFIED. For thorough understanding, please refer to the
4740 original document of ISO2022.
4742 ISO2022 provides many mechanisms to encode several character sets
4743 in 7-bit and 8-bit environments. For 7-bit environments, all text
4744 is encoded using bytes less than 128. This may make the encoded
4745 text a little bit longer, but the text passes more easily through
4746 several gateways, some of which strip off MSB (Most Signigant Bit).
4748 There are two kinds of character sets: control character set and
4749 graphic character set. The former contains control characters such
4750 as `newline' and `escape' to provide control functions (control
4751 functions are also provided by escape sequences). The latter
4752 contains graphic characters such as 'A' and '-'. Emacs recognizes
4753 two control character sets and many graphic character sets.
4755 Graphic character sets are classified into one of the following
4756 four classes, according to the number of bytes (DIMENSION) and
4757 number of characters in one dimension (CHARS) of the set:
4758 - DIMENSION1_CHARS94
4759 - DIMENSION1_CHARS96
4760 - DIMENSION2_CHARS94
4761 - DIMENSION2_CHARS96
4763 In addition, each character set is assigned an identification tag,
4764 unique for each set, called "final character" (denoted as <F>
4765 hereafter). The <F> of each character set is decided by ECMA(*)
4766 when it is registered in ISO. The code range of <F> is 0x30..0x7F
4767 (0x30..0x3F are for private use only).
4769 Note (*): ECMA = European Computer Manufacturers Association
4771 Here are examples of graphic character set [NAME(<F>)]:
4772 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
4773 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
4774 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
4775 o DIMENSION2_CHARS96 -- none for the moment
4777 A code area (1 byte = 8 bits) is divided into 4 areas, C0, GL, C1, and GR.
4778 C0 [0x00..0x1F] -- control character plane 0
4779 GL [0x20..0x7F] -- graphic character plane 0
4780 C1 [0x80..0x9F] -- control character plane 1
4781 GR [0xA0..0xFF] -- graphic character plane 1
4783 A control character set is directly designated and invoked to C0 or
4784 C1 by an escape sequence. The most common case is that:
4785 - ISO646's control character set is designated/invoked to C0, and
4786 - ISO6429's control character set is designated/invoked to C1,
4787 and usually these designations/invocations are omitted in encoded
4788 text. In a 7-bit environment, only C0 can be used, and a control
4789 character for C1 is encoded by an appropriate escape sequence to
4790 fit into the environment. All control characters for C1 are
4791 defined to have corresponding escape sequences.
4793 A graphic character set is at first designated to one of four
4794 graphic registers (G0 through G3), then these graphic registers are
4795 invoked to GL or GR. These designations and invocations can be
4796 done independently. The most common case is that G0 is invoked to
4797 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
4798 these invocations and designations are omitted in encoded text.
4799 In a 7-bit environment, only GL can be used.
4801 When a graphic character set of CHARS94 is invoked to GL, codes
4802 0x20 and 0x7F of the GL area work as control characters SPACE and
4803 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
4806 There are two ways of invocation: locking-shift and single-shift.
4807 With locking-shift, the invocation lasts until the next different
4808 invocation, whereas with single-shift, the invocation affects the
4809 following character only and doesn't affect the locking-shift
4810 state. Invocations are done by the following control characters or
4813 ----------------------------------------------------------------------
4814 abbrev function cntrl escape seq description
4815 ----------------------------------------------------------------------
4816 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
4817 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
4818 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
4819 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
4820 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
4821 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
4822 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
4823 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
4824 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
4825 ----------------------------------------------------------------------
4826 (*) These are not used by any known coding system.
4828 Control characters for these functions are defined by macros
4829 ISO_CODE_XXX in `coding.h'.
4831 Designations are done by the following escape sequences:
4832 ----------------------------------------------------------------------
4833 escape sequence description
4834 ----------------------------------------------------------------------
4835 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
4836 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
4837 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
4838 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
4839 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
4840 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
4841 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
4842 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
4843 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
4844 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
4845 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
4846 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
4847 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
4848 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
4849 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
4850 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
4851 ----------------------------------------------------------------------
4853 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
4854 of dimension 1, chars 94, and final character <F>, etc...
4856 Note (*): Although these designations are not allowed in ISO2022,
4857 Emacs accepts them on decoding, and produces them on encoding
4858 CHARS96 character sets in a coding system which is characterized as
4859 7-bit environment, non-locking-shift, and non-single-shift.
4861 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
4862 '(' can be omitted. We refer to this as "short-form" hereafter.
4864 Now you may notice that there are a lot of ways for encoding the
4865 same multilingual text in ISO2022. Actually, there exist many
4866 coding systems such as Compound Text (used in X11's inter client
4867 communication, ISO-2022-JP (used in Japanese internet), ISO-2022-KR
4868 (used in Korean internet), EUC (Extended UNIX Code, used in Asian
4869 localized platforms), and all of these are variants of ISO2022.
4871 In addition to the above, Emacs handles two more kinds of escape
4872 sequences: ISO6429's direction specification and Emacs' private
4873 sequence for specifying character composition.
4875 ISO6429's direction specification takes the following form:
4876 o CSI ']' -- end of the current direction
4877 o CSI '0' ']' -- end of the current direction
4878 o CSI '1' ']' -- start of left-to-right text
4879 o CSI '2' ']' -- start of right-to-left text
4880 The control character CSI (0x9B: control sequence introducer) is
4881 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
4883 Character composition specification takes the following form:
4884 o ESC '0' -- start character composition
4885 o ESC '1' -- end character composition
4886 Since these are not standard escape sequences of any ISO standard,
4887 their use with these meanings is restricted to Emacs only. */
4890 reset_iso2022 (Lisp_Object coding_system, struct iso2022_decoder *iso)
4894 for (i = 0; i < 4; i++)
4896 if (!NILP (coding_system))
4898 XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, i);
4900 iso->charset[i] = Qt;
4901 iso->invalid_designated[i] = 0;
4903 iso->esc = ISO_ESC_NOTHING;
4904 iso->esc_bytes_index = 0;
4905 iso->register_left = 0;
4906 iso->register_right = 1;
4907 iso->switched_dir_and_no_valid_charset_yet = 0;
4908 iso->invalid_switch_dir = 0;
4909 iso->output_direction_sequence = 0;
4910 iso->output_literally = 0;
4911 #ifdef ENABLE_COMPOSITE_CHARS
4912 if (iso->composite_chars)
4913 Dynarr_reset (iso->composite_chars);
4918 fit_to_be_escape_quoted (unsigned char c)
4935 /* Parse one byte of an ISO2022 escape sequence.
4936 If the result is an invalid escape sequence, return 0 and
4937 do not change anything in STR. Otherwise, if the result is
4938 an incomplete escape sequence, update ISO2022.ESC and
4939 ISO2022.ESC_BYTES and return -1. Otherwise, update
4940 all the state variables (but not ISO2022.ESC_BYTES) and
4943 If CHECK_INVALID_CHARSETS is non-zero, check for designation
4944 or invocation of an invalid character set and treat that as
4945 an unrecognized escape sequence. */
4948 parse_iso2022_esc (Lisp_Object codesys, struct iso2022_decoder *iso,
4949 unsigned char c, unsigned int *flags,
4950 int check_invalid_charsets)
4952 /* (1) If we're at the end of a designation sequence, CS is the
4953 charset being designated and REG is the register to designate
4956 (2) If we're at the end of a locking-shift sequence, REG is
4957 the register to invoke and HALF (0 == left, 1 == right) is
4958 the half to invoke it into.
4960 (3) If we're at the end of a single-shift sequence, REG is
4961 the register to invoke. */
4962 Lisp_Object cs = Qnil;
4965 /* NOTE: This code does goto's all over the fucking place.
4966 The reason for this is that we're basically implementing
4967 a state machine here, and hierarchical languages like C
4968 don't really provide a clean way of doing this. */
4970 if (! (*flags & CODING_STATE_ESCAPE))
4971 /* At beginning of escape sequence; we need to reset our
4972 escape-state variables. */
4973 iso->esc = ISO_ESC_NOTHING;
4975 iso->output_literally = 0;
4976 iso->output_direction_sequence = 0;
4980 case ISO_ESC_NOTHING:
4981 iso->esc_bytes_index = 0;
4984 case ISO_CODE_ESC: /* Start escape sequence */
4985 *flags |= CODING_STATE_ESCAPE;
4989 case ISO_CODE_CSI: /* ISO6429 (specifying directionality) */
4990 *flags |= CODING_STATE_ESCAPE;
4991 iso->esc = ISO_ESC_5_11;
4994 case ISO_CODE_SO: /* locking shift 1 */
4997 case ISO_CODE_SI: /* locking shift 0 */
5001 case ISO_CODE_SS2: /* single shift */
5004 case ISO_CODE_SS3: /* single shift */
5008 default: /* Other control characters */
5015 /**** single shift ****/
5017 case 'N': /* single shift 2 */
5020 case 'O': /* single shift 3 */
5024 /**** locking shift ****/
5026 case '~': /* locking shift 1 right */
5029 case 'n': /* locking shift 2 */
5032 case '}': /* locking shift 2 right */
5035 case 'o': /* locking shift 3 */
5038 case '|': /* locking shift 3 right */
5042 #ifdef ENABLE_COMPOSITE_CHARS
5043 /**** composite ****/
5046 iso->esc = ISO_ESC_START_COMPOSITE;
5047 *flags = (*flags & CODING_STATE_ISO2022_LOCK) |
5048 CODING_STATE_COMPOSITE;
5052 iso->esc = ISO_ESC_END_COMPOSITE;
5053 *flags = (*flags & CODING_STATE_ISO2022_LOCK) &
5054 ~CODING_STATE_COMPOSITE;
5056 #endif /* ENABLE_COMPOSITE_CHARS */
5058 /**** directionality ****/
5061 iso->esc = ISO_ESC_5_11;
5064 /**** designation ****/
5066 case '$': /* multibyte charset prefix */
5067 iso->esc = ISO_ESC_2_4;
5071 if (0x28 <= c && c <= 0x2F)
5073 iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_8);
5077 /* This function is called with CODESYS equal to nil when
5078 doing coding-system detection. */
5080 && XCODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
5081 && fit_to_be_escape_quoted (c))
5083 iso->esc = ISO_ESC_LITERAL;
5084 *flags &= CODING_STATE_ISO2022_LOCK;
5094 /**** directionality ****/
5096 case ISO_ESC_5_11: /* ISO6429 direction control */
5099 *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
5100 goto directionality;
5102 if (c == '0') iso->esc = ISO_ESC_5_11_0;
5103 else if (c == '1') iso->esc = ISO_ESC_5_11_1;
5104 else if (c == '2') iso->esc = ISO_ESC_5_11_2;
5108 case ISO_ESC_5_11_0:
5111 *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
5112 goto directionality;
5116 case ISO_ESC_5_11_1:
5119 *flags = (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
5120 goto directionality;
5124 case ISO_ESC_5_11_2:
5127 *flags = (*flags & CODING_STATE_ISO2022_LOCK) | CODING_STATE_R2L;
5128 goto directionality;
5133 iso->esc = ISO_ESC_DIRECTIONALITY;
5134 /* Various junk here to attempt to preserve the direction sequences
5135 literally in the text if they would otherwise be swallowed due
5136 to invalid designations that don't show up as actual charset
5137 changes in the text. */
5138 if (iso->invalid_switch_dir)
5140 /* We already inserted a direction switch literally into the
5141 text. We assume (#### this may not be right) that the
5142 next direction switch is the one going the other way,
5143 and we need to output that literally as well. */
5144 iso->output_literally = 1;
5145 iso->invalid_switch_dir = 0;
5151 /* If we are in the thrall of an invalid designation,
5152 then stick the directionality sequence literally into the
5153 output stream so it ends up in the original text again. */
5154 for (jj = 0; jj < 4; jj++)
5155 if (iso->invalid_designated[jj])
5159 iso->output_literally = 1;
5160 iso->invalid_switch_dir = 1;
5163 /* Indicate that we haven't yet seen a valid designation,
5164 so that if a switch-dir is directly followed by an
5165 invalid designation, both get inserted literally. */
5166 iso->switched_dir_and_no_valid_charset_yet = 1;
5171 /**** designation ****/
5174 if (0x28 <= c && c <= 0x2F)
5176 iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_4_8);
5179 if (0x40 <= c && c <= 0x42)
5182 cs = CHARSET_BY_ATTRIBUTES (94, -1, c,
5183 *flags & CODING_STATE_R2L ?
5184 CHARSET_RIGHT_TO_LEFT :
5185 CHARSET_LEFT_TO_RIGHT);
5196 if (c < '0' || c > '~')
5197 return 0; /* bad final byte */
5199 if (iso->esc >= ISO_ESC_2_8 &&
5200 iso->esc <= ISO_ESC_2_15)
5202 chars = (iso->esc >= ISO_ESC_2_12) ? 96 : 94;
5203 single = 1; /* single-byte */
5204 reg = (iso->esc - ISO_ESC_2_8) & 3;
5206 else if (iso->esc >= ISO_ESC_2_4_8 &&
5207 iso->esc <= ISO_ESC_2_4_15)
5209 chars = (iso->esc >= ISO_ESC_2_4_12) ? 96 : 94;
5210 single = -1; /* multi-byte */
5211 reg = (iso->esc - ISO_ESC_2_4_8) & 3;
5215 /* Can this ever be reached? -slb */
5219 cs = CHARSET_BY_ATTRIBUTES (chars, single, c,
5220 *flags & CODING_STATE_R2L ?
5221 CHARSET_RIGHT_TO_LEFT :
5222 CHARSET_LEFT_TO_RIGHT);
5228 iso->esc_bytes[iso->esc_bytes_index++] = (unsigned char) c;
5232 if (check_invalid_charsets && !CHARSETP (iso->charset[reg]))
5233 /* can't invoke something that ain't there. */
5235 iso->esc = ISO_ESC_SINGLE_SHIFT;
5236 *flags &= CODING_STATE_ISO2022_LOCK;
5238 *flags |= CODING_STATE_SS2;
5240 *flags |= CODING_STATE_SS3;
5244 if (check_invalid_charsets &&
5245 !CHARSETP (iso->charset[reg]))
5246 /* can't invoke something that ain't there. */
5249 iso->register_right = reg;
5251 iso->register_left = reg;
5252 *flags &= CODING_STATE_ISO2022_LOCK;
5253 iso->esc = ISO_ESC_LOCKING_SHIFT;
5257 if (NILP (cs) && check_invalid_charsets)
5259 iso->invalid_designated[reg] = 1;
5260 iso->charset[reg] = Vcharset_ascii;
5261 iso->esc = ISO_ESC_DESIGNATE;
5262 *flags &= CODING_STATE_ISO2022_LOCK;
5263 iso->output_literally = 1;
5264 if (iso->switched_dir_and_no_valid_charset_yet)
5266 /* We encountered a switch-direction followed by an
5267 invalid designation. Ensure that the switch-direction
5268 gets outputted; otherwise it will probably get eaten
5269 when the text is written out again. */
5270 iso->switched_dir_and_no_valid_charset_yet = 0;
5271 iso->output_direction_sequence = 1;
5272 /* And make sure that the switch-dir going the other
5273 way gets outputted, as well. */
5274 iso->invalid_switch_dir = 1;
5278 /* This function is called with CODESYS equal to nil when
5279 doing coding-system detection. */
5280 if (!NILP (codesys))
5282 charset_conversion_spec_dynarr *dyn =
5283 XCODING_SYSTEM (codesys)->iso2022.input_conv;
5289 for (i = 0; i < Dynarr_length (dyn); i++)
5291 struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
5292 if (EQ (cs, spec->from_charset))
5293 cs = spec->to_charset;
5298 iso->charset[reg] = cs;
5299 iso->esc = ISO_ESC_DESIGNATE;
5300 *flags &= CODING_STATE_ISO2022_LOCK;
5301 if (iso->invalid_designated[reg])
5303 iso->invalid_designated[reg] = 0;
5304 iso->output_literally = 1;
5306 if (iso->switched_dir_and_no_valid_charset_yet)
5307 iso->switched_dir_and_no_valid_charset_yet = 0;
5312 detect_coding_iso2022 (struct detection_state *st, const Extbyte *src, Lstream_data_count n)
5316 /* #### There are serious deficiencies in the recognition mechanism
5317 here. This needs to be much smarter if it's going to cut it.
5318 The sequence "\xff\x0f" is currently detected as LOCK_SHIFT while
5319 it should be detected as Latin-1.
5320 All the ISO2022 stuff in this file should be synced up with the
5321 code from FSF Emacs-20.4, in which Mule should be more or less stable.
5322 Perhaps we should wait till R2L works in FSF Emacs? */
5324 if (!st->iso2022.initted)
5326 reset_iso2022 (Qnil, &st->iso2022.iso);
5327 st->iso2022.mask = (CODING_CATEGORY_ISO_7_MASK |
5328 CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
5329 CODING_CATEGORY_ISO_8_1_MASK |
5330 CODING_CATEGORY_ISO_8_2_MASK |
5331 CODING_CATEGORY_ISO_LOCK_SHIFT_MASK);
5332 st->iso2022.flags = 0;
5333 st->iso2022.high_byte_count = 0;
5334 st->iso2022.saw_single_shift = 0;
5335 st->iso2022.initted = 1;
5338 mask = st->iso2022.mask;
5342 unsigned char c = *(unsigned char *)src++;
5345 mask &= ~CODING_CATEGORY_ISO_7_MASK;
5346 st->iso2022.high_byte_count++;
5350 if (st->iso2022.high_byte_count && !st->iso2022.saw_single_shift)
5352 if (st->iso2022.high_byte_count & 1)
5353 /* odd number of high bytes; assume not iso-8-2 */
5354 mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
5356 st->iso2022.high_byte_count = 0;
5357 st->iso2022.saw_single_shift = 0;
5359 mask &= ~CODING_CATEGORY_ISO_7_MASK;
5361 if (!(st->iso2022.flags & CODING_STATE_ESCAPE)
5362 && (BYTE_C0_P (c) || BYTE_C1_P (c)))
5363 { /* control chars */
5366 /* Allow and ignore control characters that you might
5367 reasonably see in a text file */
5372 case 8: /* backspace */
5373 case 11: /* vertical tab */
5374 case 12: /* form feed */
5375 case 26: /* MS-DOS C-z junk */
5376 case 31: /* '^_' -- for info */
5377 goto label_continue_loop;
5384 if ((st->iso2022.flags & CODING_STATE_ESCAPE) || BYTE_C0_P (c)
5387 if (parse_iso2022_esc (Qnil, &st->iso2022.iso, c,
5388 &st->iso2022.flags, 0))
5390 switch (st->iso2022.iso.esc)
5392 case ISO_ESC_DESIGNATE:
5393 mask &= ~CODING_CATEGORY_ISO_8_1_MASK;
5394 mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
5396 case ISO_ESC_LOCKING_SHIFT:
5397 mask = CODING_CATEGORY_ISO_LOCK_SHIFT_MASK;
5398 goto ran_out_of_chars;
5399 case ISO_ESC_SINGLE_SHIFT:
5400 mask &= ~CODING_CATEGORY_ISO_8_DESIGNATE_MASK;
5401 st->iso2022.saw_single_shift = 1;
5410 goto ran_out_of_chars;
5413 label_continue_loop:;
5422 postprocess_iso2022_mask (int mask)
5424 /* #### kind of cheesy */
5425 /* If seven-bit ISO is allowed, then assume that the encoding is
5426 entirely seven-bit and turn off the eight-bit ones. */
5427 if (mask & CODING_CATEGORY_ISO_7_MASK)
5428 mask &= ~ (CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
5429 CODING_CATEGORY_ISO_8_1_MASK |
5430 CODING_CATEGORY_ISO_8_2_MASK);
5434 /* If FLAGS is a null pointer or specifies right-to-left motion,
5435 output a switch-dir-to-left-to-right sequence to DST.
5436 Also update FLAGS if it is not a null pointer.
5437 If INTERNAL_P is set, we are outputting in internal format and
5438 need to handle the CSI differently. */
5441 restore_left_to_right_direction (Lisp_Coding_System *codesys,
5442 unsigned_char_dynarr *dst,
5443 unsigned int *flags,
5446 if (!flags || (*flags & CODING_STATE_R2L))
5448 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
5450 Dynarr_add (dst, ISO_CODE_ESC);
5451 Dynarr_add (dst, '[');
5453 else if (internal_p)
5454 DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
5456 Dynarr_add (dst, ISO_CODE_CSI);
5457 Dynarr_add (dst, '0');
5458 Dynarr_add (dst, ']');
5460 *flags &= ~CODING_STATE_R2L;
5464 /* If FLAGS is a null pointer or specifies a direction different from
5465 DIRECTION (which should be either CHARSET_RIGHT_TO_LEFT or
5466 CHARSET_LEFT_TO_RIGHT), output the appropriate switch-dir escape
5467 sequence to DST. Also update FLAGS if it is not a null pointer.
5468 If INTERNAL_P is set, we are outputting in internal format and
5469 need to handle the CSI differently. */
5472 ensure_correct_direction (int direction, Lisp_Coding_System *codesys,
5473 unsigned_char_dynarr *dst, unsigned int *flags,
5476 if ((!flags || (*flags & CODING_STATE_R2L)) &&
5477 direction == CHARSET_LEFT_TO_RIGHT)
5478 restore_left_to_right_direction (codesys, dst, flags, internal_p);
5479 else if (!CODING_SYSTEM_ISO2022_NO_ISO6429 (codesys)
5480 && (!flags || !(*flags & CODING_STATE_R2L)) &&
5481 direction == CHARSET_RIGHT_TO_LEFT)
5483 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
5485 Dynarr_add (dst, ISO_CODE_ESC);
5486 Dynarr_add (dst, '[');
5488 else if (internal_p)
5489 DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
5491 Dynarr_add (dst, ISO_CODE_CSI);
5492 Dynarr_add (dst, '2');
5493 Dynarr_add (dst, ']');
5495 *flags |= CODING_STATE_R2L;
5499 /* Convert ISO2022-format data to internal format. */
5502 decode_coding_iso2022 (Lstream *decoding, const Extbyte *src,
5503 unsigned_char_dynarr *dst, Lstream_data_count n)
5505 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
5506 unsigned int flags = str->flags;
5507 unsigned int cpos = str->cpos;
5508 unsigned char counter = str->counter;
5509 eol_type_t eol_type = str->eol_type;
5510 #ifdef ENABLE_COMPOSITE_CHARS
5511 unsigned_char_dynarr *real_dst = dst;
5513 Lisp_Object coding_system;
5515 XSETCODING_SYSTEM (coding_system, str->codesys);
5517 #ifdef ENABLE_COMPOSITE_CHARS
5518 if (flags & CODING_STATE_COMPOSITE)
5519 dst = str->iso2022.composite_chars;
5520 #endif /* ENABLE_COMPOSITE_CHARS */
5524 unsigned char c = *(unsigned char *)src++;
5525 if (flags & CODING_STATE_ESCAPE)
5526 { /* Within ESC sequence */
5527 int retval = parse_iso2022_esc (coding_system, &str->iso2022,
5532 switch (str->iso2022.esc)
5534 #ifdef ENABLE_COMPOSITE_CHARS
5535 case ISO_ESC_START_COMPOSITE:
5536 if (str->iso2022.composite_chars)
5537 Dynarr_reset (str->iso2022.composite_chars);
5539 str->iso2022.composite_chars = Dynarr_new (unsigned_char);
5540 dst = str->iso2022.composite_chars;
5542 case ISO_ESC_END_COMPOSITE:
5544 Bufbyte comstr[MAX_EMCHAR_LEN];
5546 Emchar emch = lookup_composite_char (Dynarr_atp (dst, 0),
5547 Dynarr_length (dst));
5549 len = set_charptr_emchar (comstr, emch);
5550 Dynarr_add_many (dst, comstr, len);
5553 #endif /* ENABLE_COMPOSITE_CHARS */
5555 case ISO_ESC_LITERAL:
5556 COMPOSE_FLUSH_CHARS (str, dst);
5557 decode_flush_er_chars (str, dst);
5558 DECODE_ADD_BINARY_CHAR (c, dst);
5562 /* Everything else handled already */
5567 /* Attempted error recovery. */
5568 if (str->iso2022.output_direction_sequence)
5569 ensure_correct_direction (flags & CODING_STATE_R2L ?
5570 CHARSET_RIGHT_TO_LEFT :
5571 CHARSET_LEFT_TO_RIGHT,
5572 str->codesys, dst, 0, 1);
5573 /* More error recovery. */
5574 if (!retval || str->iso2022.output_literally)
5576 /* Output the (possibly invalid) sequence */
5578 COMPOSE_FLUSH_CHARS (str, dst);
5579 decode_flush_er_chars (str, dst);
5580 for (i = 0; i < str->iso2022.esc_bytes_index; i++)
5581 DECODE_ADD_BINARY_CHAR (str->iso2022.esc_bytes[i], dst);
5582 flags &= CODING_STATE_ISO2022_LOCK;
5584 n++, src--;/* Repeat the loop with the same character. */
5587 /* No sense in reprocessing the final byte of the
5588 escape sequence; it could mess things up anyway.
5590 COMPOSE_FLUSH_CHARS (str, dst);
5591 decode_flush_er_chars (str, dst);
5592 DECODE_ADD_BINARY_CHAR (c, dst);
5598 else if (BYTE_C0_P (c) || BYTE_C1_P (c))
5599 { /* Control characters */
5601 /***** Error-handling *****/
5603 /* If we were in the middle of a character, dump out the
5604 partial character. */
5607 COMPOSE_FLUSH_CHARS (str, dst);
5608 decode_flush_er_chars (str, dst);
5612 DECODE_ADD_BINARY_CHAR
5613 ((unsigned char)(cpos >> (counter * 8)), dst);
5618 /* If we just saw a single-shift character, dump it out.
5619 This may dump out the wrong sort of single-shift character,
5620 but least it will give an indication that something went
5622 if (flags & CODING_STATE_SS2)
5624 COMPOSE_FLUSH_CHARS (str, dst);
5625 decode_flush_er_chars (str, dst);
5626 DECODE_ADD_BINARY_CHAR (ISO_CODE_SS2, dst);
5627 flags &= ~CODING_STATE_SS2;
5629 if (flags & CODING_STATE_SS3)
5631 COMPOSE_FLUSH_CHARS (str, dst);
5632 decode_flush_er_chars (str, dst);
5633 DECODE_ADD_BINARY_CHAR (ISO_CODE_SS3, dst);
5634 flags &= ~CODING_STATE_SS3;
5637 /***** Now handle the control characters. *****/
5643 COMPOSE_FLUSH_CHARS (str, dst);
5644 decode_flush_er_chars (str, dst);
5645 if (eol_type == EOL_CR)
5646 Dynarr_add (dst, '\n');
5647 else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR)
5648 Dynarr_add (dst, c);
5650 flags |= CODING_STATE_CR;
5651 goto label_continue_loop;
5653 else if (flags & CODING_STATE_CR)
5654 { /* eol_type == CODING_SYSTEM_EOL_CRLF */
5656 Dynarr_add (dst, '\r');
5657 flags &= ~CODING_STATE_CR;
5660 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
5663 flags &= CODING_STATE_ISO2022_LOCK;
5665 if (!parse_iso2022_esc (coding_system, &str->iso2022, c, &flags, 1))
5667 COMPOSE_FLUSH_CHARS (str, dst);
5668 decode_flush_er_chars (str, dst);
5669 DECODE_ADD_BINARY_CHAR (c, dst);
5673 { /* Graphic characters */
5674 Lisp_Object charset;
5683 COMPOSE_FLUSH_CHARS (str, dst);
5684 decode_flush_er_chars (str, dst);
5685 if (eol_type == EOL_CR)
5686 Dynarr_add (dst, '\n');
5687 else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR)
5688 Dynarr_add (dst, c);
5690 flags |= CODING_STATE_CR;
5691 goto label_continue_loop;
5693 else if (flags & CODING_STATE_CR)
5694 { /* eol_type == CODING_SYSTEM_EOL_CRLF */
5696 Dynarr_add (dst, '\r');
5697 flags &= ~CODING_STATE_CR;
5700 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
5703 /* Now determine the charset. */
5704 reg = ((flags & CODING_STATE_SS2) ? 2
5705 : (flags & CODING_STATE_SS3) ? 3
5706 : !BYTE_ASCII_P (c) ? str->iso2022.register_right
5707 : str->iso2022.register_left);
5708 charset = str->iso2022.charset[reg];
5710 /* Error checking: */
5711 if (! CHARSETP (charset)
5712 || str->iso2022.invalid_designated[reg]
5713 || (((c & 0x7F) == ' ' || (c & 0x7F) == ISO_CODE_DEL)
5714 && XCHARSET_CHARS (charset) == 94))
5715 /* Mrmph. We are trying to invoke a register that has no
5716 or an invalid charset in it, or trying to add a character
5717 outside the range of the charset. Insert that char literally
5718 to preserve it for the output. */
5720 COMPOSE_FLUSH_CHARS (str, dst);
5721 decode_flush_er_chars (str, dst);
5725 DECODE_ADD_BINARY_CHAR
5726 ((unsigned char)(cpos >> (counter * 8)), dst);
5729 DECODE_ADD_BINARY_CHAR (c, dst);
5734 /* Things are probably hunky-dorey. */
5736 /* Fetch reverse charset, maybe. */
5737 if (((flags & CODING_STATE_R2L) &&
5738 XCHARSET_DIRECTION (charset) == CHARSET_LEFT_TO_RIGHT)
5740 (!(flags & CODING_STATE_R2L) &&
5741 XCHARSET_DIRECTION (charset) == CHARSET_RIGHT_TO_LEFT))
5743 Lisp_Object new_charset =
5744 XCHARSET_REVERSE_DIRECTION_CHARSET (charset);
5745 if (!NILP (new_charset))
5746 charset = new_charset;
5751 if (XCHARSET_DIMENSION (charset) == counter)
5753 COMPOSE_ADD_CHAR (str,
5754 DECODE_CHAR (charset,
5755 ((cpos & 0x7F7F7F) << 8)
5762 cpos = (cpos << 8) | c;
5764 lb = XCHARSET_LEADING_BYTE (charset);
5765 switch (XCHARSET_REP_BYTES (charset))
5768 DECODE_OUTPUT_PARTIAL_CHAR (ch);
5769 Dynarr_add (dst, c & 0x7F);
5772 case 2: /* one-byte official */
5773 DECODE_OUTPUT_PARTIAL_CHAR (ch);
5774 Dynarr_add (dst, lb);
5775 Dynarr_add (dst, c | 0x80);
5778 case 3: /* one-byte private or two-byte official */
5779 if (XCHARSET_PRIVATE_P (charset))
5781 DECODE_OUTPUT_PARTIAL_CHAR (ch);
5782 Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_1);
5783 Dynarr_add (dst, lb);
5784 Dynarr_add (dst, c | 0x80);
5790 Dynarr_add (dst, lb);
5791 Dynarr_add (dst, ch | 0x80);
5792 Dynarr_add (dst, c | 0x80);
5800 default: /* two-byte private */
5803 Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_2);
5804 Dynarr_add (dst, lb);
5805 Dynarr_add (dst, ch | 0x80);
5806 Dynarr_add (dst, c | 0x80);
5816 flags &= CODING_STATE_ISO2022_LOCK;
5819 label_continue_loop:;
5822 if (flags & CODING_STATE_END)
5824 COMPOSE_FLUSH_CHARS (str, dst);
5825 decode_flush_er_chars (str, dst);
5826 DECODE_OUTPUT_PARTIAL_CHAR (cpos);
5830 str->counter = counter;
5834 /***** ISO2022 encoder *****/
5836 /* Designate CHARSET into register REG. */
5839 iso2022_designate (Lisp_Object charset, unsigned char reg,
5840 struct encoding_stream *str, unsigned_char_dynarr *dst)
5842 static const char inter94[] = "()*+";
5843 static const char inter96[] = ",-./";
5844 unsigned short chars;
5845 unsigned char dimension;
5846 unsigned char final;
5847 Lisp_Object old_charset = str->iso2022.charset[reg];
5849 str->iso2022.charset[reg] = charset;
5850 if (!CHARSETP (charset))
5851 /* charset might be an initial nil or t. */
5853 chars = XCHARSET_CHARS (charset);
5854 dimension = XCHARSET_DIMENSION (charset);
5855 final = XCHARSET_FINAL (charset);
5856 if (!str->iso2022.force_charset_on_output[reg] &&
5857 CHARSETP (old_charset) &&
5858 XCHARSET_CHARS (old_charset) == chars &&
5859 XCHARSET_DIMENSION (old_charset) == dimension &&
5860 XCHARSET_FINAL (old_charset) == final)
5863 str->iso2022.force_charset_on_output[reg] = 0;
5866 charset_conversion_spec_dynarr *dyn =
5867 str->codesys->iso2022.output_conv;
5873 for (i = 0; i < Dynarr_length (dyn); i++)
5875 struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
5876 if (EQ (charset, spec->from_charset))
5877 charset = spec->to_charset;
5882 Dynarr_add (dst, ISO_CODE_ESC);
5887 Dynarr_add (dst, inter94[reg]);
5890 Dynarr_add (dst, '$');
5892 || !(CODING_SYSTEM_ISO2022_SHORT (str->codesys))
5895 Dynarr_add (dst, inter94[reg]);
5900 Dynarr_add (dst, inter96[reg]);
5903 Dynarr_add (dst, '$');
5904 Dynarr_add (dst, inter96[reg]);
5908 Dynarr_add (dst, final);
5912 ensure_normal_shift (struct encoding_stream *str, unsigned_char_dynarr *dst)
5914 if (str->iso2022.register_left != 0)
5916 Dynarr_add (dst, ISO_CODE_SI);
5917 str->iso2022.register_left = 0;
5922 ensure_shift_out (struct encoding_stream *str, unsigned_char_dynarr *dst)
5924 if (str->iso2022.register_left != 1)
5926 Dynarr_add (dst, ISO_CODE_SO);
5927 str->iso2022.register_left = 1;
5932 char_encode_iso2022 (struct encoding_stream *str, Emchar ch,
5933 unsigned_char_dynarr *dst, unsigned int *flags)
5935 unsigned char charmask;
5936 Lisp_Coding_System* codesys = str->codesys;
5937 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
5939 Lisp_Object charset = str->iso2022.current_charset;
5940 int half = str->iso2022.current_half;
5941 int code_point = -1;
5945 restore_left_to_right_direction (codesys, dst, flags, 0);
5947 /* Make sure G0 contains ASCII */
5948 if ((ch > ' ' && ch < ISO_CODE_DEL)
5949 || !CODING_SYSTEM_ISO2022_NO_ASCII_CNTL (codesys))
5951 ensure_normal_shift (str, dst);
5952 iso2022_designate (Vcharset_ascii, 0, str, dst);
5955 /* If necessary, restore everything to the default state
5957 if (ch == '\n' && !(CODING_SYSTEM_ISO2022_NO_ASCII_EOL (codesys)))
5959 restore_left_to_right_direction (codesys, dst, flags, 0);
5961 ensure_normal_shift (str, dst);
5963 for (i = 0; i < 4; i++)
5965 Lisp_Object initial_charset =
5966 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
5967 iso2022_designate (initial_charset, i, str, dst);
5972 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5973 Dynarr_add (dst, '\r');
5974 if (eol_type != EOL_CR)
5975 Dynarr_add (dst, ch);
5979 if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
5980 && fit_to_be_escape_quoted (ch))
5981 Dynarr_add (dst, ISO_CODE_ESC);
5982 Dynarr_add (dst, ch);
5985 else if ( (0x80 <= ch) && (ch <= 0x9f) )
5987 charmask = (half == 0 ? 0x00 : 0x80);
5989 if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
5990 && fit_to_be_escape_quoted (ch))
5991 Dynarr_add (dst, ISO_CODE_ESC);
5992 /* you asked for it ... */
5993 Dynarr_add (dst, ch);
5999 /* Now determine which register to use. */
6001 for (i = 0; i < 4; i++)
6003 if ((CHARSETP (charset = str->iso2022.charset[i])
6004 && ((code_point = charset_code_point (charset, ch, 0)) >= 0))
6008 = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i))
6009 && ((code_point = charset_code_point (charset, ch, 0)) >= 0)))
6017 Lisp_Object original_default_coded_charset_priority_list
6018 = Vdefault_coded_charset_priority_list;
6019 Vdefault_coded_charset_priority_list
6020 = CODING_SYSTEM_CCS_PRIORITY_LIST (codesys);
6021 while (!EQ (Vdefault_coded_charset_priority_list, Qnil))
6023 code_point = ENCODE_CHAR (ch, charset);
6024 if (XCHARSET_FINAL (charset))
6026 Vdefault_coded_charset_priority_list
6027 = Fcdr (Fmemq (XCHARSET_NAME (charset),
6028 Vdefault_coded_charset_priority_list));
6030 Vdefault_coded_charset_priority_list
6031 = original_default_coded_charset_priority_list;
6032 while (!EQ (Vdefault_coded_charset_priority_list, Qnil))
6034 code_point = ENCODE_CHAR (ch, charset);
6035 if (XCHARSET_FINAL (charset))
6037 Vdefault_coded_charset_priority_list
6038 = Fcdr (Fmemq (XCHARSET_NAME (charset),
6039 Vdefault_coded_charset_priority_list));
6041 code_point = ENCODE_CHAR (ch, charset);
6042 if (!XCHARSET_FINAL (charset))
6044 charset = Vcharset_ascii;
6048 Vdefault_coded_charset_priority_list
6049 = original_default_coded_charset_priority_list;
6051 ensure_correct_direction (XCHARSET_DIRECTION (charset),
6052 codesys, dst, flags, 0);
6056 if (XCHARSET_GRAPHIC (charset) != 0)
6058 if (!NILP (str->iso2022.charset[1]) &&
6059 (!CODING_SYSTEM_ISO2022_SEVEN (codesys)
6060 || CODING_SYSTEM_ISO2022_LOCK_SHIFT (codesys)))
6062 else if (!NILP (str->iso2022.charset[2]))
6064 else if (!NILP (str->iso2022.charset[3]))
6073 iso2022_designate (charset, reg, str, dst);
6075 /* Now invoke that register. */
6079 ensure_normal_shift (str, dst);
6083 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
6085 ensure_shift_out (str, dst);
6092 if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
6094 Dynarr_add (dst, ISO_CODE_ESC);
6095 Dynarr_add (dst, 'N');
6100 Dynarr_add (dst, ISO_CODE_SS2);
6105 if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
6107 Dynarr_add (dst, ISO_CODE_ESC);
6108 Dynarr_add (dst, 'O');
6113 Dynarr_add (dst, ISO_CODE_SS3);
6121 charmask = (half == 0 ? 0x00 : 0x80);
6123 switch (XCHARSET_DIMENSION (charset))
6126 Dynarr_add (dst, (code_point & 0xFF) | charmask);
6129 Dynarr_add (dst, ((code_point >> 8) & 0xFF) | charmask);
6130 Dynarr_add (dst, ( code_point & 0xFF) | charmask);
6133 Dynarr_add (dst, ((code_point >> 16) & 0xFF) | charmask);
6134 Dynarr_add (dst, ((code_point >> 8) & 0xFF) | charmask);
6135 Dynarr_add (dst, ( code_point & 0xFF) | charmask);
6138 Dynarr_add (dst, ((code_point >> 24) & 0xFF) | charmask);
6139 Dynarr_add (dst, ((code_point >> 16) & 0xFF) | charmask);
6140 Dynarr_add (dst, ((code_point >> 8) & 0xFF) | charmask);
6141 Dynarr_add (dst, ( code_point & 0xFF) | charmask);
6147 str->iso2022.current_charset = charset;
6148 str->iso2022.current_half = half;
6152 char_finish_iso2022 (struct encoding_stream *str, unsigned_char_dynarr *dst,
6153 unsigned int *flags)
6155 Lisp_Coding_System* codesys = str->codesys;
6158 restore_left_to_right_direction (codesys, dst, flags, 0);
6159 ensure_normal_shift (str, dst);
6160 for (i = 0; i < 4; i++)
6162 Lisp_Object initial_charset
6163 = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
6164 iso2022_designate (initial_charset, i, str, dst);
6169 /************************************************************************/
6170 /* No-conversion methods */
6171 /************************************************************************/
6173 /* This is used when reading in "binary" files -- i.e. files that may
6174 contain all 256 possible byte values and that are not to be
6175 interpreted as being in any particular decoding. */
6177 decode_coding_no_conversion (Lstream *decoding, const Extbyte *src,
6178 unsigned_char_dynarr *dst, Lstream_data_count n)
6180 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
6181 unsigned int flags = str->flags;
6182 unsigned int cpos = str->cpos;
6183 eol_type_t eol_type = str->eol_type;
6187 unsigned char c = *(unsigned char *)src++;
6189 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
6190 DECODE_ADD_BINARY_CHAR (c, dst);
6191 label_continue_loop:;
6194 DECODE_HANDLE_END_OF_CONVERSION (flags, cpos, dst);
6201 encode_coding_no_conversion (Lstream *encoding, const Bufbyte *src,
6202 unsigned_char_dynarr *dst, Lstream_data_count n)
6205 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
6206 unsigned int flags = str->flags;
6207 unsigned int ch = str->ch;
6208 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
6210 unsigned char char_boundary = str->iso2022.current_char_boundary;
6217 if (char_boundary == 0)
6223 else if ( c >= 0xf8 )
6228 else if ( c >= 0xf0 )
6233 else if ( c >= 0xe0 )
6238 else if ( c >= 0xc0 )
6248 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
6249 Dynarr_add (dst, '\r');
6250 if (eol_type != EOL_CR)
6251 Dynarr_add (dst, c);
6254 Dynarr_add (dst, c);
6257 else if (char_boundary == 1)
6259 ch = ( ch << 6 ) | ( c & 0x3f );
6260 Dynarr_add (dst, ch & 0xff);
6265 ch = ( ch << 6 ) | ( c & 0x3f );
6268 #else /* not UTF2000 */
6271 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
6272 Dynarr_add (dst, '\r');
6273 if (eol_type != EOL_CR)
6274 Dynarr_add (dst, '\n');
6277 else if (BYTE_ASCII_P (c))
6280 Dynarr_add (dst, c);
6282 else if (BUFBYTE_LEADING_BYTE_P (c))
6285 if (c == LEADING_BYTE_LATIN_ISO8859_1 ||
6286 c == LEADING_BYTE_CONTROL_1)
6289 Dynarr_add (dst, '~'); /* untranslatable character */
6293 if (ch == LEADING_BYTE_LATIN_ISO8859_1)
6294 Dynarr_add (dst, c);
6295 else if (ch == LEADING_BYTE_CONTROL_1)
6298 Dynarr_add (dst, c - 0x20);
6300 /* else it should be the second or third byte of an
6301 untranslatable character, so ignore it */
6304 #endif /* not UTF2000 */
6310 str->iso2022.current_char_boundary = char_boundary;
6316 /************************************************************************/
6317 /* Initialization */
6318 /************************************************************************/
6321 syms_of_file_coding (void)
6323 INIT_LRECORD_IMPLEMENTATION (coding_system);
6325 DEFERROR_STANDARD (Qcoding_system_error, Qio_error);
6327 DEFSUBR (Fcoding_system_p);
6328 DEFSUBR (Ffind_coding_system);
6329 DEFSUBR (Fget_coding_system);
6330 DEFSUBR (Fcoding_system_list);
6331 DEFSUBR (Fcoding_system_name);
6332 DEFSUBR (Fmake_coding_system);
6333 DEFSUBR (Fcopy_coding_system);
6334 DEFSUBR (Fcoding_system_canonical_name_p);
6335 DEFSUBR (Fcoding_system_alias_p);
6336 DEFSUBR (Fcoding_system_aliasee);
6337 DEFSUBR (Fdefine_coding_system_alias);
6338 DEFSUBR (Fsubsidiary_coding_system);
6340 DEFSUBR (Fcoding_system_type);
6341 DEFSUBR (Fcoding_system_doc_string);
6343 DEFSUBR (Fcoding_system_charset);
6345 DEFSUBR (Fcoding_system_property);
6347 DEFSUBR (Fcoding_category_list);
6348 DEFSUBR (Fset_coding_priority_list);
6349 DEFSUBR (Fcoding_priority_list);
6350 DEFSUBR (Fset_coding_category_system);
6351 DEFSUBR (Fcoding_category_system);
6353 DEFSUBR (Fdetect_coding_region);
6354 DEFSUBR (Fdecode_coding_region);
6355 DEFSUBR (Fencode_coding_region);
6357 DEFSUBR (Fdecode_shift_jis_char);
6358 DEFSUBR (Fencode_shift_jis_char);
6359 DEFSUBR (Fdecode_big5_char);
6360 DEFSUBR (Fencode_big5_char);
6362 defsymbol (&Qcoding_systemp, "coding-system-p");
6363 defsymbol (&Qno_conversion, "no-conversion");
6364 defsymbol (&Qraw_text, "raw-text");
6366 defsymbol (&Qbig5, "big5");
6367 defsymbol (&Qshift_jis, "shift-jis");
6368 defsymbol (&Qucs4, "ucs-4");
6369 defsymbol (&Qutf8, "utf-8");
6370 defsymbol (&Qutf16, "utf-16");
6371 defsymbol (&Qccl, "ccl");
6372 defsymbol (&Qiso2022, "iso2022");
6374 defsymbol (&Qmnemonic, "mnemonic");
6375 defsymbol (&Qeol_type, "eol-type");
6376 defsymbol (&Qpost_read_conversion, "post-read-conversion");
6377 defsymbol (&Qpre_write_conversion, "pre-write-conversion");
6379 defsymbol (&Qcr, "cr");
6380 defsymbol (&Qlf, "lf");
6381 defsymbol (&Qcrlf, "crlf");
6382 defsymbol (&Qeol_cr, "eol-cr");
6383 defsymbol (&Qeol_lf, "eol-lf");
6384 defsymbol (&Qeol_crlf, "eol-crlf");
6386 defsymbol (&Qcharset_g0, "charset-g0");
6387 defsymbol (&Qcharset_g1, "charset-g1");
6388 defsymbol (&Qcharset_g2, "charset-g2");
6389 defsymbol (&Qcharset_g3, "charset-g3");
6390 defsymbol (&Qforce_g0_on_output, "force-g0-on-output");
6391 defsymbol (&Qforce_g1_on_output, "force-g1-on-output");
6392 defsymbol (&Qforce_g2_on_output, "force-g2-on-output");
6393 defsymbol (&Qforce_g3_on_output, "force-g3-on-output");
6394 defsymbol (&Qno_iso6429, "no-iso6429");
6395 defsymbol (&Qinput_charset_conversion, "input-charset-conversion");
6396 defsymbol (&Qoutput_charset_conversion, "output-charset-conversion");
6398 defsymbol (&Qshort, "short");
6399 defsymbol (&Qno_ascii_eol, "no-ascii-eol");
6400 defsymbol (&Qno_ascii_cntl, "no-ascii-cntl");
6401 defsymbol (&Qseven, "seven");
6402 defsymbol (&Qlock_shift, "lock-shift");
6403 defsymbol (&Qescape_quoted, "escape-quoted");
6406 defsymbol (&Qutf_8_mcs, "utf-8-mcs");
6407 defsymbol (&Qdisable_composition, "disable-composition");
6408 defsymbol (&Qccs_priority_list, "ccs-priority-list");
6409 defsymbol (&Quse_entity_reference, "use-entity-reference");
6410 defsymbol (&Qd, "d");
6411 defsymbol (&Qx, "x");
6412 defsymbol (&QX, "X");
6414 defsymbol (&Qencode, "encode");
6415 defsymbol (&Qdecode, "decode");
6418 defsymbol (&coding_category_symbol[CODING_CATEGORY_SHIFT_JIS],
6420 defsymbol (&coding_category_symbol[CODING_CATEGORY_BIG5],
6422 defsymbol (&coding_category_symbol[CODING_CATEGORY_UCS4],
6424 defsymbol (&coding_category_symbol[CODING_CATEGORY_UTF16],
6426 defsymbol (&coding_category_symbol[CODING_CATEGORY_UTF8],
6428 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_7],
6430 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_DESIGNATE],
6432 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_1],
6434 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_2],
6436 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_LOCK_SHIFT],
6439 defsymbol (&coding_category_symbol[CODING_CATEGORY_NO_CONVERSION],
6444 lstream_type_create_file_coding (void)
6446 LSTREAM_HAS_METHOD (decoding, reader);
6447 LSTREAM_HAS_METHOD (decoding, writer);
6448 LSTREAM_HAS_METHOD (decoding, rewinder);
6449 LSTREAM_HAS_METHOD (decoding, seekable_p);
6450 LSTREAM_HAS_METHOD (decoding, flusher);
6451 LSTREAM_HAS_METHOD (decoding, closer);
6452 LSTREAM_HAS_METHOD (decoding, marker);
6454 LSTREAM_HAS_METHOD (encoding, reader);
6455 LSTREAM_HAS_METHOD (encoding, writer);
6456 LSTREAM_HAS_METHOD (encoding, rewinder);
6457 LSTREAM_HAS_METHOD (encoding, seekable_p);
6458 LSTREAM_HAS_METHOD (encoding, flusher);
6459 LSTREAM_HAS_METHOD (encoding, closer);
6460 LSTREAM_HAS_METHOD (encoding, marker);
6464 vars_of_file_coding (void)
6468 fcd = xnew (struct file_coding_dump);
6469 dump_add_root_struct_ptr (&fcd, &fcd_description);
6471 /* Initialize to something reasonable ... */
6472 for (i = 0; i < CODING_CATEGORY_LAST; i++)
6474 fcd->coding_category_system[i] = Qnil;
6475 fcd->coding_category_by_priority[i] = i;
6478 Fprovide (intern ("file-coding"));
6480 DEFVAR_LISP ("keyboard-coding-system", &Vkeyboard_coding_system /*
6481 Coding system used for TTY keyboard input.
6482 Not used under a windowing system.
6484 Vkeyboard_coding_system = Qnil;
6486 DEFVAR_LISP ("terminal-coding-system", &Vterminal_coding_system /*
6487 Coding system used for TTY display output.
6488 Not used under a windowing system.
6490 Vterminal_coding_system = Qnil;
6492 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read /*
6493 Overriding coding system used when reading from a file or process.
6494 You should bind this variable with `let', but do not set it globally.
6495 If this is non-nil, it specifies the coding system that will be used
6496 to decode input on read operations, such as from a file or process.
6497 It overrides `buffer-file-coding-system-for-read',
6498 `insert-file-contents-pre-hook', etc. Use those variables instead of
6499 this one for permanent changes to the environment. */ );
6500 Vcoding_system_for_read = Qnil;
6502 DEFVAR_LISP ("coding-system-for-write",
6503 &Vcoding_system_for_write /*
6504 Overriding coding system used when writing to a file or process.
6505 You should bind this variable with `let', but do not set it globally.
6506 If this is non-nil, it specifies the coding system that will be used
6507 to encode output for write operations, such as to a file or process.
6508 It overrides `buffer-file-coding-system', `write-region-pre-hook', etc.
6509 Use those variables instead of this one for permanent changes to the
6511 Vcoding_system_for_write = Qnil;
6513 DEFVAR_LISP ("file-name-coding-system", &Vfile_name_coding_system /*
6514 Coding system used to convert pathnames when accessing files.
6516 Vfile_name_coding_system = Qnil;
6518 DEFVAR_LISP ("coded-charset-entity-reference-alist",
6519 &Vcoded_charset_entity_reference_alist /*
6520 Alist of coded-charset vs corresponding entity-reference.
6521 Each element looks like (CCS PREFIX CODE-COLUMNS CODE-TYPE).
6522 CCS is coded-charset.
6523 CODE-COLUMNS is columns of code-point of entity-reference.
6524 CODE-TYPE is format type of code-point of entity-reference.
6525 `d' means decimal value and `x' means hexadecimal value.
6527 Vcoded_charset_entity_reference_alist = Qnil;
6529 DEFVAR_BOOL ("enable-multibyte-characters", &enable_multibyte_characters /*
6530 Non-nil means the buffer contents are regarded as multi-byte form
6531 of characters, not a binary code. This affects the display, file I/O,
6532 and behaviors of various editing commands.
6534 Setting this to nil does not do anything.
6536 enable_multibyte_characters = 1;
6540 complex_vars_of_file_coding (void)
6542 staticpro (&Vcoding_system_hash_table);
6543 Vcoding_system_hash_table =
6544 make_lisp_hash_table (50, HASH_TABLE_NON_WEAK, HASH_TABLE_EQ);
6546 the_codesys_prop_dynarr = Dynarr_new (codesys_prop);
6547 dump_add_root_struct_ptr (&the_codesys_prop_dynarr, &codesys_prop_dynarr_description);
6549 #define DEFINE_CODESYS_PROP(Prop_Type, Sym) do \
6551 struct codesys_prop csp; \
6553 csp.prop_type = (Prop_Type); \
6554 Dynarr_add (the_codesys_prop_dynarr, csp); \
6557 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qmnemonic);
6558 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_type);
6559 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_cr);
6560 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_crlf);
6561 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_lf);
6562 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qpost_read_conversion);
6563 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qpre_write_conversion);
6565 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g0);
6566 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g1);
6567 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g2);
6568 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g3);
6569 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g0_on_output);
6570 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g1_on_output);
6571 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g2_on_output);
6572 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g3_on_output);
6573 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qshort);
6574 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_eol);
6575 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_cntl);
6576 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qseven);
6577 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qlock_shift);
6578 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_iso6429);
6579 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qescape_quoted);
6580 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qinput_charset_conversion);
6581 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qoutput_charset_conversion);
6583 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qccs_priority_list);
6586 DEFINE_CODESYS_PROP (CODESYS_PROP_CCL, Qencode);
6587 DEFINE_CODESYS_PROP (CODESYS_PROP_CCL, Qdecode);
6589 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qdisable_composition);
6590 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Quse_entity_reference);
6593 /* Need to create this here or we're really screwed. */
6595 (Qraw_text, Qno_conversion,
6596 build_string ("Raw text, which means it converts only line-break-codes."),
6597 list2 (Qmnemonic, build_string ("Raw")));
6600 (Qbinary, Qno_conversion,
6601 build_string ("Binary, which means it does not convert anything."),
6602 list4 (Qeol_type, Qlf,
6603 Qmnemonic, build_string ("Binary")));
6609 ("Coding-system of UTF-8 with Multiple Coded-character-Sets extension."),
6610 list2 (Qmnemonic, build_string ("MTF8")));
6613 Fdefine_coding_system_alias (Qno_conversion, Qraw_text);
6615 Fdefine_coding_system_alias (Qfile_name, Qbinary);
6617 Fdefine_coding_system_alias (Qterminal, Qbinary);
6618 Fdefine_coding_system_alias (Qkeyboard, Qbinary);
6620 /* Need this for bootstrapping */
6621 fcd->coding_category_system[CODING_CATEGORY_NO_CONVERSION] =
6622 Fget_coding_system (Qraw_text);
6625 fcd->coding_category_system[CODING_CATEGORY_UTF8]
6626 = Fget_coding_system (Qutf_8_mcs);
6629 #if defined(MULE) && !defined(UTF2000)
6633 for (i = 0; i < countof (fcd->ucs_to_mule_table); i++)
6634 fcd->ucs_to_mule_table[i] = Qnil;
6636 staticpro (&mule_to_ucs_table);
6637 mule_to_ucs_table = Fmake_char_table(Qgeneric);
6638 #endif /* defined(MULE) && !defined(UTF2000) */