1 /* Code conversion functions.
2 Copyright (C) 1991, 1995 Free Software Foundation, Inc.
3 Copyright (C) 1995 Sun Microsystems, Inc.
4 Copyright (C) 1999,2000,2001,2002,2003,2004,2005 MORIOKA Tomohiko
6 This file is part of XEmacs.
8 XEmacs is free software; you can redistribute it and/or modify it
9 under the terms of the GNU General Public License as published by the
10 Free Software Foundation; either version 2, or (at your option) any
13 XEmacs is distributed in the hope that it will be useful, but WITHOUT
14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
18 You should have received a copy of the GNU General Public License
19 along with XEmacs; see the file COPYING. If not, write to
20 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
21 Boston, MA 02111-1307, USA. */
23 /* Synched up with: Mule 2.3. Not in FSF. */
25 /* Rewritten by Ben Wing <ben@xemacs.org>. */
26 /* Rewritten by MORIOKA Tomohiko <tomo@m17n.org> for XEmacs CHISE. */
40 #include "file-coding.h"
42 Lisp_Object Qcoding_system_error;
44 Lisp_Object Vkeyboard_coding_system;
45 Lisp_Object Vterminal_coding_system;
46 Lisp_Object Vcoding_system_for_read;
47 Lisp_Object Vcoding_system_for_write;
48 Lisp_Object Vfile_name_coding_system;
50 Lisp_Object Vcoded_charset_entity_reference_alist;
52 /* Table of symbols identifying each coding category. */
53 Lisp_Object coding_category_symbol[CODING_CATEGORY_LAST];
57 struct file_coding_dump {
58 /* Coding system currently associated with each coding category. */
59 Lisp_Object coding_category_system[CODING_CATEGORY_LAST];
61 /* Table of all coding categories in decreasing order of priority.
62 This describes a permutation of the possible coding categories. */
63 int coding_category_by_priority[CODING_CATEGORY_LAST];
65 #if defined(MULE) && !defined(UTF2000)
66 Lisp_Object ucs_to_mule_table[65536];
70 static const struct lrecord_description fcd_description_1[] = {
71 { XD_LISP_OBJECT_ARRAY, offsetof (struct file_coding_dump, coding_category_system), CODING_CATEGORY_LAST },
72 #if defined(MULE) && !defined(UTF2000)
73 { XD_LISP_OBJECT_ARRAY, offsetof (struct file_coding_dump, ucs_to_mule_table), countof (fcd->ucs_to_mule_table) },
78 static const struct struct_description fcd_description = {
79 sizeof (struct file_coding_dump),
83 Lisp_Object mule_to_ucs_table;
85 Lisp_Object Qcoding_systemp;
87 Lisp_Object Qraw_text, Qno_conversion, Qccl, Qiso2022;
88 /* Qinternal in general.c */
90 Lisp_Object Qmnemonic, Qeol_type;
91 Lisp_Object Qcr, Qcrlf, Qlf;
92 Lisp_Object Qeol_cr, Qeol_crlf, Qeol_lf;
93 Lisp_Object Qpost_read_conversion;
94 Lisp_Object Qpre_write_conversion;
97 Lisp_Object Qucs4, Qutf16, Qutf8;
98 Lisp_Object Qbig5, Qshift_jis;
99 Lisp_Object Qcharset_g0, Qcharset_g1, Qcharset_g2, Qcharset_g3;
100 Lisp_Object Qforce_g0_on_output, Qforce_g1_on_output;
101 Lisp_Object Qforce_g2_on_output, Qforce_g3_on_output;
102 Lisp_Object Qno_iso6429;
103 Lisp_Object Qinput_charset_conversion, Qoutput_charset_conversion;
104 Lisp_Object Qescape_quoted;
105 Lisp_Object Qshort, Qno_ascii_eol, Qno_ascii_cntl, Qseven, Qlock_shift;
108 Lisp_Object Qutf_8_mcs;
109 Lisp_Object Qdisable_composition;
110 Lisp_Object Qccs_priority_list;
111 Lisp_Object Quse_entity_reference;
112 Lisp_Object Qd, Qx, QX;
114 Lisp_Object Qencode, Qdecode;
116 Lisp_Object Vcoding_system_hash_table;
118 int enable_multibyte_characters;
121 /* Additional information used by the ISO2022 decoder and detector. */
122 struct iso2022_decoder
124 /* CHARSET holds the character sets currently assigned to the G0
125 through G3 variables. It is initialized from the array
126 INITIAL_CHARSET in CODESYS. */
127 Lisp_Object charset[4];
129 /* Which registers are currently invoked into the left (GL) and
130 right (GR) halves of the 8-bit encoding space? */
131 int register_left, register_right;
133 /* ISO_ESC holds a value indicating part of an escape sequence
134 that has already been seen. */
135 enum iso_esc_flag esc;
137 /* This records the bytes we've seen so far in an escape sequence,
138 in case the sequence is invalid (we spit out the bytes unchanged). */
139 unsigned char esc_bytes[8];
141 /* Index for next byte to store in ISO escape sequence. */
144 #ifdef ENABLE_COMPOSITE_CHARS
145 /* Stuff seen so far when composing a string. */
146 unsigned_char_dynarr *composite_chars;
149 /* If we saw an invalid designation sequence for a particular
150 register, we flag it here and switch to ASCII. The next time we
151 see a valid designation for this register, we turn off the flag
152 and do the designation normally, but pretend the sequence was
153 invalid. The effect of all this is that (most of the time) the
154 escape sequences for both the switch to the unknown charset, and
155 the switch back to the known charset, get inserted literally into
156 the buffer and saved out as such. The hope is that we can
157 preserve the escape sequences so that the resulting written out
158 file makes sense. If we don't do any of this, the designation
159 to the invalid charset will be preserved but that switch back
160 to the known charset will probably get eaten because it was
161 the same charset that was already present in the register. */
162 unsigned char invalid_designated[4];
164 /* We try to do similar things as above for direction-switching
165 sequences. If we encountered a direction switch while an
166 invalid designation was present, or an invalid designation
167 just after a direction switch (i.e. no valid designation
168 encountered yet), we insert the direction-switch escape
169 sequence literally into the output stream, and later on
170 insert the corresponding direction-restoring escape sequence
172 unsigned int switched_dir_and_no_valid_charset_yet :1;
173 unsigned int invalid_switch_dir :1;
175 /* Tells the decoder to output the escape sequence literally
176 even though it was valid. Used in the games we play to
177 avoid lossage when we encounter invalid designations. */
178 unsigned int output_literally :1;
179 /* We encountered a direction switch followed by an invalid
180 designation. We didn't output the direction switch
181 literally because we didn't know about the invalid designation;
182 but we have to do so now. */
183 unsigned int output_direction_sequence :1;
186 EXFUN (Fcopy_coding_system, 2);
188 struct detection_state;
191 text_encode_generic (Lstream *encoding, const Bufbyte *src,
192 unsigned_char_dynarr *dst, Lstream_data_count n);
194 static int detect_coding_sjis (struct detection_state *st,
195 const Extbyte *src, Lstream_data_count n);
196 static void decode_coding_sjis (Lstream *decoding, const Extbyte *src,
197 unsigned_char_dynarr *dst, Lstream_data_count n);
198 void char_encode_shift_jis (struct encoding_stream *str, Emchar c,
199 unsigned_char_dynarr *dst, unsigned int *flags);
200 void char_finish_shift_jis (struct encoding_stream *str,
201 unsigned_char_dynarr *dst, unsigned int *flags);
203 static int detect_coding_big5 (struct detection_state *st,
204 const Extbyte *src, Lstream_data_count n);
205 static void decode_coding_big5 (Lstream *decoding, const Extbyte *src,
206 unsigned_char_dynarr *dst, Lstream_data_count n);
207 void char_encode_big5 (struct encoding_stream *str, Emchar c,
208 unsigned_char_dynarr *dst, unsigned int *flags);
209 void char_finish_big5 (struct encoding_stream *str,
210 unsigned_char_dynarr *dst, unsigned int *flags);
212 static int detect_coding_ucs4 (struct detection_state *st,
213 const Extbyte *src, Lstream_data_count n);
214 static void decode_coding_ucs4 (Lstream *decoding, const Extbyte *src,
215 unsigned_char_dynarr *dst, Lstream_data_count n);
216 void char_encode_ucs4 (struct encoding_stream *str, Emchar c,
217 unsigned_char_dynarr *dst, unsigned int *flags);
218 void char_finish_ucs4 (struct encoding_stream *str,
219 unsigned_char_dynarr *dst, unsigned int *flags);
221 static int detect_coding_utf16 (struct detection_state *st,
222 const Extbyte *src, Lstream_data_count n);
223 static void decode_coding_utf16 (Lstream *decoding, const Extbyte *src,
224 unsigned_char_dynarr *dst, Lstream_data_count n);
225 void char_encode_utf16 (struct encoding_stream *str, Emchar c,
226 unsigned_char_dynarr *dst, unsigned int *flags);
227 void char_finish_utf16 (struct encoding_stream *str,
228 unsigned_char_dynarr *dst, unsigned int *flags);
230 static int detect_coding_utf8 (struct detection_state *st,
231 const Extbyte *src, Lstream_data_count n);
232 static void decode_coding_utf8 (Lstream *decoding, const Extbyte *src,
233 unsigned_char_dynarr *dst, Lstream_data_count n);
234 void char_encode_utf8 (struct encoding_stream *str, Emchar c,
235 unsigned_char_dynarr *dst, unsigned int *flags);
236 void char_finish_utf8 (struct encoding_stream *str,
237 unsigned_char_dynarr *dst, unsigned int *flags);
239 static int postprocess_iso2022_mask (int mask);
240 static void reset_iso2022 (Lisp_Object coding_system,
241 struct iso2022_decoder *iso);
242 static int detect_coding_iso2022 (struct detection_state *st,
243 const Extbyte *src, Lstream_data_count n);
244 static void decode_coding_iso2022 (Lstream *decoding, const Extbyte *src,
245 unsigned_char_dynarr *dst, Lstream_data_count n);
246 void char_encode_iso2022 (struct encoding_stream *str, Emchar c,
247 unsigned_char_dynarr *dst, unsigned int *flags);
248 void char_finish_iso2022 (struct encoding_stream *str,
249 unsigned_char_dynarr *dst, unsigned int *flags);
251 static void decode_coding_no_conversion (Lstream *decoding, const Extbyte *src,
252 unsigned_char_dynarr *dst, Lstream_data_count n);
253 static void encode_coding_no_conversion (Lstream *encoding, const Bufbyte *src,
254 unsigned_char_dynarr *dst, Lstream_data_count n);
255 static void mule_decode (Lstream *decoding, const Extbyte *src,
256 unsigned_char_dynarr *dst, Lstream_data_count n);
257 static void mule_encode (Lstream *encoding, const Bufbyte *src,
258 unsigned_char_dynarr *dst, Lstream_data_count n);
260 typedef struct codesys_prop codesys_prop;
269 Dynarr_declare (codesys_prop);
270 } codesys_prop_dynarr;
272 static const struct lrecord_description codesys_prop_description_1[] = {
273 { XD_LISP_OBJECT, offsetof (codesys_prop, sym) },
277 static const struct struct_description codesys_prop_description = {
278 sizeof (codesys_prop),
279 codesys_prop_description_1
282 static const struct lrecord_description codesys_prop_dynarr_description_1[] = {
283 XD_DYNARR_DESC (codesys_prop_dynarr, &codesys_prop_description),
287 static const struct struct_description codesys_prop_dynarr_description = {
288 sizeof (codesys_prop_dynarr),
289 codesys_prop_dynarr_description_1
292 codesys_prop_dynarr *the_codesys_prop_dynarr;
294 enum codesys_prop_enum
297 CODESYS_PROP_ISO2022,
302 /************************************************************************/
303 /* Coding system functions */
304 /************************************************************************/
306 static Lisp_Object mark_coding_system (Lisp_Object);
307 static void print_coding_system (Lisp_Object, Lisp_Object, int);
308 static void finalize_coding_system (void *header, int for_disksave);
311 static const struct lrecord_description ccs_description_1[] = {
312 { XD_LISP_OBJECT, offsetof (charset_conversion_spec, from_charset) },
313 { XD_LISP_OBJECT, offsetof (charset_conversion_spec, to_charset) },
317 static const struct struct_description ccs_description = {
318 sizeof (charset_conversion_spec),
322 static const struct lrecord_description ccsd_description_1[] = {
323 XD_DYNARR_DESC (charset_conversion_spec_dynarr, &ccs_description),
327 static const struct struct_description ccsd_description = {
328 sizeof (charset_conversion_spec_dynarr),
333 static const struct lrecord_description coding_system_description[] = {
334 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, name) },
335 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, doc_string) },
336 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, mnemonic) },
337 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, post_read_conversion) },
338 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, pre_write_conversion) },
339 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, eol_lf) },
340 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, eol_crlf) },
341 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, eol_cr) },
343 { XD_LISP_OBJECT_ARRAY, offsetof (Lisp_Coding_System, iso2022.initial_charset), 4 },
344 { XD_STRUCT_PTR, offsetof (Lisp_Coding_System, iso2022.input_conv), 1, &ccsd_description },
345 { XD_STRUCT_PTR, offsetof (Lisp_Coding_System, iso2022.output_conv), 1, &ccsd_description },
346 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, ccl.decode) },
347 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, ccl.encode) },
349 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, ccs_priority_list) },
355 DEFINE_LRECORD_IMPLEMENTATION ("coding-system", coding_system,
356 mark_coding_system, print_coding_system,
357 finalize_coding_system,
358 0, 0, coding_system_description,
362 mark_coding_system (Lisp_Object obj)
364 Lisp_Coding_System *codesys = XCODING_SYSTEM (obj);
366 mark_object (CODING_SYSTEM_NAME (codesys));
367 mark_object (CODING_SYSTEM_DOC_STRING (codesys));
368 mark_object (CODING_SYSTEM_MNEMONIC (codesys));
369 mark_object (CODING_SYSTEM_EOL_LF (codesys));
370 mark_object (CODING_SYSTEM_EOL_CRLF (codesys));
371 mark_object (CODING_SYSTEM_EOL_CR (codesys));
373 switch (CODING_SYSTEM_TYPE (codesys))
377 case CODESYS_ISO2022:
378 for (i = 0; i < 4; i++)
379 mark_object (CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i));
380 if (codesys->iso2022.input_conv)
382 for (i = 0; i < Dynarr_length (codesys->iso2022.input_conv); i++)
384 struct charset_conversion_spec *ccs =
385 Dynarr_atp (codesys->iso2022.input_conv, i);
386 mark_object (ccs->from_charset);
387 mark_object (ccs->to_charset);
390 if (codesys->iso2022.output_conv)
392 for (i = 0; i < Dynarr_length (codesys->iso2022.output_conv); i++)
394 struct charset_conversion_spec *ccs =
395 Dynarr_atp (codesys->iso2022.output_conv, i);
396 mark_object (ccs->from_charset);
397 mark_object (ccs->to_charset);
404 mark_object (CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 0));
405 mark_object (CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 1));
410 mark_object (CODING_SYSTEM_CCL_DECODE (codesys));
411 mark_object (CODING_SYSTEM_CCL_ENCODE (codesys));
418 mark_object (CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys));
420 mark_object (CODING_SYSTEM_CCS_PRIORITY_LIST (codesys));
422 return CODING_SYSTEM_POST_READ_CONVERSION (codesys);
426 print_coding_system (Lisp_Object obj, Lisp_Object printcharfun,
429 Lisp_Coding_System *c = XCODING_SYSTEM (obj);
431 error ("printing unreadable object #<coding_system 0x%x>",
434 write_c_string ("#<coding_system ", printcharfun);
435 print_internal (c->name, printcharfun, 1);
436 write_c_string (">", printcharfun);
440 finalize_coding_system (void *header, int for_disksave)
442 Lisp_Coding_System *c = (Lisp_Coding_System *) header;
443 /* Since coding systems never go away, this function is not
444 necessary. But it would be necessary if we changed things
445 so that coding systems could go away. */
446 if (!for_disksave) /* see comment in lstream.c */
448 switch (CODING_SYSTEM_TYPE (c))
451 case CODESYS_ISO2022:
452 if (c->iso2022.input_conv)
454 Dynarr_free (c->iso2022.input_conv);
455 c->iso2022.input_conv = 0;
457 if (c->iso2022.output_conv)
459 Dynarr_free (c->iso2022.output_conv);
460 c->iso2022.output_conv = 0;
471 symbol_to_eol_type (Lisp_Object symbol)
473 CHECK_SYMBOL (symbol);
474 if (NILP (symbol)) return EOL_AUTODETECT;
475 if (EQ (symbol, Qlf)) return EOL_LF;
476 if (EQ (symbol, Qcrlf)) return EOL_CRLF;
477 if (EQ (symbol, Qcr)) return EOL_CR;
479 signal_simple_error ("Unrecognized eol type", symbol);
480 return EOL_AUTODETECT; /* not reached */
484 eol_type_to_symbol (eol_type_t type)
489 case EOL_LF: return Qlf;
490 case EOL_CRLF: return Qcrlf;
491 case EOL_CR: return Qcr;
492 case EOL_AUTODETECT: return Qnil;
497 setup_eol_coding_systems (Lisp_Coding_System *codesys)
499 Lisp_Object codesys_obj;
500 int len = string_length (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name);
501 char *codesys_name = (char *) alloca (len + 7);
503 char *codesys_mnemonic=0;
505 Lisp_Object codesys_name_sym, sub_codesys_obj;
509 XSETCODING_SYSTEM (codesys_obj, codesys);
511 memcpy (codesys_name,
512 string_data (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name), len);
514 if (STRINGP (CODING_SYSTEM_MNEMONIC (codesys)))
516 mlen = XSTRING_LENGTH (CODING_SYSTEM_MNEMONIC (codesys));
517 codesys_mnemonic = (char *) alloca (mlen + 7);
518 memcpy (codesys_mnemonic,
519 XSTRING_DATA (CODING_SYSTEM_MNEMONIC (codesys)), mlen);
522 #define DEFINE_SUB_CODESYS(op_sys, op_sys_abbr, Type) do { \
523 strcpy (codesys_name + len, "-" op_sys); \
525 strcpy (codesys_mnemonic + mlen, op_sys_abbr); \
526 codesys_name_sym = intern (codesys_name); \
527 sub_codesys_obj = Fcopy_coding_system (codesys_obj, codesys_name_sym); \
528 XCODING_SYSTEM_EOL_TYPE (sub_codesys_obj) = Type; \
530 XCODING_SYSTEM_MNEMONIC(sub_codesys_obj) = \
531 build_string (codesys_mnemonic); \
532 CODING_SYSTEM_##Type (codesys) = sub_codesys_obj; \
535 DEFINE_SUB_CODESYS("unix", "", EOL_LF);
536 DEFINE_SUB_CODESYS("dos", ":T", EOL_CRLF);
537 DEFINE_SUB_CODESYS("mac", ":t", EOL_CR);
540 DEFUN ("coding-system-p", Fcoding_system_p, 1, 1, 0, /*
541 Return t if OBJECT is a coding system.
542 A coding system is an object that defines how text containing multiple
543 character sets is encoded into a stream of (typically 8-bit) bytes.
544 The coding system is used to decode the stream into a series of
545 characters (which may be from multiple charsets) when the text is read
546 from a file or process, and is used to encode the text back into the
547 same format when it is written out to a file or process.
549 For example, many ISO2022-compliant coding systems (such as Compound
550 Text, which is used for inter-client data under the X Window System)
551 use escape sequences to switch between different charsets -- Japanese
552 Kanji, for example, is invoked with "ESC $ ( B"; ASCII is invoked
553 with "ESC ( B"; and Cyrillic is invoked with "ESC - L". See
554 `make-coding-system' for more information.
556 Coding systems are normally identified using a symbol, and the
557 symbol is accepted in place of the actual coding system object whenever
558 a coding system is called for. (This is similar to how faces work.)
562 return CODING_SYSTEMP (object) ? Qt : Qnil;
565 DEFUN ("find-coding-system", Ffind_coding_system, 1, 1, 0, /*
566 Retrieve the coding system of the given name.
568 If CODING-SYSTEM-OR-NAME is a coding-system object, it is simply
569 returned. Otherwise, CODING-SYSTEM-OR-NAME should be a symbol.
570 If there is no such coding system, nil is returned. Otherwise the
571 associated coding system object is returned.
573 (coding_system_or_name))
575 if (NILP (coding_system_or_name))
576 coding_system_or_name = Qbinary;
577 else if (CODING_SYSTEMP (coding_system_or_name))
578 return coding_system_or_name;
580 CHECK_SYMBOL (coding_system_or_name);
584 coding_system_or_name =
585 Fgethash (coding_system_or_name, Vcoding_system_hash_table, Qnil);
587 if (CODING_SYSTEMP (coding_system_or_name) || NILP (coding_system_or_name))
588 return coding_system_or_name;
592 DEFUN ("get-coding-system", Fget_coding_system, 1, 1, 0, /*
593 Retrieve the coding system of the given name.
594 Same as `find-coding-system' except that if there is no such
595 coding system, an error is signaled instead of returning nil.
599 Lisp_Object coding_system = Ffind_coding_system (name);
601 if (NILP (coding_system))
602 signal_simple_error ("No such coding system", name);
603 return coding_system;
606 /* We store the coding systems in hash tables with the names as the key and the
607 actual coding system object as the value. Occasionally we need to use them
608 in a list format. These routines provide us with that. */
609 struct coding_system_list_closure
611 Lisp_Object *coding_system_list;
615 add_coding_system_to_list_mapper (Lisp_Object key, Lisp_Object value,
616 void *coding_system_list_closure)
618 /* This function can GC */
619 struct coding_system_list_closure *cscl =
620 (struct coding_system_list_closure *) coding_system_list_closure;
621 Lisp_Object *coding_system_list = cscl->coding_system_list;
623 *coding_system_list = Fcons (key, *coding_system_list);
627 DEFUN ("coding-system-list", Fcoding_system_list, 0, 0, 0, /*
628 Return a list of the names of all defined coding systems.
632 Lisp_Object coding_system_list = Qnil;
634 struct coding_system_list_closure coding_system_list_closure;
636 GCPRO1 (coding_system_list);
637 coding_system_list_closure.coding_system_list = &coding_system_list;
638 elisp_maphash (add_coding_system_to_list_mapper, Vcoding_system_hash_table,
639 &coding_system_list_closure);
642 return coding_system_list;
645 DEFUN ("coding-system-name", Fcoding_system_name, 1, 1, 0, /*
646 Return the name of the given coding system.
650 coding_system = Fget_coding_system (coding_system);
651 return XCODING_SYSTEM_NAME (coding_system);
654 static Lisp_Coding_System *
655 allocate_coding_system (enum coding_system_type type, Lisp_Object name)
657 Lisp_Coding_System *codesys =
658 alloc_lcrecord_type (Lisp_Coding_System, &lrecord_coding_system);
660 zero_lcrecord (codesys);
661 CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = Qnil;
662 CODING_SYSTEM_POST_READ_CONVERSION (codesys) = Qnil;
663 CODING_SYSTEM_EOL_TYPE (codesys) = EOL_AUTODETECT;
664 CODING_SYSTEM_EOL_CRLF (codesys) = Qnil;
665 CODING_SYSTEM_EOL_CR (codesys) = Qnil;
666 CODING_SYSTEM_EOL_LF (codesys) = Qnil;
667 CODING_SYSTEM_TYPE (codesys) = type;
668 CODING_SYSTEM_MNEMONIC (codesys) = Qnil;
671 CODING_SYSTEM_CCS_PRIORITY_LIST (codesys) = Qnil;
673 if (type == CODESYS_ISO2022)
676 for (i = 0; i < 4; i++)
677 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i) = Qnil;
680 if (type == CODESYS_UTF8)
682 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 0)
684 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 1)
686 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 2)
688 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 3)
691 else if (type == CODESYS_BIG5)
693 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 0)
695 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 1)
696 = Vcharset_chinese_big5;
697 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 2)
699 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 3)
703 else if (type == CODESYS_CCL)
705 CODING_SYSTEM_CCL_DECODE (codesys) = Qnil;
706 CODING_SYSTEM_CCL_ENCODE (codesys) = Qnil;
709 CODING_SYSTEM_NAME (codesys) = name;
715 /* Given a list of charset conversion specs as specified in a Lisp
716 program, parse it into STORE_HERE. */
719 parse_charset_conversion_specs (charset_conversion_spec_dynarr *store_here,
720 Lisp_Object spec_list)
724 EXTERNAL_LIST_LOOP (rest, spec_list)
726 Lisp_Object car = XCAR (rest);
727 Lisp_Object from, to;
728 struct charset_conversion_spec spec;
730 if (!CONSP (car) || !CONSP (XCDR (car)) || !NILP (XCDR (XCDR (car))))
731 signal_simple_error ("Invalid charset conversion spec", car);
732 from = Fget_charset (XCAR (car));
733 to = Fget_charset (XCAR (XCDR (car)));
734 if ( (XCHARSET_CHARS (from) != XCHARSET_CHARS (to)) ||
735 (XCHARSET_DIMENSION (from) != XCHARSET_DIMENSION (to)) )
736 signal_simple_error_2
737 ("Attempted conversion between different charset types",
739 spec.from_charset = from;
740 spec.to_charset = to;
742 Dynarr_add (store_here, spec);
746 /* Given a dynarr LOAD_HERE of internally-stored charset conversion
747 specs, return the equivalent as the Lisp programmer would see it.
749 If LOAD_HERE is 0, return Qnil. */
752 unparse_charset_conversion_specs (charset_conversion_spec_dynarr *load_here)
759 for (i = 0, result = Qnil; i < Dynarr_length (load_here); i++)
761 struct charset_conversion_spec *ccs = Dynarr_atp (load_here, i);
762 result = Fcons (list2 (ccs->from_charset, ccs->to_charset), result);
765 return Fnreverse (result);
770 DEFUN ("make-coding-system", Fmake_coding_system, 2, 4, 0, /*
771 Register symbol NAME as a coding system.
773 TYPE describes the conversion method used and should be one of
776 Automatic conversion. XEmacs attempts to detect the coding system
779 No conversion. Use this for binary files and such. On output,
780 graphic characters that are not in ASCII or Latin-1 will be
781 replaced by a ?. (For a no-conversion-encoded buffer, these
782 characters will only be present if you explicitly insert them.)
784 Shift-JIS (a Japanese encoding commonly used in PC operating systems).
786 ISO 10646 UCS-4 encoding.
788 ISO 10646 UTF-8 encoding.
790 Any ISO2022-compliant encoding. Among other things, this includes
791 JIS (the Japanese encoding commonly used for e-mail), EUC (the
792 standard Unix encoding for Japanese and other languages), and
793 Compound Text (the encoding used in X11). You can specify more
794 specific information about the conversion with the PROPS argument.
796 Big5 (the encoding commonly used for Taiwanese).
798 The conversion is performed using a user-written pseudo-code
799 program. CCL (Code Conversion Language) is the name of this
802 Write out or read in the raw contents of the memory representing
803 the buffer's text. This is primarily useful for debugging
804 purposes, and is only enabled when XEmacs has been compiled with
805 DEBUG_XEMACS defined (via the --debug configure option).
806 WARNING: Reading in a file using 'internal conversion can result
807 in an internal inconsistency in the memory representing a
808 buffer's text, which will produce unpredictable results and may
809 cause XEmacs to crash. Under normal circumstances you should
810 never use 'internal conversion.
812 DOC-STRING is a string describing the coding system.
814 PROPS is a property list, describing the specific nature of the
815 character set. Recognized properties are:
818 String to be displayed in the modeline when this coding system is
822 End-of-line conversion to be used. It should be one of
825 Automatically detect the end-of-line type (LF, CRLF,
826 or CR). Also generate subsidiary coding systems named
827 `NAME-unix', `NAME-dos', and `NAME-mac', that are
828 identical to this coding system but have an EOL-TYPE
829 value of 'lf, 'crlf, and 'cr, respectively.
831 The end of a line is marked externally using ASCII LF.
832 Since this is also the way that XEmacs represents an
833 end-of-line internally, specifying this option results
834 in no end-of-line conversion. This is the standard
835 format for Unix text files.
837 The end of a line is marked externally using ASCII
838 CRLF. This is the standard format for MS-DOS text
841 The end of a line is marked externally using ASCII CR.
842 This is the standard format for Macintosh text files.
844 Automatically detect the end-of-line type but do not
845 generate subsidiary coding systems. (This value is
846 converted to nil when stored internally, and
847 `coding-system-property' will return nil.)
850 If non-nil, composition/decomposition for combining characters
853 'use-entity-reference
854 If non-nil, SGML style entity-reference is used for non-system-characters.
856 'post-read-conversion
857 Function called after a file has been read in, to perform the
858 decoding. Called with two arguments, START and END, denoting
859 a region of the current buffer to be decoded.
861 'pre-write-conversion
862 Function called before a file is written out, to perform the
863 encoding. Called with two arguments, START and END, denoting
864 a region of the current buffer to be encoded.
867 The following additional properties are recognized if TYPE is 'iso2022:
873 The character set initially designated to the G0 - G3 registers.
874 The value should be one of
876 -- A charset object (designate that character set)
877 -- nil (do not ever use this register)
878 -- t (no character set is initially designated to
879 the register, but may be later on; this automatically
880 sets the corresponding `force-g*-on-output' property)
886 If non-nil, send an explicit designation sequence on output before
887 using the specified register.
890 If non-nil, use the short forms "ESC $ @", "ESC $ A", and
891 "ESC $ B" on output in place of the full designation sequences
892 "ESC $ ( @", "ESC $ ( A", and "ESC $ ( B".
895 If non-nil, don't designate ASCII to G0 at each end of line on output.
896 Setting this to non-nil also suppresses other state-resetting that
897 normally happens at the end of a line.
900 If non-nil, don't designate ASCII to G0 before control chars on output.
903 If non-nil, use 7-bit environment on output. Otherwise, use 8-bit
907 If non-nil, use locking-shift (SO/SI) instead of single-shift
908 or designation by escape sequence.
911 If non-nil, don't use ISO6429's direction specification.
914 If non-nil, literal control characters that are the same as
915 the beginning of a recognized ISO2022 or ISO6429 escape sequence
916 (in particular, ESC (0x1B), SO (0x0E), SI (0x0F), SS2 (0x8E),
917 SS3 (0x8F), and CSI (0x9B)) are "quoted" with an escape character
918 so that they can be properly distinguished from an escape sequence.
919 (Note that doing this results in a non-portable encoding.) This
920 encoding flag is used for byte-compiled files. Note that ESC
921 is a good choice for a quoting character because there are no
922 escape sequences whose second byte is a character from the Control-0
923 or Control-1 character sets; this is explicitly disallowed by the
926 'input-charset-conversion
927 A list of conversion specifications, specifying conversion of
928 characters in one charset to another when decoding is performed.
929 Each specification is a list of two elements: the source charset,
930 and the destination charset.
932 'output-charset-conversion
933 A list of conversion specifications, specifying conversion of
934 characters in one charset to another when encoding is performed.
935 The form of each specification is the same as for
936 'input-charset-conversion.
939 The following additional properties are recognized (and required)
943 CCL program used for decoding (converting to internal format).
946 CCL program used for encoding (converting to external format).
948 (name, type, doc_string, props))
950 Lisp_Coding_System *codesys;
951 enum coding_system_type ty;
952 int need_to_setup_eol_systems = 1;
954 /* Convert type to constant */
955 if (NILP (type) || EQ (type, Qundecided))
956 { ty = CODESYS_AUTODETECT; }
958 else if (EQ (type, Qshift_jis)) { ty = CODESYS_SHIFT_JIS; }
959 else if (EQ (type, Qiso2022)) { ty = CODESYS_ISO2022; }
960 else if (EQ (type, Qbig5)) { ty = CODESYS_BIG5; }
961 else if (EQ (type, Qucs4)) { ty = CODESYS_UCS4; }
962 else if (EQ (type, Qutf16)) { ty = CODESYS_UTF16; }
963 else if (EQ (type, Qutf8)) { ty = CODESYS_UTF8; }
964 else if (EQ (type, Qccl)) { ty = CODESYS_CCL; }
966 else if (EQ (type, Qno_conversion)) { ty = CODESYS_NO_CONVERSION; }
968 else if (EQ (type, Qinternal)) { ty = CODESYS_INTERNAL; }
971 signal_simple_error ("Invalid coding system type", type);
975 codesys = allocate_coding_system (ty, name);
977 if (NILP (doc_string))
978 doc_string = build_string ("");
980 CHECK_STRING (doc_string);
981 CODING_SYSTEM_DOC_STRING (codesys) = doc_string;
984 EXTERNAL_PROPERTY_LIST_LOOP_3 (key, value, props)
986 if (EQ (key, Qmnemonic))
989 CHECK_STRING (value);
990 CODING_SYSTEM_MNEMONIC (codesys) = value;
993 else if (EQ (key, Qeol_type))
995 need_to_setup_eol_systems = NILP (value);
998 CODING_SYSTEM_EOL_TYPE (codesys) = symbol_to_eol_type (value);
1001 else if (EQ (key, Qpost_read_conversion))
1002 CODING_SYSTEM_POST_READ_CONVERSION (codesys) = value;
1003 else if (EQ (key, Qpre_write_conversion))
1004 CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = value;
1006 else if (EQ (key, Qdisable_composition))
1007 CODING_SYSTEM_DISABLE_COMPOSITION (codesys) = !NILP (value);
1008 else if (EQ (key, Quse_entity_reference))
1009 CODING_SYSTEM_USE_ENTITY_REFERENCE (codesys) = !NILP (value);
1012 else if (ty == CODESYS_ISO2022)
1014 #define FROB_INITIAL_CHARSET(charset_num) \
1015 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, charset_num) = \
1016 ((EQ (value, Qt) || EQ (value, Qnil)) ? value : Fget_charset (value))
1018 if (EQ (key, Qcharset_g0)) FROB_INITIAL_CHARSET (0);
1019 else if (EQ (key, Qcharset_g1)) FROB_INITIAL_CHARSET (1);
1020 else if (EQ (key, Qcharset_g2)) FROB_INITIAL_CHARSET (2);
1021 else if (EQ (key, Qcharset_g3)) FROB_INITIAL_CHARSET (3);
1023 #define FROB_FORCE_CHARSET(charset_num) \
1024 CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (codesys, charset_num) = !NILP (value)
1026 else if (EQ (key, Qforce_g0_on_output)) FROB_FORCE_CHARSET (0);
1027 else if (EQ (key, Qforce_g1_on_output)) FROB_FORCE_CHARSET (1);
1028 else if (EQ (key, Qforce_g2_on_output)) FROB_FORCE_CHARSET (2);
1029 else if (EQ (key, Qforce_g3_on_output)) FROB_FORCE_CHARSET (3);
1031 #define FROB_BOOLEAN_PROPERTY(prop) \
1032 CODING_SYSTEM_ISO2022_##prop (codesys) = !NILP (value)
1034 else if (EQ (key, Qshort)) FROB_BOOLEAN_PROPERTY (SHORT);
1035 else if (EQ (key, Qno_ascii_eol)) FROB_BOOLEAN_PROPERTY (NO_ASCII_EOL);
1036 else if (EQ (key, Qno_ascii_cntl)) FROB_BOOLEAN_PROPERTY (NO_ASCII_CNTL);
1037 else if (EQ (key, Qseven)) FROB_BOOLEAN_PROPERTY (SEVEN);
1038 else if (EQ (key, Qlock_shift)) FROB_BOOLEAN_PROPERTY (LOCK_SHIFT);
1039 else if (EQ (key, Qno_iso6429)) FROB_BOOLEAN_PROPERTY (NO_ISO6429);
1040 else if (EQ (key, Qescape_quoted)) FROB_BOOLEAN_PROPERTY (ESCAPE_QUOTED);
1042 else if (EQ (key, Qinput_charset_conversion))
1044 codesys->iso2022.input_conv =
1045 Dynarr_new (charset_conversion_spec);
1046 parse_charset_conversion_specs (codesys->iso2022.input_conv,
1049 else if (EQ (key, Qoutput_charset_conversion))
1051 codesys->iso2022.output_conv =
1052 Dynarr_new (charset_conversion_spec);
1053 parse_charset_conversion_specs (codesys->iso2022.output_conv,
1057 else if (EQ (key, Qccs_priority_list))
1059 codesys->ccs_priority_list = value;
1063 signal_simple_error ("Unrecognized property", key);
1066 else if (ty == CODESYS_UTF8)
1068 if (EQ (key, Qcharset_g0)) FROB_INITIAL_CHARSET (0);
1069 else if (EQ (key, Qcharset_g1))
1070 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 1) = value;
1071 else if (EQ (key, Qcharset_g2))
1072 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 2) = value;
1074 signal_simple_error ("Unrecognized property", key);
1076 else if (ty == CODESYS_BIG5)
1078 if (EQ (key, Qcharset_g0)) FROB_INITIAL_CHARSET (0);
1079 else if (EQ (key, Qcharset_g1)) FROB_INITIAL_CHARSET (1);
1081 signal_simple_error ("Unrecognized property", key);
1084 else if (EQ (type, Qccl))
1087 struct ccl_program test_ccl;
1090 /* Check key first. */
1091 if (EQ (key, Qdecode))
1092 suffix = "-ccl-decode";
1093 else if (EQ (key, Qencode))
1094 suffix = "-ccl-encode";
1096 signal_simple_error ("Unrecognized property", key);
1098 /* If value is vector, register it as a ccl program
1099 associated with an newly created symbol for
1100 backward compatibility. */
1101 if (VECTORP (value))
1103 sym = Fintern (concat2 (Fsymbol_name (name),
1104 build_string (suffix)),
1106 Fregister_ccl_program (sym, value);
1110 CHECK_SYMBOL (value);
1113 /* check if the given ccl programs are valid. */
1114 if (setup_ccl_program (&test_ccl, sym) < 0)
1115 signal_simple_error ("Invalid CCL program", value);
1117 if (EQ (key, Qdecode))
1118 CODING_SYSTEM_CCL_DECODE (codesys) = sym;
1119 else if (EQ (key, Qencode))
1120 CODING_SYSTEM_CCL_ENCODE (codesys) = sym;
1125 signal_simple_error ("Unrecognized property", key);
1129 if (need_to_setup_eol_systems)
1130 setup_eol_coding_systems (codesys);
1133 Lisp_Object codesys_obj;
1134 XSETCODING_SYSTEM (codesys_obj, codesys);
1135 Fputhash (name, codesys_obj, Vcoding_system_hash_table);
1140 DEFUN ("copy-coding-system", Fcopy_coding_system, 2, 2, 0, /*
1141 Copy OLD-CODING-SYSTEM to NEW-NAME.
1142 If NEW-NAME does not name an existing coding system, a new one will
1145 (old_coding_system, new_name))
1147 Lisp_Object new_coding_system;
1148 old_coding_system = Fget_coding_system (old_coding_system);
1149 new_coding_system = Ffind_coding_system (new_name);
1150 if (NILP (new_coding_system))
1152 XSETCODING_SYSTEM (new_coding_system,
1153 allocate_coding_system
1154 (XCODING_SYSTEM_TYPE (old_coding_system),
1156 Fputhash (new_name, new_coding_system, Vcoding_system_hash_table);
1160 Lisp_Coding_System *to = XCODING_SYSTEM (new_coding_system);
1161 Lisp_Coding_System *from = XCODING_SYSTEM (old_coding_system);
1162 memcpy (((char *) to ) + sizeof (to->header),
1163 ((char *) from) + sizeof (from->header),
1164 sizeof (*from) - sizeof (from->header));
1165 to->name = new_name;
1167 return new_coding_system;
1170 DEFUN ("coding-system-canonical-name-p", Fcoding_system_canonical_name_p, 1, 1, 0, /*
1171 Return t if OBJECT names a coding system, and is not a coding system alias.
1175 return CODING_SYSTEMP (Fgethash (object, Vcoding_system_hash_table, Qnil))
1179 DEFUN ("coding-system-alias-p", Fcoding_system_alias_p, 1, 1, 0, /*
1180 Return t if OBJECT is a coding system alias.
1181 All coding system aliases are created by `define-coding-system-alias'.
1185 return SYMBOLP (Fgethash (object, Vcoding_system_hash_table, Qzero))
1189 DEFUN ("coding-system-aliasee", Fcoding_system_aliasee, 1, 1, 0, /*
1190 Return the coding-system symbol for which symbol ALIAS is an alias.
1194 Lisp_Object aliasee = Fgethash (alias, Vcoding_system_hash_table, Qnil);
1195 if (SYMBOLP (aliasee))
1198 signal_simple_error ("Symbol is not a coding system alias", alias);
1199 return Qnil; /* To keep the compiler happy */
1203 append_suffix_to_symbol (Lisp_Object symbol, const char *ascii_string)
1205 return Fintern (concat2 (Fsymbol_name (symbol), build_string (ascii_string)),
1209 /* A maphash function, for removing dangling coding system aliases. */
1211 dangling_coding_system_alias_p (Lisp_Object alias,
1212 Lisp_Object aliasee,
1213 void *dangling_aliases)
1215 if (SYMBOLP (aliasee)
1216 && NILP (Fgethash (aliasee, Vcoding_system_hash_table, Qnil)))
1218 (*(int *) dangling_aliases)++;
1225 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias, 2, 2, 0, /*
1226 Define symbol ALIAS as an alias for coding system ALIASEE.
1228 You can use this function to redefine an alias that has already been defined,
1229 but you cannot redefine a name which is the canonical name for a coding system.
1230 \(a canonical name of a coding system is what is returned when you call
1231 `coding-system-name' on a coding system).
1233 ALIASEE itself can be an alias, which allows you to define nested aliases.
1235 You are forbidden, however, from creating alias loops or `dangling' aliases.
1236 These will be detected, and an error will be signaled if you attempt to do so.
1238 If ALIASEE is nil, then ALIAS will simply be undefined.
1240 See also `coding-system-alias-p', `coding-system-aliasee',
1241 and `coding-system-canonical-name-p'.
1245 Lisp_Object real_coding_system, probe;
1247 CHECK_SYMBOL (alias);
1249 if (!NILP (Fcoding_system_canonical_name_p (alias)))
1251 ("Symbol is the canonical name of a coding system and cannot be redefined",
1256 Lisp_Object subsidiary_unix = append_suffix_to_symbol (alias, "-unix");
1257 Lisp_Object subsidiary_dos = append_suffix_to_symbol (alias, "-dos");
1258 Lisp_Object subsidiary_mac = append_suffix_to_symbol (alias, "-mac");
1260 Fremhash (alias, Vcoding_system_hash_table);
1262 /* Undefine subsidiary aliases,
1263 presumably created by a previous call to this function */
1264 if (! NILP (Fcoding_system_alias_p (subsidiary_unix)) &&
1265 ! NILP (Fcoding_system_alias_p (subsidiary_dos)) &&
1266 ! NILP (Fcoding_system_alias_p (subsidiary_mac)))
1268 Fdefine_coding_system_alias (subsidiary_unix, Qnil);
1269 Fdefine_coding_system_alias (subsidiary_dos, Qnil);
1270 Fdefine_coding_system_alias (subsidiary_mac, Qnil);
1273 /* Undefine dangling coding system aliases. */
1275 int dangling_aliases;
1278 dangling_aliases = 0;
1279 elisp_map_remhash (dangling_coding_system_alias_p,
1280 Vcoding_system_hash_table,
1282 } while (dangling_aliases > 0);
1288 if (CODING_SYSTEMP (aliasee))
1289 aliasee = XCODING_SYSTEM_NAME (aliasee);
1291 /* Checks that aliasee names a coding-system */
1292 real_coding_system = Fget_coding_system (aliasee);
1294 /* Check for coding system alias loops */
1295 if (EQ (alias, aliasee))
1296 alias_loop: signal_simple_error_2
1297 ("Attempt to create a coding system alias loop", alias, aliasee);
1299 for (probe = aliasee;
1301 probe = Fgethash (probe, Vcoding_system_hash_table, Qzero))
1303 if (EQ (probe, alias))
1307 Fputhash (alias, aliasee, Vcoding_system_hash_table);
1309 /* Set up aliases for subsidiaries.
1310 #### There must be a better way to handle subsidiary coding systems. */
1312 static const char *suffixes[] = { "-unix", "-dos", "-mac" };
1314 for (i = 0; i < countof (suffixes); i++)
1316 Lisp_Object alias_subsidiary =
1317 append_suffix_to_symbol (alias, suffixes[i]);
1318 Lisp_Object aliasee_subsidiary =
1319 append_suffix_to_symbol (aliasee, suffixes[i]);
1321 if (! NILP (Ffind_coding_system (aliasee_subsidiary)))
1322 Fdefine_coding_system_alias (alias_subsidiary, aliasee_subsidiary);
1325 /* FSF return value is a vector of [ALIAS-unix ALIAS-dos ALIAS-mac],
1326 but it doesn't look intentional, so I'd rather return something
1327 meaningful or nothing at all. */
1332 subsidiary_coding_system (Lisp_Object coding_system, eol_type_t type)
1334 Lisp_Coding_System *cs = XCODING_SYSTEM (coding_system);
1335 Lisp_Object new_coding_system;
1337 if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
1338 return coding_system;
1342 case EOL_AUTODETECT: return coding_system;
1343 case EOL_LF: new_coding_system = CODING_SYSTEM_EOL_LF (cs); break;
1344 case EOL_CR: new_coding_system = CODING_SYSTEM_EOL_CR (cs); break;
1345 case EOL_CRLF: new_coding_system = CODING_SYSTEM_EOL_CRLF (cs); break;
1346 default: abort (); return Qnil;
1349 return NILP (new_coding_system) ? coding_system : new_coding_system;
1352 DEFUN ("subsidiary-coding-system", Fsubsidiary_coding_system, 2, 2, 0, /*
1353 Return the subsidiary coding system of CODING-SYSTEM with eol type EOL-TYPE.
1355 (coding_system, eol_type))
1357 coding_system = Fget_coding_system (coding_system);
1359 return subsidiary_coding_system (coding_system,
1360 symbol_to_eol_type (eol_type));
1364 /************************************************************************/
1365 /* Coding system accessors */
1366 /************************************************************************/
1368 DEFUN ("coding-system-doc-string", Fcoding_system_doc_string, 1, 1, 0, /*
1369 Return the doc string for CODING-SYSTEM.
1373 coding_system = Fget_coding_system (coding_system);
1374 return XCODING_SYSTEM_DOC_STRING (coding_system);
1377 DEFUN ("coding-system-type", Fcoding_system_type, 1, 1, 0, /*
1378 Return the type of CODING-SYSTEM.
1382 switch (XCODING_SYSTEM_TYPE (Fget_coding_system (coding_system)))
1385 case CODESYS_AUTODETECT: return Qundecided;
1387 case CODESYS_SHIFT_JIS: return Qshift_jis;
1388 case CODESYS_ISO2022: return Qiso2022;
1389 case CODESYS_BIG5: return Qbig5;
1390 case CODESYS_UCS4: return Qucs4;
1391 case CODESYS_UTF16: return Qutf16;
1392 case CODESYS_UTF8: return Qutf8;
1393 case CODESYS_CCL: return Qccl;
1395 case CODESYS_NO_CONVERSION: return Qno_conversion;
1397 case CODESYS_INTERNAL: return Qinternal;
1404 Lisp_Object coding_system_charset (Lisp_Object coding_system, int gnum)
1407 = XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, gnum);
1409 return CHARSETP (cs) ? XCHARSET_NAME (cs) : Qnil;
1412 DEFUN ("coding-system-charset", Fcoding_system_charset, 2, 2, 0, /*
1413 Return initial charset of CODING-SYSTEM designated to GNUM.
1416 (coding_system, gnum))
1418 coding_system = Fget_coding_system (coding_system);
1421 return coding_system_charset (coding_system, XINT (gnum));
1425 DEFUN ("coding-system-property", Fcoding_system_property, 2, 2, 0, /*
1426 Return the PROP property of CODING-SYSTEM.
1428 (coding_system, prop))
1431 enum coding_system_type type;
1433 coding_system = Fget_coding_system (coding_system);
1434 CHECK_SYMBOL (prop);
1435 type = XCODING_SYSTEM_TYPE (coding_system);
1437 for (i = 0; !ok && i < Dynarr_length (the_codesys_prop_dynarr); i++)
1438 if (EQ (Dynarr_at (the_codesys_prop_dynarr, i).sym, prop))
1441 switch (Dynarr_at (the_codesys_prop_dynarr, i).prop_type)
1443 case CODESYS_PROP_ALL_OK:
1446 case CODESYS_PROP_ISO2022:
1447 if (type != CODESYS_ISO2022)
1449 ("Property only valid in ISO2022 coding systems",
1453 case CODESYS_PROP_CCL:
1454 if (type != CODESYS_CCL)
1456 ("Property only valid in CCL coding systems",
1466 signal_simple_error ("Unrecognized property", prop);
1468 if (EQ (prop, Qname))
1469 return XCODING_SYSTEM_NAME (coding_system);
1470 else if (EQ (prop, Qtype))
1471 return Fcoding_system_type (coding_system);
1472 else if (EQ (prop, Qdoc_string))
1473 return XCODING_SYSTEM_DOC_STRING (coding_system);
1474 else if (EQ (prop, Qmnemonic))
1475 return XCODING_SYSTEM_MNEMONIC (coding_system);
1476 else if (EQ (prop, Qeol_type))
1477 return eol_type_to_symbol (XCODING_SYSTEM_EOL_TYPE (coding_system));
1478 else if (EQ (prop, Qeol_lf))
1479 return XCODING_SYSTEM_EOL_LF (coding_system);
1480 else if (EQ (prop, Qeol_crlf))
1481 return XCODING_SYSTEM_EOL_CRLF (coding_system);
1482 else if (EQ (prop, Qeol_cr))
1483 return XCODING_SYSTEM_EOL_CR (coding_system);
1484 else if (EQ (prop, Qpost_read_conversion))
1485 return XCODING_SYSTEM_POST_READ_CONVERSION (coding_system);
1486 else if (EQ (prop, Qpre_write_conversion))
1487 return XCODING_SYSTEM_PRE_WRITE_CONVERSION (coding_system);
1490 else if (EQ (prop, Qdisable_composition))
1491 return XCODING_SYSTEM_DISABLE_COMPOSITION (coding_system) ? Qt : Qnil;
1492 else if (EQ (prop, Quse_entity_reference))
1493 return XCODING_SYSTEM_USE_ENTITY_REFERENCE (coding_system) ? Qt : Qnil;
1494 else if (EQ (prop, Qccs_priority_list))
1495 return XCODING_SYSTEM_CCS_PRIORITY_LIST (coding_system);
1497 else if (type == CODESYS_ISO2022)
1499 if (EQ (prop, Qcharset_g0))
1500 return coding_system_charset (coding_system, 0);
1501 else if (EQ (prop, Qcharset_g1))
1502 return coding_system_charset (coding_system, 1);
1503 else if (EQ (prop, Qcharset_g2))
1504 return coding_system_charset (coding_system, 2);
1505 else if (EQ (prop, Qcharset_g3))
1506 return coding_system_charset (coding_system, 3);
1508 #define FORCE_CHARSET(charset_num) \
1509 (XCODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT \
1510 (coding_system, charset_num) ? Qt : Qnil)
1512 else if (EQ (prop, Qforce_g0_on_output)) return FORCE_CHARSET (0);
1513 else if (EQ (prop, Qforce_g1_on_output)) return FORCE_CHARSET (1);
1514 else if (EQ (prop, Qforce_g2_on_output)) return FORCE_CHARSET (2);
1515 else if (EQ (prop, Qforce_g3_on_output)) return FORCE_CHARSET (3);
1517 #define LISP_BOOLEAN(prop) \
1518 (XCODING_SYSTEM_ISO2022_##prop (coding_system) ? Qt : Qnil)
1520 else if (EQ (prop, Qshort)) return LISP_BOOLEAN (SHORT);
1521 else if (EQ (prop, Qno_ascii_eol)) return LISP_BOOLEAN (NO_ASCII_EOL);
1522 else if (EQ (prop, Qno_ascii_cntl)) return LISP_BOOLEAN (NO_ASCII_CNTL);
1523 else if (EQ (prop, Qseven)) return LISP_BOOLEAN (SEVEN);
1524 else if (EQ (prop, Qlock_shift)) return LISP_BOOLEAN (LOCK_SHIFT);
1525 else if (EQ (prop, Qno_iso6429)) return LISP_BOOLEAN (NO_ISO6429);
1526 else if (EQ (prop, Qescape_quoted)) return LISP_BOOLEAN (ESCAPE_QUOTED);
1528 else if (EQ (prop, Qinput_charset_conversion))
1530 unparse_charset_conversion_specs
1531 (XCODING_SYSTEM (coding_system)->iso2022.input_conv);
1532 else if (EQ (prop, Qoutput_charset_conversion))
1534 unparse_charset_conversion_specs
1535 (XCODING_SYSTEM (coding_system)->iso2022.output_conv);
1539 else if (type == CODESYS_CCL)
1541 if (EQ (prop, Qdecode))
1542 return XCODING_SYSTEM_CCL_DECODE (coding_system);
1543 else if (EQ (prop, Qencode))
1544 return XCODING_SYSTEM_CCL_ENCODE (coding_system);
1552 return Qnil; /* not reached */
1556 /************************************************************************/
1557 /* Coding category functions */
1558 /************************************************************************/
1561 decode_coding_category (Lisp_Object symbol)
1565 CHECK_SYMBOL (symbol);
1566 for (i = 0; i < CODING_CATEGORY_LAST; i++)
1567 if (EQ (coding_category_symbol[i], symbol))
1570 signal_simple_error ("Unrecognized coding category", symbol);
1571 return 0; /* not reached */
1574 DEFUN ("coding-category-list", Fcoding_category_list, 0, 0, 0, /*
1575 Return a list of all recognized coding categories.
1580 Lisp_Object list = Qnil;
1582 for (i = CODING_CATEGORY_LAST - 1; i >= 0; i--)
1583 list = Fcons (coding_category_symbol[i], list);
1587 DEFUN ("set-coding-priority-list", Fset_coding_priority_list, 1, 1, 0, /*
1588 Change the priority order of the coding categories.
1589 LIST should be list of coding categories, in descending order of
1590 priority. Unspecified coding categories will be lower in priority
1591 than all specified ones, in the same relative order they were in
1596 int category_to_priority[CODING_CATEGORY_LAST];
1600 /* First generate a list that maps coding categories to priorities. */
1602 for (i = 0; i < CODING_CATEGORY_LAST; i++)
1603 category_to_priority[i] = -1;
1605 /* Highest priority comes from the specified list. */
1607 EXTERNAL_LIST_LOOP (rest, list)
1609 int cat = decode_coding_category (XCAR (rest));
1611 if (category_to_priority[cat] >= 0)
1612 signal_simple_error ("Duplicate coding category in list", XCAR (rest));
1613 category_to_priority[cat] = i++;
1616 /* Now go through the existing categories by priority to retrieve
1617 the categories not yet specified and preserve their priority
1619 for (j = 0; j < CODING_CATEGORY_LAST; j++)
1621 int cat = fcd->coding_category_by_priority[j];
1622 if (category_to_priority[cat] < 0)
1623 category_to_priority[cat] = i++;
1626 /* Now we need to construct the inverse of the mapping we just
1629 for (i = 0; i < CODING_CATEGORY_LAST; i++)
1630 fcd->coding_category_by_priority[category_to_priority[i]] = i;
1632 /* Phew! That was confusing. */
1636 DEFUN ("coding-priority-list", Fcoding_priority_list, 0, 0, 0, /*
1637 Return a list of coding categories in descending order of priority.
1642 Lisp_Object list = Qnil;
1644 for (i = CODING_CATEGORY_LAST - 1; i >= 0; i--)
1645 list = Fcons (coding_category_symbol[fcd->coding_category_by_priority[i]],
1650 DEFUN ("set-coding-category-system", Fset_coding_category_system, 2, 2, 0, /*
1651 Change the coding system associated with a coding category.
1653 (coding_category, coding_system))
1655 int cat = decode_coding_category (coding_category);
1657 coding_system = Fget_coding_system (coding_system);
1658 fcd->coding_category_system[cat] = coding_system;
1662 DEFUN ("coding-category-system", Fcoding_category_system, 1, 1, 0, /*
1663 Return the coding system associated with a coding category.
1667 int cat = decode_coding_category (coding_category);
1668 Lisp_Object sys = fcd->coding_category_system[cat];
1671 return XCODING_SYSTEM_NAME (sys);
1676 /************************************************************************/
1677 /* Detecting the encoding of data */
1678 /************************************************************************/
1680 struct detection_state
1682 eol_type_t eol_type;
1725 struct iso2022_decoder iso;
1727 int high_byte_count;
1728 unsigned int saw_single_shift:1;
1741 acceptable_control_char_p (int c)
1745 /* Allow and ignore control characters that you might
1746 reasonably see in a text file */
1751 case 8: /* backspace */
1752 case 11: /* vertical tab */
1753 case 12: /* form feed */
1754 case 26: /* MS-DOS C-z junk */
1755 case 31: /* '^_' -- for info */
1763 mask_has_at_most_one_bit_p (int mask)
1765 /* Perhaps the only thing useful you learn from intensive Microsoft
1766 technical interviews */
1767 return (mask & (mask - 1)) == 0;
1771 detect_eol_type (struct detection_state *st, const Extbyte *src,
1772 Lstream_data_count n)
1776 unsigned char c = *(unsigned char *)src++;
1779 if (st->eol.just_saw_cr)
1781 else if (st->eol.seen_anything)
1784 else if (st->eol.just_saw_cr)
1787 st->eol.just_saw_cr = 1;
1789 st->eol.just_saw_cr = 0;
1790 st->eol.seen_anything = 1;
1793 return EOL_AUTODETECT;
1796 /* Attempt to determine the encoding and EOL type of the given text.
1797 Before calling this function for the first type, you must initialize
1798 st->eol_type as appropriate and initialize st->mask to ~0.
1800 st->eol_type holds the determined EOL type, or EOL_AUTODETECT if
1803 st->mask holds the determined coding category mask, or ~0 if only
1804 ASCII has been seen so far.
1808 0 == st->eol_type is EOL_AUTODETECT and/or more than coding category
1809 is present in st->mask
1810 1 == definitive answers are here for both st->eol_type and st->mask
1814 detect_coding_type (struct detection_state *st, const Extbyte *src,
1815 Lstream_data_count n, int just_do_eol)
1817 if (st->eol_type == EOL_AUTODETECT)
1818 st->eol_type = detect_eol_type (st, src, n);
1821 return st->eol_type != EOL_AUTODETECT;
1823 if (!st->seen_non_ascii)
1825 for (; n; n--, src++)
1827 unsigned char c = *(unsigned char *) src;
1828 if ((c < 0x20 && !acceptable_control_char_p (c)) || c >= 0x80)
1830 st->seen_non_ascii = 1;
1832 st->shift_jis.mask = ~0;
1835 st->utf16.mask = ~0;
1837 st->iso2022.mask = ~0;
1847 if (!mask_has_at_most_one_bit_p (st->iso2022.mask))
1848 st->iso2022.mask = detect_coding_iso2022 (st, src, n);
1849 if (!mask_has_at_most_one_bit_p (st->shift_jis.mask))
1850 st->shift_jis.mask = detect_coding_sjis (st, src, n);
1851 if (!mask_has_at_most_one_bit_p (st->big5.mask))
1852 st->big5.mask = detect_coding_big5 (st, src, n);
1853 if (!mask_has_at_most_one_bit_p (st->utf8.mask))
1854 st->utf8.mask = detect_coding_utf8 (st, src, n);
1855 if (!mask_has_at_most_one_bit_p (st->utf16.mask))
1856 st->utf16.mask = detect_coding_utf16 (st, src, n);
1857 if (!mask_has_at_most_one_bit_p (st->ucs4.mask))
1858 st->ucs4.mask = detect_coding_ucs4 (st, src, n);
1861 = st->iso2022.mask | st->shift_jis.mask | st->big5.mask
1862 | st->utf8.mask | st->ucs4.mask;
1865 int retval = mask_has_at_most_one_bit_p (st->mask);
1866 st->mask |= CODING_CATEGORY_NO_CONVERSION_MASK;
1867 return retval && st->eol_type != EOL_AUTODETECT;
1872 coding_system_from_mask (int mask)
1876 /* If the file was entirely or basically ASCII, use the
1877 default value of `buffer-file-coding-system'. */
1878 Lisp_Object retval =
1879 XBUFFER (Vbuffer_defaults)->buffer_file_coding_system;
1882 retval = Ffind_coding_system (retval);
1886 (Qbad_variable, Qwarning,
1887 "Invalid `default-buffer-file-coding-system', set to nil");
1888 XBUFFER (Vbuffer_defaults)->buffer_file_coding_system = Qnil;
1892 retval = Fget_coding_system (Qraw_text);
1900 mask = postprocess_iso2022_mask (mask);
1902 /* Look through the coding categories by priority and find
1903 the first one that is allowed. */
1904 for (i = 0; i < CODING_CATEGORY_LAST; i++)
1906 cat = fcd->coding_category_by_priority[i];
1907 if ((mask & (1 << cat)) &&
1908 !NILP (fcd->coding_category_system[cat]))
1912 return fcd->coding_category_system[cat];
1914 return Fget_coding_system (Qraw_text);
1918 /* Given a seekable read stream and potential coding system and EOL type
1919 as specified, do any autodetection that is called for. If the
1920 coding system and/or EOL type are not `autodetect', they will be left
1921 alone; but this function will never return an autodetect coding system
1924 This function does not automatically fetch subsidiary coding systems;
1925 that should be unnecessary with the explicit eol-type argument. */
1927 #define LENGTH(string_constant) (sizeof (string_constant) - 1)
1928 /* number of leading lines to check for a coding cookie */
1929 #define LINES_TO_CHECK 2
1932 determine_real_coding_system (Lstream *stream, Lisp_Object *codesys_in_out,
1933 eol_type_t *eol_type_in_out)
1935 struct detection_state decst;
1937 if (*eol_type_in_out == EOL_AUTODETECT)
1938 *eol_type_in_out = XCODING_SYSTEM_EOL_TYPE (*codesys_in_out);
1941 decst.eol_type = *eol_type_in_out;
1944 /* If autodetection is called for, do it now. */
1945 if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT
1946 || *eol_type_in_out == EOL_AUTODETECT)
1949 Lisp_Object coding_system = Qnil;
1951 Lstream_data_count nread = Lstream_read (stream, buf, sizeof (buf));
1953 int lines_checked = 0;
1955 /* Look for initial "-*-"; mode line prefix */
1957 scan_end = buf + nread - LENGTH ("-*-coding:?-*-");
1959 && lines_checked < LINES_TO_CHECK;
1961 if (*p == '-' && *(p+1) == '*' && *(p+2) == '-')
1963 Extbyte *local_vars_beg = p + 3;
1964 /* Look for final "-*-"; mode line suffix */
1965 for (p = local_vars_beg,
1966 scan_end = buf + nread - LENGTH ("-*-");
1968 && lines_checked < LINES_TO_CHECK;
1970 if (*p == '-' && *(p+1) == '*' && *(p+2) == '-')
1972 Extbyte *suffix = p;
1973 /* Look for "coding:" */
1974 for (p = local_vars_beg,
1975 scan_end = suffix - LENGTH ("coding:?");
1978 if (memcmp ("coding:", p, LENGTH ("coding:")) == 0
1979 && (p == local_vars_beg
1980 || (*(p-1) == ' ' ||
1986 p += LENGTH ("coding:");
1987 while (*p == ' ' || *p == '\t') p++;
1989 /* Get coding system name */
1990 save = *suffix; *suffix = '\0';
1991 /* Characters valid in a MIME charset name (rfc 1521),
1992 and in a Lisp symbol name. */
1993 n = strspn ( (char *) p,
1994 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
1995 "abcdefghijklmnopqrstuvwxyz"
2001 save = p[n]; p[n] = '\0';
2003 Ffind_coding_system (intern ((char *) p));
2010 /* #### file must use standard EOLs or we miss 2d line */
2011 /* #### not to mention this is broken for UTF-16 DOS files */
2012 else if (*p == '\n' || *p == '\r')
2015 /* skip past multibyte (DOS) newline */
2016 if (*p == '\r' && *(p+1) == '\n') p++;
2020 /* #### file must use standard EOLs or we miss 2d line */
2021 /* #### not to mention this is broken for UTF-16 DOS files */
2022 else if (*p == '\n' || *p == '\r')
2025 /* skip past multibyte (DOS) newline */
2026 if (*p == '\r' && *(p+1) == '\n') p++;
2029 if (NILP (coding_system))
2032 if (detect_coding_type (&decst, buf, nread,
2033 XCODING_SYSTEM_TYPE (*codesys_in_out)
2034 != CODESYS_AUTODETECT))
2036 nread = Lstream_read (stream, buf, sizeof (buf));
2042 else if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT
2043 && XCODING_SYSTEM_EOL_TYPE (coding_system) == EOL_AUTODETECT)
2046 if (detect_coding_type (&decst, buf, nread, 1))
2048 nread = Lstream_read (stream, buf, sizeof (buf));
2054 *eol_type_in_out = decst.eol_type;
2055 if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT)
2057 if (NILP (coding_system))
2058 *codesys_in_out = coding_system_from_mask (decst.mask);
2060 *codesys_in_out = coding_system;
2064 /* If we absolutely can't determine the EOL type, just assume LF. */
2065 if (*eol_type_in_out == EOL_AUTODETECT)
2066 *eol_type_in_out = EOL_LF;
2068 Lstream_rewind (stream);
2071 DEFUN ("detect-coding-region", Fdetect_coding_region, 2, 3, 0, /*
2072 Detect coding system of the text in the region between START and END.
2073 Return a list of possible coding systems ordered by priority.
2074 If only ASCII characters are found, return 'undecided or one of
2075 its subsidiary coding systems according to a detected end-of-line
2076 type. Optional arg BUFFER defaults to the current buffer.
2078 (start, end, buffer))
2080 Lisp_Object val = Qnil;
2081 struct buffer *buf = decode_buffer (buffer, 0);
2083 Lisp_Object instream, lb_instream;
2084 Lstream *istr, *lb_istr;
2085 struct detection_state decst;
2086 struct gcpro gcpro1, gcpro2;
2088 get_buffer_range_char (buf, start, end, &b, &e, 0);
2089 lb_instream = make_lisp_buffer_input_stream (buf, b, e, 0);
2090 lb_istr = XLSTREAM (lb_instream);
2091 instream = make_encoding_input_stream (lb_istr, Fget_coding_system (Qbinary));
2092 istr = XLSTREAM (instream);
2093 GCPRO2 (instream, lb_instream);
2095 decst.eol_type = EOL_AUTODETECT;
2099 Extbyte random_buffer[4096];
2100 Lstream_data_count nread = Lstream_read (istr, random_buffer, sizeof (random_buffer));
2104 if (detect_coding_type (&decst, random_buffer, nread, 0))
2108 if (decst.mask == ~0)
2109 val = subsidiary_coding_system (Fget_coding_system (Qundecided),
2117 decst.mask = postprocess_iso2022_mask (decst.mask);
2119 for (i = CODING_CATEGORY_LAST - 1; i >= 0; i--)
2121 int sys = fcd->coding_category_by_priority[i];
2122 if (decst.mask & (1 << sys))
2124 Lisp_Object codesys = fcd->coding_category_system[sys];
2125 if (!NILP (codesys))
2126 codesys = subsidiary_coding_system (codesys, decst.eol_type);
2127 val = Fcons (codesys, val);
2131 Lstream_close (istr);
2133 Lstream_delete (istr);
2134 Lstream_delete (lb_istr);
2139 /************************************************************************/
2140 /* Converting to internal Mule format ("decoding") */
2141 /************************************************************************/
2143 /* A decoding stream is a stream used for decoding text (i.e.
2144 converting from some external format to internal format).
2145 The decoding-stream object keeps track of the actual coding
2146 stream, the stream that is at the other end, and data that
2147 needs to be persistent across the lifetime of the stream. */
2149 /* Handle the EOL stuff related to just-read-in character C.
2150 EOL_TYPE is the EOL type of the coding stream.
2151 FLAGS is the current value of FLAGS in the coding stream, and may
2152 be modified by this macro. (The macro only looks at the
2153 CODING_STATE_CR flag.) DST is the Dynarr to which the decoded
2154 bytes are to be written. You need to also define a local goto
2155 label "label_continue_loop" that is at the end of the main
2156 character-reading loop.
2158 If C is a CR character, then this macro handles it entirely and
2159 jumps to label_continue_loop. Otherwise, this macro does not add
2160 anything to DST, and continues normally. You should continue
2161 processing C normally after this macro. */
2163 #define DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst) \
2167 if (eol_type == EOL_CR) \
2168 Dynarr_add (dst, '\n'); \
2169 else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR) \
2170 Dynarr_add (dst, c); \
2172 flags |= CODING_STATE_CR; \
2173 goto label_continue_loop; \
2175 else if (flags & CODING_STATE_CR) \
2176 { /* eol_type == CODING_SYSTEM_EOL_CRLF */ \
2178 Dynarr_add (dst, '\r'); \
2179 flags &= ~CODING_STATE_CR; \
2183 /* C should be a binary character in the range 0 - 255; convert
2184 to internal format and add to Dynarr DST. */
2187 #define DECODE_ADD_BINARY_CHAR(c, dst) \
2189 if (BYTE_ASCII_P (c)) \
2190 Dynarr_add (dst, c); \
2193 Dynarr_add (dst, (c >> 6) | 0xc0); \
2194 Dynarr_add (dst, (c & 0x3f) | 0x80); \
2198 INLINE_HEADER void DECODE_ADD_UCS_CHAR(Emchar c, unsigned_char_dynarr* dst);
2200 DECODE_ADD_UCS_CHAR(Emchar c, unsigned_char_dynarr* dst)
2204 Dynarr_add (dst, c);
2206 else if ( c <= 0x7ff )
2208 Dynarr_add (dst, (c >> 6) | 0xc0);
2209 Dynarr_add (dst, (c & 0x3f) | 0x80);
2211 else if ( c <= 0xffff )
2213 Dynarr_add (dst, (c >> 12) | 0xe0);
2214 Dynarr_add (dst, ((c >> 6) & 0x3f) | 0x80);
2215 Dynarr_add (dst, (c & 0x3f) | 0x80);
2217 else if ( c <= 0x1fffff )
2219 Dynarr_add (dst, (c >> 18) | 0xf0);
2220 Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
2221 Dynarr_add (dst, ((c >> 6) & 0x3f) | 0x80);
2222 Dynarr_add (dst, (c & 0x3f) | 0x80);
2224 else if ( c <= 0x3ffffff )
2226 Dynarr_add (dst, (c >> 24) | 0xf8);
2227 Dynarr_add (dst, ((c >> 18) & 0x3f) | 0x80);
2228 Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
2229 Dynarr_add (dst, ((c >> 6) & 0x3f) | 0x80);
2230 Dynarr_add (dst, (c & 0x3f) | 0x80);
2234 Dynarr_add (dst, (c >> 30) | 0xfc);
2235 Dynarr_add (dst, ((c >> 24) & 0x3f) | 0x80);
2236 Dynarr_add (dst, ((c >> 18) & 0x3f) | 0x80);
2237 Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
2238 Dynarr_add (dst, ((c >> 6) & 0x3f) | 0x80);
2239 Dynarr_add (dst, (c & 0x3f) | 0x80);
2243 #define DECODE_ADD_BINARY_CHAR(c, dst) \
2245 if (BYTE_ASCII_P (c)) \
2246 Dynarr_add (dst, c); \
2247 else if (BYTE_C1_P (c)) \
2249 Dynarr_add (dst, LEADING_BYTE_CONTROL_1); \
2250 Dynarr_add (dst, c + 0x20); \
2254 Dynarr_add (dst, LEADING_BYTE_LATIN_ISO8859_1); \
2255 Dynarr_add (dst, c); \
2260 #define DECODE_OUTPUT_PARTIAL_CHAR(ch) \
2264 DECODE_ADD_BINARY_CHAR (ch, dst); \
2269 #define DECODE_HANDLE_END_OF_CONVERSION(flags, ch, dst) \
2271 if (flags & CODING_STATE_END) \
2273 DECODE_OUTPUT_PARTIAL_CHAR (ch); \
2274 if (flags & CODING_STATE_CR) \
2275 Dynarr_add (dst, '\r'); \
2279 #define DECODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, decoding)
2281 #define ER_BUF_SIZE 24
2283 struct decoding_stream
2285 /* Coding system that governs the conversion. */
2286 Lisp_Coding_System *codesys;
2288 /* Stream that we read the encoded data from or
2289 write the decoded data to. */
2292 /* If we are reading, then we can return only a fixed amount of
2293 data, so if the conversion resulted in too much data, we store it
2294 here for retrieval the next time around. */
2295 unsigned_char_dynarr *runoff;
2297 /* FLAGS holds flags indicating the current state of the decoding.
2298 Some of these flags are dependent on the coding system. */
2301 /* CPOS holds a partially built-up code-point of character. */
2304 /* EOL_TYPE specifies the type of end-of-line conversion that
2305 currently applies. We need to keep this separate from the
2306 EOL type stored in CODESYS because the latter might indicate
2307 automatic EOL-type detection while the former will always
2308 indicate a particular EOL type. */
2309 eol_type_t eol_type;
2311 /* Additional ISO2022 information. We define the structure above
2312 because it's also needed by the detection routines. */
2313 struct iso2022_decoder iso2022;
2315 /* Additional information (the state of the running CCL program)
2316 used by the CCL decoder. */
2317 struct ccl_program ccl;
2319 /* counter for UTF-8 or UCS-4 */
2320 unsigned char counter;
2323 unsigned char er_counter;
2324 unsigned char er_buf[ER_BUF_SIZE];
2326 unsigned combined_char_count;
2327 Emchar combined_chars[16];
2328 Lisp_Object combining_table;
2330 struct detection_state decst;
2333 static Lstream_data_count decoding_reader (Lstream *stream,
2334 unsigned char *data, Lstream_data_count size);
2335 static Lstream_data_count decoding_writer (Lstream *stream,
2336 const unsigned char *data, Lstream_data_count size);
2337 static int decoding_rewinder (Lstream *stream);
2338 static int decoding_seekable_p (Lstream *stream);
2339 static int decoding_flusher (Lstream *stream);
2340 static int decoding_closer (Lstream *stream);
2342 static Lisp_Object decoding_marker (Lisp_Object stream);
2344 DEFINE_LSTREAM_IMPLEMENTATION ("decoding", lstream_decoding,
2345 sizeof (struct decoding_stream));
2348 decoding_marker (Lisp_Object stream)
2350 Lstream *str = DECODING_STREAM_DATA (XLSTREAM (stream))->other_end;
2351 Lisp_Object str_obj;
2353 /* We do not need to mark the coding systems or charsets stored
2354 within the stream because they are stored in a global list
2355 and automatically marked. */
2357 XSETLSTREAM (str_obj, str);
2358 mark_object (str_obj);
2359 if (str->imp->marker)
2360 return (str->imp->marker) (str_obj);
2365 /* Read SIZE bytes of data and store it into DATA. We are a decoding stream
2366 so we read data from the other end, decode it, and store it into DATA. */
2368 static Lstream_data_count
2369 decoding_reader (Lstream *stream, unsigned char *data, Lstream_data_count size)
2371 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2372 unsigned char *orig_data = data;
2373 Lstream_data_count read_size;
2374 int error_occurred = 0;
2376 /* We need to interface to mule_decode(), which expects to take some
2377 amount of data and store the result into a Dynarr. We have
2378 mule_decode() store into str->runoff, and take data from there
2381 /* We loop until we have enough data, reading chunks from the other
2382 end and decoding it. */
2385 /* Take data from the runoff if we can. Make sure to take at
2386 most SIZE bytes, and delete the data from the runoff. */
2387 if (Dynarr_length (str->runoff) > 0)
2389 Lstream_data_count chunk = min (size, (Lstream_data_count) Dynarr_length (str->runoff));
2390 memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
2391 Dynarr_delete_many (str->runoff, 0, chunk);
2397 break; /* No more room for data */
2399 if (str->flags & CODING_STATE_END)
2400 /* This means that on the previous iteration, we hit the EOF on
2401 the other end. We loop once more so that mule_decode() can
2402 output any final stuff it may be holding, or any "go back
2403 to a sane state" escape sequences. (This latter makes sense
2404 during encoding.) */
2407 /* Exhausted the runoff, so get some more. DATA has at least
2408 SIZE bytes left of storage in it, so it's OK to read directly
2409 into it. (We'll be overwriting above, after we've decoded it
2410 into the runoff.) */
2411 read_size = Lstream_read (str->other_end, data, size);
2418 /* There might be some more end data produced in the translation.
2419 See the comment above. */
2420 str->flags |= CODING_STATE_END;
2421 mule_decode (stream, (Extbyte *) data, str->runoff, read_size);
2424 if (data - orig_data == 0)
2425 return error_occurred ? -1 : 0;
2427 return data - orig_data;
2430 static Lstream_data_count
2431 decoding_writer (Lstream *stream, const unsigned char *data, Lstream_data_count size)
2433 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2434 Lstream_data_count retval;
2436 /* Decode all our data into the runoff, and then attempt to write
2437 it all out to the other end. Remove whatever chunk we succeeded
2439 mule_decode (stream, (Extbyte *) data, str->runoff, size);
2440 retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
2441 Dynarr_length (str->runoff));
2443 Dynarr_delete_many (str->runoff, 0, retval);
2444 /* Do NOT return retval. The return value indicates how much
2445 of the incoming data was written, not how many bytes were
2451 reset_decoding_stream (struct decoding_stream *str)
2454 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_ISO2022)
2456 Lisp_Object coding_system;
2457 XSETCODING_SYSTEM (coding_system, str->codesys);
2458 reset_iso2022 (coding_system, &str->iso2022);
2460 else if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_CCL)
2462 setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_DECODE (str->codesys));
2467 str->er_counter = 0;
2468 str->combined_char_count = 0;
2469 str->combining_table = Qnil;
2471 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT
2472 || CODING_SYSTEM_EOL_TYPE (str->codesys) == EOL_AUTODETECT)
2475 str->decst.eol_type = EOL_AUTODETECT;
2476 str->decst.mask = ~0;
2478 str->flags = str->cpos = 0;
2482 decoding_rewinder (Lstream *stream)
2484 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2485 reset_decoding_stream (str);
2486 Dynarr_reset (str->runoff);
2487 return Lstream_rewind (str->other_end);
2491 decoding_seekable_p (Lstream *stream)
2493 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2494 return Lstream_seekable_p (str->other_end);
2498 decoding_flusher (Lstream *stream)
2500 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2501 return Lstream_flush (str->other_end);
2505 decoding_closer (Lstream *stream)
2507 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2508 if (stream->flags & LSTREAM_FL_WRITE)
2510 str->flags |= CODING_STATE_END;
2511 decoding_writer (stream, 0, 0);
2513 Dynarr_free (str->runoff);
2515 #ifdef ENABLE_COMPOSITE_CHARS
2516 if (str->iso2022.composite_chars)
2517 Dynarr_free (str->iso2022.composite_chars);
2520 return Lstream_close (str->other_end);
2524 decoding_stream_coding_system (Lstream *stream)
2526 Lisp_Object coding_system;
2527 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2529 XSETCODING_SYSTEM (coding_system, str->codesys);
2530 return subsidiary_coding_system (coding_system, str->eol_type);
2534 set_decoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
2536 Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
2537 struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
2539 if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
2540 str->eol_type = CODING_SYSTEM_EOL_TYPE (cs);
2541 reset_decoding_stream (str);
2544 /* WARNING WARNING WARNING WARNING!!!!! If you open up a decoding
2545 stream for writing, no automatic code detection will be performed.
2546 The reason for this is that automatic code detection requires a
2547 seekable input. Things will also fail if you open a decoding
2548 stream for reading using a non-fully-specified coding system and
2549 a non-seekable input stream. */
2552 make_decoding_stream_1 (Lstream *stream, Lisp_Object codesys,
2555 Lstream *lstr = Lstream_new (lstream_decoding, mode);
2556 struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
2560 str->other_end = stream;
2561 str->runoff = (unsigned_char_dynarr *) Dynarr_new (unsigned_char);
2562 str->eol_type = EOL_AUTODETECT;
2563 if (!strcmp (mode, "r")
2564 && Lstream_seekable_p (stream))
2565 /* We can determine the coding system now. */
2566 determine_real_coding_system (stream, &codesys, &str->eol_type);
2567 set_decoding_stream_coding_system (lstr, codesys);
2568 str->decst.eol_type = str->eol_type;
2569 str->decst.mask = ~0;
2570 XSETLSTREAM (obj, lstr);
2575 make_decoding_input_stream (Lstream *stream, Lisp_Object codesys)
2577 return make_decoding_stream_1 (stream, codesys, "r");
2581 make_decoding_output_stream (Lstream *stream, Lisp_Object codesys)
2583 return make_decoding_stream_1 (stream, codesys, "w");
2586 /* Note: the decode_coding_* functions all take the same
2587 arguments as mule_decode(), which is to say some SRC data of
2588 size N, which is to be stored into dynamic array DST.
2589 DECODING is the stream within which the decoding is
2590 taking place, but no data is actually read from or
2591 written to that stream; that is handled in decoding_reader()
2592 or decoding_writer(). This allows the same functions to
2593 be used for both reading and writing. */
2596 mule_decode (Lstream *decoding, const Extbyte *src,
2597 unsigned_char_dynarr *dst, Lstream_data_count n)
2599 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
2601 /* If necessary, do encoding-detection now. We do this when
2602 we're a writing stream or a non-seekable reading stream,
2603 meaning that we can't just process the whole input,
2604 rewind, and start over. */
2606 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT ||
2607 str->eol_type == EOL_AUTODETECT)
2609 Lisp_Object codesys;
2611 XSETCODING_SYSTEM (codesys, str->codesys);
2612 detect_coding_type (&str->decst, src, n,
2613 CODING_SYSTEM_TYPE (str->codesys) !=
2614 CODESYS_AUTODETECT);
2615 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT &&
2616 str->decst.mask != ~0)
2617 /* #### This is cheesy. What we really ought to do is
2618 buffer up a certain amount of data so as to get a
2619 less random result. */
2620 codesys = coding_system_from_mask (str->decst.mask);
2621 str->eol_type = str->decst.eol_type;
2622 if (XCODING_SYSTEM (codesys) != str->codesys)
2624 /* Preserve the CODING_STATE_END flag in case it was set.
2625 If we erase it, bad things might happen. */
2626 int was_end = str->flags & CODING_STATE_END;
2627 set_decoding_stream_coding_system (decoding, codesys);
2629 str->flags |= CODING_STATE_END;
2633 switch (CODING_SYSTEM_TYPE (str->codesys))
2636 case CODESYS_INTERNAL:
2637 Dynarr_add_many (dst, src, n);
2640 case CODESYS_AUTODETECT:
2641 /* If we got this far and still haven't decided on the coding
2642 system, then do no conversion. */
2643 case CODESYS_NO_CONVERSION:
2644 decode_coding_no_conversion (decoding, src, dst, n);
2647 case CODESYS_SHIFT_JIS:
2648 decode_coding_sjis (decoding, src, dst, n);
2651 decode_coding_big5 (decoding, src, dst, n);
2654 decode_coding_ucs4 (decoding, src, dst, n);
2657 decode_coding_utf16 (decoding, src, dst, n);
2660 decode_coding_utf8 (decoding, src, dst, n);
2663 str->ccl.last_block = str->flags & CODING_STATE_END;
2664 /* When applying ccl program to stream, MUST NOT set NULL
2666 ccl_driver (&str->ccl, (src ? (unsigned char *)src : (unsigned char*)""),
2667 dst, n, 0, CCL_MODE_DECODING);
2669 case CODESYS_ISO2022:
2670 decode_coding_iso2022 (decoding, src, dst, n);
2678 DEFUN ("decode-coding-region", Fdecode_coding_region, 3, 4, 0, /*
2679 Decode the text between START and END which is encoded in CODING-SYSTEM.
2680 This is useful if you've read in encoded text from a file without decoding
2681 it (e.g. you read in a JIS-formatted file but used the `binary' or
2682 `no-conversion' coding system, so that it shows up as "^[$B!<!+^[(B").
2683 Return length of decoded text.
2684 BUFFER defaults to the current buffer if unspecified.
2686 (start, end, coding_system, buffer))
2689 struct buffer *buf = decode_buffer (buffer, 0);
2690 Lisp_Object instream, lb_outstream, de_outstream, outstream;
2691 Lstream *istr, *ostr;
2692 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
2694 get_buffer_range_char (buf, start, end, &b, &e, 0);
2696 barf_if_buffer_read_only (buf, b, e);
2698 coding_system = Fget_coding_system (coding_system);
2699 instream = make_lisp_buffer_input_stream (buf, b, e, 0);
2700 lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
2701 de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
2703 outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
2704 Fget_coding_system (Qbinary));
2705 istr = XLSTREAM (instream);
2706 ostr = XLSTREAM (outstream);
2707 GCPRO4 (instream, lb_outstream, de_outstream, outstream);
2709 /* The chain of streams looks like this:
2711 [BUFFER] <----- send through
2712 ------> [ENCODE AS BINARY]
2713 ------> [DECODE AS SPECIFIED]
2719 char tempbuf[1024]; /* some random amount */
2720 Bufpos newpos, even_newer_pos;
2721 Bufpos oldpos = lisp_buffer_stream_startpos (istr);
2722 Lstream_data_count size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
2726 newpos = lisp_buffer_stream_startpos (istr);
2727 Lstream_write (ostr, tempbuf, size_in_bytes);
2728 even_newer_pos = lisp_buffer_stream_startpos (istr);
2729 buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
2732 Lstream_close (istr);
2733 Lstream_close (ostr);
2735 Lstream_delete (istr);
2736 Lstream_delete (ostr);
2737 Lstream_delete (XLSTREAM (de_outstream));
2738 Lstream_delete (XLSTREAM (lb_outstream));
2743 /************************************************************************/
2744 /* Converting to an external encoding ("encoding") */
2745 /************************************************************************/
2747 /* An encoding stream is an output stream. When you create the
2748 stream, you specify the coding system that governs the encoding
2749 and another stream that the resulting encoded data is to be
2750 sent to, and then start sending data to it. */
2752 #define ENCODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, encoding)
2754 struct encoding_stream
2756 /* Coding system that governs the conversion. */
2757 Lisp_Coding_System *codesys;
2759 /* Stream that we read the encoded data from or
2760 write the decoded data to. */
2763 /* If we are reading, then we can return only a fixed amount of
2764 data, so if the conversion resulted in too much data, we store it
2765 here for retrieval the next time around. */
2766 unsigned_char_dynarr *runoff;
2768 /* FLAGS holds flags indicating the current state of the encoding.
2769 Some of these flags are dependent on the coding system. */
2772 /* CH holds a partially built-up character. Since we only deal
2773 with one- and two-byte characters at the moment, we only use
2774 this to store the first byte of a two-byte character. */
2777 /* Additional information used by the ISO2022 encoder. */
2780 /* CHARSET holds the character sets currently assigned to the G0
2781 through G3 registers. It is initialized from the array
2782 INITIAL_CHARSET in CODESYS. */
2783 Lisp_Object charset[4];
2785 /* Which registers are currently invoked into the left (GL) and
2786 right (GR) halves of the 8-bit encoding space? */
2787 int register_left, register_right;
2789 /* Whether we need to explicitly designate the charset in the
2790 G? register before using it. It is initialized from the
2791 array FORCE_CHARSET_ON_OUTPUT in CODESYS. */
2792 unsigned char force_charset_on_output[4];
2794 /* Other state variables that need to be preserved across
2796 Lisp_Object current_charset;
2798 int current_char_boundary;
2801 void (*encode_char) (struct encoding_stream *str, Emchar c,
2802 unsigned_char_dynarr *dst, unsigned int *flags);
2803 void (*finish) (struct encoding_stream *str,
2804 unsigned_char_dynarr *dst, unsigned int *flags);
2806 /* Additional information (the state of the running CCL program)
2807 used by the CCL encoder. */
2808 struct ccl_program ccl;
2812 static Lstream_data_count encoding_reader (Lstream *stream, unsigned char *data, Lstream_data_count size);
2813 static Lstream_data_count encoding_writer (Lstream *stream, const unsigned char *data,
2814 Lstream_data_count size);
2815 static int encoding_rewinder (Lstream *stream);
2816 static int encoding_seekable_p (Lstream *stream);
2817 static int encoding_flusher (Lstream *stream);
2818 static int encoding_closer (Lstream *stream);
2820 static Lisp_Object encoding_marker (Lisp_Object stream);
2822 DEFINE_LSTREAM_IMPLEMENTATION ("encoding", lstream_encoding,
2823 sizeof (struct encoding_stream));
2826 encoding_marker (Lisp_Object stream)
2828 Lstream *str = ENCODING_STREAM_DATA (XLSTREAM (stream))->other_end;
2829 Lisp_Object str_obj;
2831 /* We do not need to mark the coding systems or charsets stored
2832 within the stream because they are stored in a global list
2833 and automatically marked. */
2835 XSETLSTREAM (str_obj, str);
2836 mark_object (str_obj);
2837 if (str->imp->marker)
2838 return (str->imp->marker) (str_obj);
2843 /* Read SIZE bytes of data and store it into DATA. We are a encoding stream
2844 so we read data from the other end, encode it, and store it into DATA. */
2846 static Lstream_data_count
2847 encoding_reader (Lstream *stream, unsigned char *data, Lstream_data_count size)
2849 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2850 unsigned char *orig_data = data;
2851 Lstream_data_count read_size;
2852 int error_occurred = 0;
2854 /* We need to interface to mule_encode(), which expects to take some
2855 amount of data and store the result into a Dynarr. We have
2856 mule_encode() store into str->runoff, and take data from there
2859 /* We loop until we have enough data, reading chunks from the other
2860 end and encoding it. */
2863 /* Take data from the runoff if we can. Make sure to take at
2864 most SIZE bytes, and delete the data from the runoff. */
2865 if (Dynarr_length (str->runoff) > 0)
2867 int chunk = min ((int) size, Dynarr_length (str->runoff));
2868 memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
2869 Dynarr_delete_many (str->runoff, 0, chunk);
2875 break; /* No more room for data */
2877 if (str->flags & CODING_STATE_END)
2878 /* This means that on the previous iteration, we hit the EOF on
2879 the other end. We loop once more so that mule_encode() can
2880 output any final stuff it may be holding, or any "go back
2881 to a sane state" escape sequences. (This latter makes sense
2882 during encoding.) */
2885 /* Exhausted the runoff, so get some more. DATA at least SIZE bytes
2886 left of storage in it, so it's OK to read directly into it.
2887 (We'll be overwriting above, after we've encoded it into the
2889 read_size = Lstream_read (str->other_end, data, size);
2896 /* There might be some more end data produced in the translation.
2897 See the comment above. */
2898 str->flags |= CODING_STATE_END;
2899 mule_encode (stream, data, str->runoff, read_size);
2902 if (data == orig_data)
2903 return error_occurred ? -1 : 0;
2905 return data - orig_data;
2908 static Lstream_data_count
2909 encoding_writer (Lstream *stream, const unsigned char *data, Lstream_data_count size)
2911 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2912 Lstream_data_count retval;
2914 /* Encode all our data into the runoff, and then attempt to write
2915 it all out to the other end. Remove whatever chunk we succeeded
2917 mule_encode (stream, data, str->runoff, size);
2918 retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
2919 Dynarr_length (str->runoff));
2921 Dynarr_delete_many (str->runoff, 0, retval);
2922 /* Do NOT return retval. The return value indicates how much
2923 of the incoming data was written, not how many bytes were
2929 reset_encoding_stream (struct encoding_stream *str)
2932 switch (CODING_SYSTEM_TYPE (str->codesys))
2934 case CODESYS_ISO2022:
2938 str->encode_char = &char_encode_iso2022;
2939 str->finish = &char_finish_iso2022;
2940 for (i = 0; i < 4; i++)
2942 str->iso2022.charset[i] =
2943 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (str->codesys, i);
2944 str->iso2022.force_charset_on_output[i] =
2945 CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (str->codesys, i);
2947 str->iso2022.register_left = 0;
2948 str->iso2022.register_right = 1;
2949 str->iso2022.current_charset = Qnil;
2950 str->iso2022.current_half = 0;
2954 setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_ENCODE (str->codesys));
2957 str->encode_char = &char_encode_utf8;
2958 str->finish = &char_finish_utf8;
2961 str->encode_char = &char_encode_utf16;
2962 str->finish = &char_finish_utf16;
2965 str->encode_char = &char_encode_ucs4;
2966 str->finish = &char_finish_ucs4;
2968 case CODESYS_SHIFT_JIS:
2969 str->encode_char = &char_encode_shift_jis;
2970 str->finish = &char_finish_shift_jis;
2973 str->encode_char = &char_encode_big5;
2974 str->finish = &char_finish_big5;
2980 str->iso2022.current_char_boundary = 0;
2981 str->flags = str->ch = 0;
2985 encoding_rewinder (Lstream *stream)
2987 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2988 reset_encoding_stream (str);
2989 Dynarr_reset (str->runoff);
2990 return Lstream_rewind (str->other_end);
2994 encoding_seekable_p (Lstream *stream)
2996 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2997 return Lstream_seekable_p (str->other_end);
3001 encoding_flusher (Lstream *stream)
3003 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
3004 return Lstream_flush (str->other_end);
3008 encoding_closer (Lstream *stream)
3010 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
3011 if (stream->flags & LSTREAM_FL_WRITE)
3013 str->flags |= CODING_STATE_END;
3014 encoding_writer (stream, 0, 0);
3016 Dynarr_free (str->runoff);
3017 return Lstream_close (str->other_end);
3021 encoding_stream_coding_system (Lstream *stream)
3023 Lisp_Object coding_system;
3024 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
3026 XSETCODING_SYSTEM (coding_system, str->codesys);
3027 return coding_system;
3031 set_encoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
3033 Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
3034 struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
3036 reset_encoding_stream (str);
3040 make_encoding_stream_1 (Lstream *stream, Lisp_Object codesys,
3043 Lstream *lstr = Lstream_new (lstream_encoding, mode);
3044 struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
3048 str->runoff = Dynarr_new (unsigned_char);
3049 str->other_end = stream;
3050 set_encoding_stream_coding_system (lstr, codesys);
3051 XSETLSTREAM (obj, lstr);
3056 make_encoding_input_stream (Lstream *stream, Lisp_Object codesys)
3058 return make_encoding_stream_1 (stream, codesys, "r");
3062 make_encoding_output_stream (Lstream *stream, Lisp_Object codesys)
3064 return make_encoding_stream_1 (stream, codesys, "w");
3067 /* Convert N bytes of internally-formatted data stored in SRC to an
3068 external format, according to the encoding stream ENCODING.
3069 Store the encoded data into DST. */
3072 mule_encode (Lstream *encoding, const Bufbyte *src,
3073 unsigned_char_dynarr *dst, Lstream_data_count n)
3075 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3077 switch (CODING_SYSTEM_TYPE (str->codesys))
3080 case CODESYS_INTERNAL:
3081 Dynarr_add_many (dst, src, n);
3084 case CODESYS_AUTODETECT:
3085 /* If we got this far and still haven't decided on the coding
3086 system, then do no conversion. */
3087 case CODESYS_NO_CONVERSION:
3088 encode_coding_no_conversion (encoding, src, dst, n);
3092 str->ccl.last_block = str->flags & CODING_STATE_END;
3093 /* When applying ccl program to stream, MUST NOT set NULL
3095 ccl_driver (&str->ccl, ((src) ? src : (unsigned char*)""),
3096 dst, n, 0, CCL_MODE_ENCODING);
3100 text_encode_generic (encoding, src, dst, n);
3104 DEFUN ("encode-coding-region", Fencode_coding_region, 3, 4, 0, /*
3105 Encode the text between START and END using CODING-SYSTEM.
3106 This will, for example, convert Japanese characters into stuff such as
3107 "^[$B!<!+^[(B" if you use the JIS encoding. Return length of encoded
3108 text. BUFFER defaults to the current buffer if unspecified.
3110 (start, end, coding_system, buffer))
3113 struct buffer *buf = decode_buffer (buffer, 0);
3114 Lisp_Object instream, lb_outstream, de_outstream, outstream;
3115 Lstream *istr, *ostr;
3116 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
3118 get_buffer_range_char (buf, start, end, &b, &e, 0);
3120 barf_if_buffer_read_only (buf, b, e);
3122 coding_system = Fget_coding_system (coding_system);
3123 instream = make_lisp_buffer_input_stream (buf, b, e, 0);
3124 lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
3125 de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
3126 Fget_coding_system (Qbinary));
3127 outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
3129 istr = XLSTREAM (instream);
3130 ostr = XLSTREAM (outstream);
3131 GCPRO4 (instream, outstream, de_outstream, lb_outstream);
3132 /* The chain of streams looks like this:
3134 [BUFFER] <----- send through
3135 ------> [ENCODE AS SPECIFIED]
3136 ------> [DECODE AS BINARY]
3141 char tempbuf[1024]; /* some random amount */
3142 Bufpos newpos, even_newer_pos;
3143 Bufpos oldpos = lisp_buffer_stream_startpos (istr);
3144 Lstream_data_count size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
3148 newpos = lisp_buffer_stream_startpos (istr);
3149 Lstream_write (ostr, tempbuf, size_in_bytes);
3150 even_newer_pos = lisp_buffer_stream_startpos (istr);
3151 buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
3157 lisp_buffer_stream_startpos (XLSTREAM (instream)) - b;
3158 Lstream_close (istr);
3159 Lstream_close (ostr);
3161 Lstream_delete (istr);
3162 Lstream_delete (ostr);
3163 Lstream_delete (XLSTREAM (de_outstream));
3164 Lstream_delete (XLSTREAM (lb_outstream));
3165 return make_int (retlen);
3172 text_encode_generic (Lstream *encoding, const Bufbyte *src,
3173 unsigned_char_dynarr *dst, Lstream_data_count n)
3176 unsigned char char_boundary;
3177 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3178 unsigned int flags = str->flags;
3179 Emchar ch = str->ch;
3181 char_boundary = str->iso2022.current_char_boundary;
3187 if (char_boundary == 0)
3215 (*str->encode_char) (str, c, dst, &flags);
3217 else if (char_boundary == 1)
3219 (*str->encode_char) (str, (ch << 6) | (c & 0x3f), dst, &flags);
3225 ch = (ch << 6) | (c & 0x3f);
3230 if ((char_boundary == 0) && (flags & CODING_STATE_END))
3232 (*str->finish) (str, dst, &flags);
3237 str->iso2022.current_char_boundary = char_boundary;
3242 /************************************************************************/
3243 /* entity reference */
3244 /************************************************************************/
3247 decode_flush_er_chars (struct decoding_stream *str, unsigned_char_dynarr* dst);
3249 decode_flush_er_chars (struct decoding_stream *str, unsigned_char_dynarr* dst)
3251 if ( str->er_counter > 0)
3253 Dynarr_add_many (dst, str->er_buf, str->er_counter);
3254 str->er_counter = 0;
3258 EXFUN (Fregexp_quote, 1);
3260 void decode_add_er_char (struct decoding_stream *str, Emchar character,
3261 unsigned_char_dynarr* dst);
3263 decode_add_er_char (struct decoding_stream *str, Emchar c,
3264 unsigned_char_dynarr* dst)
3266 if (str->er_counter == 0)
3268 if (CODING_SYSTEM_USE_ENTITY_REFERENCE (str->codesys)
3271 str->er_buf[0] = '&';
3275 DECODE_ADD_UCS_CHAR (c, dst);
3279 Lisp_Object string = make_string (str->er_buf,
3286 Lisp_Object char_type;
3289 for ( rest = Vcoded_charset_entity_reference_alist;
3290 !NILP (rest); rest = Fcdr (rest) )
3296 char_type = XCDR (ccs);
3301 if (NILP (ccs = Ffind_charset (ccs)))
3310 pat = Fregexp_quote (pat);
3317 pat = concat3 (build_string ("^&"),
3318 pat, build_string ("\\([0-9]+\\)$"));
3321 else if (EQ (ret, Qx))
3323 pat = concat3 (build_string ("^&"),
3324 pat, build_string ("\\([0-9a-f]+\\)$"));
3327 else if (EQ (ret, QX))
3329 pat = concat3 (build_string ("^&"),
3330 pat, build_string ("\\([0-9A-F]+\\)$"));
3336 if (!NILP (Fstring_match (pat, string, Qnil, Qnil)))
3339 = XINT (Fstring_to_number
3340 (Fsubstring (string,
3341 Fmatch_beginning (make_int (1)),
3342 Fmatch_end (make_int (1))),
3346 ? DECODE_CHAR (ccs, code, 0)
3347 : decode_builtin_char (ccs, code);
3349 DECODE_ADD_UCS_CHAR (chr, dst);
3353 if (!NILP (Fstring_match (build_string ("^&MCS-\\([0-9A-F]+\\)$"),
3354 string, Qnil, Qnil)))
3357 = XUINT (Fstring_to_number
3358 (Fsubstring (string,
3359 Fmatch_beginning (make_int (1)),
3360 Fmatch_end (make_int (1))),
3363 DECODE_ADD_UCS_CHAR (code, dst);
3367 Dynarr_add_many (dst, str->er_buf, str->er_counter);
3368 Dynarr_add (dst, ';');
3371 str->er_counter = 0;
3373 else if ( (str->er_counter >= ER_BUF_SIZE) || (c >= 0x7F) )
3375 Dynarr_add_many (dst, str->er_buf, str->er_counter);
3376 str->er_counter = 0;
3377 DECODE_ADD_UCS_CHAR (c, dst);
3380 str->er_buf[str->er_counter++] = c;
3383 void char_encode_as_entity_reference (Emchar ch, char* buf);
3385 char_encode_as_entity_reference (Emchar ch, char* buf)
3387 Lisp_Object rest = Vcoded_charset_entity_reference_alist;
3390 Lisp_Object char_type;
3391 int format_columns, idx;
3392 char format[ER_BUF_SIZE];
3394 while (!NILP (rest))
3400 char_type = XCDR (ccs);
3405 if (!NILP (ccs = Ffind_charset (ccs)))
3407 int code_point = charset_code_point (ccs, ch, 0);
3409 if ( (code_point >= 0)
3410 && (NILP (char_type)
3411 || DECODE_CHAR (ccs, code_point, 0) != ch) )
3417 if ( STRINGP (ret) &&
3418 ( (idx = XSTRING_LENGTH (ret)) <= (ER_BUF_SIZE - 4) ) )
3421 strncpy (&format[1], XSTRING_DATA (ret), idx);
3431 format[idx++] = '%';
3432 format_columns = XINT (ret);
3433 if ( (2 <= format_columns) && (format_columns <= 8)
3434 && (idx + format_columns <= ER_BUF_SIZE - 1) )
3436 format [idx++] = '0';
3437 format [idx++] = '0' + format_columns;
3446 format [idx++] = 'd';
3447 else if (EQ (ret, Qx))
3448 format [idx++] = 'x';
3449 else if (EQ (ret, QX))
3450 format [idx++] = 'X';
3453 format [idx++] = ';';
3456 sprintf (buf, format, code_point);
3463 sprintf (buf, "&MCS-%08X;", ch);
3467 /************************************************************************/
3468 /* character composition */
3469 /************************************************************************/
3470 extern Lisp_Object Qcomposition;
3473 COMPOSE_FLUSH_CHARS (struct decoding_stream *str, unsigned_char_dynarr* dst);
3475 COMPOSE_FLUSH_CHARS (struct decoding_stream *str, unsigned_char_dynarr* dst)
3479 for (i = 0; i < str->combined_char_count; i++)
3480 decode_add_er_char (str, str->combined_chars[i], dst);
3481 str->combined_char_count = 0;
3482 str->combining_table = Qnil;
3485 void COMPOSE_ADD_CHAR (struct decoding_stream *str, Emchar character,
3486 unsigned_char_dynarr* dst);
3488 COMPOSE_ADD_CHAR (struct decoding_stream *str,
3489 Emchar character, unsigned_char_dynarr* dst)
3491 if (CODING_SYSTEM_DISABLE_COMPOSITION (str->codesys))
3492 decode_add_er_char (str, character, dst);
3493 else if (!CONSP (str->combining_table))
3496 = Fchar_feature (make_char (character), Qcomposition, Qnil,
3500 decode_add_er_char (str, character, dst);
3503 str->combined_chars[0] = character;
3504 str->combined_char_count = 1;
3505 str->combining_table = ret;
3511 = Fcdr (Fassq (make_char (character), str->combining_table));
3515 Emchar char2 = XCHARVAL (ret);
3516 Lisp_Object ret2 = Fchar_feature (ret, Qcomposition, Qnil,
3521 decode_add_er_char (str, char2, dst);
3522 str->combined_char_count = 0;
3523 str->combining_table = Qnil;
3527 str->combined_chars[0] = char2;
3528 str->combined_char_count = 1;
3529 str->combining_table = ret2;
3534 ret = Fchar_feature (make_char (character), Qcomposition, Qnil,
3537 COMPOSE_FLUSH_CHARS (str, dst);
3539 decode_add_er_char (str, character, dst);
3542 str->combined_chars[0] = character;
3543 str->combined_char_count = 1;
3544 str->combining_table = ret;
3549 #else /* not UTF2000 */
3550 #define COMPOSE_FLUSH_CHARS(str, dst)
3551 #define COMPOSE_ADD_CHAR(str, ch, dst) DECODE_ADD_UCS_CHAR (ch, dst)
3552 #endif /* UTF2000 */
3555 /************************************************************************/
3556 /* Shift-JIS methods */
3557 /************************************************************************/
3559 /* Shift-JIS is a coding system encoding three character sets: ASCII, right
3560 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
3561 as is. A character of JISX0201-Kana (DIMENSION1_CHARS94 character set) is
3562 encoded by "position-code + 0x80". A character of JISX0208
3563 (DIMENSION2_CHARS94 character set) is encoded in 2-byte but two
3564 position-codes are divided and shifted so that it fit in the range
3567 --- CODE RANGE of Shift-JIS ---
3568 (character set) (range)
3570 JISX0201-Kana 0xA0 .. 0xDF
3571 JISX0208 (1st byte) 0x80 .. 0x9F and 0xE0 .. 0xEF
3572 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
3573 -------------------------------
3577 /* Is this the first byte of a Shift-JIS two-byte char? */
3579 #define BYTE_SJIS_TWO_BYTE_1_P(c) \
3580 (((c) >= 0x81 && (c) <= 0x9F) || ((c) >= 0xE0 && (c) <= 0xEF))
3582 /* Is this the second byte of a Shift-JIS two-byte char? */
3584 #define BYTE_SJIS_TWO_BYTE_2_P(c) \
3585 (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0x80 && (c) <= 0xFC))
3587 #define BYTE_SJIS_KATAKANA_P(c) \
3588 ((c) >= 0xA1 && (c) <= 0xDF)
3591 detect_coding_sjis (struct detection_state *st, const Extbyte *src, Lstream_data_count n)
3595 unsigned char c = *(unsigned char *)src++;
3596 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
3598 if (st->shift_jis.in_second_byte)
3600 st->shift_jis.in_second_byte = 0;
3604 else if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
3605 st->shift_jis.in_second_byte = 1;
3607 return CODING_CATEGORY_SHIFT_JIS_MASK;
3610 /* Convert Shift-JIS data to internal format. */
3613 decode_coding_sjis (Lstream *decoding, const Extbyte *src,
3614 unsigned_char_dynarr *dst, Lstream_data_count n)
3616 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3617 unsigned int flags = str->flags;
3618 unsigned int cpos = str->cpos;
3619 eol_type_t eol_type = str->eol_type;
3623 unsigned char c = *(unsigned char *)src++;
3627 /* Previous character was first byte of Shift-JIS Kanji char. */
3628 if (BYTE_SJIS_TWO_BYTE_2_P (c))
3630 unsigned char e1, e2;
3632 DECODE_SJIS (cpos, c, e1, e2);
3634 DECODE_ADD_UCS_CHAR(MAKE_CHAR(Vcharset_japanese_jisx0208,
3638 Dynarr_add (dst, LEADING_BYTE_JAPANESE_JISX0208);
3639 Dynarr_add (dst, e1);
3640 Dynarr_add (dst, e2);
3645 DECODE_ADD_BINARY_CHAR (cpos, dst);
3646 DECODE_ADD_BINARY_CHAR (c, dst);
3652 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3653 if (BYTE_SJIS_TWO_BYTE_1_P (c))
3655 else if (BYTE_SJIS_KATAKANA_P (c))
3658 DECODE_ADD_UCS_CHAR(MAKE_CHAR(Vcharset_katakana_jisx0201,
3661 Dynarr_add (dst, LEADING_BYTE_KATAKANA_JISX0201);
3662 Dynarr_add (dst, c);
3667 DECODE_ADD_UCS_CHAR(MAKE_CHAR(Vcharset_latin_jisx0201,
3671 DECODE_ADD_BINARY_CHAR (c, dst);
3673 label_continue_loop:;
3676 DECODE_HANDLE_END_OF_CONVERSION (flags, cpos, dst);
3682 /* Convert internal character representation to Shift_JIS. */
3685 char_encode_shift_jis (struct encoding_stream *str, Emchar ch,
3686 unsigned_char_dynarr *dst, unsigned int *flags)
3688 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
3692 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3693 Dynarr_add (dst, '\r');
3694 if (eol_type != EOL_CR)
3695 Dynarr_add (dst, ch);
3699 unsigned int s1, s2;
3701 int code_point = charset_code_point (Vcharset_latin_jisx0201, ch, 0);
3703 if (code_point >= 0)
3704 Dynarr_add (dst, code_point);
3705 else if ((code_point
3706 = charset_code_point (Vcharset_japanese_jisx0208_1990, ch, 0))
3709 ENCODE_SJIS ((code_point >> 8) | 0x80,
3710 (code_point & 0xFF) | 0x80, s1, s2);
3711 Dynarr_add (dst, s1);
3712 Dynarr_add (dst, s2);
3714 else if ((code_point
3715 = charset_code_point (Vcharset_katakana_jisx0201, ch, 0))
3717 Dynarr_add (dst, code_point | 0x80);
3718 else if ((code_point
3719 = charset_code_point (Vcharset_japanese_jisx0208, ch, 0))
3722 ENCODE_SJIS ((code_point >> 8) | 0x80,
3723 (code_point & 0xFF) | 0x80, s1, s2);
3724 Dynarr_add (dst, s1);
3725 Dynarr_add (dst, s2);
3727 else if ((code_point = charset_code_point (Vcharset_ascii, ch, 0))
3729 Dynarr_add (dst, code_point);
3731 Dynarr_add (dst, '?');
3733 Lisp_Object charset;
3734 unsigned int c1, c2;
3736 BREAKUP_CHAR (ch, charset, c1, c2);
3738 if (EQ(charset, Vcharset_katakana_jisx0201))
3740 Dynarr_add (dst, c1 | 0x80);
3744 Dynarr_add (dst, c1);
3746 else if (EQ(charset, Vcharset_japanese_jisx0208))
3748 ENCODE_SJIS (c1 | 0x80, c2 | 0x80, s1, s2);
3749 Dynarr_add (dst, s1);
3750 Dynarr_add (dst, s2);
3753 Dynarr_add (dst, '?');
3759 char_finish_shift_jis (struct encoding_stream *str, unsigned_char_dynarr *dst,
3760 unsigned int *flags)
3764 DEFUN ("decode-shift-jis-char", Fdecode_shift_jis_char, 1, 1, 0, /*
3765 Decode a JISX0208 character of Shift-JIS coding-system.
3766 CODE is the character code in Shift-JIS as a cons of type bytes.
3767 Return the corresponding character.
3771 unsigned char c1, c2, s1, s2;
3774 CHECK_INT (XCAR (code));
3775 CHECK_INT (XCDR (code));
3776 s1 = XINT (XCAR (code));
3777 s2 = XINT (XCDR (code));
3778 if (BYTE_SJIS_TWO_BYTE_1_P (s1) &&
3779 BYTE_SJIS_TWO_BYTE_2_P (s2))
3781 DECODE_SJIS (s1, s2, c1, c2);
3782 return make_char (MAKE_CHAR (Vcharset_japanese_jisx0208,
3783 c1 & 0x7F, c2 & 0x7F));
3789 DEFUN ("encode-shift-jis-char", Fencode_shift_jis_char, 1, 1, 0, /*
3790 Encode a JISX0208 character CHARACTER to SHIFT-JIS coding-system.
3791 Return the corresponding character code in SHIFT-JIS as a cons of two bytes.
3795 Lisp_Object charset;
3798 CHECK_CHAR_COERCE_INT (character);
3799 BREAKUP_CHAR (XCHAR (character), charset, c1, c2);
3800 if (EQ (charset, Vcharset_japanese_jisx0208))
3802 ENCODE_SJIS (c1 | 0x80, c2 | 0x80, s1, s2);
3803 return Fcons (make_int (s1), make_int (s2));
3810 /************************************************************************/
3812 /************************************************************************/
3814 /* BIG5 is a coding system encoding two character sets: ASCII and
3815 Big5. An ASCII character is encoded as is. Big5 is a two-byte
3816 character set and is encoded in two-byte.
3818 --- CODE RANGE of BIG5 ---
3819 (character set) (range)
3821 Big5 (1st byte) 0xA1 .. 0xFE
3822 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
3823 --------------------------
3825 Since the number of characters in Big5 is larger than maximum
3826 characters in Emacs' charset (96x96), it can't be handled as one
3827 charset. So, in Emacs, Big5 is divided into two: `charset-big5-1'
3828 and `charset-big5-2'. Both <type>s are DIMENSION2_CHARS94. The former
3829 contains frequently used characters and the latter contains less
3830 frequently used characters. */
3833 #define BYTE_BIG5_TWO_BYTE_1_P(c) \
3834 ((c) >= 0x81 && (c) <= 0xFE)
3836 #define BYTE_BIG5_TWO_BYTE_1_P(c) \
3837 ((c) >= 0xA1 && (c) <= 0xFE)
3840 /* Is this the second byte of a Shift-JIS two-byte char? */
3842 #define BYTE_BIG5_TWO_BYTE_2_P(c) \
3843 (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0xA1 && (c) <= 0xFE))
3845 /* Number of Big5 characters which have the same code in 1st byte. */
3847 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
3849 /* Code conversion macros. These are macros because they are used in
3850 inner loops during code conversion.
3852 Note that temporary variables in macros introduce the classic
3853 dynamic-scoping problems with variable names. We use capital-
3854 lettered variables in the assumption that XEmacs does not use
3855 capital letters in variables except in a very formalized way
3858 /* Convert Big5 code (b1, b2) into its internal string representation
3861 /* There is a much simpler way to split the Big5 charset into two.
3862 For the moment I'm going to leave the algorithm as-is because it
3863 claims to separate out the most-used characters into a single
3864 charset, which perhaps will lead to optimizations in various
3867 The way the algorithm works is something like this:
3869 Big5 can be viewed as a 94x157 charset, where the row is
3870 encoded into the bytes 0xA1 .. 0xFE and the column is encoded
3871 into the bytes 0x40 .. 0x7E and 0xA1 .. 0xFE. As for frequency,
3872 the split between low and high column numbers is apparently
3873 meaningless; ascending rows produce less and less frequent chars.
3874 Therefore, we assign the lower half of rows (0xA1 .. 0xC8) to
3875 the first charset, and the upper half (0xC9 .. 0xFE) to the
3876 second. To do the conversion, we convert the character into
3877 a single number where 0 .. 156 is the first row, 157 .. 313
3878 is the second, etc. That way, the characters are ordered by
3879 decreasing frequency. Then we just chop the space in two
3880 and coerce the result into a 94x94 space.
3883 #define DECODE_BIG5(b1, b2, lb, c1, c2) do \
3885 int B1 = b1, B2 = b2; \
3887 = (B1 - 0xA1) * BIG5_SAME_ROW + B2 - (B2 < 0x7F ? 0x40 : 0x62); \
3891 lb = LEADING_BYTE_CHINESE_BIG5_1; \
3895 lb = LEADING_BYTE_CHINESE_BIG5_2; \
3896 I -= (BIG5_SAME_ROW) * (0xC9 - 0xA1); \
3898 c1 = I / (0xFF - 0xA1) + 0xA1; \
3899 c2 = I % (0xFF - 0xA1) + 0xA1; \
3902 /* Convert the internal string representation of a Big5 character
3903 (lb, c1, c2) into Big5 code (b1, b2). */
3905 #define ENCODE_BIG5(lb, c1, c2, b1, b2) do \
3907 unsigned int I = ((c1) - 0xA1) * (0xFF - 0xA1) + ((c2) - 0xA1); \
3909 if (lb == LEADING_BYTE_CHINESE_BIG5_2) \
3911 I += BIG5_SAME_ROW * (0xC9 - 0xA1); \
3913 b1 = I / BIG5_SAME_ROW + 0xA1; \
3914 b2 = I % BIG5_SAME_ROW; \
3915 b2 += b2 < 0x3F ? 0x40 : 0x62; \
3919 detect_coding_big5 (struct detection_state *st, const Extbyte *src, Lstream_data_count n)
3923 unsigned char c = *(unsigned char *)src++;
3924 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO
3926 || (c >= 0x80 && c <= 0xA0)
3930 if (st->big5.in_second_byte)
3932 st->big5.in_second_byte = 0;
3933 if (c < 0x40 || (c >= 0x80 && c <= 0xA0))
3943 st->big5.in_second_byte = 1;
3945 return CODING_CATEGORY_BIG5_MASK;
3948 /* Convert Big5 data to internal format. */
3951 decode_coding_big5 (Lstream *decoding, const Extbyte *src,
3952 unsigned_char_dynarr *dst, Lstream_data_count n)
3954 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3955 unsigned int flags = str->flags;
3956 unsigned int cpos = str->cpos;
3957 eol_type_t eol_type = str->eol_type;
3960 = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (DECODING_STREAM_DATA
3961 (decoding)->codesys, 1);
3966 unsigned char c = *(unsigned char *)src++;
3969 /* Previous character was first byte of Big5 char. */
3970 if (BYTE_BIG5_TWO_BYTE_2_P (c))
3973 int code_point = (cpos << 8) | c;
3974 Emchar char_id = decode_defined_char (ccs, code_point, 0);
3978 = DECODE_CHAR (Vcharset_chinese_big5, code_point, 0);
3979 DECODE_ADD_UCS_CHAR (char_id, dst);
3981 unsigned char b1, b2, b3;
3982 DECODE_BIG5 (cpos, c, b1, b2, b3);
3983 Dynarr_add (dst, b1);
3984 Dynarr_add (dst, b2);
3985 Dynarr_add (dst, b3);
3990 DECODE_ADD_BINARY_CHAR (cpos, dst);
3991 DECODE_ADD_BINARY_CHAR (c, dst);
3997 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3998 if (BYTE_BIG5_TWO_BYTE_1_P (c))
4000 decode_flush_er_chars (str, dst);
4005 decode_flush_er_chars (str, dst);
4006 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
4007 DECODE_ADD_BINARY_CHAR (c, dst);
4011 /* DECODE_ADD_BINARY_CHAR (c, dst); */
4012 decode_add_er_char (str, c, dst);
4015 label_continue_loop:;
4018 /* DECODE_HANDLE_END_OF_CONVERSION (flags, cpos, dst); */
4019 if (flags & CODING_STATE_END)
4021 decode_flush_er_chars (str, dst);
4022 DECODE_OUTPUT_PARTIAL_CHAR (cpos);
4023 if (flags & CODING_STATE_CR)
4024 Dynarr_add (dst, '\r');
4031 /* Convert internally-formatted data to Big5. */
4034 char_encode_big5 (struct encoding_stream *str, Emchar ch,
4035 unsigned_char_dynarr *dst, unsigned int *flags)
4037 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
4041 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
4042 Dynarr_add (dst, '\r');
4043 if (eol_type != EOL_CR)
4044 Dynarr_add (dst, ch);
4051 = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (str->codesys, 1);
4053 if ((code_point = charset_code_point (Vcharset_ascii, ch, 0)) >= 0)
4054 Dynarr_add (dst, code_point);
4055 else if ((code_point = charset_code_point (ccs, ch, 0)) >= 0)
4057 Dynarr_add (dst, code_point >> 8);
4058 Dynarr_add (dst, code_point & 0xFF);
4060 else if ((code_point
4061 = charset_code_point (Vcharset_chinese_big5, ch, 0)) >= 0)
4063 Dynarr_add (dst, code_point >> 8);
4064 Dynarr_add (dst, code_point & 0xFF);
4066 else if ((code_point
4067 = charset_code_point (Vcharset_chinese_big5_1, ch, 0)) >= 0)
4070 = ((code_point >> 8) - 33) * (0xFF - 0xA1)
4071 + ((code_point & 0xFF) - 33);
4072 unsigned char b1 = I / BIG5_SAME_ROW + 0xA1;
4073 unsigned char b2 = I % BIG5_SAME_ROW;
4075 b2 += b2 < 0x3F ? 0x40 : 0x62;
4076 Dynarr_add (dst, b1);
4077 Dynarr_add (dst, b2);
4079 else if ((code_point
4080 = charset_code_point (Vcharset_chinese_big5_2, ch, 0)) >= 0)
4083 = ((code_point >> 8) - 33) * (0xFF - 0xA1)
4084 + ((code_point & 0xFF) - 33);
4085 unsigned char b1, b2;
4087 I += BIG5_SAME_ROW * (0xC9 - 0xA1);
4088 b1 = I / BIG5_SAME_ROW + 0xA1;
4089 b2 = I % BIG5_SAME_ROW;
4090 b2 += b2 < 0x3F ? 0x40 : 0x62;
4091 Dynarr_add (dst, b1);
4092 Dynarr_add (dst, b2);
4094 else if (CODING_SYSTEM_USE_ENTITY_REFERENCE (str->codesys))
4098 char_encode_as_entity_reference (ch, buf);
4099 Dynarr_add_many (dst, buf, strlen (buf));
4102 Dynarr_add (dst, '?');
4109 char_finish_big5 (struct encoding_stream *str, unsigned_char_dynarr *dst,
4110 unsigned int *flags)
4115 DEFUN ("decode-big5-char", Fdecode_big5_char, 1, 1, 0, /*
4116 Decode a Big5 character CODE of BIG5 coding-system.
4117 CODE is the character code in BIG5, a cons of two integers.
4118 Return the corresponding character.
4122 unsigned char c1, c2, b1, b2;
4125 CHECK_INT (XCAR (code));
4126 CHECK_INT (XCDR (code));
4127 b1 = XINT (XCAR (code));
4128 b2 = XINT (XCDR (code));
4129 if (BYTE_BIG5_TWO_BYTE_1_P (b1) &&
4130 BYTE_BIG5_TWO_BYTE_2_P (b2))
4132 Charset_ID leading_byte;
4133 Lisp_Object charset;
4134 DECODE_BIG5 (b1, b2, leading_byte, c1, c2);
4135 charset = CHARSET_BY_LEADING_BYTE (leading_byte);
4136 return make_char (MAKE_CHAR (charset, c1 & 0x7F, c2 & 0x7F));
4142 DEFUN ("encode-big5-char", Fencode_big5_char, 1, 1, 0, /*
4143 Encode the Big5 character CHARACTER in the BIG5 coding-system.
4144 Return the corresponding character code in Big5.
4148 Lisp_Object charset;
4151 CHECK_CHAR_COERCE_INT (character);
4152 BREAKUP_CHAR (XCHAR (character), charset, c1, c2);
4153 if (EQ (charset, Vcharset_chinese_big5_1) ||
4154 EQ (charset, Vcharset_chinese_big5_2))
4156 ENCODE_BIG5 (XCHARSET_LEADING_BYTE (charset), c1 | 0x80, c2 | 0x80,
4158 return Fcons (make_int (b1), make_int (b2));
4165 /************************************************************************/
4167 /************************************************************************/
4170 detect_coding_ucs4 (struct detection_state *st, const Extbyte *src, Lstream_data_count n)
4174 unsigned char c = *(unsigned char *)src++;
4175 switch (st->ucs4.in_byte)
4184 st->ucs4.in_byte = 0;
4190 return CODING_CATEGORY_UCS4_MASK;
4194 decode_coding_ucs4 (Lstream *decoding, const Extbyte *src,
4195 unsigned_char_dynarr *dst, Lstream_data_count n)
4197 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
4198 unsigned int flags = str->flags;
4199 unsigned int cpos = str->cpos;
4200 unsigned char counter = str->counter;
4204 unsigned char c = *(unsigned char *)src++;
4212 DECODE_ADD_UCS_CHAR ((cpos << 8) | c, dst);
4217 cpos = ( cpos << 8 ) | c;
4221 if (counter & CODING_STATE_END)
4222 DECODE_OUTPUT_PARTIAL_CHAR (cpos);
4226 str->counter = counter;
4230 char_encode_ucs4 (struct encoding_stream *str, Emchar ch,
4231 unsigned_char_dynarr *dst, unsigned int *flags)
4233 Dynarr_add (dst, ch >> 24);
4234 Dynarr_add (dst, ch >> 16);
4235 Dynarr_add (dst, ch >> 8);
4236 Dynarr_add (dst, ch );
4240 char_finish_ucs4 (struct encoding_stream *str, unsigned_char_dynarr *dst,
4241 unsigned int *flags)
4246 /************************************************************************/
4247 /* UTF-16 methods */
4248 /************************************************************************/
4251 detect_coding_utf16 (struct detection_state *st, const Extbyte *src, Lstream_data_count n)
4253 return CODING_CATEGORY_UTF16_MASK;
4257 decode_coding_utf16 (Lstream *decoding, const Extbyte *src,
4258 unsigned_char_dynarr *dst, Lstream_data_count n)
4260 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
4261 unsigned int flags = str->flags;
4262 unsigned int cpos = str->cpos;
4263 unsigned char counter = str->counter & 3;
4264 unsigned char byte_order = str->counter >> 2;
4265 eol_type_t eol_type = str->eol_type;
4269 unsigned char c = *(unsigned char *)src++;
4275 else if (counter == 1)
4279 if (byte_order == 0)
4280 code = (c << 8) | cpos;
4282 code = (cpos << 8) | c;
4285 code = ((code & 0xFF) << 8) | (code >> 8);
4286 if ( byte_order == 0 )
4291 if ( (0xD800 <= code) && (code <= 0xDBFF) )
4302 DECODE_HANDLE_EOL_TYPE (eol_type, code, flags, dst);
4303 DECODE_ADD_UCS_CHAR (code, dst);
4307 else if (counter == 2)
4309 cpos = (cpos << 8) | c;
4317 ? (c << 8) | (cpos & 0xFF)
4318 : ((cpos & 0xFF) << 8) | c;
4320 DECODE_ADD_UCS_CHAR ((x - 0xD800) * 0x400 + (y - 0xDC00)
4325 label_continue_loop:;
4327 if (counter & CODING_STATE_END)
4328 DECODE_OUTPUT_PARTIAL_CHAR (cpos);
4332 str->counter = (byte_order << 2) | counter;
4336 char_encode_utf16 (struct encoding_stream *str, Emchar ch,
4337 unsigned_char_dynarr *dst, unsigned int *flags)
4341 Dynarr_add (dst, ch);
4342 Dynarr_add (dst, ch >> 8);
4346 int y = ((ch - 0x10000) / 0x400) + 0xD800;
4347 int z = ((ch - 0x10000) % 0x400) + 0xDC00;
4349 Dynarr_add (dst, y);
4350 Dynarr_add (dst, y >> 8);
4351 Dynarr_add (dst, z);
4352 Dynarr_add (dst, z >> 8);
4357 char_finish_utf16 (struct encoding_stream *str, unsigned_char_dynarr *dst,
4358 unsigned int *flags)
4363 /************************************************************************/
4365 /************************************************************************/
4368 detect_coding_utf8 (struct detection_state *st, const Extbyte *src, Lstream_data_count n)
4372 unsigned char c = *(unsigned char *)src++;
4373 switch (st->utf8.in_byte)
4376 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
4379 st->utf8.in_byte = 5;
4381 st->utf8.in_byte = 4;
4383 st->utf8.in_byte = 3;
4385 st->utf8.in_byte = 2;
4387 st->utf8.in_byte = 1;
4392 if ((c & 0xc0) != 0x80)
4398 return CODING_CATEGORY_UTF8_MASK;
4402 decode_output_utf8_partial_char (unsigned char counter,
4404 unsigned_char_dynarr *dst)
4407 DECODE_ADD_BINARY_CHAR ( (cpos|0xFC), dst);
4408 else if (counter == 4)
4410 if (cpos < (1 << 6))
4411 DECODE_ADD_BINARY_CHAR ( (cpos|0xF8), dst);
4414 DECODE_ADD_BINARY_CHAR ( ((cpos >> 6)|0xFC), dst);
4415 DECODE_ADD_BINARY_CHAR ( ((cpos&0x3F)|0x80), dst);
4418 else if (counter == 3)
4420 if (cpos < (1 << 6))
4421 DECODE_ADD_BINARY_CHAR ( (cpos|0xF0), dst);
4422 else if (cpos < (1 << 12))
4424 DECODE_ADD_BINARY_CHAR ( ((cpos >> 6)|0xF8), dst);
4425 DECODE_ADD_BINARY_CHAR ( ((cpos&0x3F)|0x80), dst);
4429 DECODE_ADD_BINARY_CHAR ( ( (cpos >> 12)|0xFC), dst);
4430 DECODE_ADD_BINARY_CHAR ( (((cpos >> 6)&0x3F)|0x80), dst);
4431 DECODE_ADD_BINARY_CHAR ( ( (cpos &0x3F)|0x80), dst);
4434 else if (counter == 2)
4436 if (cpos < (1 << 6))
4437 DECODE_ADD_BINARY_CHAR ( (cpos|0xE0), dst);
4438 else if (cpos < (1 << 12))
4440 DECODE_ADD_BINARY_CHAR ( ((cpos >> 6)|0xF0), dst);
4441 DECODE_ADD_BINARY_CHAR ( ((cpos&0x3F)|0x80), dst);
4443 else if (cpos < (1 << 18))
4445 DECODE_ADD_BINARY_CHAR ( ( (cpos >> 12)|0xF8), dst);
4446 DECODE_ADD_BINARY_CHAR ( (((cpos >> 6)&0x3F)|0x80), dst);
4447 DECODE_ADD_BINARY_CHAR ( ( (cpos &0x3F)|0x80), dst);
4451 DECODE_ADD_BINARY_CHAR ( ( (cpos >> 18)|0xFC), dst);
4452 DECODE_ADD_BINARY_CHAR ( (((cpos >> 12)&0x3F)|0x80), dst);
4453 DECODE_ADD_BINARY_CHAR ( (((cpos >> 6)&0x3F)|0x80), dst);
4454 DECODE_ADD_BINARY_CHAR ( ( (cpos &0x3F)|0x80), dst);
4459 if (cpos < (1 << 6))
4460 DECODE_ADD_BINARY_CHAR ( (cpos|0xC0), dst);
4461 else if (cpos < (1 << 12))
4463 DECODE_ADD_BINARY_CHAR ( ((cpos >> 6)|0xE0), dst);
4464 DECODE_ADD_BINARY_CHAR ( ((cpos&0x3F)|0x80), dst);
4466 else if (cpos < (1 << 18))
4468 DECODE_ADD_BINARY_CHAR ( ( (cpos >> 12)|0xF0), dst);
4469 DECODE_ADD_BINARY_CHAR ( (((cpos >> 6)&0x3F)|0x80), dst);
4470 DECODE_ADD_BINARY_CHAR ( ( (cpos &0x3F)|0x80), dst);
4472 else if (cpos < (1 << 24))
4474 DECODE_ADD_BINARY_CHAR ( ( (cpos >> 18)|0xF8), dst);
4475 DECODE_ADD_BINARY_CHAR ( (((cpos >> 12)&0x3F)|0x80), dst);
4476 DECODE_ADD_BINARY_CHAR ( (((cpos >> 6)&0x3F)|0x80), dst);
4477 DECODE_ADD_BINARY_CHAR ( ( (cpos &0x3F)|0x80), dst);
4481 DECODE_ADD_BINARY_CHAR ( ( (cpos >> 24)|0xFC), dst);
4482 DECODE_ADD_BINARY_CHAR ( (((cpos >> 18)&0x3F)|0x80), dst);
4483 DECODE_ADD_BINARY_CHAR ( (((cpos >> 12)&0x3F)|0x80), dst);
4484 DECODE_ADD_BINARY_CHAR ( (((cpos >> 6)&0x3F)|0x80), dst);
4485 DECODE_ADD_BINARY_CHAR ( ( (cpos &0x3F)|0x80), dst);
4491 decode_coding_utf8 (Lstream *decoding, const Extbyte *src,
4492 unsigned_char_dynarr *dst, Lstream_data_count n)
4494 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
4495 unsigned int flags = str->flags;
4496 unsigned int cpos = str->cpos;
4497 eol_type_t eol_type = str->eol_type;
4498 unsigned char counter = str->counter;
4501 = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (DECODING_STREAM_DATA
4502 (decoding)->codesys, 0);
4507 unsigned char c = *(unsigned char *)src++;
4512 COMPOSE_FLUSH_CHARS (str, dst);
4513 decode_flush_er_chars (str, dst);
4514 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
4515 DECODE_ADD_UCS_CHAR (c, dst);
4517 else if ( c < 0xC0 )
4518 /* decode_add_er_char (str, c, dst); */
4519 COMPOSE_ADD_CHAR (str, c, dst);
4522 /* decode_flush_er_chars (str, dst); */
4528 else if ( c < 0xF0 )
4533 else if ( c < 0xF8 )
4538 else if ( c < 0xFC )
4550 else if ( (c & 0xC0) == 0x80 )
4552 cpos = ( cpos << 6 ) | ( c & 0x3f );
4559 char_id = decode_defined_char (ccs, cpos, 0);
4566 COMPOSE_ADD_CHAR (str, char_id, dst);
4575 COMPOSE_FLUSH_CHARS (str, dst);
4576 decode_flush_er_chars (str, dst);
4577 decode_output_utf8_partial_char (counter, cpos, dst);
4578 DECODE_ADD_BINARY_CHAR (c, dst);
4582 label_continue_loop:;
4585 if (flags & CODING_STATE_END)
4587 COMPOSE_FLUSH_CHARS (str, dst);
4588 decode_flush_er_chars (str, dst);
4591 decode_output_utf8_partial_char (counter, cpos, dst);
4598 str->counter = counter;
4602 char_encode_utf8 (struct encoding_stream *str, Emchar ch,
4603 unsigned_char_dynarr *dst, unsigned int *flags)
4605 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
4609 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
4610 Dynarr_add (dst, '\r');
4611 if (eol_type != EOL_CR)
4612 Dynarr_add (dst, ch);
4614 else if (ch <= 0x7f)
4616 Dynarr_add (dst, ch);
4621 = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (str->codesys, 0);
4622 int code_point = charset_code_point (ucs_ccs, ch, 0);
4624 if ( (code_point < 0) || (code_point > 0xEFFFF) )
4627 = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (str->codesys, 1);
4631 && INTP (ret = Fchar_feature (make_char (ch),
4634 code_point = XINT (ret);
4635 else if ( !NILP (map =
4636 CODING_SYSTEM_ISO2022_INITIAL_CHARSET
4638 && INTP (ret = Fchar_feature (make_char (ch),
4641 code_point = XINT (ret);
4642 else if (CODING_SYSTEM_USE_ENTITY_REFERENCE (str->codesys))
4646 char_encode_as_entity_reference (ch, buf);
4647 Dynarr_add_many (dst, buf, strlen (buf));
4653 if (code_point <= 0x7ff)
4655 Dynarr_add (dst, (code_point >> 6) | 0xc0);
4656 Dynarr_add (dst, (code_point & 0x3f) | 0x80);
4658 else if (code_point <= 0xffff)
4660 Dynarr_add (dst, (code_point >> 12) | 0xe0);
4661 Dynarr_add (dst, ((code_point >> 6) & 0x3f) | 0x80);
4662 Dynarr_add (dst, (code_point & 0x3f) | 0x80);
4664 else if (code_point <= 0x1fffff)
4666 Dynarr_add (dst, (code_point >> 18) | 0xf0);
4667 Dynarr_add (dst, ((code_point >> 12) & 0x3f) | 0x80);
4668 Dynarr_add (dst, ((code_point >> 6) & 0x3f) | 0x80);
4669 Dynarr_add (dst, (code_point & 0x3f) | 0x80);
4671 else if (code_point <= 0x3ffffff)
4673 Dynarr_add (dst, (code_point >> 24) | 0xf8);
4674 Dynarr_add (dst, ((code_point >> 18) & 0x3f) | 0x80);
4675 Dynarr_add (dst, ((code_point >> 12) & 0x3f) | 0x80);
4676 Dynarr_add (dst, ((code_point >> 6) & 0x3f) | 0x80);
4677 Dynarr_add (dst, (code_point & 0x3f) | 0x80);
4681 Dynarr_add (dst, (code_point >> 30) | 0xfc);
4682 Dynarr_add (dst, ((code_point >> 24) & 0x3f) | 0x80);
4683 Dynarr_add (dst, ((code_point >> 18) & 0x3f) | 0x80);
4684 Dynarr_add (dst, ((code_point >> 12) & 0x3f) | 0x80);
4685 Dynarr_add (dst, ((code_point >> 6) & 0x3f) | 0x80);
4686 Dynarr_add (dst, (code_point & 0x3f) | 0x80);
4692 char_finish_utf8 (struct encoding_stream *str, unsigned_char_dynarr *dst,
4693 unsigned int *flags)
4698 /************************************************************************/
4699 /* ISO2022 methods */
4700 /************************************************************************/
4702 /* The following note describes the coding system ISO2022 briefly.
4703 Since the intention of this note is to help understand the
4704 functions in this file, some parts are NOT ACCURATE or OVERLY
4705 SIMPLIFIED. For thorough understanding, please refer to the
4706 original document of ISO2022.
4708 ISO2022 provides many mechanisms to encode several character sets
4709 in 7-bit and 8-bit environments. For 7-bit environments, all text
4710 is encoded using bytes less than 128. This may make the encoded
4711 text a little bit longer, but the text passes more easily through
4712 several gateways, some of which strip off MSB (Most Signigant Bit).
4714 There are two kinds of character sets: control character set and
4715 graphic character set. The former contains control characters such
4716 as `newline' and `escape' to provide control functions (control
4717 functions are also provided by escape sequences). The latter
4718 contains graphic characters such as 'A' and '-'. Emacs recognizes
4719 two control character sets and many graphic character sets.
4721 Graphic character sets are classified into one of the following
4722 four classes, according to the number of bytes (DIMENSION) and
4723 number of characters in one dimension (CHARS) of the set:
4724 - DIMENSION1_CHARS94
4725 - DIMENSION1_CHARS96
4726 - DIMENSION2_CHARS94
4727 - DIMENSION2_CHARS96
4729 In addition, each character set is assigned an identification tag,
4730 unique for each set, called "final character" (denoted as <F>
4731 hereafter). The <F> of each character set is decided by ECMA(*)
4732 when it is registered in ISO. The code range of <F> is 0x30..0x7F
4733 (0x30..0x3F are for private use only).
4735 Note (*): ECMA = European Computer Manufacturers Association
4737 Here are examples of graphic character set [NAME(<F>)]:
4738 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
4739 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
4740 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
4741 o DIMENSION2_CHARS96 -- none for the moment
4743 A code area (1 byte = 8 bits) is divided into 4 areas, C0, GL, C1, and GR.
4744 C0 [0x00..0x1F] -- control character plane 0
4745 GL [0x20..0x7F] -- graphic character plane 0
4746 C1 [0x80..0x9F] -- control character plane 1
4747 GR [0xA0..0xFF] -- graphic character plane 1
4749 A control character set is directly designated and invoked to C0 or
4750 C1 by an escape sequence. The most common case is that:
4751 - ISO646's control character set is designated/invoked to C0, and
4752 - ISO6429's control character set is designated/invoked to C1,
4753 and usually these designations/invocations are omitted in encoded
4754 text. In a 7-bit environment, only C0 can be used, and a control
4755 character for C1 is encoded by an appropriate escape sequence to
4756 fit into the environment. All control characters for C1 are
4757 defined to have corresponding escape sequences.
4759 A graphic character set is at first designated to one of four
4760 graphic registers (G0 through G3), then these graphic registers are
4761 invoked to GL or GR. These designations and invocations can be
4762 done independently. The most common case is that G0 is invoked to
4763 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
4764 these invocations and designations are omitted in encoded text.
4765 In a 7-bit environment, only GL can be used.
4767 When a graphic character set of CHARS94 is invoked to GL, codes
4768 0x20 and 0x7F of the GL area work as control characters SPACE and
4769 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
4772 There are two ways of invocation: locking-shift and single-shift.
4773 With locking-shift, the invocation lasts until the next different
4774 invocation, whereas with single-shift, the invocation affects the
4775 following character only and doesn't affect the locking-shift
4776 state. Invocations are done by the following control characters or
4779 ----------------------------------------------------------------------
4780 abbrev function cntrl escape seq description
4781 ----------------------------------------------------------------------
4782 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
4783 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
4784 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
4785 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
4786 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
4787 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
4788 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
4789 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
4790 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
4791 ----------------------------------------------------------------------
4792 (*) These are not used by any known coding system.
4794 Control characters for these functions are defined by macros
4795 ISO_CODE_XXX in `coding.h'.
4797 Designations are done by the following escape sequences:
4798 ----------------------------------------------------------------------
4799 escape sequence description
4800 ----------------------------------------------------------------------
4801 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
4802 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
4803 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
4804 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
4805 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
4806 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
4807 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
4808 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
4809 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
4810 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
4811 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
4812 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
4813 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
4814 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
4815 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
4816 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
4817 ----------------------------------------------------------------------
4819 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
4820 of dimension 1, chars 94, and final character <F>, etc...
4822 Note (*): Although these designations are not allowed in ISO2022,
4823 Emacs accepts them on decoding, and produces them on encoding
4824 CHARS96 character sets in a coding system which is characterized as
4825 7-bit environment, non-locking-shift, and non-single-shift.
4827 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
4828 '(' can be omitted. We refer to this as "short-form" hereafter.
4830 Now you may notice that there are a lot of ways for encoding the
4831 same multilingual text in ISO2022. Actually, there exist many
4832 coding systems such as Compound Text (used in X11's inter client
4833 communication, ISO-2022-JP (used in Japanese internet), ISO-2022-KR
4834 (used in Korean internet), EUC (Extended UNIX Code, used in Asian
4835 localized platforms), and all of these are variants of ISO2022.
4837 In addition to the above, Emacs handles two more kinds of escape
4838 sequences: ISO6429's direction specification and Emacs' private
4839 sequence for specifying character composition.
4841 ISO6429's direction specification takes the following form:
4842 o CSI ']' -- end of the current direction
4843 o CSI '0' ']' -- end of the current direction
4844 o CSI '1' ']' -- start of left-to-right text
4845 o CSI '2' ']' -- start of right-to-left text
4846 The control character CSI (0x9B: control sequence introducer) is
4847 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
4849 Character composition specification takes the following form:
4850 o ESC '0' -- start character composition
4851 o ESC '1' -- end character composition
4852 Since these are not standard escape sequences of any ISO standard,
4853 their use with these meanings is restricted to Emacs only. */
4856 reset_iso2022 (Lisp_Object coding_system, struct iso2022_decoder *iso)
4860 for (i = 0; i < 4; i++)
4862 if (!NILP (coding_system))
4864 XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, i);
4866 iso->charset[i] = Qt;
4867 iso->invalid_designated[i] = 0;
4869 iso->esc = ISO_ESC_NOTHING;
4870 iso->esc_bytes_index = 0;
4871 iso->register_left = 0;
4872 iso->register_right = 1;
4873 iso->switched_dir_and_no_valid_charset_yet = 0;
4874 iso->invalid_switch_dir = 0;
4875 iso->output_direction_sequence = 0;
4876 iso->output_literally = 0;
4877 #ifdef ENABLE_COMPOSITE_CHARS
4878 if (iso->composite_chars)
4879 Dynarr_reset (iso->composite_chars);
4884 fit_to_be_escape_quoted (unsigned char c)
4901 /* Parse one byte of an ISO2022 escape sequence.
4902 If the result is an invalid escape sequence, return 0 and
4903 do not change anything in STR. Otherwise, if the result is
4904 an incomplete escape sequence, update ISO2022.ESC and
4905 ISO2022.ESC_BYTES and return -1. Otherwise, update
4906 all the state variables (but not ISO2022.ESC_BYTES) and
4909 If CHECK_INVALID_CHARSETS is non-zero, check for designation
4910 or invocation of an invalid character set and treat that as
4911 an unrecognized escape sequence. */
4914 parse_iso2022_esc (Lisp_Object codesys, struct iso2022_decoder *iso,
4915 unsigned char c, unsigned int *flags,
4916 int check_invalid_charsets)
4918 /* (1) If we're at the end of a designation sequence, CS is the
4919 charset being designated and REG is the register to designate
4922 (2) If we're at the end of a locking-shift sequence, REG is
4923 the register to invoke and HALF (0 == left, 1 == right) is
4924 the half to invoke it into.
4926 (3) If we're at the end of a single-shift sequence, REG is
4927 the register to invoke. */
4928 Lisp_Object cs = Qnil;
4931 /* NOTE: This code does goto's all over the fucking place.
4932 The reason for this is that we're basically implementing
4933 a state machine here, and hierarchical languages like C
4934 don't really provide a clean way of doing this. */
4936 if (! (*flags & CODING_STATE_ESCAPE))
4937 /* At beginning of escape sequence; we need to reset our
4938 escape-state variables. */
4939 iso->esc = ISO_ESC_NOTHING;
4941 iso->output_literally = 0;
4942 iso->output_direction_sequence = 0;
4946 case ISO_ESC_NOTHING:
4947 iso->esc_bytes_index = 0;
4950 case ISO_CODE_ESC: /* Start escape sequence */
4951 *flags |= CODING_STATE_ESCAPE;
4955 case ISO_CODE_CSI: /* ISO6429 (specifying directionality) */
4956 *flags |= CODING_STATE_ESCAPE;
4957 iso->esc = ISO_ESC_5_11;
4960 case ISO_CODE_SO: /* locking shift 1 */
4963 case ISO_CODE_SI: /* locking shift 0 */
4967 case ISO_CODE_SS2: /* single shift */
4970 case ISO_CODE_SS3: /* single shift */
4974 default: /* Other control characters */
4981 /**** single shift ****/
4983 case 'N': /* single shift 2 */
4986 case 'O': /* single shift 3 */
4990 /**** locking shift ****/
4992 case '~': /* locking shift 1 right */
4995 case 'n': /* locking shift 2 */
4998 case '}': /* locking shift 2 right */
5001 case 'o': /* locking shift 3 */
5004 case '|': /* locking shift 3 right */
5008 #ifdef ENABLE_COMPOSITE_CHARS
5009 /**** composite ****/
5012 iso->esc = ISO_ESC_START_COMPOSITE;
5013 *flags = (*flags & CODING_STATE_ISO2022_LOCK) |
5014 CODING_STATE_COMPOSITE;
5018 iso->esc = ISO_ESC_END_COMPOSITE;
5019 *flags = (*flags & CODING_STATE_ISO2022_LOCK) &
5020 ~CODING_STATE_COMPOSITE;
5022 #endif /* ENABLE_COMPOSITE_CHARS */
5024 /**** directionality ****/
5027 iso->esc = ISO_ESC_5_11;
5030 /**** designation ****/
5032 case '$': /* multibyte charset prefix */
5033 iso->esc = ISO_ESC_2_4;
5037 if (0x28 <= c && c <= 0x2F)
5039 iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_8);
5043 /* This function is called with CODESYS equal to nil when
5044 doing coding-system detection. */
5046 && XCODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
5047 && fit_to_be_escape_quoted (c))
5049 iso->esc = ISO_ESC_LITERAL;
5050 *flags &= CODING_STATE_ISO2022_LOCK;
5060 /**** directionality ****/
5062 case ISO_ESC_5_11: /* ISO6429 direction control */
5065 *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
5066 goto directionality;
5068 if (c == '0') iso->esc = ISO_ESC_5_11_0;
5069 else if (c == '1') iso->esc = ISO_ESC_5_11_1;
5070 else if (c == '2') iso->esc = ISO_ESC_5_11_2;
5074 case ISO_ESC_5_11_0:
5077 *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
5078 goto directionality;
5082 case ISO_ESC_5_11_1:
5085 *flags = (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
5086 goto directionality;
5090 case ISO_ESC_5_11_2:
5093 *flags = (*flags & CODING_STATE_ISO2022_LOCK) | CODING_STATE_R2L;
5094 goto directionality;
5099 iso->esc = ISO_ESC_DIRECTIONALITY;
5100 /* Various junk here to attempt to preserve the direction sequences
5101 literally in the text if they would otherwise be swallowed due
5102 to invalid designations that don't show up as actual charset
5103 changes in the text. */
5104 if (iso->invalid_switch_dir)
5106 /* We already inserted a direction switch literally into the
5107 text. We assume (#### this may not be right) that the
5108 next direction switch is the one going the other way,
5109 and we need to output that literally as well. */
5110 iso->output_literally = 1;
5111 iso->invalid_switch_dir = 0;
5117 /* If we are in the thrall of an invalid designation,
5118 then stick the directionality sequence literally into the
5119 output stream so it ends up in the original text again. */
5120 for (jj = 0; jj < 4; jj++)
5121 if (iso->invalid_designated[jj])
5125 iso->output_literally = 1;
5126 iso->invalid_switch_dir = 1;
5129 /* Indicate that we haven't yet seen a valid designation,
5130 so that if a switch-dir is directly followed by an
5131 invalid designation, both get inserted literally. */
5132 iso->switched_dir_and_no_valid_charset_yet = 1;
5137 /**** designation ****/
5140 if (0x28 <= c && c <= 0x2F)
5142 iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_4_8);
5145 if (0x40 <= c && c <= 0x42)
5148 cs = CHARSET_BY_ATTRIBUTES (94, -1, c,
5149 *flags & CODING_STATE_R2L ?
5150 CHARSET_RIGHT_TO_LEFT :
5151 CHARSET_LEFT_TO_RIGHT);
5162 if (c < '0' || c > '~')
5163 return 0; /* bad final byte */
5165 if (iso->esc >= ISO_ESC_2_8 &&
5166 iso->esc <= ISO_ESC_2_15)
5168 chars = (iso->esc >= ISO_ESC_2_12) ? 96 : 94;
5169 single = 1; /* single-byte */
5170 reg = (iso->esc - ISO_ESC_2_8) & 3;
5172 else if (iso->esc >= ISO_ESC_2_4_8 &&
5173 iso->esc <= ISO_ESC_2_4_15)
5175 chars = (iso->esc >= ISO_ESC_2_4_12) ? 96 : 94;
5176 single = -1; /* multi-byte */
5177 reg = (iso->esc - ISO_ESC_2_4_8) & 3;
5181 /* Can this ever be reached? -slb */
5185 cs = CHARSET_BY_ATTRIBUTES (chars, single, c,
5186 *flags & CODING_STATE_R2L ?
5187 CHARSET_RIGHT_TO_LEFT :
5188 CHARSET_LEFT_TO_RIGHT);
5194 iso->esc_bytes[iso->esc_bytes_index++] = (unsigned char) c;
5198 if (check_invalid_charsets && !CHARSETP (iso->charset[reg]))
5199 /* can't invoke something that ain't there. */
5201 iso->esc = ISO_ESC_SINGLE_SHIFT;
5202 *flags &= CODING_STATE_ISO2022_LOCK;
5204 *flags |= CODING_STATE_SS2;
5206 *flags |= CODING_STATE_SS3;
5210 if (check_invalid_charsets &&
5211 !CHARSETP (iso->charset[reg]))
5212 /* can't invoke something that ain't there. */
5215 iso->register_right = reg;
5217 iso->register_left = reg;
5218 *flags &= CODING_STATE_ISO2022_LOCK;
5219 iso->esc = ISO_ESC_LOCKING_SHIFT;
5223 if (NILP (cs) && check_invalid_charsets)
5225 iso->invalid_designated[reg] = 1;
5226 iso->charset[reg] = Vcharset_ascii;
5227 iso->esc = ISO_ESC_DESIGNATE;
5228 *flags &= CODING_STATE_ISO2022_LOCK;
5229 iso->output_literally = 1;
5230 if (iso->switched_dir_and_no_valid_charset_yet)
5232 /* We encountered a switch-direction followed by an
5233 invalid designation. Ensure that the switch-direction
5234 gets outputted; otherwise it will probably get eaten
5235 when the text is written out again. */
5236 iso->switched_dir_and_no_valid_charset_yet = 0;
5237 iso->output_direction_sequence = 1;
5238 /* And make sure that the switch-dir going the other
5239 way gets outputted, as well. */
5240 iso->invalid_switch_dir = 1;
5244 /* This function is called with CODESYS equal to nil when
5245 doing coding-system detection. */
5246 if (!NILP (codesys))
5248 charset_conversion_spec_dynarr *dyn =
5249 XCODING_SYSTEM (codesys)->iso2022.input_conv;
5255 for (i = 0; i < Dynarr_length (dyn); i++)
5257 struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
5258 if (EQ (cs, spec->from_charset))
5259 cs = spec->to_charset;
5264 iso->charset[reg] = cs;
5265 iso->esc = ISO_ESC_DESIGNATE;
5266 *flags &= CODING_STATE_ISO2022_LOCK;
5267 if (iso->invalid_designated[reg])
5269 iso->invalid_designated[reg] = 0;
5270 iso->output_literally = 1;
5272 if (iso->switched_dir_and_no_valid_charset_yet)
5273 iso->switched_dir_and_no_valid_charset_yet = 0;
5278 detect_coding_iso2022 (struct detection_state *st, const Extbyte *src, Lstream_data_count n)
5282 /* #### There are serious deficiencies in the recognition mechanism
5283 here. This needs to be much smarter if it's going to cut it.
5284 The sequence "\xff\x0f" is currently detected as LOCK_SHIFT while
5285 it should be detected as Latin-1.
5286 All the ISO2022 stuff in this file should be synced up with the
5287 code from FSF Emacs-20.4, in which Mule should be more or less stable.
5288 Perhaps we should wait till R2L works in FSF Emacs? */
5290 if (!st->iso2022.initted)
5292 reset_iso2022 (Qnil, &st->iso2022.iso);
5293 st->iso2022.mask = (CODING_CATEGORY_ISO_7_MASK |
5294 CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
5295 CODING_CATEGORY_ISO_8_1_MASK |
5296 CODING_CATEGORY_ISO_8_2_MASK |
5297 CODING_CATEGORY_ISO_LOCK_SHIFT_MASK);
5298 st->iso2022.flags = 0;
5299 st->iso2022.high_byte_count = 0;
5300 st->iso2022.saw_single_shift = 0;
5301 st->iso2022.initted = 1;
5304 mask = st->iso2022.mask;
5308 unsigned char c = *(unsigned char *)src++;
5311 mask &= ~CODING_CATEGORY_ISO_7_MASK;
5312 st->iso2022.high_byte_count++;
5316 if (st->iso2022.high_byte_count && !st->iso2022.saw_single_shift)
5318 if (st->iso2022.high_byte_count & 1)
5319 /* odd number of high bytes; assume not iso-8-2 */
5320 mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
5322 st->iso2022.high_byte_count = 0;
5323 st->iso2022.saw_single_shift = 0;
5325 mask &= ~CODING_CATEGORY_ISO_7_MASK;
5327 if (!(st->iso2022.flags & CODING_STATE_ESCAPE)
5328 && (BYTE_C0_P (c) || BYTE_C1_P (c)))
5329 { /* control chars */
5332 /* Allow and ignore control characters that you might
5333 reasonably see in a text file */
5338 case 8: /* backspace */
5339 case 11: /* vertical tab */
5340 case 12: /* form feed */
5341 case 26: /* MS-DOS C-z junk */
5342 case 31: /* '^_' -- for info */
5343 goto label_continue_loop;
5350 if ((st->iso2022.flags & CODING_STATE_ESCAPE) || BYTE_C0_P (c)
5353 if (parse_iso2022_esc (Qnil, &st->iso2022.iso, c,
5354 &st->iso2022.flags, 0))
5356 switch (st->iso2022.iso.esc)
5358 case ISO_ESC_DESIGNATE:
5359 mask &= ~CODING_CATEGORY_ISO_8_1_MASK;
5360 mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
5362 case ISO_ESC_LOCKING_SHIFT:
5363 mask = CODING_CATEGORY_ISO_LOCK_SHIFT_MASK;
5364 goto ran_out_of_chars;
5365 case ISO_ESC_SINGLE_SHIFT:
5366 mask &= ~CODING_CATEGORY_ISO_8_DESIGNATE_MASK;
5367 st->iso2022.saw_single_shift = 1;
5376 goto ran_out_of_chars;
5379 label_continue_loop:;
5388 postprocess_iso2022_mask (int mask)
5390 /* #### kind of cheesy */
5391 /* If seven-bit ISO is allowed, then assume that the encoding is
5392 entirely seven-bit and turn off the eight-bit ones. */
5393 if (mask & CODING_CATEGORY_ISO_7_MASK)
5394 mask &= ~ (CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
5395 CODING_CATEGORY_ISO_8_1_MASK |
5396 CODING_CATEGORY_ISO_8_2_MASK);
5400 /* If FLAGS is a null pointer or specifies right-to-left motion,
5401 output a switch-dir-to-left-to-right sequence to DST.
5402 Also update FLAGS if it is not a null pointer.
5403 If INTERNAL_P is set, we are outputting in internal format and
5404 need to handle the CSI differently. */
5407 restore_left_to_right_direction (Lisp_Coding_System *codesys,
5408 unsigned_char_dynarr *dst,
5409 unsigned int *flags,
5412 if (!flags || (*flags & CODING_STATE_R2L))
5414 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
5416 Dynarr_add (dst, ISO_CODE_ESC);
5417 Dynarr_add (dst, '[');
5419 else if (internal_p)
5420 DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
5422 Dynarr_add (dst, ISO_CODE_CSI);
5423 Dynarr_add (dst, '0');
5424 Dynarr_add (dst, ']');
5426 *flags &= ~CODING_STATE_R2L;
5430 /* If FLAGS is a null pointer or specifies a direction different from
5431 DIRECTION (which should be either CHARSET_RIGHT_TO_LEFT or
5432 CHARSET_LEFT_TO_RIGHT), output the appropriate switch-dir escape
5433 sequence to DST. Also update FLAGS if it is not a null pointer.
5434 If INTERNAL_P is set, we are outputting in internal format and
5435 need to handle the CSI differently. */
5438 ensure_correct_direction (int direction, Lisp_Coding_System *codesys,
5439 unsigned_char_dynarr *dst, unsigned int *flags,
5442 if ((!flags || (*flags & CODING_STATE_R2L)) &&
5443 direction == CHARSET_LEFT_TO_RIGHT)
5444 restore_left_to_right_direction (codesys, dst, flags, internal_p);
5445 else if (!CODING_SYSTEM_ISO2022_NO_ISO6429 (codesys)
5446 && (!flags || !(*flags & CODING_STATE_R2L)) &&
5447 direction == CHARSET_RIGHT_TO_LEFT)
5449 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
5451 Dynarr_add (dst, ISO_CODE_ESC);
5452 Dynarr_add (dst, '[');
5454 else if (internal_p)
5455 DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
5457 Dynarr_add (dst, ISO_CODE_CSI);
5458 Dynarr_add (dst, '2');
5459 Dynarr_add (dst, ']');
5461 *flags |= CODING_STATE_R2L;
5465 /* Convert ISO2022-format data to internal format. */
5468 decode_coding_iso2022 (Lstream *decoding, const Extbyte *src,
5469 unsigned_char_dynarr *dst, Lstream_data_count n)
5471 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
5472 unsigned int flags = str->flags;
5473 unsigned int cpos = str->cpos;
5474 unsigned char counter = str->counter;
5475 eol_type_t eol_type = str->eol_type;
5476 #ifdef ENABLE_COMPOSITE_CHARS
5477 unsigned_char_dynarr *real_dst = dst;
5479 Lisp_Object coding_system;
5481 XSETCODING_SYSTEM (coding_system, str->codesys);
5483 #ifdef ENABLE_COMPOSITE_CHARS
5484 if (flags & CODING_STATE_COMPOSITE)
5485 dst = str->iso2022.composite_chars;
5486 #endif /* ENABLE_COMPOSITE_CHARS */
5490 unsigned char c = *(unsigned char *)src++;
5491 if (flags & CODING_STATE_ESCAPE)
5492 { /* Within ESC sequence */
5493 int retval = parse_iso2022_esc (coding_system, &str->iso2022,
5498 switch (str->iso2022.esc)
5500 #ifdef ENABLE_COMPOSITE_CHARS
5501 case ISO_ESC_START_COMPOSITE:
5502 if (str->iso2022.composite_chars)
5503 Dynarr_reset (str->iso2022.composite_chars);
5505 str->iso2022.composite_chars = Dynarr_new (unsigned_char);
5506 dst = str->iso2022.composite_chars;
5508 case ISO_ESC_END_COMPOSITE:
5510 Bufbyte comstr[MAX_EMCHAR_LEN];
5512 Emchar emch = lookup_composite_char (Dynarr_atp (dst, 0),
5513 Dynarr_length (dst));
5515 len = set_charptr_emchar (comstr, emch);
5516 Dynarr_add_many (dst, comstr, len);
5519 #endif /* ENABLE_COMPOSITE_CHARS */
5521 case ISO_ESC_LITERAL:
5522 COMPOSE_FLUSH_CHARS (str, dst);
5523 decode_flush_er_chars (str, dst);
5524 DECODE_ADD_BINARY_CHAR (c, dst);
5528 /* Everything else handled already */
5533 /* Attempted error recovery. */
5534 if (str->iso2022.output_direction_sequence)
5535 ensure_correct_direction (flags & CODING_STATE_R2L ?
5536 CHARSET_RIGHT_TO_LEFT :
5537 CHARSET_LEFT_TO_RIGHT,
5538 str->codesys, dst, 0, 1);
5539 /* More error recovery. */
5540 if (!retval || str->iso2022.output_literally)
5542 /* Output the (possibly invalid) sequence */
5544 COMPOSE_FLUSH_CHARS (str, dst);
5545 decode_flush_er_chars (str, dst);
5546 for (i = 0; i < str->iso2022.esc_bytes_index; i++)
5547 DECODE_ADD_BINARY_CHAR (str->iso2022.esc_bytes[i], dst);
5548 flags &= CODING_STATE_ISO2022_LOCK;
5550 n++, src--;/* Repeat the loop with the same character. */
5553 /* No sense in reprocessing the final byte of the
5554 escape sequence; it could mess things up anyway.
5556 COMPOSE_FLUSH_CHARS (str, dst);
5557 decode_flush_er_chars (str, dst);
5558 DECODE_ADD_BINARY_CHAR (c, dst);
5564 else if (BYTE_C0_P (c) || BYTE_C1_P (c))
5565 { /* Control characters */
5567 /***** Error-handling *****/
5569 /* If we were in the middle of a character, dump out the
5570 partial character. */
5573 COMPOSE_FLUSH_CHARS (str, dst);
5574 decode_flush_er_chars (str, dst);
5578 DECODE_ADD_BINARY_CHAR
5579 ((unsigned char)(cpos >> (counter * 8)), dst);
5584 /* If we just saw a single-shift character, dump it out.
5585 This may dump out the wrong sort of single-shift character,
5586 but least it will give an indication that something went
5588 if (flags & CODING_STATE_SS2)
5590 COMPOSE_FLUSH_CHARS (str, dst);
5591 decode_flush_er_chars (str, dst);
5592 DECODE_ADD_BINARY_CHAR (ISO_CODE_SS2, dst);
5593 flags &= ~CODING_STATE_SS2;
5595 if (flags & CODING_STATE_SS3)
5597 COMPOSE_FLUSH_CHARS (str, dst);
5598 decode_flush_er_chars (str, dst);
5599 DECODE_ADD_BINARY_CHAR (ISO_CODE_SS3, dst);
5600 flags &= ~CODING_STATE_SS3;
5603 /***** Now handle the control characters. *****/
5609 COMPOSE_FLUSH_CHARS (str, dst);
5610 decode_flush_er_chars (str, dst);
5611 if (eol_type == EOL_CR)
5612 Dynarr_add (dst, '\n');
5613 else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR)
5614 Dynarr_add (dst, c);
5616 flags |= CODING_STATE_CR;
5617 goto label_continue_loop;
5619 else if (flags & CODING_STATE_CR)
5620 { /* eol_type == CODING_SYSTEM_EOL_CRLF */
5622 Dynarr_add (dst, '\r');
5623 flags &= ~CODING_STATE_CR;
5626 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
5629 flags &= CODING_STATE_ISO2022_LOCK;
5631 if (!parse_iso2022_esc (coding_system, &str->iso2022, c, &flags, 1))
5633 COMPOSE_FLUSH_CHARS (str, dst);
5634 decode_flush_er_chars (str, dst);
5635 DECODE_ADD_BINARY_CHAR (c, dst);
5639 { /* Graphic characters */
5640 Lisp_Object charset;
5649 COMPOSE_FLUSH_CHARS (str, dst);
5650 decode_flush_er_chars (str, dst);
5651 if (eol_type == EOL_CR)
5652 Dynarr_add (dst, '\n');
5653 else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR)
5654 Dynarr_add (dst, c);
5656 flags |= CODING_STATE_CR;
5657 goto label_continue_loop;
5659 else if (flags & CODING_STATE_CR)
5660 { /* eol_type == CODING_SYSTEM_EOL_CRLF */
5662 Dynarr_add (dst, '\r');
5663 flags &= ~CODING_STATE_CR;
5666 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
5669 /* Now determine the charset. */
5670 reg = ((flags & CODING_STATE_SS2) ? 2
5671 : (flags & CODING_STATE_SS3) ? 3
5672 : !BYTE_ASCII_P (c) ? str->iso2022.register_right
5673 : str->iso2022.register_left);
5674 charset = str->iso2022.charset[reg];
5676 /* Error checking: */
5677 if (! CHARSETP (charset)
5678 || str->iso2022.invalid_designated[reg]
5679 || (((c & 0x7F) == ' ' || (c & 0x7F) == ISO_CODE_DEL)
5680 && XCHARSET_CHARS (charset) == 94))
5681 /* Mrmph. We are trying to invoke a register that has no
5682 or an invalid charset in it, or trying to add a character
5683 outside the range of the charset. Insert that char literally
5684 to preserve it for the output. */
5686 COMPOSE_FLUSH_CHARS (str, dst);
5687 decode_flush_er_chars (str, dst);
5691 DECODE_ADD_BINARY_CHAR
5692 ((unsigned char)(cpos >> (counter * 8)), dst);
5695 DECODE_ADD_BINARY_CHAR (c, dst);
5700 /* Things are probably hunky-dorey. */
5702 /* Fetch reverse charset, maybe. */
5703 if (((flags & CODING_STATE_R2L) &&
5704 XCHARSET_DIRECTION (charset) == CHARSET_LEFT_TO_RIGHT)
5706 (!(flags & CODING_STATE_R2L) &&
5707 XCHARSET_DIRECTION (charset) == CHARSET_RIGHT_TO_LEFT))
5709 Lisp_Object new_charset =
5710 XCHARSET_REVERSE_DIRECTION_CHARSET (charset);
5711 if (!NILP (new_charset))
5712 charset = new_charset;
5717 if (XCHARSET_DIMENSION (charset) == counter)
5719 COMPOSE_ADD_CHAR (str,
5720 DECODE_CHAR (charset,
5721 ((cpos & 0x7F7F7F) << 8)
5728 cpos = (cpos << 8) | c;
5730 lb = XCHARSET_LEADING_BYTE (charset);
5731 switch (XCHARSET_REP_BYTES (charset))
5734 DECODE_OUTPUT_PARTIAL_CHAR (ch);
5735 Dynarr_add (dst, c & 0x7F);
5738 case 2: /* one-byte official */
5739 DECODE_OUTPUT_PARTIAL_CHAR (ch);
5740 Dynarr_add (dst, lb);
5741 Dynarr_add (dst, c | 0x80);
5744 case 3: /* one-byte private or two-byte official */
5745 if (XCHARSET_PRIVATE_P (charset))
5747 DECODE_OUTPUT_PARTIAL_CHAR (ch);
5748 Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_1);
5749 Dynarr_add (dst, lb);
5750 Dynarr_add (dst, c | 0x80);
5756 Dynarr_add (dst, lb);
5757 Dynarr_add (dst, ch | 0x80);
5758 Dynarr_add (dst, c | 0x80);
5766 default: /* two-byte private */
5769 Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_2);
5770 Dynarr_add (dst, lb);
5771 Dynarr_add (dst, ch | 0x80);
5772 Dynarr_add (dst, c | 0x80);
5782 flags &= CODING_STATE_ISO2022_LOCK;
5785 label_continue_loop:;
5788 if (flags & CODING_STATE_END)
5790 COMPOSE_FLUSH_CHARS (str, dst);
5791 decode_flush_er_chars (str, dst);
5792 DECODE_OUTPUT_PARTIAL_CHAR (cpos);
5796 str->counter = counter;
5800 /***** ISO2022 encoder *****/
5802 /* Designate CHARSET into register REG. */
5805 iso2022_designate (Lisp_Object charset, unsigned char reg,
5806 struct encoding_stream *str, unsigned_char_dynarr *dst)
5808 static const char inter94[] = "()*+";
5809 static const char inter96[] = ",-./";
5810 unsigned short chars;
5811 unsigned char dimension;
5812 unsigned char final;
5813 Lisp_Object old_charset = str->iso2022.charset[reg];
5815 str->iso2022.charset[reg] = charset;
5816 if (!CHARSETP (charset))
5817 /* charset might be an initial nil or t. */
5819 chars = XCHARSET_CHARS (charset);
5820 dimension = XCHARSET_DIMENSION (charset);
5821 final = XCHARSET_FINAL (charset);
5822 if (!str->iso2022.force_charset_on_output[reg] &&
5823 CHARSETP (old_charset) &&
5824 XCHARSET_CHARS (old_charset) == chars &&
5825 XCHARSET_DIMENSION (old_charset) == dimension &&
5826 XCHARSET_FINAL (old_charset) == final)
5829 str->iso2022.force_charset_on_output[reg] = 0;
5832 charset_conversion_spec_dynarr *dyn =
5833 str->codesys->iso2022.output_conv;
5839 for (i = 0; i < Dynarr_length (dyn); i++)
5841 struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
5842 if (EQ (charset, spec->from_charset))
5843 charset = spec->to_charset;
5848 Dynarr_add (dst, ISO_CODE_ESC);
5853 Dynarr_add (dst, inter94[reg]);
5856 Dynarr_add (dst, '$');
5858 || !(CODING_SYSTEM_ISO2022_SHORT (str->codesys))
5861 Dynarr_add (dst, inter94[reg]);
5866 Dynarr_add (dst, inter96[reg]);
5869 Dynarr_add (dst, '$');
5870 Dynarr_add (dst, inter96[reg]);
5874 Dynarr_add (dst, final);
5878 ensure_normal_shift (struct encoding_stream *str, unsigned_char_dynarr *dst)
5880 if (str->iso2022.register_left != 0)
5882 Dynarr_add (dst, ISO_CODE_SI);
5883 str->iso2022.register_left = 0;
5888 ensure_shift_out (struct encoding_stream *str, unsigned_char_dynarr *dst)
5890 if (str->iso2022.register_left != 1)
5892 Dynarr_add (dst, ISO_CODE_SO);
5893 str->iso2022.register_left = 1;
5898 char_encode_iso2022 (struct encoding_stream *str, Emchar ch,
5899 unsigned_char_dynarr *dst, unsigned int *flags)
5901 unsigned char charmask;
5902 Lisp_Coding_System* codesys = str->codesys;
5903 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
5905 Lisp_Object charset = str->iso2022.current_charset;
5906 int half = str->iso2022.current_half;
5907 int code_point = -1;
5911 restore_left_to_right_direction (codesys, dst, flags, 0);
5913 /* Make sure G0 contains ASCII */
5914 if ((ch > ' ' && ch < ISO_CODE_DEL)
5915 || !CODING_SYSTEM_ISO2022_NO_ASCII_CNTL (codesys))
5917 ensure_normal_shift (str, dst);
5918 iso2022_designate (Vcharset_ascii, 0, str, dst);
5921 /* If necessary, restore everything to the default state
5923 if (ch == '\n' && !(CODING_SYSTEM_ISO2022_NO_ASCII_EOL (codesys)))
5925 restore_left_to_right_direction (codesys, dst, flags, 0);
5927 ensure_normal_shift (str, dst);
5929 for (i = 0; i < 4; i++)
5931 Lisp_Object initial_charset =
5932 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
5933 iso2022_designate (initial_charset, i, str, dst);
5938 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5939 Dynarr_add (dst, '\r');
5940 if (eol_type != EOL_CR)
5941 Dynarr_add (dst, ch);
5945 if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
5946 && fit_to_be_escape_quoted (ch))
5947 Dynarr_add (dst, ISO_CODE_ESC);
5948 Dynarr_add (dst, ch);
5951 else if ( (0x80 <= ch) && (ch <= 0x9f) )
5953 charmask = (half == 0 ? 0x00 : 0x80);
5955 if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
5956 && fit_to_be_escape_quoted (ch))
5957 Dynarr_add (dst, ISO_CODE_ESC);
5958 /* you asked for it ... */
5959 Dynarr_add (dst, ch);
5965 /* Now determine which register to use. */
5967 for (i = 0; i < 4; i++)
5969 if ((CHARSETP (charset = str->iso2022.charset[i])
5970 && ((code_point = charset_code_point (charset, ch, 0)) >= 0))
5974 = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i))
5975 && ((code_point = charset_code_point (charset, ch, 0)) >= 0)))
5983 Lisp_Object original_default_coded_charset_priority_list
5984 = Vdefault_coded_charset_priority_list;
5985 Vdefault_coded_charset_priority_list
5986 = CODING_SYSTEM_CCS_PRIORITY_LIST (codesys);
5987 while (!EQ (Vdefault_coded_charset_priority_list, Qnil))
5989 code_point = ENCODE_CHAR (ch, charset);
5990 if (XCHARSET_FINAL (charset))
5992 Vdefault_coded_charset_priority_list
5993 = Fcdr (Fmemq (XCHARSET_NAME (charset),
5994 Vdefault_coded_charset_priority_list));
5996 Vdefault_coded_charset_priority_list
5997 = original_default_coded_charset_priority_list;
5998 while (!EQ (Vdefault_coded_charset_priority_list, Qnil))
6000 code_point = ENCODE_CHAR (ch, charset);
6001 if (XCHARSET_FINAL (charset))
6003 Vdefault_coded_charset_priority_list
6004 = Fcdr (Fmemq (XCHARSET_NAME (charset),
6005 Vdefault_coded_charset_priority_list));
6007 code_point = ENCODE_CHAR (ch, charset);
6008 if (!XCHARSET_FINAL (charset))
6010 charset = Vcharset_ascii;
6014 Vdefault_coded_charset_priority_list
6015 = original_default_coded_charset_priority_list;
6017 ensure_correct_direction (XCHARSET_DIRECTION (charset),
6018 codesys, dst, flags, 0);
6022 if (XCHARSET_GRAPHIC (charset) != 0)
6024 if (!NILP (str->iso2022.charset[1]) &&
6025 (!CODING_SYSTEM_ISO2022_SEVEN (codesys)
6026 || CODING_SYSTEM_ISO2022_LOCK_SHIFT (codesys)))
6028 else if (!NILP (str->iso2022.charset[2]))
6030 else if (!NILP (str->iso2022.charset[3]))
6039 iso2022_designate (charset, reg, str, dst);
6041 /* Now invoke that register. */
6045 ensure_normal_shift (str, dst);
6049 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
6051 ensure_shift_out (str, dst);
6058 if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
6060 Dynarr_add (dst, ISO_CODE_ESC);
6061 Dynarr_add (dst, 'N');
6066 Dynarr_add (dst, ISO_CODE_SS2);
6071 if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
6073 Dynarr_add (dst, ISO_CODE_ESC);
6074 Dynarr_add (dst, 'O');
6079 Dynarr_add (dst, ISO_CODE_SS3);
6087 charmask = (half == 0 ? 0x00 : 0x80);
6089 switch (XCHARSET_DIMENSION (charset))
6092 Dynarr_add (dst, (code_point & 0xFF) | charmask);
6095 Dynarr_add (dst, ((code_point >> 8) & 0xFF) | charmask);
6096 Dynarr_add (dst, ( code_point & 0xFF) | charmask);
6099 Dynarr_add (dst, ((code_point >> 16) & 0xFF) | charmask);
6100 Dynarr_add (dst, ((code_point >> 8) & 0xFF) | charmask);
6101 Dynarr_add (dst, ( code_point & 0xFF) | charmask);
6104 Dynarr_add (dst, ((code_point >> 24) & 0xFF) | charmask);
6105 Dynarr_add (dst, ((code_point >> 16) & 0xFF) | charmask);
6106 Dynarr_add (dst, ((code_point >> 8) & 0xFF) | charmask);
6107 Dynarr_add (dst, ( code_point & 0xFF) | charmask);
6113 str->iso2022.current_charset = charset;
6114 str->iso2022.current_half = half;
6118 char_finish_iso2022 (struct encoding_stream *str, unsigned_char_dynarr *dst,
6119 unsigned int *flags)
6121 Lisp_Coding_System* codesys = str->codesys;
6124 restore_left_to_right_direction (codesys, dst, flags, 0);
6125 ensure_normal_shift (str, dst);
6126 for (i = 0; i < 4; i++)
6128 Lisp_Object initial_charset
6129 = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
6130 iso2022_designate (initial_charset, i, str, dst);
6135 /************************************************************************/
6136 /* No-conversion methods */
6137 /************************************************************************/
6139 /* This is used when reading in "binary" files -- i.e. files that may
6140 contain all 256 possible byte values and that are not to be
6141 interpreted as being in any particular decoding. */
6143 decode_coding_no_conversion (Lstream *decoding, const Extbyte *src,
6144 unsigned_char_dynarr *dst, Lstream_data_count n)
6146 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
6147 unsigned int flags = str->flags;
6148 unsigned int cpos = str->cpos;
6149 eol_type_t eol_type = str->eol_type;
6153 unsigned char c = *(unsigned char *)src++;
6155 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
6156 DECODE_ADD_BINARY_CHAR (c, dst);
6157 label_continue_loop:;
6160 DECODE_HANDLE_END_OF_CONVERSION (flags, cpos, dst);
6167 encode_coding_no_conversion (Lstream *encoding, const Bufbyte *src,
6168 unsigned_char_dynarr *dst, Lstream_data_count n)
6171 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
6172 unsigned int flags = str->flags;
6173 unsigned int ch = str->ch;
6174 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
6176 unsigned char char_boundary = str->iso2022.current_char_boundary;
6183 if (char_boundary == 0)
6189 else if ( c >= 0xf8 )
6194 else if ( c >= 0xf0 )
6199 else if ( c >= 0xe0 )
6204 else if ( c >= 0xc0 )
6214 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
6215 Dynarr_add (dst, '\r');
6216 if (eol_type != EOL_CR)
6217 Dynarr_add (dst, c);
6220 Dynarr_add (dst, c);
6223 else if (char_boundary == 1)
6225 ch = ( ch << 6 ) | ( c & 0x3f );
6226 Dynarr_add (dst, ch & 0xff);
6231 ch = ( ch << 6 ) | ( c & 0x3f );
6234 #else /* not UTF2000 */
6237 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
6238 Dynarr_add (dst, '\r');
6239 if (eol_type != EOL_CR)
6240 Dynarr_add (dst, '\n');
6243 else if (BYTE_ASCII_P (c))
6246 Dynarr_add (dst, c);
6248 else if (BUFBYTE_LEADING_BYTE_P (c))
6251 if (c == LEADING_BYTE_LATIN_ISO8859_1 ||
6252 c == LEADING_BYTE_CONTROL_1)
6255 Dynarr_add (dst, '~'); /* untranslatable character */
6259 if (ch == LEADING_BYTE_LATIN_ISO8859_1)
6260 Dynarr_add (dst, c);
6261 else if (ch == LEADING_BYTE_CONTROL_1)
6264 Dynarr_add (dst, c - 0x20);
6266 /* else it should be the second or third byte of an
6267 untranslatable character, so ignore it */
6270 #endif /* not UTF2000 */
6276 str->iso2022.current_char_boundary = char_boundary;
6282 /************************************************************************/
6283 /* Initialization */
6284 /************************************************************************/
6287 syms_of_file_coding (void)
6289 INIT_LRECORD_IMPLEMENTATION (coding_system);
6291 DEFERROR_STANDARD (Qcoding_system_error, Qio_error);
6293 DEFSUBR (Fcoding_system_p);
6294 DEFSUBR (Ffind_coding_system);
6295 DEFSUBR (Fget_coding_system);
6296 DEFSUBR (Fcoding_system_list);
6297 DEFSUBR (Fcoding_system_name);
6298 DEFSUBR (Fmake_coding_system);
6299 DEFSUBR (Fcopy_coding_system);
6300 DEFSUBR (Fcoding_system_canonical_name_p);
6301 DEFSUBR (Fcoding_system_alias_p);
6302 DEFSUBR (Fcoding_system_aliasee);
6303 DEFSUBR (Fdefine_coding_system_alias);
6304 DEFSUBR (Fsubsidiary_coding_system);
6306 DEFSUBR (Fcoding_system_type);
6307 DEFSUBR (Fcoding_system_doc_string);
6309 DEFSUBR (Fcoding_system_charset);
6311 DEFSUBR (Fcoding_system_property);
6313 DEFSUBR (Fcoding_category_list);
6314 DEFSUBR (Fset_coding_priority_list);
6315 DEFSUBR (Fcoding_priority_list);
6316 DEFSUBR (Fset_coding_category_system);
6317 DEFSUBR (Fcoding_category_system);
6319 DEFSUBR (Fdetect_coding_region);
6320 DEFSUBR (Fdecode_coding_region);
6321 DEFSUBR (Fencode_coding_region);
6323 DEFSUBR (Fdecode_shift_jis_char);
6324 DEFSUBR (Fencode_shift_jis_char);
6325 DEFSUBR (Fdecode_big5_char);
6326 DEFSUBR (Fencode_big5_char);
6328 defsymbol (&Qcoding_systemp, "coding-system-p");
6329 defsymbol (&Qno_conversion, "no-conversion");
6330 defsymbol (&Qraw_text, "raw-text");
6332 defsymbol (&Qbig5, "big5");
6333 defsymbol (&Qshift_jis, "shift-jis");
6334 defsymbol (&Qucs4, "ucs-4");
6335 defsymbol (&Qutf8, "utf-8");
6336 defsymbol (&Qutf16, "utf-16");
6337 defsymbol (&Qccl, "ccl");
6338 defsymbol (&Qiso2022, "iso2022");
6340 defsymbol (&Qmnemonic, "mnemonic");
6341 defsymbol (&Qeol_type, "eol-type");
6342 defsymbol (&Qpost_read_conversion, "post-read-conversion");
6343 defsymbol (&Qpre_write_conversion, "pre-write-conversion");
6345 defsymbol (&Qcr, "cr");
6346 defsymbol (&Qlf, "lf");
6347 defsymbol (&Qcrlf, "crlf");
6348 defsymbol (&Qeol_cr, "eol-cr");
6349 defsymbol (&Qeol_lf, "eol-lf");
6350 defsymbol (&Qeol_crlf, "eol-crlf");
6352 defsymbol (&Qcharset_g0, "charset-g0");
6353 defsymbol (&Qcharset_g1, "charset-g1");
6354 defsymbol (&Qcharset_g2, "charset-g2");
6355 defsymbol (&Qcharset_g3, "charset-g3");
6356 defsymbol (&Qforce_g0_on_output, "force-g0-on-output");
6357 defsymbol (&Qforce_g1_on_output, "force-g1-on-output");
6358 defsymbol (&Qforce_g2_on_output, "force-g2-on-output");
6359 defsymbol (&Qforce_g3_on_output, "force-g3-on-output");
6360 defsymbol (&Qno_iso6429, "no-iso6429");
6361 defsymbol (&Qinput_charset_conversion, "input-charset-conversion");
6362 defsymbol (&Qoutput_charset_conversion, "output-charset-conversion");
6364 defsymbol (&Qshort, "short");
6365 defsymbol (&Qno_ascii_eol, "no-ascii-eol");
6366 defsymbol (&Qno_ascii_cntl, "no-ascii-cntl");
6367 defsymbol (&Qseven, "seven");
6368 defsymbol (&Qlock_shift, "lock-shift");
6369 defsymbol (&Qescape_quoted, "escape-quoted");
6372 defsymbol (&Qutf_8_mcs, "utf-8-mcs");
6373 defsymbol (&Qdisable_composition, "disable-composition");
6374 defsymbol (&Qccs_priority_list, "ccs-priority-list");
6375 defsymbol (&Quse_entity_reference, "use-entity-reference");
6376 defsymbol (&Qd, "d");
6377 defsymbol (&Qx, "x");
6378 defsymbol (&QX, "X");
6380 defsymbol (&Qencode, "encode");
6381 defsymbol (&Qdecode, "decode");
6384 defsymbol (&coding_category_symbol[CODING_CATEGORY_SHIFT_JIS],
6386 defsymbol (&coding_category_symbol[CODING_CATEGORY_BIG5],
6388 defsymbol (&coding_category_symbol[CODING_CATEGORY_UCS4],
6390 defsymbol (&coding_category_symbol[CODING_CATEGORY_UTF16],
6392 defsymbol (&coding_category_symbol[CODING_CATEGORY_UTF8],
6394 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_7],
6396 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_DESIGNATE],
6398 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_1],
6400 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_2],
6402 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_LOCK_SHIFT],
6405 defsymbol (&coding_category_symbol[CODING_CATEGORY_NO_CONVERSION],
6410 lstream_type_create_file_coding (void)
6412 LSTREAM_HAS_METHOD (decoding, reader);
6413 LSTREAM_HAS_METHOD (decoding, writer);
6414 LSTREAM_HAS_METHOD (decoding, rewinder);
6415 LSTREAM_HAS_METHOD (decoding, seekable_p);
6416 LSTREAM_HAS_METHOD (decoding, flusher);
6417 LSTREAM_HAS_METHOD (decoding, closer);
6418 LSTREAM_HAS_METHOD (decoding, marker);
6420 LSTREAM_HAS_METHOD (encoding, reader);
6421 LSTREAM_HAS_METHOD (encoding, writer);
6422 LSTREAM_HAS_METHOD (encoding, rewinder);
6423 LSTREAM_HAS_METHOD (encoding, seekable_p);
6424 LSTREAM_HAS_METHOD (encoding, flusher);
6425 LSTREAM_HAS_METHOD (encoding, closer);
6426 LSTREAM_HAS_METHOD (encoding, marker);
6430 vars_of_file_coding (void)
6434 fcd = xnew (struct file_coding_dump);
6435 dump_add_root_struct_ptr (&fcd, &fcd_description);
6437 /* Initialize to something reasonable ... */
6438 for (i = 0; i < CODING_CATEGORY_LAST; i++)
6440 fcd->coding_category_system[i] = Qnil;
6441 fcd->coding_category_by_priority[i] = i;
6444 Fprovide (intern ("file-coding"));
6446 DEFVAR_LISP ("keyboard-coding-system", &Vkeyboard_coding_system /*
6447 Coding system used for TTY keyboard input.
6448 Not used under a windowing system.
6450 Vkeyboard_coding_system = Qnil;
6452 DEFVAR_LISP ("terminal-coding-system", &Vterminal_coding_system /*
6453 Coding system used for TTY display output.
6454 Not used under a windowing system.
6456 Vterminal_coding_system = Qnil;
6458 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read /*
6459 Overriding coding system used when reading from a file or process.
6460 You should bind this variable with `let', but do not set it globally.
6461 If this is non-nil, it specifies the coding system that will be used
6462 to decode input on read operations, such as from a file or process.
6463 It overrides `buffer-file-coding-system-for-read',
6464 `insert-file-contents-pre-hook', etc. Use those variables instead of
6465 this one for permanent changes to the environment. */ );
6466 Vcoding_system_for_read = Qnil;
6468 DEFVAR_LISP ("coding-system-for-write",
6469 &Vcoding_system_for_write /*
6470 Overriding coding system used when writing to a file or process.
6471 You should bind this variable with `let', but do not set it globally.
6472 If this is non-nil, it specifies the coding system that will be used
6473 to encode output for write operations, such as to a file or process.
6474 It overrides `buffer-file-coding-system', `write-region-pre-hook', etc.
6475 Use those variables instead of this one for permanent changes to the
6477 Vcoding_system_for_write = Qnil;
6479 DEFVAR_LISP ("file-name-coding-system", &Vfile_name_coding_system /*
6480 Coding system used to convert pathnames when accessing files.
6482 Vfile_name_coding_system = Qnil;
6484 DEFVAR_LISP ("coded-charset-entity-reference-alist",
6485 &Vcoded_charset_entity_reference_alist /*
6486 Alist of coded-charset vs corresponding entity-reference.
6487 Each element looks like (CCS PREFIX CODE-COLUMNS CODE-TYPE).
6488 CCS is coded-charset.
6489 CODE-COLUMNS is columns of code-point of entity-reference.
6490 CODE-TYPE is format type of code-point of entity-reference.
6491 `d' means decimal value and `x' means hexadecimal value.
6493 Vcoded_charset_entity_reference_alist = Qnil;
6495 DEFVAR_BOOL ("enable-multibyte-characters", &enable_multibyte_characters /*
6496 Non-nil means the buffer contents are regarded as multi-byte form
6497 of characters, not a binary code. This affects the display, file I/O,
6498 and behaviors of various editing commands.
6500 Setting this to nil does not do anything.
6502 enable_multibyte_characters = 1;
6506 complex_vars_of_file_coding (void)
6508 staticpro (&Vcoding_system_hash_table);
6509 Vcoding_system_hash_table =
6510 make_lisp_hash_table (50, HASH_TABLE_NON_WEAK, HASH_TABLE_EQ);
6512 the_codesys_prop_dynarr = Dynarr_new (codesys_prop);
6513 dump_add_root_struct_ptr (&the_codesys_prop_dynarr, &codesys_prop_dynarr_description);
6515 #define DEFINE_CODESYS_PROP(Prop_Type, Sym) do \
6517 struct codesys_prop csp; \
6519 csp.prop_type = (Prop_Type); \
6520 Dynarr_add (the_codesys_prop_dynarr, csp); \
6523 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qmnemonic);
6524 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_type);
6525 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_cr);
6526 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_crlf);
6527 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_lf);
6528 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qpost_read_conversion);
6529 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qpre_write_conversion);
6531 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g0);
6532 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g1);
6533 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g2);
6534 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g3);
6535 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g0_on_output);
6536 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g1_on_output);
6537 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g2_on_output);
6538 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g3_on_output);
6539 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qshort);
6540 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_eol);
6541 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_cntl);
6542 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qseven);
6543 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qlock_shift);
6544 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_iso6429);
6545 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qescape_quoted);
6546 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qinput_charset_conversion);
6547 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qoutput_charset_conversion);
6549 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qccs_priority_list);
6552 DEFINE_CODESYS_PROP (CODESYS_PROP_CCL, Qencode);
6553 DEFINE_CODESYS_PROP (CODESYS_PROP_CCL, Qdecode);
6555 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qdisable_composition);
6556 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Quse_entity_reference);
6559 /* Need to create this here or we're really screwed. */
6561 (Qraw_text, Qno_conversion,
6562 build_string ("Raw text, which means it converts only line-break-codes."),
6563 list2 (Qmnemonic, build_string ("Raw")));
6566 (Qbinary, Qno_conversion,
6567 build_string ("Binary, which means it does not convert anything."),
6568 list4 (Qeol_type, Qlf,
6569 Qmnemonic, build_string ("Binary")));
6575 ("Coding-system of UTF-8 with Multiple Coded-character-Sets extension."),
6576 list2 (Qmnemonic, build_string ("MTF8")));
6579 Fdefine_coding_system_alias (Qno_conversion, Qraw_text);
6581 Fdefine_coding_system_alias (Qfile_name, Qbinary);
6583 Fdefine_coding_system_alias (Qterminal, Qbinary);
6584 Fdefine_coding_system_alias (Qkeyboard, Qbinary);
6586 /* Need this for bootstrapping */
6587 fcd->coding_category_system[CODING_CATEGORY_NO_CONVERSION] =
6588 Fget_coding_system (Qraw_text);
6591 fcd->coding_category_system[CODING_CATEGORY_UTF8]
6592 = Fget_coding_system (Qutf_8_mcs);
6595 #if defined(MULE) && !defined(UTF2000)
6599 for (i = 0; i < countof (fcd->ucs_to_mule_table); i++)
6600 fcd->ucs_to_mule_table[i] = Qnil;
6602 staticpro (&mule_to_ucs_table);
6603 mule_to_ucs_table = Fmake_char_table(Qgeneric);
6604 #endif /* defined(MULE) && !defined(UTF2000) */