1 /* Code conversion functions.
2 Copyright (C) 1991, 1995 Free Software Foundation, Inc.
3 Copyright (C) 1995 Sun Microsystems, Inc.
4 Copyright (C) 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2008, 2011
7 This file is part of XEmacs.
9 XEmacs is free software; you can redistribute it and/or modify it
10 under the terms of the GNU General Public License as published by the
11 Free Software Foundation; either version 2, or (at your option) any
14 XEmacs is distributed in the hope that it will be useful, but WITHOUT
15 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
16 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
19 You should have received a copy of the GNU General Public License
20 along with XEmacs; see the file COPYING. If not, write to
21 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
22 Boston, MA 02111-1307, USA. */
24 /* Synched up with: Mule 2.3. Not in FSF. */
26 /* Rewritten by Ben Wing <ben@xemacs.org>. */
27 /* Rewritten by MORIOKA Tomohiko <tomo@m17n.org> for XEmacs CHISE. */
41 #include "file-coding.h"
43 Lisp_Object Qcoding_system_error;
45 Lisp_Object Vkeyboard_coding_system;
46 Lisp_Object Vterminal_coding_system;
47 Lisp_Object Vcoding_system_for_read;
48 Lisp_Object Vcoding_system_for_write;
49 Lisp_Object Vfile_name_coding_system;
51 Lisp_Object Vcoded_charset_entity_reference_alist;
53 /* Table of symbols identifying each coding category. */
54 Lisp_Object coding_category_symbol[CODING_CATEGORY_LAST];
58 struct file_coding_dump {
59 /* Coding system currently associated with each coding category. */
60 Lisp_Object coding_category_system[CODING_CATEGORY_LAST];
62 /* Table of all coding categories in decreasing order of priority.
63 This describes a permutation of the possible coding categories. */
64 int coding_category_by_priority[CODING_CATEGORY_LAST];
66 #if defined(MULE) && !defined(UTF2000)
67 Lisp_Object ucs_to_mule_table[65536];
71 static const struct lrecord_description fcd_description_1[] = {
72 { XD_LISP_OBJECT_ARRAY, offsetof (struct file_coding_dump, coding_category_system), CODING_CATEGORY_LAST },
73 #if defined(MULE) && !defined(UTF2000)
74 { XD_LISP_OBJECT_ARRAY, offsetof (struct file_coding_dump, ucs_to_mule_table), countof (fcd->ucs_to_mule_table) },
79 static const struct struct_description fcd_description = {
80 sizeof (struct file_coding_dump),
84 Lisp_Object mule_to_ucs_table;
86 Lisp_Object Qcoding_systemp;
88 Lisp_Object Qraw_text, Qno_conversion, Qccl, Qiso2022;
89 /* Qinternal in general.c */
91 Lisp_Object Qmnemonic, Qeol_type;
92 Lisp_Object Qcr, Qcrlf, Qlf;
93 Lisp_Object Qeol_cr, Qeol_crlf, Qeol_lf;
94 Lisp_Object Qpost_read_conversion;
95 Lisp_Object Qpre_write_conversion;
98 Lisp_Object Qucs4, Qutf16, Qutf8;
99 Lisp_Object Qbig5, Qshift_jis;
100 Lisp_Object Qcharset_g0, Qcharset_g1, Qcharset_g2, Qcharset_g3;
101 Lisp_Object Qforce_g0_on_output, Qforce_g1_on_output;
102 Lisp_Object Qforce_g2_on_output, Qforce_g3_on_output;
103 Lisp_Object Qno_iso6429;
104 Lisp_Object Qinput_charset_conversion, Qoutput_charset_conversion;
105 Lisp_Object Qescape_quoted;
106 Lisp_Object Qshort, Qno_ascii_eol, Qno_ascii_cntl, Qseven, Qlock_shift;
109 Lisp_Object Qutf_8_mcs;
110 Lisp_Object Qdisable_composition;
111 Lisp_Object Qccs_priority_list;
112 Lisp_Object Quse_entity_reference;
113 Lisp_Object Qd, Qx, QX;
115 Lisp_Object Qencode, Qdecode;
117 Lisp_Object Vcoding_system_hash_table;
119 int enable_multibyte_characters;
122 /* Additional information used by the ISO2022 decoder and detector. */
123 struct iso2022_decoder
125 /* CHARSET holds the character sets currently assigned to the G0
126 through G3 variables. It is initialized from the array
127 INITIAL_CHARSET in CODESYS. */
128 Lisp_Object charset[4];
130 /* Which registers are currently invoked into the left (GL) and
131 right (GR) halves of the 8-bit encoding space? */
132 int register_left, register_right;
134 /* ISO_ESC holds a value indicating part of an escape sequence
135 that has already been seen. */
136 enum iso_esc_flag esc;
138 /* This records the bytes we've seen so far in an escape sequence,
139 in case the sequence is invalid (we spit out the bytes unchanged). */
140 unsigned char esc_bytes[8];
142 /* Index for next byte to store in ISO escape sequence. */
145 #ifdef ENABLE_COMPOSITE_CHARS
146 /* Stuff seen so far when composing a string. */
147 unsigned_char_dynarr *composite_chars;
150 /* If we saw an invalid designation sequence for a particular
151 register, we flag it here and switch to ASCII. The next time we
152 see a valid designation for this register, we turn off the flag
153 and do the designation normally, but pretend the sequence was
154 invalid. The effect of all this is that (most of the time) the
155 escape sequences for both the switch to the unknown charset, and
156 the switch back to the known charset, get inserted literally into
157 the buffer and saved out as such. The hope is that we can
158 preserve the escape sequences so that the resulting written out
159 file makes sense. If we don't do any of this, the designation
160 to the invalid charset will be preserved but that switch back
161 to the known charset will probably get eaten because it was
162 the same charset that was already present in the register. */
163 unsigned char invalid_designated[4];
165 /* We try to do similar things as above for direction-switching
166 sequences. If we encountered a direction switch while an
167 invalid designation was present, or an invalid designation
168 just after a direction switch (i.e. no valid designation
169 encountered yet), we insert the direction-switch escape
170 sequence literally into the output stream, and later on
171 insert the corresponding direction-restoring escape sequence
173 unsigned int switched_dir_and_no_valid_charset_yet :1;
174 unsigned int invalid_switch_dir :1;
176 /* Tells the decoder to output the escape sequence literally
177 even though it was valid. Used in the games we play to
178 avoid lossage when we encounter invalid designations. */
179 unsigned int output_literally :1;
180 /* We encountered a direction switch followed by an invalid
181 designation. We didn't output the direction switch
182 literally because we didn't know about the invalid designation;
183 but we have to do so now. */
184 unsigned int output_direction_sequence :1;
187 EXFUN (Fcopy_coding_system, 2);
189 struct detection_state;
192 text_encode_generic (Lstream *encoding, const Bufbyte *src,
193 unsigned_char_dynarr *dst, Lstream_data_count n);
195 static int detect_coding_sjis (struct detection_state *st,
196 const Extbyte *src, Lstream_data_count n);
197 static void decode_coding_sjis (Lstream *decoding, const Extbyte *src,
198 unsigned_char_dynarr *dst, Lstream_data_count n);
199 void char_encode_shift_jis (struct encoding_stream *str, Emchar c,
200 unsigned_char_dynarr *dst, unsigned int *flags);
201 void char_finish_shift_jis (struct encoding_stream *str,
202 unsigned_char_dynarr *dst, unsigned int *flags);
204 static int detect_coding_big5 (struct detection_state *st,
205 const Extbyte *src, Lstream_data_count n);
206 static void decode_coding_big5 (Lstream *decoding, const Extbyte *src,
207 unsigned_char_dynarr *dst, Lstream_data_count n);
208 void char_encode_big5 (struct encoding_stream *str, Emchar c,
209 unsigned_char_dynarr *dst, unsigned int *flags);
210 void char_finish_big5 (struct encoding_stream *str,
211 unsigned_char_dynarr *dst, unsigned int *flags);
213 static int detect_coding_ucs4 (struct detection_state *st,
214 const Extbyte *src, Lstream_data_count n);
215 static void decode_coding_ucs4 (Lstream *decoding, const Extbyte *src,
216 unsigned_char_dynarr *dst, Lstream_data_count n);
217 void char_encode_ucs4 (struct encoding_stream *str, Emchar c,
218 unsigned_char_dynarr *dst, unsigned int *flags);
219 void char_finish_ucs4 (struct encoding_stream *str,
220 unsigned_char_dynarr *dst, unsigned int *flags);
222 static int detect_coding_utf16 (struct detection_state *st,
223 const Extbyte *src, Lstream_data_count n);
224 static void decode_coding_utf16 (Lstream *decoding, const Extbyte *src,
225 unsigned_char_dynarr *dst, Lstream_data_count n);
226 void char_encode_utf16 (struct encoding_stream *str, Emchar c,
227 unsigned_char_dynarr *dst, unsigned int *flags);
228 void char_finish_utf16 (struct encoding_stream *str,
229 unsigned_char_dynarr *dst, unsigned int *flags);
231 static int detect_coding_utf8 (struct detection_state *st,
232 const Extbyte *src, Lstream_data_count n);
233 static void decode_coding_utf8 (Lstream *decoding, const Extbyte *src,
234 unsigned_char_dynarr *dst, Lstream_data_count n);
235 void char_encode_utf8 (struct encoding_stream *str, Emchar c,
236 unsigned_char_dynarr *dst, unsigned int *flags);
237 void char_finish_utf8 (struct encoding_stream *str,
238 unsigned_char_dynarr *dst, unsigned int *flags);
240 static int postprocess_iso2022_mask (int mask);
241 static void reset_iso2022 (Lisp_Object coding_system,
242 struct iso2022_decoder *iso);
243 static int detect_coding_iso2022 (struct detection_state *st,
244 const Extbyte *src, Lstream_data_count n);
245 static void decode_coding_iso2022 (Lstream *decoding, const Extbyte *src,
246 unsigned_char_dynarr *dst, Lstream_data_count n);
247 void char_encode_iso2022 (struct encoding_stream *str, Emchar c,
248 unsigned_char_dynarr *dst, unsigned int *flags);
249 void char_finish_iso2022 (struct encoding_stream *str,
250 unsigned_char_dynarr *dst, unsigned int *flags);
252 static void decode_coding_no_conversion (Lstream *decoding, const Extbyte *src,
253 unsigned_char_dynarr *dst, Lstream_data_count n);
254 static void encode_coding_no_conversion (Lstream *encoding, const Bufbyte *src,
255 unsigned_char_dynarr *dst, Lstream_data_count n);
256 static void mule_decode (Lstream *decoding, const Extbyte *src,
257 unsigned_char_dynarr *dst, Lstream_data_count n);
258 static void mule_encode (Lstream *encoding, const Bufbyte *src,
259 unsigned_char_dynarr *dst, Lstream_data_count n);
261 typedef struct codesys_prop codesys_prop;
270 Dynarr_declare (codesys_prop);
271 } codesys_prop_dynarr;
273 static const struct lrecord_description codesys_prop_description_1[] = {
274 { XD_LISP_OBJECT, offsetof (codesys_prop, sym) },
278 static const struct struct_description codesys_prop_description = {
279 sizeof (codesys_prop),
280 codesys_prop_description_1
283 static const struct lrecord_description codesys_prop_dynarr_description_1[] = {
284 XD_DYNARR_DESC (codesys_prop_dynarr, &codesys_prop_description),
288 static const struct struct_description codesys_prop_dynarr_description = {
289 sizeof (codesys_prop_dynarr),
290 codesys_prop_dynarr_description_1
293 codesys_prop_dynarr *the_codesys_prop_dynarr;
295 enum codesys_prop_enum
298 CODESYS_PROP_ISO2022,
303 /************************************************************************/
304 /* Coding system functions */
305 /************************************************************************/
307 static Lisp_Object mark_coding_system (Lisp_Object);
308 static void print_coding_system (Lisp_Object, Lisp_Object, int);
309 static void finalize_coding_system (void *header, int for_disksave);
312 static const struct lrecord_description ccs_description_1[] = {
313 { XD_LISP_OBJECT, offsetof (charset_conversion_spec, from_charset) },
314 { XD_LISP_OBJECT, offsetof (charset_conversion_spec, to_charset) },
318 static const struct struct_description ccs_description = {
319 sizeof (charset_conversion_spec),
323 static const struct lrecord_description ccsd_description_1[] = {
324 XD_DYNARR_DESC (charset_conversion_spec_dynarr, &ccs_description),
328 static const struct struct_description ccsd_description = {
329 sizeof (charset_conversion_spec_dynarr),
334 static const struct lrecord_description coding_system_description[] = {
335 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, name) },
336 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, doc_string) },
337 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, mnemonic) },
338 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, post_read_conversion) },
339 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, pre_write_conversion) },
340 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, eol_lf) },
341 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, eol_crlf) },
342 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, eol_cr) },
344 { XD_LISP_OBJECT_ARRAY, offsetof (Lisp_Coding_System, iso2022.initial_charset), 4 },
345 { XD_STRUCT_PTR, offsetof (Lisp_Coding_System, iso2022.input_conv), 1, &ccsd_description },
346 { XD_STRUCT_PTR, offsetof (Lisp_Coding_System, iso2022.output_conv), 1, &ccsd_description },
347 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, ccl.decode) },
348 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, ccl.encode) },
350 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, ccs_priority_list) },
356 DEFINE_LRECORD_IMPLEMENTATION ("coding-system", coding_system,
357 mark_coding_system, print_coding_system,
358 finalize_coding_system,
359 0, 0, coding_system_description,
363 mark_coding_system (Lisp_Object obj)
365 Lisp_Coding_System *codesys = XCODING_SYSTEM (obj);
367 mark_object (CODING_SYSTEM_NAME (codesys));
368 mark_object (CODING_SYSTEM_DOC_STRING (codesys));
369 mark_object (CODING_SYSTEM_MNEMONIC (codesys));
370 mark_object (CODING_SYSTEM_EOL_LF (codesys));
371 mark_object (CODING_SYSTEM_EOL_CRLF (codesys));
372 mark_object (CODING_SYSTEM_EOL_CR (codesys));
374 switch (CODING_SYSTEM_TYPE (codesys))
378 case CODESYS_ISO2022:
379 for (i = 0; i < 4; i++)
380 mark_object (CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i));
381 if (codesys->iso2022.input_conv)
383 for (i = 0; i < Dynarr_length (codesys->iso2022.input_conv); i++)
385 struct charset_conversion_spec *ccs =
386 Dynarr_atp (codesys->iso2022.input_conv, i);
387 mark_object (ccs->from_charset);
388 mark_object (ccs->to_charset);
391 if (codesys->iso2022.output_conv)
393 for (i = 0; i < Dynarr_length (codesys->iso2022.output_conv); i++)
395 struct charset_conversion_spec *ccs =
396 Dynarr_atp (codesys->iso2022.output_conv, i);
397 mark_object (ccs->from_charset);
398 mark_object (ccs->to_charset);
405 mark_object (CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 0));
406 mark_object (CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 1));
411 mark_object (CODING_SYSTEM_CCL_DECODE (codesys));
412 mark_object (CODING_SYSTEM_CCL_ENCODE (codesys));
419 mark_object (CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys));
421 mark_object (CODING_SYSTEM_CCS_PRIORITY_LIST (codesys));
423 return CODING_SYSTEM_POST_READ_CONVERSION (codesys);
427 print_coding_system (Lisp_Object obj, Lisp_Object printcharfun,
430 Lisp_Coding_System *c = XCODING_SYSTEM (obj);
432 error ("printing unreadable object #<coding_system 0x%x>",
435 write_c_string ("#<coding_system ", printcharfun);
436 print_internal (c->name, printcharfun, 1);
437 write_c_string (">", printcharfun);
441 finalize_coding_system (void *header, int for_disksave)
443 Lisp_Coding_System *c = (Lisp_Coding_System *) header;
444 /* Since coding systems never go away, this function is not
445 necessary. But it would be necessary if we changed things
446 so that coding systems could go away. */
447 if (!for_disksave) /* see comment in lstream.c */
449 switch (CODING_SYSTEM_TYPE (c))
452 case CODESYS_ISO2022:
453 if (c->iso2022.input_conv)
455 Dynarr_free (c->iso2022.input_conv);
456 c->iso2022.input_conv = 0;
458 if (c->iso2022.output_conv)
460 Dynarr_free (c->iso2022.output_conv);
461 c->iso2022.output_conv = 0;
472 symbol_to_eol_type (Lisp_Object symbol)
474 CHECK_SYMBOL (symbol);
475 if (NILP (symbol)) return EOL_AUTODETECT;
476 if (EQ (symbol, Qlf)) return EOL_LF;
477 if (EQ (symbol, Qcrlf)) return EOL_CRLF;
478 if (EQ (symbol, Qcr)) return EOL_CR;
480 signal_simple_error ("Unrecognized eol type", symbol);
481 return EOL_AUTODETECT; /* not reached */
485 eol_type_to_symbol (eol_type_t type)
490 case EOL_LF: return Qlf;
491 case EOL_CRLF: return Qcrlf;
492 case EOL_CR: return Qcr;
493 case EOL_AUTODETECT: return Qnil;
498 setup_eol_coding_systems (Lisp_Coding_System *codesys)
500 Lisp_Object codesys_obj;
501 int len = string_length (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name);
502 char *codesys_name = (char *) alloca (len + 7);
504 char *codesys_mnemonic=0;
506 Lisp_Object codesys_name_sym, sub_codesys_obj;
510 XSETCODING_SYSTEM (codesys_obj, codesys);
512 memcpy (codesys_name,
513 string_data (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name), len);
515 if (STRINGP (CODING_SYSTEM_MNEMONIC (codesys)))
517 mlen = XSTRING_LENGTH (CODING_SYSTEM_MNEMONIC (codesys));
518 codesys_mnemonic = (char *) alloca (mlen + 7);
519 memcpy (codesys_mnemonic,
520 XSTRING_DATA (CODING_SYSTEM_MNEMONIC (codesys)), mlen);
523 #define DEFINE_SUB_CODESYS(op_sys, op_sys_abbr, Type) do { \
524 strcpy (codesys_name + len, "-" op_sys); \
526 strcpy (codesys_mnemonic + mlen, op_sys_abbr); \
527 codesys_name_sym = intern (codesys_name); \
528 sub_codesys_obj = Fcopy_coding_system (codesys_obj, codesys_name_sym); \
529 XCODING_SYSTEM_EOL_TYPE (sub_codesys_obj) = Type; \
531 XCODING_SYSTEM_MNEMONIC(sub_codesys_obj) = \
532 build_string (codesys_mnemonic); \
533 CODING_SYSTEM_##Type (codesys) = sub_codesys_obj; \
536 DEFINE_SUB_CODESYS("unix", "", EOL_LF);
537 DEFINE_SUB_CODESYS("dos", ":T", EOL_CRLF);
538 DEFINE_SUB_CODESYS("mac", ":t", EOL_CR);
541 DEFUN ("coding-system-p", Fcoding_system_p, 1, 1, 0, /*
542 Return t if OBJECT is a coding system.
543 A coding system is an object that defines how text containing multiple
544 character sets is encoded into a stream of (typically 8-bit) bytes.
545 The coding system is used to decode the stream into a series of
546 characters (which may be from multiple charsets) when the text is read
547 from a file or process, and is used to encode the text back into the
548 same format when it is written out to a file or process.
550 For example, many ISO2022-compliant coding systems (such as Compound
551 Text, which is used for inter-client data under the X Window System)
552 use escape sequences to switch between different charsets -- Japanese
553 Kanji, for example, is invoked with "ESC $ ( B"; ASCII is invoked
554 with "ESC ( B"; and Cyrillic is invoked with "ESC - L". See
555 `make-coding-system' for more information.
557 Coding systems are normally identified using a symbol, and the
558 symbol is accepted in place of the actual coding system object whenever
559 a coding system is called for. (This is similar to how faces work.)
563 return CODING_SYSTEMP (object) ? Qt : Qnil;
566 DEFUN ("find-coding-system", Ffind_coding_system, 1, 1, 0, /*
567 Retrieve the coding system of the given name.
569 If CODING-SYSTEM-OR-NAME is a coding-system object, it is simply
570 returned. Otherwise, CODING-SYSTEM-OR-NAME should be a symbol.
571 If there is no such coding system, nil is returned. Otherwise the
572 associated coding system object is returned.
574 (coding_system_or_name))
576 if (NILP (coding_system_or_name))
577 coding_system_or_name = Qbinary;
578 else if (CODING_SYSTEMP (coding_system_or_name))
579 return coding_system_or_name;
581 CHECK_SYMBOL (coding_system_or_name);
585 coding_system_or_name =
586 Fgethash (coding_system_or_name, Vcoding_system_hash_table, Qnil);
588 if (CODING_SYSTEMP (coding_system_or_name) || NILP (coding_system_or_name))
589 return coding_system_or_name;
593 DEFUN ("get-coding-system", Fget_coding_system, 1, 1, 0, /*
594 Retrieve the coding system of the given name.
595 Same as `find-coding-system' except that if there is no such
596 coding system, an error is signaled instead of returning nil.
600 Lisp_Object coding_system = Ffind_coding_system (name);
602 if (NILP (coding_system))
603 signal_simple_error ("No such coding system", name);
604 return coding_system;
607 /* We store the coding systems in hash tables with the names as the key and the
608 actual coding system object as the value. Occasionally we need to use them
609 in a list format. These routines provide us with that. */
610 struct coding_system_list_closure
612 Lisp_Object *coding_system_list;
616 add_coding_system_to_list_mapper (Lisp_Object key, Lisp_Object value,
617 void *coding_system_list_closure)
619 /* This function can GC */
620 struct coding_system_list_closure *cscl =
621 (struct coding_system_list_closure *) coding_system_list_closure;
622 Lisp_Object *coding_system_list = cscl->coding_system_list;
624 *coding_system_list = Fcons (key, *coding_system_list);
628 DEFUN ("coding-system-list", Fcoding_system_list, 0, 0, 0, /*
629 Return a list of the names of all defined coding systems.
633 Lisp_Object coding_system_list = Qnil;
635 struct coding_system_list_closure coding_system_list_closure;
637 GCPRO1 (coding_system_list);
638 coding_system_list_closure.coding_system_list = &coding_system_list;
639 elisp_maphash (add_coding_system_to_list_mapper, Vcoding_system_hash_table,
640 &coding_system_list_closure);
643 return coding_system_list;
646 DEFUN ("coding-system-name", Fcoding_system_name, 1, 1, 0, /*
647 Return the name of the given coding system.
651 coding_system = Fget_coding_system (coding_system);
652 return XCODING_SYSTEM_NAME (coding_system);
655 static Lisp_Coding_System *
656 allocate_coding_system (enum coding_system_type type, Lisp_Object name)
658 Lisp_Coding_System *codesys =
659 alloc_lcrecord_type (Lisp_Coding_System, &lrecord_coding_system);
661 zero_lcrecord (codesys);
662 CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = Qnil;
663 CODING_SYSTEM_POST_READ_CONVERSION (codesys) = Qnil;
664 CODING_SYSTEM_EOL_TYPE (codesys) = EOL_AUTODETECT;
665 CODING_SYSTEM_EOL_CRLF (codesys) = Qnil;
666 CODING_SYSTEM_EOL_CR (codesys) = Qnil;
667 CODING_SYSTEM_EOL_LF (codesys) = Qnil;
668 CODING_SYSTEM_TYPE (codesys) = type;
669 CODING_SYSTEM_MNEMONIC (codesys) = Qnil;
672 CODING_SYSTEM_CCS_PRIORITY_LIST (codesys) = Qnil;
674 if (type == CODESYS_ISO2022)
677 for (i = 0; i < 4; i++)
678 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i) = Qnil;
681 if (type == CODESYS_UTF8)
683 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 0)
685 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 1)
687 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 2)
689 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 3)
692 else if (type == CODESYS_BIG5)
694 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 0)
696 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 1)
697 = Vcharset_chinese_big5;
698 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 2)
700 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 3)
704 else if (type == CODESYS_CCL)
706 CODING_SYSTEM_CCL_DECODE (codesys) = Qnil;
707 CODING_SYSTEM_CCL_ENCODE (codesys) = Qnil;
710 CODING_SYSTEM_NAME (codesys) = name;
716 /* Given a list of charset conversion specs as specified in a Lisp
717 program, parse it into STORE_HERE. */
720 parse_charset_conversion_specs (charset_conversion_spec_dynarr *store_here,
721 Lisp_Object spec_list)
725 EXTERNAL_LIST_LOOP (rest, spec_list)
727 Lisp_Object car = XCAR (rest);
728 Lisp_Object from, to;
729 struct charset_conversion_spec spec;
731 if (!CONSP (car) || !CONSP (XCDR (car)) || !NILP (XCDR (XCDR (car))))
732 signal_simple_error ("Invalid charset conversion spec", car);
733 from = Fget_charset (XCAR (car));
734 to = Fget_charset (XCAR (XCDR (car)));
735 if ( (XCHARSET_CHARS (from) != XCHARSET_CHARS (to)) ||
736 (XCHARSET_DIMENSION (from) != XCHARSET_DIMENSION (to)) )
737 signal_simple_error_2
738 ("Attempted conversion between different charset types",
740 spec.from_charset = from;
741 spec.to_charset = to;
743 Dynarr_add (store_here, spec);
747 /* Given a dynarr LOAD_HERE of internally-stored charset conversion
748 specs, return the equivalent as the Lisp programmer would see it.
750 If LOAD_HERE is 0, return Qnil. */
753 unparse_charset_conversion_specs (charset_conversion_spec_dynarr *load_here)
760 for (i = 0, result = Qnil; i < Dynarr_length (load_here); i++)
762 struct charset_conversion_spec *ccs = Dynarr_atp (load_here, i);
763 result = Fcons (list2 (ccs->from_charset, ccs->to_charset), result);
766 return Fnreverse (result);
771 DEFUN ("make-coding-system", Fmake_coding_system, 2, 4, 0, /*
772 Register symbol NAME as a coding system.
774 TYPE describes the conversion method used and should be one of
777 Automatic conversion. XEmacs attempts to detect the coding system
780 No conversion. Use this for binary files and such. On output,
781 graphic characters that are not in ASCII or Latin-1 will be
782 replaced by a ?. (For a no-conversion-encoded buffer, these
783 characters will only be present if you explicitly insert them.)
785 Shift-JIS (a Japanese encoding commonly used in PC operating systems).
787 ISO 10646 UCS-4 encoding.
789 ISO 10646 UTF-8 encoding.
791 Any ISO2022-compliant encoding. Among other things, this includes
792 JIS (the Japanese encoding commonly used for e-mail), EUC (the
793 standard Unix encoding for Japanese and other languages), and
794 Compound Text (the encoding used in X11). You can specify more
795 specific information about the conversion with the PROPS argument.
797 Big5 (the encoding commonly used for Taiwanese).
799 The conversion is performed using a user-written pseudo-code
800 program. CCL (Code Conversion Language) is the name of this
803 Write out or read in the raw contents of the memory representing
804 the buffer's text. This is primarily useful for debugging
805 purposes, and is only enabled when XEmacs has been compiled with
806 DEBUG_XEMACS defined (via the --debug configure option).
807 WARNING: Reading in a file using 'internal conversion can result
808 in an internal inconsistency in the memory representing a
809 buffer's text, which will produce unpredictable results and may
810 cause XEmacs to crash. Under normal circumstances you should
811 never use 'internal conversion.
813 DOC-STRING is a string describing the coding system.
815 PROPS is a property list, describing the specific nature of the
816 character set. Recognized properties are:
819 String to be displayed in the modeline when this coding system is
823 End-of-line conversion to be used. It should be one of
826 Automatically detect the end-of-line type (LF, CRLF,
827 or CR). Also generate subsidiary coding systems named
828 `NAME-unix', `NAME-dos', and `NAME-mac', that are
829 identical to this coding system but have an EOL-TYPE
830 value of 'lf, 'crlf, and 'cr, respectively.
832 The end of a line is marked externally using ASCII LF.
833 Since this is also the way that XEmacs represents an
834 end-of-line internally, specifying this option results
835 in no end-of-line conversion. This is the standard
836 format for Unix text files.
838 The end of a line is marked externally using ASCII
839 CRLF. This is the standard format for MS-DOS text
842 The end of a line is marked externally using ASCII CR.
843 This is the standard format for Macintosh text files.
845 Automatically detect the end-of-line type but do not
846 generate subsidiary coding systems. (This value is
847 converted to nil when stored internally, and
848 `coding-system-property' will return nil.)
851 If non-nil, composition/decomposition for combining characters
854 'use-entity-reference
855 If non-nil, SGML style entity-reference is used for non-system-characters.
857 'post-read-conversion
858 Function called after a file has been read in, to perform the
859 decoding. Called with two arguments, START and END, denoting
860 a region of the current buffer to be decoded.
862 'pre-write-conversion
863 Function called before a file is written out, to perform the
864 encoding. Called with two arguments, START and END, denoting
865 a region of the current buffer to be encoded.
868 The following additional properties are recognized if TYPE is 'iso2022:
874 The character set initially designated to the G0 - G3 registers.
875 The value should be one of
877 -- A charset object (designate that character set)
878 -- nil (do not ever use this register)
879 -- t (no character set is initially designated to
880 the register, but may be later on; this automatically
881 sets the corresponding `force-g*-on-output' property)
887 If non-nil, send an explicit designation sequence on output before
888 using the specified register.
891 If non-nil, use the short forms "ESC $ @", "ESC $ A", and
892 "ESC $ B" on output in place of the full designation sequences
893 "ESC $ ( @", "ESC $ ( A", and "ESC $ ( B".
896 If non-nil, don't designate ASCII to G0 at each end of line on output.
897 Setting this to non-nil also suppresses other state-resetting that
898 normally happens at the end of a line.
901 If non-nil, don't designate ASCII to G0 before control chars on output.
904 If non-nil, use 7-bit environment on output. Otherwise, use 8-bit
908 If non-nil, use locking-shift (SO/SI) instead of single-shift
909 or designation by escape sequence.
912 If non-nil, don't use ISO6429's direction specification.
915 If non-nil, literal control characters that are the same as
916 the beginning of a recognized ISO2022 or ISO6429 escape sequence
917 (in particular, ESC (0x1B), SO (0x0E), SI (0x0F), SS2 (0x8E),
918 SS3 (0x8F), and CSI (0x9B)) are "quoted" with an escape character
919 so that they can be properly distinguished from an escape sequence.
920 (Note that doing this results in a non-portable encoding.) This
921 encoding flag is used for byte-compiled files. Note that ESC
922 is a good choice for a quoting character because there are no
923 escape sequences whose second byte is a character from the Control-0
924 or Control-1 character sets; this is explicitly disallowed by the
927 'input-charset-conversion
928 A list of conversion specifications, specifying conversion of
929 characters in one charset to another when decoding is performed.
930 Each specification is a list of two elements: the source charset,
931 and the destination charset.
933 'output-charset-conversion
934 A list of conversion specifications, specifying conversion of
935 characters in one charset to another when encoding is performed.
936 The form of each specification is the same as for
937 'input-charset-conversion.
940 The following additional properties are recognized (and required)
944 CCL program used for decoding (converting to internal format).
947 CCL program used for encoding (converting to external format).
949 (name, type, doc_string, props))
951 Lisp_Coding_System *codesys;
952 enum coding_system_type ty;
953 int need_to_setup_eol_systems = 1;
955 /* Convert type to constant */
956 if (NILP (type) || EQ (type, Qundecided))
957 { ty = CODESYS_AUTODETECT; }
959 else if (EQ (type, Qshift_jis)) { ty = CODESYS_SHIFT_JIS; }
960 else if (EQ (type, Qiso2022)) { ty = CODESYS_ISO2022; }
961 else if (EQ (type, Qbig5)) { ty = CODESYS_BIG5; }
962 else if (EQ (type, Qucs4)) { ty = CODESYS_UCS4; }
963 else if (EQ (type, Qutf16)) { ty = CODESYS_UTF16; }
964 else if (EQ (type, Qutf8)) { ty = CODESYS_UTF8; }
965 else if (EQ (type, Qccl)) { ty = CODESYS_CCL; }
967 else if (EQ (type, Qno_conversion)) { ty = CODESYS_NO_CONVERSION; }
969 else if (EQ (type, Qinternal)) { ty = CODESYS_INTERNAL; }
972 signal_simple_error ("Invalid coding system type", type);
976 codesys = allocate_coding_system (ty, name);
978 if (NILP (doc_string))
979 doc_string = build_string ("");
981 CHECK_STRING (doc_string);
982 CODING_SYSTEM_DOC_STRING (codesys) = doc_string;
985 EXTERNAL_PROPERTY_LIST_LOOP_3 (key, value, props)
987 if (EQ (key, Qmnemonic))
990 CHECK_STRING (value);
991 CODING_SYSTEM_MNEMONIC (codesys) = value;
994 else if (EQ (key, Qeol_type))
996 need_to_setup_eol_systems = NILP (value);
999 CODING_SYSTEM_EOL_TYPE (codesys) = symbol_to_eol_type (value);
1002 else if (EQ (key, Qpost_read_conversion))
1003 CODING_SYSTEM_POST_READ_CONVERSION (codesys) = value;
1004 else if (EQ (key, Qpre_write_conversion))
1005 CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = value;
1007 else if (EQ (key, Qdisable_composition))
1008 CODING_SYSTEM_DISABLE_COMPOSITION (codesys) = !NILP (value);
1009 else if (EQ (key, Quse_entity_reference))
1010 CODING_SYSTEM_USE_ENTITY_REFERENCE (codesys) = !NILP (value);
1013 else if (ty == CODESYS_ISO2022)
1015 #define FROB_INITIAL_CHARSET(charset_num) \
1016 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, charset_num) = \
1017 ((EQ (value, Qt) || EQ (value, Qnil)) ? value : Fget_charset (value))
1019 if (EQ (key, Qcharset_g0)) FROB_INITIAL_CHARSET (0);
1020 else if (EQ (key, Qcharset_g1)) FROB_INITIAL_CHARSET (1);
1021 else if (EQ (key, Qcharset_g2)) FROB_INITIAL_CHARSET (2);
1022 else if (EQ (key, Qcharset_g3)) FROB_INITIAL_CHARSET (3);
1024 #define FROB_FORCE_CHARSET(charset_num) \
1025 CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (codesys, charset_num) = !NILP (value)
1027 else if (EQ (key, Qforce_g0_on_output)) FROB_FORCE_CHARSET (0);
1028 else if (EQ (key, Qforce_g1_on_output)) FROB_FORCE_CHARSET (1);
1029 else if (EQ (key, Qforce_g2_on_output)) FROB_FORCE_CHARSET (2);
1030 else if (EQ (key, Qforce_g3_on_output)) FROB_FORCE_CHARSET (3);
1032 #define FROB_BOOLEAN_PROPERTY(prop) \
1033 CODING_SYSTEM_ISO2022_##prop (codesys) = !NILP (value)
1035 else if (EQ (key, Qshort)) FROB_BOOLEAN_PROPERTY (SHORT);
1036 else if (EQ (key, Qno_ascii_eol)) FROB_BOOLEAN_PROPERTY (NO_ASCII_EOL);
1037 else if (EQ (key, Qno_ascii_cntl)) FROB_BOOLEAN_PROPERTY (NO_ASCII_CNTL);
1038 else if (EQ (key, Qseven)) FROB_BOOLEAN_PROPERTY (SEVEN);
1039 else if (EQ (key, Qlock_shift)) FROB_BOOLEAN_PROPERTY (LOCK_SHIFT);
1040 else if (EQ (key, Qno_iso6429)) FROB_BOOLEAN_PROPERTY (NO_ISO6429);
1041 else if (EQ (key, Qescape_quoted)) FROB_BOOLEAN_PROPERTY (ESCAPE_QUOTED);
1043 else if (EQ (key, Qinput_charset_conversion))
1045 codesys->iso2022.input_conv =
1046 Dynarr_new (charset_conversion_spec);
1047 parse_charset_conversion_specs (codesys->iso2022.input_conv,
1050 else if (EQ (key, Qoutput_charset_conversion))
1052 codesys->iso2022.output_conv =
1053 Dynarr_new (charset_conversion_spec);
1054 parse_charset_conversion_specs (codesys->iso2022.output_conv,
1058 else if (EQ (key, Qccs_priority_list))
1060 codesys->ccs_priority_list = value;
1064 signal_simple_error ("Unrecognized property", key);
1067 else if (ty == CODESYS_UTF8)
1069 if (EQ (key, Qcharset_g0)) FROB_INITIAL_CHARSET (0);
1070 else if (EQ (key, Qcharset_g1))
1071 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 1) = value;
1072 else if (EQ (key, Qcharset_g2))
1073 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 2) = value;
1075 signal_simple_error ("Unrecognized property", key);
1077 else if (ty == CODESYS_BIG5)
1079 if (EQ (key, Qcharset_g0)) FROB_INITIAL_CHARSET (0);
1080 else if (EQ (key, Qcharset_g1)) FROB_INITIAL_CHARSET (1);
1082 signal_simple_error ("Unrecognized property", key);
1085 else if (EQ (type, Qccl))
1088 struct ccl_program test_ccl;
1091 /* Check key first. */
1092 if (EQ (key, Qdecode))
1093 suffix = "-ccl-decode";
1094 else if (EQ (key, Qencode))
1095 suffix = "-ccl-encode";
1097 signal_simple_error ("Unrecognized property", key);
1099 /* If value is vector, register it as a ccl program
1100 associated with an newly created symbol for
1101 backward compatibility. */
1102 if (VECTORP (value))
1104 sym = Fintern (concat2 (Fsymbol_name (name),
1105 build_string (suffix)),
1107 Fregister_ccl_program (sym, value);
1111 CHECK_SYMBOL (value);
1114 /* check if the given ccl programs are valid. */
1115 if (setup_ccl_program (&test_ccl, sym) < 0)
1116 signal_simple_error ("Invalid CCL program", value);
1118 if (EQ (key, Qdecode))
1119 CODING_SYSTEM_CCL_DECODE (codesys) = sym;
1120 else if (EQ (key, Qencode))
1121 CODING_SYSTEM_CCL_ENCODE (codesys) = sym;
1126 signal_simple_error ("Unrecognized property", key);
1130 if (need_to_setup_eol_systems)
1131 setup_eol_coding_systems (codesys);
1134 Lisp_Object codesys_obj;
1135 XSETCODING_SYSTEM (codesys_obj, codesys);
1136 Fputhash (name, codesys_obj, Vcoding_system_hash_table);
1141 DEFUN ("copy-coding-system", Fcopy_coding_system, 2, 2, 0, /*
1142 Copy OLD-CODING-SYSTEM to NEW-NAME.
1143 If NEW-NAME does not name an existing coding system, a new one will
1146 (old_coding_system, new_name))
1148 Lisp_Object new_coding_system;
1149 old_coding_system = Fget_coding_system (old_coding_system);
1150 new_coding_system = Ffind_coding_system (new_name);
1151 if (NILP (new_coding_system))
1153 XSETCODING_SYSTEM (new_coding_system,
1154 allocate_coding_system
1155 (XCODING_SYSTEM_TYPE (old_coding_system),
1157 Fputhash (new_name, new_coding_system, Vcoding_system_hash_table);
1161 Lisp_Coding_System *to = XCODING_SYSTEM (new_coding_system);
1162 Lisp_Coding_System *from = XCODING_SYSTEM (old_coding_system);
1163 memcpy (((char *) to ) + sizeof (to->header),
1164 ((char *) from) + sizeof (from->header),
1165 sizeof (*from) - sizeof (from->header));
1166 to->name = new_name;
1168 return new_coding_system;
1171 DEFUN ("coding-system-canonical-name-p", Fcoding_system_canonical_name_p, 1, 1, 0, /*
1172 Return t if OBJECT names a coding system, and is not a coding system alias.
1176 return CODING_SYSTEMP (Fgethash (object, Vcoding_system_hash_table, Qnil))
1180 DEFUN ("coding-system-alias-p", Fcoding_system_alias_p, 1, 1, 0, /*
1181 Return t if OBJECT is a coding system alias.
1182 All coding system aliases are created by `define-coding-system-alias'.
1186 return SYMBOLP (Fgethash (object, Vcoding_system_hash_table, Qzero))
1190 DEFUN ("coding-system-aliasee", Fcoding_system_aliasee, 1, 1, 0, /*
1191 Return the coding-system symbol for which symbol ALIAS is an alias.
1195 Lisp_Object aliasee = Fgethash (alias, Vcoding_system_hash_table, Qnil);
1196 if (SYMBOLP (aliasee))
1199 signal_simple_error ("Symbol is not a coding system alias", alias);
1200 return Qnil; /* To keep the compiler happy */
1204 append_suffix_to_symbol (Lisp_Object symbol, const char *ascii_string)
1206 return Fintern (concat2 (Fsymbol_name (symbol), build_string (ascii_string)),
1210 /* A maphash function, for removing dangling coding system aliases. */
1212 dangling_coding_system_alias_p (Lisp_Object alias,
1213 Lisp_Object aliasee,
1214 void *dangling_aliases)
1216 if (SYMBOLP (aliasee)
1217 && NILP (Fgethash (aliasee, Vcoding_system_hash_table, Qnil)))
1219 (*(int *) dangling_aliases)++;
1226 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias, 2, 2, 0, /*
1227 Define symbol ALIAS as an alias for coding system ALIASEE.
1229 You can use this function to redefine an alias that has already been defined,
1230 but you cannot redefine a name which is the canonical name for a coding system.
1231 \(a canonical name of a coding system is what is returned when you call
1232 `coding-system-name' on a coding system).
1234 ALIASEE itself can be an alias, which allows you to define nested aliases.
1236 You are forbidden, however, from creating alias loops or `dangling' aliases.
1237 These will be detected, and an error will be signaled if you attempt to do so.
1239 If ALIASEE is nil, then ALIAS will simply be undefined.
1241 See also `coding-system-alias-p', `coding-system-aliasee',
1242 and `coding-system-canonical-name-p'.
1246 Lisp_Object real_coding_system, probe;
1248 CHECK_SYMBOL (alias);
1250 if (!NILP (Fcoding_system_canonical_name_p (alias)))
1252 ("Symbol is the canonical name of a coding system and cannot be redefined",
1257 Lisp_Object subsidiary_unix = append_suffix_to_symbol (alias, "-unix");
1258 Lisp_Object subsidiary_dos = append_suffix_to_symbol (alias, "-dos");
1259 Lisp_Object subsidiary_mac = append_suffix_to_symbol (alias, "-mac");
1261 Fremhash (alias, Vcoding_system_hash_table);
1263 /* Undefine subsidiary aliases,
1264 presumably created by a previous call to this function */
1265 if (! NILP (Fcoding_system_alias_p (subsidiary_unix)) &&
1266 ! NILP (Fcoding_system_alias_p (subsidiary_dos)) &&
1267 ! NILP (Fcoding_system_alias_p (subsidiary_mac)))
1269 Fdefine_coding_system_alias (subsidiary_unix, Qnil);
1270 Fdefine_coding_system_alias (subsidiary_dos, Qnil);
1271 Fdefine_coding_system_alias (subsidiary_mac, Qnil);
1274 /* Undefine dangling coding system aliases. */
1276 int dangling_aliases;
1279 dangling_aliases = 0;
1280 elisp_map_remhash (dangling_coding_system_alias_p,
1281 Vcoding_system_hash_table,
1283 } while (dangling_aliases > 0);
1289 if (CODING_SYSTEMP (aliasee))
1290 aliasee = XCODING_SYSTEM_NAME (aliasee);
1292 /* Checks that aliasee names a coding-system */
1293 real_coding_system = Fget_coding_system (aliasee);
1295 /* Check for coding system alias loops */
1296 if (EQ (alias, aliasee))
1297 alias_loop: signal_simple_error_2
1298 ("Attempt to create a coding system alias loop", alias, aliasee);
1300 for (probe = aliasee;
1302 probe = Fgethash (probe, Vcoding_system_hash_table, Qzero))
1304 if (EQ (probe, alias))
1308 Fputhash (alias, aliasee, Vcoding_system_hash_table);
1310 /* Set up aliases for subsidiaries.
1311 #### There must be a better way to handle subsidiary coding systems. */
1313 static const char *suffixes[] = { "-unix", "-dos", "-mac" };
1315 for (i = 0; i < countof (suffixes); i++)
1317 Lisp_Object alias_subsidiary =
1318 append_suffix_to_symbol (alias, suffixes[i]);
1319 Lisp_Object aliasee_subsidiary =
1320 append_suffix_to_symbol (aliasee, suffixes[i]);
1322 if (! NILP (Ffind_coding_system (aliasee_subsidiary)))
1323 Fdefine_coding_system_alias (alias_subsidiary, aliasee_subsidiary);
1326 /* FSF return value is a vector of [ALIAS-unix ALIAS-dos ALIAS-mac],
1327 but it doesn't look intentional, so I'd rather return something
1328 meaningful or nothing at all. */
1333 subsidiary_coding_system (Lisp_Object coding_system, eol_type_t type)
1335 Lisp_Coding_System *cs = XCODING_SYSTEM (coding_system);
1336 Lisp_Object new_coding_system;
1338 if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
1339 return coding_system;
1343 case EOL_AUTODETECT: return coding_system;
1344 case EOL_LF: new_coding_system = CODING_SYSTEM_EOL_LF (cs); break;
1345 case EOL_CR: new_coding_system = CODING_SYSTEM_EOL_CR (cs); break;
1346 case EOL_CRLF: new_coding_system = CODING_SYSTEM_EOL_CRLF (cs); break;
1347 default: abort (); return Qnil;
1350 return NILP (new_coding_system) ? coding_system : new_coding_system;
1353 DEFUN ("subsidiary-coding-system", Fsubsidiary_coding_system, 2, 2, 0, /*
1354 Return the subsidiary coding system of CODING-SYSTEM with eol type EOL-TYPE.
1356 (coding_system, eol_type))
1358 coding_system = Fget_coding_system (coding_system);
1360 return subsidiary_coding_system (coding_system,
1361 symbol_to_eol_type (eol_type));
1365 /************************************************************************/
1366 /* Coding system accessors */
1367 /************************************************************************/
1369 DEFUN ("coding-system-doc-string", Fcoding_system_doc_string, 1, 1, 0, /*
1370 Return the doc string for CODING-SYSTEM.
1374 coding_system = Fget_coding_system (coding_system);
1375 return XCODING_SYSTEM_DOC_STRING (coding_system);
1378 DEFUN ("coding-system-type", Fcoding_system_type, 1, 1, 0, /*
1379 Return the type of CODING-SYSTEM.
1383 switch (XCODING_SYSTEM_TYPE (Fget_coding_system (coding_system)))
1386 case CODESYS_AUTODETECT: return Qundecided;
1388 case CODESYS_SHIFT_JIS: return Qshift_jis;
1389 case CODESYS_ISO2022: return Qiso2022;
1390 case CODESYS_BIG5: return Qbig5;
1391 case CODESYS_UCS4: return Qucs4;
1392 case CODESYS_UTF16: return Qutf16;
1393 case CODESYS_UTF8: return Qutf8;
1394 case CODESYS_CCL: return Qccl;
1396 case CODESYS_NO_CONVERSION: return Qno_conversion;
1398 case CODESYS_INTERNAL: return Qinternal;
1405 Lisp_Object coding_system_charset (Lisp_Object coding_system, int gnum)
1408 = XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, gnum);
1410 return CHARSETP (cs) ? XCHARSET_NAME (cs) : Qnil;
1413 DEFUN ("coding-system-charset", Fcoding_system_charset, 2, 2, 0, /*
1414 Return initial charset of CODING-SYSTEM designated to GNUM.
1417 (coding_system, gnum))
1419 coding_system = Fget_coding_system (coding_system);
1422 return coding_system_charset (coding_system, XINT (gnum));
1426 DEFUN ("coding-system-property", Fcoding_system_property, 2, 2, 0, /*
1427 Return the PROP property of CODING-SYSTEM.
1429 (coding_system, prop))
1432 enum coding_system_type type;
1434 coding_system = Fget_coding_system (coding_system);
1435 CHECK_SYMBOL (prop);
1436 type = XCODING_SYSTEM_TYPE (coding_system);
1438 for (i = 0; !ok && i < Dynarr_length (the_codesys_prop_dynarr); i++)
1439 if (EQ (Dynarr_at (the_codesys_prop_dynarr, i).sym, prop))
1442 switch (Dynarr_at (the_codesys_prop_dynarr, i).prop_type)
1444 case CODESYS_PROP_ALL_OK:
1447 case CODESYS_PROP_ISO2022:
1448 if (type != CODESYS_ISO2022)
1450 ("Property only valid in ISO2022 coding systems",
1454 case CODESYS_PROP_CCL:
1455 if (type != CODESYS_CCL)
1457 ("Property only valid in CCL coding systems",
1467 signal_simple_error ("Unrecognized property", prop);
1469 if (EQ (prop, Qname))
1470 return XCODING_SYSTEM_NAME (coding_system);
1471 else if (EQ (prop, Qtype))
1472 return Fcoding_system_type (coding_system);
1473 else if (EQ (prop, Qdoc_string))
1474 return XCODING_SYSTEM_DOC_STRING (coding_system);
1475 else if (EQ (prop, Qmnemonic))
1476 return XCODING_SYSTEM_MNEMONIC (coding_system);
1477 else if (EQ (prop, Qeol_type))
1478 return eol_type_to_symbol (XCODING_SYSTEM_EOL_TYPE (coding_system));
1479 else if (EQ (prop, Qeol_lf))
1480 return XCODING_SYSTEM_EOL_LF (coding_system);
1481 else if (EQ (prop, Qeol_crlf))
1482 return XCODING_SYSTEM_EOL_CRLF (coding_system);
1483 else if (EQ (prop, Qeol_cr))
1484 return XCODING_SYSTEM_EOL_CR (coding_system);
1485 else if (EQ (prop, Qpost_read_conversion))
1486 return XCODING_SYSTEM_POST_READ_CONVERSION (coding_system);
1487 else if (EQ (prop, Qpre_write_conversion))
1488 return XCODING_SYSTEM_PRE_WRITE_CONVERSION (coding_system);
1491 else if (EQ (prop, Qdisable_composition))
1492 return XCODING_SYSTEM_DISABLE_COMPOSITION (coding_system) ? Qt : Qnil;
1493 else if (EQ (prop, Quse_entity_reference))
1494 return XCODING_SYSTEM_USE_ENTITY_REFERENCE (coding_system) ? Qt : Qnil;
1495 else if (EQ (prop, Qccs_priority_list))
1496 return XCODING_SYSTEM_CCS_PRIORITY_LIST (coding_system);
1498 else if (type == CODESYS_ISO2022)
1500 if (EQ (prop, Qcharset_g0))
1501 return coding_system_charset (coding_system, 0);
1502 else if (EQ (prop, Qcharset_g1))
1503 return coding_system_charset (coding_system, 1);
1504 else if (EQ (prop, Qcharset_g2))
1505 return coding_system_charset (coding_system, 2);
1506 else if (EQ (prop, Qcharset_g3))
1507 return coding_system_charset (coding_system, 3);
1509 #define FORCE_CHARSET(charset_num) \
1510 (XCODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT \
1511 (coding_system, charset_num) ? Qt : Qnil)
1513 else if (EQ (prop, Qforce_g0_on_output)) return FORCE_CHARSET (0);
1514 else if (EQ (prop, Qforce_g1_on_output)) return FORCE_CHARSET (1);
1515 else if (EQ (prop, Qforce_g2_on_output)) return FORCE_CHARSET (2);
1516 else if (EQ (prop, Qforce_g3_on_output)) return FORCE_CHARSET (3);
1518 #define LISP_BOOLEAN(prop) \
1519 (XCODING_SYSTEM_ISO2022_##prop (coding_system) ? Qt : Qnil)
1521 else if (EQ (prop, Qshort)) return LISP_BOOLEAN (SHORT);
1522 else if (EQ (prop, Qno_ascii_eol)) return LISP_BOOLEAN (NO_ASCII_EOL);
1523 else if (EQ (prop, Qno_ascii_cntl)) return LISP_BOOLEAN (NO_ASCII_CNTL);
1524 else if (EQ (prop, Qseven)) return LISP_BOOLEAN (SEVEN);
1525 else if (EQ (prop, Qlock_shift)) return LISP_BOOLEAN (LOCK_SHIFT);
1526 else if (EQ (prop, Qno_iso6429)) return LISP_BOOLEAN (NO_ISO6429);
1527 else if (EQ (prop, Qescape_quoted)) return LISP_BOOLEAN (ESCAPE_QUOTED);
1529 else if (EQ (prop, Qinput_charset_conversion))
1531 unparse_charset_conversion_specs
1532 (XCODING_SYSTEM (coding_system)->iso2022.input_conv);
1533 else if (EQ (prop, Qoutput_charset_conversion))
1535 unparse_charset_conversion_specs
1536 (XCODING_SYSTEM (coding_system)->iso2022.output_conv);
1540 else if (type == CODESYS_CCL)
1542 if (EQ (prop, Qdecode))
1543 return XCODING_SYSTEM_CCL_DECODE (coding_system);
1544 else if (EQ (prop, Qencode))
1545 return XCODING_SYSTEM_CCL_ENCODE (coding_system);
1553 return Qnil; /* not reached */
1557 /************************************************************************/
1558 /* Coding category functions */
1559 /************************************************************************/
1562 decode_coding_category (Lisp_Object symbol)
1566 CHECK_SYMBOL (symbol);
1567 for (i = 0; i < CODING_CATEGORY_LAST; i++)
1568 if (EQ (coding_category_symbol[i], symbol))
1571 signal_simple_error ("Unrecognized coding category", symbol);
1572 return 0; /* not reached */
1575 DEFUN ("coding-category-list", Fcoding_category_list, 0, 0, 0, /*
1576 Return a list of all recognized coding categories.
1581 Lisp_Object list = Qnil;
1583 for (i = CODING_CATEGORY_LAST - 1; i >= 0; i--)
1584 list = Fcons (coding_category_symbol[i], list);
1588 DEFUN ("set-coding-priority-list", Fset_coding_priority_list, 1, 1, 0, /*
1589 Change the priority order of the coding categories.
1590 LIST should be list of coding categories, in descending order of
1591 priority. Unspecified coding categories will be lower in priority
1592 than all specified ones, in the same relative order they were in
1597 int category_to_priority[CODING_CATEGORY_LAST];
1601 /* First generate a list that maps coding categories to priorities. */
1603 for (i = 0; i < CODING_CATEGORY_LAST; i++)
1604 category_to_priority[i] = -1;
1606 /* Highest priority comes from the specified list. */
1608 EXTERNAL_LIST_LOOP (rest, list)
1610 int cat = decode_coding_category (XCAR (rest));
1612 if (category_to_priority[cat] >= 0)
1613 signal_simple_error ("Duplicate coding category in list", XCAR (rest));
1614 category_to_priority[cat] = i++;
1617 /* Now go through the existing categories by priority to retrieve
1618 the categories not yet specified and preserve their priority
1620 for (j = 0; j < CODING_CATEGORY_LAST; j++)
1622 int cat = fcd->coding_category_by_priority[j];
1623 if (category_to_priority[cat] < 0)
1624 category_to_priority[cat] = i++;
1627 /* Now we need to construct the inverse of the mapping we just
1630 for (i = 0; i < CODING_CATEGORY_LAST; i++)
1631 fcd->coding_category_by_priority[category_to_priority[i]] = i;
1633 /* Phew! That was confusing. */
1637 DEFUN ("coding-priority-list", Fcoding_priority_list, 0, 0, 0, /*
1638 Return a list of coding categories in descending order of priority.
1643 Lisp_Object list = Qnil;
1645 for (i = CODING_CATEGORY_LAST - 1; i >= 0; i--)
1646 list = Fcons (coding_category_symbol[fcd->coding_category_by_priority[i]],
1651 DEFUN ("set-coding-category-system", Fset_coding_category_system, 2, 2, 0, /*
1652 Change the coding system associated with a coding category.
1654 (coding_category, coding_system))
1656 int cat = decode_coding_category (coding_category);
1658 coding_system = Fget_coding_system (coding_system);
1659 fcd->coding_category_system[cat] = coding_system;
1663 DEFUN ("coding-category-system", Fcoding_category_system, 1, 1, 0, /*
1664 Return the coding system associated with a coding category.
1668 int cat = decode_coding_category (coding_category);
1669 Lisp_Object sys = fcd->coding_category_system[cat];
1672 return XCODING_SYSTEM_NAME (sys);
1677 /************************************************************************/
1678 /* Detecting the encoding of data */
1679 /************************************************************************/
1681 struct detection_state
1683 eol_type_t eol_type;
1726 struct iso2022_decoder iso;
1728 int high_byte_count;
1729 unsigned int saw_single_shift:1;
1742 acceptable_control_char_p (int c)
1746 /* Allow and ignore control characters that you might
1747 reasonably see in a text file */
1752 case 8: /* backspace */
1753 case 11: /* vertical tab */
1754 case 12: /* form feed */
1755 case 26: /* MS-DOS C-z junk */
1756 case 31: /* '^_' -- for info */
1764 mask_has_at_most_one_bit_p (int mask)
1766 /* Perhaps the only thing useful you learn from intensive Microsoft
1767 technical interviews */
1768 return (mask & (mask - 1)) == 0;
1772 detect_eol_type (struct detection_state *st, const Extbyte *src,
1773 Lstream_data_count n)
1777 unsigned char c = *(unsigned char *)src++;
1780 if (st->eol.just_saw_cr)
1782 else if (st->eol.seen_anything)
1785 else if (st->eol.just_saw_cr)
1788 st->eol.just_saw_cr = 1;
1790 st->eol.just_saw_cr = 0;
1791 st->eol.seen_anything = 1;
1794 return EOL_AUTODETECT;
1797 /* Attempt to determine the encoding and EOL type of the given text.
1798 Before calling this function for the first type, you must initialize
1799 st->eol_type as appropriate and initialize st->mask to ~0.
1801 st->eol_type holds the determined EOL type, or EOL_AUTODETECT if
1804 st->mask holds the determined coding category mask, or ~0 if only
1805 ASCII has been seen so far.
1809 0 == st->eol_type is EOL_AUTODETECT and/or more than coding category
1810 is present in st->mask
1811 1 == definitive answers are here for both st->eol_type and st->mask
1815 detect_coding_type (struct detection_state *st, const Extbyte *src,
1816 Lstream_data_count n, int just_do_eol)
1818 if (st->eol_type == EOL_AUTODETECT)
1819 st->eol_type = detect_eol_type (st, src, n);
1822 return st->eol_type != EOL_AUTODETECT;
1824 if (!st->seen_non_ascii)
1826 for (; n; n--, src++)
1828 unsigned char c = *(unsigned char *) src;
1829 if ((c < 0x20 && !acceptable_control_char_p (c)) || c >= 0x80)
1831 st->seen_non_ascii = 1;
1833 st->shift_jis.mask = ~0;
1836 st->utf16.mask = ~0;
1838 st->iso2022.mask = ~0;
1848 if (!mask_has_at_most_one_bit_p (st->iso2022.mask))
1849 st->iso2022.mask = detect_coding_iso2022 (st, src, n);
1850 if (!mask_has_at_most_one_bit_p (st->shift_jis.mask))
1851 st->shift_jis.mask = detect_coding_sjis (st, src, n);
1852 if (!mask_has_at_most_one_bit_p (st->big5.mask))
1853 st->big5.mask = detect_coding_big5 (st, src, n);
1854 if (!mask_has_at_most_one_bit_p (st->utf8.mask))
1855 st->utf8.mask = detect_coding_utf8 (st, src, n);
1856 if (!mask_has_at_most_one_bit_p (st->utf16.mask))
1857 st->utf16.mask = detect_coding_utf16 (st, src, n);
1858 if (!mask_has_at_most_one_bit_p (st->ucs4.mask))
1859 st->ucs4.mask = detect_coding_ucs4 (st, src, n);
1862 = st->iso2022.mask | st->shift_jis.mask | st->big5.mask
1863 | st->utf8.mask | st->ucs4.mask;
1866 int retval = mask_has_at_most_one_bit_p (st->mask);
1867 st->mask |= CODING_CATEGORY_NO_CONVERSION_MASK;
1868 return retval && st->eol_type != EOL_AUTODETECT;
1873 coding_system_from_mask (int mask)
1877 /* If the file was entirely or basically ASCII, use the
1878 default value of `buffer-file-coding-system'. */
1879 Lisp_Object retval =
1880 XBUFFER (Vbuffer_defaults)->buffer_file_coding_system;
1883 retval = Ffind_coding_system (retval);
1887 (Qbad_variable, Qwarning,
1888 "Invalid `default-buffer-file-coding-system', set to nil");
1889 XBUFFER (Vbuffer_defaults)->buffer_file_coding_system = Qnil;
1893 retval = Fget_coding_system (Qraw_text);
1901 mask = postprocess_iso2022_mask (mask);
1903 /* Look through the coding categories by priority and find
1904 the first one that is allowed. */
1905 for (i = 0; i < CODING_CATEGORY_LAST; i++)
1907 cat = fcd->coding_category_by_priority[i];
1908 if ((mask & (1 << cat)) &&
1909 !NILP (fcd->coding_category_system[cat]))
1913 return fcd->coding_category_system[cat];
1915 return Fget_coding_system (Qraw_text);
1919 /* Given a seekable read stream and potential coding system and EOL type
1920 as specified, do any autodetection that is called for. If the
1921 coding system and/or EOL type are not `autodetect', they will be left
1922 alone; but this function will never return an autodetect coding system
1925 This function does not automatically fetch subsidiary coding systems;
1926 that should be unnecessary with the explicit eol-type argument. */
1928 #define LENGTH(string_constant) (sizeof (string_constant) - 1)
1929 /* number of leading lines to check for a coding cookie */
1930 #define LINES_TO_CHECK 2
1933 determine_real_coding_system (Lstream *stream, Lisp_Object *codesys_in_out,
1934 eol_type_t *eol_type_in_out)
1936 struct detection_state decst;
1938 if (*eol_type_in_out == EOL_AUTODETECT)
1939 *eol_type_in_out = XCODING_SYSTEM_EOL_TYPE (*codesys_in_out);
1942 decst.eol_type = *eol_type_in_out;
1945 /* If autodetection is called for, do it now. */
1946 if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT
1947 || *eol_type_in_out == EOL_AUTODETECT)
1950 Lisp_Object coding_system = Qnil;
1952 Lstream_data_count nread = Lstream_read (stream, buf, sizeof (buf));
1954 int lines_checked = 0;
1956 /* Look for initial "-*-"; mode line prefix */
1958 scan_end = buf + nread - LENGTH ("-*-coding:?-*-");
1960 && lines_checked < LINES_TO_CHECK;
1962 if (*p == '-' && *(p+1) == '*' && *(p+2) == '-')
1964 Extbyte *local_vars_beg = p + 3;
1965 /* Look for final "-*-"; mode line suffix */
1966 for (p = local_vars_beg,
1967 scan_end = buf + nread - LENGTH ("-*-");
1969 && lines_checked < LINES_TO_CHECK;
1971 if (*p == '-' && *(p+1) == '*' && *(p+2) == '-')
1973 Extbyte *suffix = p;
1974 /* Look for "coding:" */
1975 for (p = local_vars_beg,
1976 scan_end = suffix - LENGTH ("coding:?");
1979 if (memcmp ("coding:", p, LENGTH ("coding:")) == 0
1980 && (p == local_vars_beg
1981 || (*(p-1) == ' ' ||
1987 p += LENGTH ("coding:");
1988 while (*p == ' ' || *p == '\t') p++;
1990 /* Get coding system name */
1991 save = *suffix; *suffix = '\0';
1992 /* Characters valid in a MIME charset name (rfc 1521),
1993 and in a Lisp symbol name. */
1994 n = strspn ( (char *) p,
1995 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
1996 "abcdefghijklmnopqrstuvwxyz"
2002 save = p[n]; p[n] = '\0';
2004 Ffind_coding_system (intern ((char *) p));
2011 /* #### file must use standard EOLs or we miss 2d line */
2012 /* #### not to mention this is broken for UTF-16 DOS files */
2013 else if (*p == '\n' || *p == '\r')
2016 /* skip past multibyte (DOS) newline */
2017 if (*p == '\r' && *(p+1) == '\n') p++;
2021 /* #### file must use standard EOLs or we miss 2d line */
2022 /* #### not to mention this is broken for UTF-16 DOS files */
2023 else if (*p == '\n' || *p == '\r')
2026 /* skip past multibyte (DOS) newline */
2027 if (*p == '\r' && *(p+1) == '\n') p++;
2030 if (NILP (coding_system))
2033 if (detect_coding_type (&decst, buf, nread,
2034 XCODING_SYSTEM_TYPE (*codesys_in_out)
2035 != CODESYS_AUTODETECT))
2037 nread = Lstream_read (stream, buf, sizeof (buf));
2043 else if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT
2044 && XCODING_SYSTEM_EOL_TYPE (coding_system) == EOL_AUTODETECT)
2047 if (detect_coding_type (&decst, buf, nread, 1))
2049 nread = Lstream_read (stream, buf, sizeof (buf));
2055 *eol_type_in_out = decst.eol_type;
2056 if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT)
2058 if (NILP (coding_system))
2059 *codesys_in_out = coding_system_from_mask (decst.mask);
2061 *codesys_in_out = coding_system;
2065 /* If we absolutely can't determine the EOL type, just assume LF. */
2066 if (*eol_type_in_out == EOL_AUTODETECT)
2067 *eol_type_in_out = EOL_LF;
2069 Lstream_rewind (stream);
2072 DEFUN ("detect-coding-region", Fdetect_coding_region, 2, 3, 0, /*
2073 Detect coding system of the text in the region between START and END.
2074 Return a list of possible coding systems ordered by priority.
2075 If only ASCII characters are found, return 'undecided or one of
2076 its subsidiary coding systems according to a detected end-of-line
2077 type. Optional arg BUFFER defaults to the current buffer.
2079 (start, end, buffer))
2081 Lisp_Object val = Qnil;
2082 struct buffer *buf = decode_buffer (buffer, 0);
2084 Lisp_Object instream, lb_instream;
2085 Lstream *istr, *lb_istr;
2086 struct detection_state decst;
2087 struct gcpro gcpro1, gcpro2;
2089 get_buffer_range_char (buf, start, end, &b, &e, 0);
2090 lb_instream = make_lisp_buffer_input_stream (buf, b, e, 0);
2091 lb_istr = XLSTREAM (lb_instream);
2092 instream = make_encoding_input_stream (lb_istr, Fget_coding_system (Qbinary));
2093 istr = XLSTREAM (instream);
2094 GCPRO2 (instream, lb_instream);
2096 decst.eol_type = EOL_AUTODETECT;
2100 Extbyte random_buffer[4096];
2101 Lstream_data_count nread = Lstream_read (istr, random_buffer, sizeof (random_buffer));
2105 if (detect_coding_type (&decst, random_buffer, nread, 0))
2109 if (decst.mask == ~0)
2110 val = subsidiary_coding_system (Fget_coding_system (Qundecided),
2118 decst.mask = postprocess_iso2022_mask (decst.mask);
2120 for (i = CODING_CATEGORY_LAST - 1; i >= 0; i--)
2122 int sys = fcd->coding_category_by_priority[i];
2123 if (decst.mask & (1 << sys))
2125 Lisp_Object codesys = fcd->coding_category_system[sys];
2126 if (!NILP (codesys))
2127 codesys = subsidiary_coding_system (codesys, decst.eol_type);
2128 val = Fcons (codesys, val);
2132 Lstream_close (istr);
2134 Lstream_delete (istr);
2135 Lstream_delete (lb_istr);
2140 /************************************************************************/
2141 /* Converting to internal Mule format ("decoding") */
2142 /************************************************************************/
2144 /* A decoding stream is a stream used for decoding text (i.e.
2145 converting from some external format to internal format).
2146 The decoding-stream object keeps track of the actual coding
2147 stream, the stream that is at the other end, and data that
2148 needs to be persistent across the lifetime of the stream. */
2150 /* Handle the EOL stuff related to just-read-in character C.
2151 EOL_TYPE is the EOL type of the coding stream.
2152 FLAGS is the current value of FLAGS in the coding stream, and may
2153 be modified by this macro. (The macro only looks at the
2154 CODING_STATE_CR flag.) DST is the Dynarr to which the decoded
2155 bytes are to be written. You need to also define a local goto
2156 label "label_continue_loop" that is at the end of the main
2157 character-reading loop.
2159 If C is a CR character, then this macro handles it entirely and
2160 jumps to label_continue_loop. Otherwise, this macro does not add
2161 anything to DST, and continues normally. You should continue
2162 processing C normally after this macro. */
2164 #define DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst) \
2168 if (eol_type == EOL_CR) \
2169 Dynarr_add (dst, '\n'); \
2170 else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR) \
2171 Dynarr_add (dst, c); \
2173 flags |= CODING_STATE_CR; \
2174 goto label_continue_loop; \
2176 else if (flags & CODING_STATE_CR) \
2177 { /* eol_type == CODING_SYSTEM_EOL_CRLF */ \
2179 Dynarr_add (dst, '\r'); \
2180 flags &= ~CODING_STATE_CR; \
2184 /* C should be a binary character in the range 0 - 255; convert
2185 to internal format and add to Dynarr DST. */
2188 #define DECODE_ADD_BINARY_CHAR(c, dst) \
2190 if (BYTE_ASCII_P (c)) \
2191 Dynarr_add (dst, c); \
2194 Dynarr_add (dst, (c >> 6) | 0xc0); \
2195 Dynarr_add (dst, (c & 0x3f) | 0x80); \
2199 INLINE_HEADER void DECODE_ADD_UCS_CHAR(Emchar c, unsigned_char_dynarr* dst);
2201 DECODE_ADD_UCS_CHAR(Emchar c, unsigned_char_dynarr* dst)
2205 Dynarr_add (dst, c);
2207 else if ( c <= 0x7ff )
2209 Dynarr_add (dst, (c >> 6) | 0xc0);
2210 Dynarr_add (dst, (c & 0x3f) | 0x80);
2212 else if ( c <= 0xffff )
2214 Dynarr_add (dst, (c >> 12) | 0xe0);
2215 Dynarr_add (dst, ((c >> 6) & 0x3f) | 0x80);
2216 Dynarr_add (dst, (c & 0x3f) | 0x80);
2218 else if ( c <= 0x1fffff )
2220 Dynarr_add (dst, (c >> 18) | 0xf0);
2221 Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
2222 Dynarr_add (dst, ((c >> 6) & 0x3f) | 0x80);
2223 Dynarr_add (dst, (c & 0x3f) | 0x80);
2225 else if ( c <= 0x3ffffff )
2227 Dynarr_add (dst, (c >> 24) | 0xf8);
2228 Dynarr_add (dst, ((c >> 18) & 0x3f) | 0x80);
2229 Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
2230 Dynarr_add (dst, ((c >> 6) & 0x3f) | 0x80);
2231 Dynarr_add (dst, (c & 0x3f) | 0x80);
2235 Dynarr_add (dst, (c >> 30) | 0xfc);
2236 Dynarr_add (dst, ((c >> 24) & 0x3f) | 0x80);
2237 Dynarr_add (dst, ((c >> 18) & 0x3f) | 0x80);
2238 Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
2239 Dynarr_add (dst, ((c >> 6) & 0x3f) | 0x80);
2240 Dynarr_add (dst, (c & 0x3f) | 0x80);
2244 #define DECODE_ADD_BINARY_CHAR(c, dst) \
2246 if (BYTE_ASCII_P (c)) \
2247 Dynarr_add (dst, c); \
2248 else if (BYTE_C1_P (c)) \
2250 Dynarr_add (dst, LEADING_BYTE_CONTROL_1); \
2251 Dynarr_add (dst, c + 0x20); \
2255 Dynarr_add (dst, LEADING_BYTE_LATIN_ISO8859_1); \
2256 Dynarr_add (dst, c); \
2261 #define DECODE_OUTPUT_PARTIAL_CHAR(ch) \
2265 DECODE_ADD_BINARY_CHAR (ch, dst); \
2270 #define DECODE_HANDLE_END_OF_CONVERSION(flags, ch, dst) \
2272 if (flags & CODING_STATE_END) \
2274 DECODE_OUTPUT_PARTIAL_CHAR (ch); \
2275 if (flags & CODING_STATE_CR) \
2276 Dynarr_add (dst, '\r'); \
2280 #define DECODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, decoding)
2282 #define ER_BUF_SIZE 24
2284 struct decoding_stream
2286 /* Coding system that governs the conversion. */
2287 Lisp_Coding_System *codesys;
2289 /* Stream that we read the encoded data from or
2290 write the decoded data to. */
2293 /* If we are reading, then we can return only a fixed amount of
2294 data, so if the conversion resulted in too much data, we store it
2295 here for retrieval the next time around. */
2296 unsigned_char_dynarr *runoff;
2298 /* FLAGS holds flags indicating the current state of the decoding.
2299 Some of these flags are dependent on the coding system. */
2302 /* CPOS holds a partially built-up code-point of character. */
2305 /* EOL_TYPE specifies the type of end-of-line conversion that
2306 currently applies. We need to keep this separate from the
2307 EOL type stored in CODESYS because the latter might indicate
2308 automatic EOL-type detection while the former will always
2309 indicate a particular EOL type. */
2310 eol_type_t eol_type;
2312 /* Additional ISO2022 information. We define the structure above
2313 because it's also needed by the detection routines. */
2314 struct iso2022_decoder iso2022;
2316 /* Additional information (the state of the running CCL program)
2317 used by the CCL decoder. */
2318 struct ccl_program ccl;
2320 /* counter for UTF-8 or UCS-4 */
2321 unsigned char counter;
2325 unsigned char er_counter;
2326 unsigned char er_buf[ER_BUF_SIZE];
2328 unsigned combined_char_count;
2329 Emchar combined_chars[16];
2330 Lisp_Object combining_table;
2332 struct detection_state decst;
2335 static Lstream_data_count decoding_reader (Lstream *stream,
2336 unsigned char *data, Lstream_data_count size);
2337 static Lstream_data_count decoding_writer (Lstream *stream,
2338 const unsigned char *data, Lstream_data_count size);
2339 static int decoding_rewinder (Lstream *stream);
2340 static int decoding_seekable_p (Lstream *stream);
2341 static int decoding_flusher (Lstream *stream);
2342 static int decoding_closer (Lstream *stream);
2344 static Lisp_Object decoding_marker (Lisp_Object stream);
2346 DEFINE_LSTREAM_IMPLEMENTATION ("decoding", lstream_decoding,
2347 sizeof (struct decoding_stream));
2350 decoding_marker (Lisp_Object stream)
2352 Lstream *str = DECODING_STREAM_DATA (XLSTREAM (stream))->other_end;
2353 Lisp_Object str_obj;
2355 /* We do not need to mark the coding systems or charsets stored
2356 within the stream because they are stored in a global list
2357 and automatically marked. */
2359 XSETLSTREAM (str_obj, str);
2360 mark_object (str_obj);
2361 if (str->imp->marker)
2362 return (str->imp->marker) (str_obj);
2367 /* Read SIZE bytes of data and store it into DATA. We are a decoding stream
2368 so we read data from the other end, decode it, and store it into DATA. */
2370 static Lstream_data_count
2371 decoding_reader (Lstream *stream, unsigned char *data, Lstream_data_count size)
2373 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2374 unsigned char *orig_data = data;
2375 Lstream_data_count read_size;
2376 int error_occurred = 0;
2378 /* We need to interface to mule_decode(), which expects to take some
2379 amount of data and store the result into a Dynarr. We have
2380 mule_decode() store into str->runoff, and take data from there
2383 /* We loop until we have enough data, reading chunks from the other
2384 end and decoding it. */
2387 /* Take data from the runoff if we can. Make sure to take at
2388 most SIZE bytes, and delete the data from the runoff. */
2389 if (Dynarr_length (str->runoff) > 0)
2391 Lstream_data_count chunk = min (size, (Lstream_data_count) Dynarr_length (str->runoff));
2392 memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
2393 Dynarr_delete_many (str->runoff, 0, chunk);
2399 break; /* No more room for data */
2401 if (str->flags & CODING_STATE_END)
2402 /* This means that on the previous iteration, we hit the EOF on
2403 the other end. We loop once more so that mule_decode() can
2404 output any final stuff it may be holding, or any "go back
2405 to a sane state" escape sequences. (This latter makes sense
2406 during encoding.) */
2409 /* Exhausted the runoff, so get some more. DATA has at least
2410 SIZE bytes left of storage in it, so it's OK to read directly
2411 into it. (We'll be overwriting above, after we've decoded it
2412 into the runoff.) */
2413 read_size = Lstream_read (str->other_end, data, size);
2420 /* There might be some more end data produced in the translation.
2421 See the comment above. */
2422 str->flags |= CODING_STATE_END;
2423 mule_decode (stream, (Extbyte *) data, str->runoff, read_size);
2426 if (data - orig_data == 0)
2427 return error_occurred ? -1 : 0;
2429 return data - orig_data;
2432 static Lstream_data_count
2433 decoding_writer (Lstream *stream, const unsigned char *data, Lstream_data_count size)
2435 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2436 Lstream_data_count retval;
2438 /* Decode all our data into the runoff, and then attempt to write
2439 it all out to the other end. Remove whatever chunk we succeeded
2441 mule_decode (stream, (Extbyte *) data, str->runoff, size);
2442 retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
2443 Dynarr_length (str->runoff));
2445 Dynarr_delete_many (str->runoff, 0, retval);
2446 /* Do NOT return retval. The return value indicates how much
2447 of the incoming data was written, not how many bytes were
2453 reset_decoding_stream (struct decoding_stream *str)
2456 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_ISO2022)
2458 Lisp_Object coding_system;
2459 XSETCODING_SYSTEM (coding_system, str->codesys);
2460 reset_iso2022 (coding_system, &str->iso2022);
2462 else if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_CCL)
2464 setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_DECODE (str->codesys));
2470 str->er_counter = 0;
2471 str->combined_char_count = 0;
2472 str->combining_table = Qnil;
2474 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT
2475 || CODING_SYSTEM_EOL_TYPE (str->codesys) == EOL_AUTODETECT)
2478 str->decst.eol_type = EOL_AUTODETECT;
2479 str->decst.mask = ~0;
2481 str->flags = str->cpos = 0;
2485 decoding_rewinder (Lstream *stream)
2487 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2488 reset_decoding_stream (str);
2489 Dynarr_reset (str->runoff);
2490 return Lstream_rewind (str->other_end);
2494 decoding_seekable_p (Lstream *stream)
2496 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2497 return Lstream_seekable_p (str->other_end);
2501 decoding_flusher (Lstream *stream)
2503 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2504 return Lstream_flush (str->other_end);
2508 decoding_closer (Lstream *stream)
2510 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2511 if (stream->flags & LSTREAM_FL_WRITE)
2513 str->flags |= CODING_STATE_END;
2514 decoding_writer (stream, 0, 0);
2516 Dynarr_free (str->runoff);
2518 #ifdef ENABLE_COMPOSITE_CHARS
2519 if (str->iso2022.composite_chars)
2520 Dynarr_free (str->iso2022.composite_chars);
2523 return Lstream_close (str->other_end);
2527 decoding_stream_coding_system (Lstream *stream)
2529 Lisp_Object coding_system;
2530 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2532 XSETCODING_SYSTEM (coding_system, str->codesys);
2533 return subsidiary_coding_system (coding_system, str->eol_type);
2537 set_decoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
2539 Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
2540 struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
2542 if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
2543 str->eol_type = CODING_SYSTEM_EOL_TYPE (cs);
2544 reset_decoding_stream (str);
2547 /* WARNING WARNING WARNING WARNING!!!!! If you open up a decoding
2548 stream for writing, no automatic code detection will be performed.
2549 The reason for this is that automatic code detection requires a
2550 seekable input. Things will also fail if you open a decoding
2551 stream for reading using a non-fully-specified coding system and
2552 a non-seekable input stream. */
2555 make_decoding_stream_1 (Lstream *stream, Lisp_Object codesys,
2558 Lstream *lstr = Lstream_new (lstream_decoding, mode);
2559 struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
2563 str->other_end = stream;
2564 str->runoff = (unsigned_char_dynarr *) Dynarr_new (unsigned_char);
2565 str->eol_type = EOL_AUTODETECT;
2566 if (!strcmp (mode, "r")
2567 && Lstream_seekable_p (stream))
2568 /* We can determine the coding system now. */
2569 determine_real_coding_system (stream, &codesys, &str->eol_type);
2570 set_decoding_stream_coding_system (lstr, codesys);
2571 str->decst.eol_type = str->eol_type;
2572 str->decst.mask = ~0;
2573 XSETLSTREAM (obj, lstr);
2578 make_decoding_input_stream (Lstream *stream, Lisp_Object codesys)
2580 return make_decoding_stream_1 (stream, codesys, "r");
2584 make_decoding_output_stream (Lstream *stream, Lisp_Object codesys)
2586 return make_decoding_stream_1 (stream, codesys, "w");
2589 /* Note: the decode_coding_* functions all take the same
2590 arguments as mule_decode(), which is to say some SRC data of
2591 size N, which is to be stored into dynamic array DST.
2592 DECODING is the stream within which the decoding is
2593 taking place, but no data is actually read from or
2594 written to that stream; that is handled in decoding_reader()
2595 or decoding_writer(). This allows the same functions to
2596 be used for both reading and writing. */
2599 mule_decode (Lstream *decoding, const Extbyte *src,
2600 unsigned_char_dynarr *dst, Lstream_data_count n)
2602 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
2604 /* If necessary, do encoding-detection now. We do this when
2605 we're a writing stream or a non-seekable reading stream,
2606 meaning that we can't just process the whole input,
2607 rewind, and start over. */
2609 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT ||
2610 str->eol_type == EOL_AUTODETECT)
2612 Lisp_Object codesys;
2614 XSETCODING_SYSTEM (codesys, str->codesys);
2615 detect_coding_type (&str->decst, src, n,
2616 CODING_SYSTEM_TYPE (str->codesys) !=
2617 CODESYS_AUTODETECT);
2618 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT &&
2619 str->decst.mask != ~0)
2620 /* #### This is cheesy. What we really ought to do is
2621 buffer up a certain amount of data so as to get a
2622 less random result. */
2623 codesys = coding_system_from_mask (str->decst.mask);
2624 str->eol_type = str->decst.eol_type;
2625 if (XCODING_SYSTEM (codesys) != str->codesys)
2627 /* Preserve the CODING_STATE_END flag in case it was set.
2628 If we erase it, bad things might happen. */
2629 int was_end = str->flags & CODING_STATE_END;
2630 set_decoding_stream_coding_system (decoding, codesys);
2632 str->flags |= CODING_STATE_END;
2636 switch (CODING_SYSTEM_TYPE (str->codesys))
2639 case CODESYS_INTERNAL:
2640 Dynarr_add_many (dst, src, n);
2643 case CODESYS_AUTODETECT:
2644 /* If we got this far and still haven't decided on the coding
2645 system, then do no conversion. */
2646 case CODESYS_NO_CONVERSION:
2647 decode_coding_no_conversion (decoding, src, dst, n);
2650 case CODESYS_SHIFT_JIS:
2651 decode_coding_sjis (decoding, src, dst, n);
2654 decode_coding_big5 (decoding, src, dst, n);
2657 decode_coding_ucs4 (decoding, src, dst, n);
2660 decode_coding_utf16 (decoding, src, dst, n);
2663 decode_coding_utf8 (decoding, src, dst, n);
2666 str->ccl.last_block = str->flags & CODING_STATE_END;
2667 /* When applying ccl program to stream, MUST NOT set NULL
2669 ccl_driver (&str->ccl, (src ? (unsigned char *)src : (unsigned char*)""),
2670 dst, n, 0, CCL_MODE_DECODING);
2672 case CODESYS_ISO2022:
2673 decode_coding_iso2022 (decoding, src, dst, n);
2681 DEFUN ("decode-coding-region", Fdecode_coding_region, 3, 4, 0, /*
2682 Decode the text between START and END which is encoded in CODING-SYSTEM.
2683 This is useful if you've read in encoded text from a file without decoding
2684 it (e.g. you read in a JIS-formatted file but used the `binary' or
2685 `no-conversion' coding system, so that it shows up as "^[$B!<!+^[(B").
2686 Return length of decoded text.
2687 BUFFER defaults to the current buffer if unspecified.
2689 (start, end, coding_system, buffer))
2692 struct buffer *buf = decode_buffer (buffer, 0);
2693 Lisp_Object instream, lb_outstream, de_outstream, outstream;
2694 Lstream *istr, *ostr;
2695 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
2697 get_buffer_range_char (buf, start, end, &b, &e, 0);
2699 barf_if_buffer_read_only (buf, b, e);
2701 coding_system = Fget_coding_system (coding_system);
2702 instream = make_lisp_buffer_input_stream (buf, b, e, 0);
2703 lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
2704 de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
2706 outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
2707 Fget_coding_system (Qbinary));
2708 istr = XLSTREAM (instream);
2709 ostr = XLSTREAM (outstream);
2710 GCPRO4 (instream, lb_outstream, de_outstream, outstream);
2712 /* The chain of streams looks like this:
2714 [BUFFER] <----- send through
2715 ------> [ENCODE AS BINARY]
2716 ------> [DECODE AS SPECIFIED]
2722 char tempbuf[1024]; /* some random amount */
2723 Bufpos newpos, even_newer_pos;
2724 Bufpos oldpos = lisp_buffer_stream_startpos (istr);
2725 Lstream_data_count size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
2729 newpos = lisp_buffer_stream_startpos (istr);
2730 Lstream_write (ostr, tempbuf, size_in_bytes);
2731 even_newer_pos = lisp_buffer_stream_startpos (istr);
2732 buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
2735 Lstream_close (istr);
2736 Lstream_close (ostr);
2738 Lstream_delete (istr);
2739 Lstream_delete (ostr);
2740 Lstream_delete (XLSTREAM (de_outstream));
2741 Lstream_delete (XLSTREAM (lb_outstream));
2746 /************************************************************************/
2747 /* Converting to an external encoding ("encoding") */
2748 /************************************************************************/
2750 /* An encoding stream is an output stream. When you create the
2751 stream, you specify the coding system that governs the encoding
2752 and another stream that the resulting encoded data is to be
2753 sent to, and then start sending data to it. */
2755 #define ENCODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, encoding)
2757 struct encoding_stream
2759 /* Coding system that governs the conversion. */
2760 Lisp_Coding_System *codesys;
2762 /* Stream that we read the encoded data from or
2763 write the decoded data to. */
2766 /* If we are reading, then we can return only a fixed amount of
2767 data, so if the conversion resulted in too much data, we store it
2768 here for retrieval the next time around. */
2769 unsigned_char_dynarr *runoff;
2771 /* FLAGS holds flags indicating the current state of the encoding.
2772 Some of these flags are dependent on the coding system. */
2775 /* CH holds a partially built-up character. Since we only deal
2776 with one- and two-byte characters at the moment, we only use
2777 this to store the first byte of a two-byte character. */
2780 /* Additional information used by the ISO2022 encoder. */
2783 /* CHARSET holds the character sets currently assigned to the G0
2784 through G3 registers. It is initialized from the array
2785 INITIAL_CHARSET in CODESYS. */
2786 Lisp_Object charset[4];
2788 /* Which registers are currently invoked into the left (GL) and
2789 right (GR) halves of the 8-bit encoding space? */
2790 int register_left, register_right;
2792 /* Whether we need to explicitly designate the charset in the
2793 G? register before using it. It is initialized from the
2794 array FORCE_CHARSET_ON_OUTPUT in CODESYS. */
2795 unsigned char force_charset_on_output[4];
2797 /* Other state variables that need to be preserved across
2799 Lisp_Object current_charset;
2801 int current_char_boundary;
2804 void (*encode_char) (struct encoding_stream *str, Emchar c,
2805 unsigned_char_dynarr *dst, unsigned int *flags);
2806 void (*finish) (struct encoding_stream *str,
2807 unsigned_char_dynarr *dst, unsigned int *flags);
2809 /* Additional information (the state of the running CCL program)
2810 used by the CCL encoder. */
2811 struct ccl_program ccl;
2815 static Lstream_data_count encoding_reader (Lstream *stream, unsigned char *data, Lstream_data_count size);
2816 static Lstream_data_count encoding_writer (Lstream *stream, const unsigned char *data,
2817 Lstream_data_count size);
2818 static int encoding_rewinder (Lstream *stream);
2819 static int encoding_seekable_p (Lstream *stream);
2820 static int encoding_flusher (Lstream *stream);
2821 static int encoding_closer (Lstream *stream);
2823 static Lisp_Object encoding_marker (Lisp_Object stream);
2825 DEFINE_LSTREAM_IMPLEMENTATION ("encoding", lstream_encoding,
2826 sizeof (struct encoding_stream));
2829 encoding_marker (Lisp_Object stream)
2831 Lstream *str = ENCODING_STREAM_DATA (XLSTREAM (stream))->other_end;
2832 Lisp_Object str_obj;
2834 /* We do not need to mark the coding systems or charsets stored
2835 within the stream because they are stored in a global list
2836 and automatically marked. */
2838 XSETLSTREAM (str_obj, str);
2839 mark_object (str_obj);
2840 if (str->imp->marker)
2841 return (str->imp->marker) (str_obj);
2846 /* Read SIZE bytes of data and store it into DATA. We are a encoding stream
2847 so we read data from the other end, encode it, and store it into DATA. */
2849 static Lstream_data_count
2850 encoding_reader (Lstream *stream, unsigned char *data, Lstream_data_count size)
2852 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2853 unsigned char *orig_data = data;
2854 Lstream_data_count read_size;
2855 int error_occurred = 0;
2857 /* We need to interface to mule_encode(), which expects to take some
2858 amount of data and store the result into a Dynarr. We have
2859 mule_encode() store into str->runoff, and take data from there
2862 /* We loop until we have enough data, reading chunks from the other
2863 end and encoding it. */
2866 /* Take data from the runoff if we can. Make sure to take at
2867 most SIZE bytes, and delete the data from the runoff. */
2868 if (Dynarr_length (str->runoff) > 0)
2870 int chunk = min ((int) size, Dynarr_length (str->runoff));
2871 memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
2872 Dynarr_delete_many (str->runoff, 0, chunk);
2878 break; /* No more room for data */
2880 if (str->flags & CODING_STATE_END)
2881 /* This means that on the previous iteration, we hit the EOF on
2882 the other end. We loop once more so that mule_encode() can
2883 output any final stuff it may be holding, or any "go back
2884 to a sane state" escape sequences. (This latter makes sense
2885 during encoding.) */
2888 /* Exhausted the runoff, so get some more. DATA at least SIZE bytes
2889 left of storage in it, so it's OK to read directly into it.
2890 (We'll be overwriting above, after we've encoded it into the
2892 read_size = Lstream_read (str->other_end, data, size);
2899 /* There might be some more end data produced in the translation.
2900 See the comment above. */
2901 str->flags |= CODING_STATE_END;
2902 mule_encode (stream, data, str->runoff, read_size);
2905 if (data == orig_data)
2906 return error_occurred ? -1 : 0;
2908 return data - orig_data;
2911 static Lstream_data_count
2912 encoding_writer (Lstream *stream, const unsigned char *data, Lstream_data_count size)
2914 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2915 Lstream_data_count retval;
2917 /* Encode all our data into the runoff, and then attempt to write
2918 it all out to the other end. Remove whatever chunk we succeeded
2920 mule_encode (stream, data, str->runoff, size);
2921 retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
2922 Dynarr_length (str->runoff));
2924 Dynarr_delete_many (str->runoff, 0, retval);
2925 /* Do NOT return retval. The return value indicates how much
2926 of the incoming data was written, not how many bytes were
2932 reset_encoding_stream (struct encoding_stream *str)
2935 switch (CODING_SYSTEM_TYPE (str->codesys))
2937 case CODESYS_ISO2022:
2941 str->encode_char = &char_encode_iso2022;
2942 str->finish = &char_finish_iso2022;
2943 for (i = 0; i < 4; i++)
2945 str->iso2022.charset[i] =
2946 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (str->codesys, i);
2947 str->iso2022.force_charset_on_output[i] =
2948 CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (str->codesys, i);
2950 str->iso2022.register_left = 0;
2951 str->iso2022.register_right = 1;
2952 str->iso2022.current_charset = Qnil;
2953 str->iso2022.current_half = 0;
2957 setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_ENCODE (str->codesys));
2960 str->encode_char = &char_encode_utf8;
2961 str->finish = &char_finish_utf8;
2964 str->encode_char = &char_encode_utf16;
2965 str->finish = &char_finish_utf16;
2968 str->encode_char = &char_encode_ucs4;
2969 str->finish = &char_finish_ucs4;
2971 case CODESYS_SHIFT_JIS:
2972 str->encode_char = &char_encode_shift_jis;
2973 str->finish = &char_finish_shift_jis;
2976 str->encode_char = &char_encode_big5;
2977 str->finish = &char_finish_big5;
2983 str->iso2022.current_char_boundary = 0;
2984 str->flags = str->ch = 0;
2988 encoding_rewinder (Lstream *stream)
2990 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2991 reset_encoding_stream (str);
2992 Dynarr_reset (str->runoff);
2993 return Lstream_rewind (str->other_end);
2997 encoding_seekable_p (Lstream *stream)
2999 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
3000 return Lstream_seekable_p (str->other_end);
3004 encoding_flusher (Lstream *stream)
3006 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
3007 return Lstream_flush (str->other_end);
3011 encoding_closer (Lstream *stream)
3013 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
3014 if (stream->flags & LSTREAM_FL_WRITE)
3016 str->flags |= CODING_STATE_END;
3017 encoding_writer (stream, 0, 0);
3019 Dynarr_free (str->runoff);
3020 return Lstream_close (str->other_end);
3024 encoding_stream_coding_system (Lstream *stream)
3026 Lisp_Object coding_system;
3027 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
3029 XSETCODING_SYSTEM (coding_system, str->codesys);
3030 return coding_system;
3034 set_encoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
3036 Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
3037 struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
3039 reset_encoding_stream (str);
3043 make_encoding_stream_1 (Lstream *stream, Lisp_Object codesys,
3046 Lstream *lstr = Lstream_new (lstream_encoding, mode);
3047 struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
3051 str->runoff = Dynarr_new (unsigned_char);
3052 str->other_end = stream;
3053 set_encoding_stream_coding_system (lstr, codesys);
3054 XSETLSTREAM (obj, lstr);
3059 make_encoding_input_stream (Lstream *stream, Lisp_Object codesys)
3061 return make_encoding_stream_1 (stream, codesys, "r");
3065 make_encoding_output_stream (Lstream *stream, Lisp_Object codesys)
3067 return make_encoding_stream_1 (stream, codesys, "w");
3070 /* Convert N bytes of internally-formatted data stored in SRC to an
3071 external format, according to the encoding stream ENCODING.
3072 Store the encoded data into DST. */
3075 mule_encode (Lstream *encoding, const Bufbyte *src,
3076 unsigned_char_dynarr *dst, Lstream_data_count n)
3078 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3080 switch (CODING_SYSTEM_TYPE (str->codesys))
3083 case CODESYS_INTERNAL:
3084 Dynarr_add_many (dst, src, n);
3087 case CODESYS_AUTODETECT:
3088 /* If we got this far and still haven't decided on the coding
3089 system, then do no conversion. */
3090 case CODESYS_NO_CONVERSION:
3091 encode_coding_no_conversion (encoding, src, dst, n);
3095 str->ccl.last_block = str->flags & CODING_STATE_END;
3096 /* When applying ccl program to stream, MUST NOT set NULL
3098 ccl_driver (&str->ccl, ((src) ? src : (unsigned char*)""),
3099 dst, n, 0, CCL_MODE_ENCODING);
3103 text_encode_generic (encoding, src, dst, n);
3107 DEFUN ("encode-coding-region", Fencode_coding_region, 3, 4, 0, /*
3108 Encode the text between START and END using CODING-SYSTEM.
3109 This will, for example, convert Japanese characters into stuff such as
3110 "^[$B!<!+^[(B" if you use the JIS encoding. Return length of encoded
3111 text. BUFFER defaults to the current buffer if unspecified.
3113 (start, end, coding_system, buffer))
3116 struct buffer *buf = decode_buffer (buffer, 0);
3117 Lisp_Object instream, lb_outstream, de_outstream, outstream;
3118 Lstream *istr, *ostr;
3119 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
3121 get_buffer_range_char (buf, start, end, &b, &e, 0);
3123 barf_if_buffer_read_only (buf, b, e);
3125 coding_system = Fget_coding_system (coding_system);
3126 instream = make_lisp_buffer_input_stream (buf, b, e, 0);
3127 lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
3128 de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
3129 Fget_coding_system (Qbinary));
3130 outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
3132 istr = XLSTREAM (instream);
3133 ostr = XLSTREAM (outstream);
3134 GCPRO4 (instream, outstream, de_outstream, lb_outstream);
3135 /* The chain of streams looks like this:
3137 [BUFFER] <----- send through
3138 ------> [ENCODE AS SPECIFIED]
3139 ------> [DECODE AS BINARY]
3144 char tempbuf[1024]; /* some random amount */
3145 Bufpos newpos, even_newer_pos;
3146 Bufpos oldpos = lisp_buffer_stream_startpos (istr);
3147 Lstream_data_count size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
3151 newpos = lisp_buffer_stream_startpos (istr);
3152 Lstream_write (ostr, tempbuf, size_in_bytes);
3153 even_newer_pos = lisp_buffer_stream_startpos (istr);
3154 buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
3160 lisp_buffer_stream_startpos (XLSTREAM (instream)) - b;
3161 Lstream_close (istr);
3162 Lstream_close (ostr);
3164 Lstream_delete (istr);
3165 Lstream_delete (ostr);
3166 Lstream_delete (XLSTREAM (de_outstream));
3167 Lstream_delete (XLSTREAM (lb_outstream));
3168 return make_int (retlen);
3175 text_encode_generic (Lstream *encoding, const Bufbyte *src,
3176 unsigned_char_dynarr *dst, Lstream_data_count n)
3179 unsigned char char_boundary;
3180 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3181 unsigned int flags = str->flags;
3182 Emchar ch = str->ch;
3184 char_boundary = str->iso2022.current_char_boundary;
3190 if (char_boundary == 0)
3218 (*str->encode_char) (str, c, dst, &flags);
3220 else if (char_boundary == 1)
3222 (*str->encode_char) (str, (ch << 6) | (c & 0x3f), dst, &flags);
3228 ch = (ch << 6) | (c & 0x3f);
3233 if ((char_boundary == 0) && (flags & CODING_STATE_END))
3235 (*str->finish) (str, dst, &flags);
3240 str->iso2022.current_char_boundary = char_boundary;
3245 /************************************************************************/
3246 /* entity reference */
3247 /************************************************************************/
3250 decode_flush_er_chars (struct decoding_stream *str, unsigned_char_dynarr* dst);
3252 decode_flush_er_chars (struct decoding_stream *str, unsigned_char_dynarr* dst)
3254 if ( str->er_counter > 0)
3256 Dynarr_add_many (dst, str->er_buf, str->er_counter);
3257 str->er_counter = 0;
3261 EXFUN (Fregexp_quote, 1);
3263 void decode_add_er_char (struct decoding_stream *str, Emchar character,
3264 unsigned_char_dynarr* dst);
3266 decode_add_er_char (struct decoding_stream *str, Emchar c,
3267 unsigned_char_dynarr* dst)
3269 if (str->er_counter == 0)
3271 if (CODING_SYSTEM_USE_ENTITY_REFERENCE (str->codesys)
3274 str->er_buf[0] = '&';
3278 DECODE_ADD_UCS_CHAR (c, dst);
3282 Lisp_Object string = make_string (str->er_buf,
3289 Lisp_Object char_type;
3292 for ( rest = Vcoded_charset_entity_reference_alist;
3293 !NILP (rest); rest = Fcdr (rest) )
3299 char_type = XCDR (ccs);
3304 if (NILP (ccs = Ffind_charset (ccs)))
3313 pat = Fregexp_quote (pat);
3320 pat = concat3 (build_string ("^&"),
3321 pat, build_string ("\\([0-9]+\\)$"));
3324 else if (EQ (ret, Qx))
3326 pat = concat3 (build_string ("^&"),
3327 pat, build_string ("\\([0-9a-f]+\\)$"));
3330 else if (EQ (ret, QX))
3332 pat = concat3 (build_string ("^&"),
3333 pat, build_string ("\\([0-9A-F]+\\)$"));
3339 if (!NILP (Fstring_match (pat, string, Qnil, Qnil)))
3342 = XINT (Fstring_to_number
3343 (Fsubstring (string,
3344 Fmatch_beginning (make_int (1)),
3345 Fmatch_end (make_int (1))),
3349 ? DECODE_CHAR (ccs, code, 0)
3350 : decode_builtin_char (ccs, code);
3353 DECODE_ADD_UCS_CHAR (chr, dst);
3356 Dynarr_add_many (dst, str->er_buf, str->er_counter);
3357 Dynarr_add (dst, ';');
3363 if (!NILP (Fstring_match (build_string ("^&MCS-\\([0-9A-F]+\\)$"),
3364 string, Qnil, Qnil)))
3367 = XUINT (Fstring_to_number
3368 (Fsubstring (string,
3369 Fmatch_beginning (make_int (1)),
3370 Fmatch_end (make_int (1))),
3373 DECODE_ADD_UCS_CHAR (code, dst);
3377 Dynarr_add_many (dst, str->er_buf, str->er_counter);
3378 Dynarr_add (dst, ';');
3381 str->er_counter = 0;
3383 else if ( (str->er_counter >= ER_BUF_SIZE) || (c >= 0x7F) )
3385 Dynarr_add_many (dst, str->er_buf, str->er_counter);
3386 str->er_counter = 0;
3387 DECODE_ADD_UCS_CHAR (c, dst);
3390 str->er_buf[str->er_counter++] = c;
3393 void char_encode_as_entity_reference (Emchar ch, char* buf);
3395 char_encode_as_entity_reference (Emchar ch, char* buf)
3397 Lisp_Object rest = Vcoded_charset_entity_reference_alist;
3400 Lisp_Object char_type;
3401 int format_columns, idx;
3402 char format[ER_BUF_SIZE];
3404 while (!NILP (rest))
3410 char_type = XCDR (ccs);
3415 if (!NILP (ccs = Ffind_charset (ccs)))
3417 int code_point = charset_code_point (ccs, ch, 0);
3419 if ( (code_point >= 0)
3420 && (NILP (char_type)
3421 || DECODE_CHAR (ccs, code_point, 0) != ch) )
3427 if ( STRINGP (ret) &&
3428 ( (idx = XSTRING_LENGTH (ret)) <= (ER_BUF_SIZE - 4) ) )
3431 strncpy (&format[1], XSTRING_DATA (ret), idx);
3441 format[idx++] = '%';
3442 format_columns = XINT (ret);
3443 if ( (2 <= format_columns) && (format_columns <= 8)
3444 && (idx + format_columns <= ER_BUF_SIZE - 1) )
3446 format [idx++] = '0';
3447 format [idx++] = '0' + format_columns;
3456 format [idx++] = 'd';
3457 else if (EQ (ret, Qx))
3458 format [idx++] = 'x';
3459 else if (EQ (ret, QX))
3460 format [idx++] = 'X';
3463 format [idx++] = ';';
3466 sprintf (buf, format, code_point);
3473 sprintf (buf, "&MCS-%08X;", ch);
3477 /************************************************************************/
3478 /* character composition */
3479 /************************************************************************/
3480 extern Lisp_Object Qcomposition, Qrep_decomposition;
3483 COMPOSE_FLUSH_CHARS (struct decoding_stream *str, unsigned_char_dynarr* dst);
3485 COMPOSE_FLUSH_CHARS (struct decoding_stream *str, unsigned_char_dynarr* dst)
3489 for (i = 0; i < str->combined_char_count; i++)
3490 decode_add_er_char (str, str->combined_chars[i], dst);
3491 str->combined_char_count = 0;
3492 str->combining_table = Qnil;
3495 void COMPOSE_ADD_CHAR (struct decoding_stream *str, Emchar character,
3496 unsigned_char_dynarr* dst);
3498 COMPOSE_ADD_CHAR (struct decoding_stream *str,
3499 Emchar character, unsigned_char_dynarr* dst)
3501 if (CODING_SYSTEM_DISABLE_COMPOSITION (str->codesys))
3502 decode_add_er_char (str, character, dst);
3503 else if (!CONSP (str->combining_table))
3506 = Fchar_feature (make_char (character), Qcomposition, Qnil,
3510 decode_add_er_char (str, character, dst);
3513 str->combined_chars[0] = character;
3514 str->combined_char_count = 1;
3515 str->combining_table = ret;
3521 = Fcdr (Fassq (make_char (character), str->combining_table));
3525 Emchar char2 = XCHARVAL (ret);
3526 Lisp_Object ret2 = Fchar_feature (ret, Qcomposition, Qnil,
3531 decode_add_er_char (str, char2, dst);
3532 str->combined_char_count = 0;
3533 str->combining_table = Qnil;
3537 str->combined_chars[0] = char2;
3538 str->combined_char_count = 1;
3539 str->combining_table = ret2;
3544 ret = Fchar_feature (make_char (character), Qcomposition, Qnil,
3547 COMPOSE_FLUSH_CHARS (str, dst);
3549 decode_add_er_char (str, character, dst);
3552 str->combined_chars[0] = character;
3553 str->combined_char_count = 1;
3554 str->combining_table = ret;
3559 #else /* not UTF2000 */
3560 #define COMPOSE_FLUSH_CHARS(str, dst)
3561 #define COMPOSE_ADD_CHAR(str, ch, dst) DECODE_ADD_UCS_CHAR (ch, dst)
3562 #endif /* UTF2000 */
3565 /************************************************************************/
3566 /* Shift-JIS methods */
3567 /************************************************************************/
3569 /* Shift-JIS is a coding system encoding three character sets: ASCII, right
3570 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
3571 as is. A character of JISX0201-Kana (DIMENSION1_CHARS94 character set) is
3572 encoded by "position-code + 0x80". A character of JISX0208
3573 (DIMENSION2_CHARS94 character set) is encoded in 2-byte but two
3574 position-codes are divided and shifted so that it fit in the range
3577 --- CODE RANGE of Shift-JIS ---
3578 (character set) (range)
3580 JISX0201-Kana 0xA0 .. 0xDF
3581 JISX0208 (1st byte) 0x80 .. 0x9F and 0xE0 .. 0xEF
3582 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
3583 -------------------------------
3587 /* Is this the first byte of a Shift-JIS two-byte char? */
3589 #define BYTE_SJIS_TWO_BYTE_1_P(c) \
3590 (((c) >= 0x81 && (c) <= 0x9F) || ((c) >= 0xE0 && (c) <= 0xEF))
3592 /* Is this the second byte of a Shift-JIS two-byte char? */
3594 #define BYTE_SJIS_TWO_BYTE_2_P(c) \
3595 (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0x80 && (c) <= 0xFC))
3597 #define BYTE_SJIS_KATAKANA_P(c) \
3598 ((c) >= 0xA1 && (c) <= 0xDF)
3601 detect_coding_sjis (struct detection_state *st, const Extbyte *src, Lstream_data_count n)
3605 unsigned char c = *(unsigned char *)src++;
3606 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
3608 if (st->shift_jis.in_second_byte)
3610 st->shift_jis.in_second_byte = 0;
3614 else if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
3615 st->shift_jis.in_second_byte = 1;
3617 return CODING_CATEGORY_SHIFT_JIS_MASK;
3620 /* Convert Shift-JIS data to internal format. */
3623 decode_coding_sjis (Lstream *decoding, const Extbyte *src,
3624 unsigned_char_dynarr *dst, Lstream_data_count n)
3626 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3627 unsigned int flags = str->flags;
3628 unsigned int cpos = str->cpos;
3629 eol_type_t eol_type = str->eol_type;
3633 unsigned char c = *(unsigned char *)src++;
3637 /* Previous character was first byte of Shift-JIS Kanji char. */
3638 if (BYTE_SJIS_TWO_BYTE_2_P (c))
3640 unsigned char e1, e2;
3642 DECODE_SJIS (cpos, c, e1, e2);
3644 DECODE_ADD_UCS_CHAR(MAKE_CHAR(Vcharset_japanese_jisx0208,
3648 Dynarr_add (dst, LEADING_BYTE_JAPANESE_JISX0208);
3649 Dynarr_add (dst, e1);
3650 Dynarr_add (dst, e2);
3655 DECODE_ADD_BINARY_CHAR (cpos, dst);
3656 DECODE_ADD_BINARY_CHAR (c, dst);
3662 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3663 if (BYTE_SJIS_TWO_BYTE_1_P (c))
3665 else if (BYTE_SJIS_KATAKANA_P (c))
3668 DECODE_ADD_UCS_CHAR(MAKE_CHAR(Vcharset_katakana_jisx0201,
3671 Dynarr_add (dst, LEADING_BYTE_KATAKANA_JISX0201);
3672 Dynarr_add (dst, c);
3677 DECODE_ADD_UCS_CHAR(MAKE_CHAR(Vcharset_latin_jisx0201,
3681 DECODE_ADD_BINARY_CHAR (c, dst);
3683 label_continue_loop:;
3686 DECODE_HANDLE_END_OF_CONVERSION (flags, cpos, dst);
3692 /* Convert internal character representation to Shift_JIS. */
3695 char_encode_shift_jis (struct encoding_stream *str, Emchar ch,
3696 unsigned_char_dynarr *dst, unsigned int *flags)
3698 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
3702 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3703 Dynarr_add (dst, '\r');
3704 if (eol_type != EOL_CR)
3705 Dynarr_add (dst, ch);
3709 unsigned int s1, s2;
3711 int code_point = charset_code_point (Vcharset_latin_jisx0201, ch, 0);
3713 if (code_point >= 0)
3714 Dynarr_add (dst, code_point);
3715 else if ((code_point
3716 = charset_code_point (Vcharset_japanese_jisx0208_1990, ch, 0))
3719 ENCODE_SJIS ((code_point >> 8) | 0x80,
3720 (code_point & 0xFF) | 0x80, s1, s2);
3721 Dynarr_add (dst, s1);
3722 Dynarr_add (dst, s2);
3724 else if ((code_point
3725 = charset_code_point (Vcharset_katakana_jisx0201, ch, 0))
3727 Dynarr_add (dst, code_point | 0x80);
3728 else if ((code_point
3729 = charset_code_point (Vcharset_japanese_jisx0208, ch, 0))
3732 ENCODE_SJIS ((code_point >> 8) | 0x80,
3733 (code_point & 0xFF) | 0x80, s1, s2);
3734 Dynarr_add (dst, s1);
3735 Dynarr_add (dst, s2);
3737 else if ((code_point = charset_code_point (Vcharset_ascii, ch, 0))
3739 Dynarr_add (dst, code_point);
3741 Dynarr_add (dst, '?');
3743 Lisp_Object charset;
3744 unsigned int c1, c2;
3746 BREAKUP_CHAR (ch, charset, c1, c2);
3748 if (EQ(charset, Vcharset_katakana_jisx0201))
3750 Dynarr_add (dst, c1 | 0x80);
3754 Dynarr_add (dst, c1);
3756 else if (EQ(charset, Vcharset_japanese_jisx0208))
3758 ENCODE_SJIS (c1 | 0x80, c2 | 0x80, s1, s2);
3759 Dynarr_add (dst, s1);
3760 Dynarr_add (dst, s2);
3763 Dynarr_add (dst, '?');
3769 char_finish_shift_jis (struct encoding_stream *str, unsigned_char_dynarr *dst,
3770 unsigned int *flags)
3774 DEFUN ("decode-shift-jis-char", Fdecode_shift_jis_char, 1, 1, 0, /*
3775 Decode a JISX0208 character of Shift-JIS coding-system.
3776 CODE is the character code in Shift-JIS as a cons of type bytes.
3777 Return the corresponding character.
3781 unsigned char c1, c2, s1, s2;
3784 CHECK_INT (XCAR (code));
3785 CHECK_INT (XCDR (code));
3786 s1 = XINT (XCAR (code));
3787 s2 = XINT (XCDR (code));
3788 if (BYTE_SJIS_TWO_BYTE_1_P (s1) &&
3789 BYTE_SJIS_TWO_BYTE_2_P (s2))
3791 DECODE_SJIS (s1, s2, c1, c2);
3792 return make_char (MAKE_CHAR (Vcharset_japanese_jisx0208,
3793 c1 & 0x7F, c2 & 0x7F));
3799 DEFUN ("encode-shift-jis-char", Fencode_shift_jis_char, 1, 1, 0, /*
3800 Encode a JISX0208 character CHARACTER to SHIFT-JIS coding-system.
3801 Return the corresponding character code in SHIFT-JIS as a cons of two bytes.
3805 Lisp_Object charset;
3808 CHECK_CHAR_COERCE_INT (character);
3809 BREAKUP_CHAR (XCHAR (character), charset, c1, c2);
3810 if (EQ (charset, Vcharset_japanese_jisx0208))
3812 ENCODE_SJIS (c1 | 0x80, c2 | 0x80, s1, s2);
3813 return Fcons (make_int (s1), make_int (s2));
3820 /************************************************************************/
3822 /************************************************************************/
3824 /* BIG5 is a coding system encoding two character sets: ASCII and
3825 Big5. An ASCII character is encoded as is. Big5 is a two-byte
3826 character set and is encoded in two-byte.
3828 --- CODE RANGE of BIG5 ---
3829 (character set) (range)
3831 Big5 (1st byte) 0xA1 .. 0xFE
3832 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
3833 --------------------------
3835 Since the number of characters in Big5 is larger than maximum
3836 characters in Emacs' charset (96x96), it can't be handled as one
3837 charset. So, in Emacs, Big5 is divided into two: `charset-big5-1'
3838 and `charset-big5-2'. Both <type>s are DIMENSION2_CHARS94. The former
3839 contains frequently used characters and the latter contains less
3840 frequently used characters. */
3843 #define BYTE_BIG5_TWO_BYTE_1_P(c) \
3844 ((c) >= 0x81 && (c) <= 0xFE)
3846 #define BYTE_BIG5_TWO_BYTE_1_P(c) \
3847 ((c) >= 0xA1 && (c) <= 0xFE)
3850 /* Is this the second byte of a Shift-JIS two-byte char? */
3852 #define BYTE_BIG5_TWO_BYTE_2_P(c) \
3853 (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0xA1 && (c) <= 0xFE))
3855 /* Number of Big5 characters which have the same code in 1st byte. */
3857 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
3859 /* Code conversion macros. These are macros because they are used in
3860 inner loops during code conversion.
3862 Note that temporary variables in macros introduce the classic
3863 dynamic-scoping problems with variable names. We use capital-
3864 lettered variables in the assumption that XEmacs does not use
3865 capital letters in variables except in a very formalized way
3868 /* Convert Big5 code (b1, b2) into its internal string representation
3871 /* There is a much simpler way to split the Big5 charset into two.
3872 For the moment I'm going to leave the algorithm as-is because it
3873 claims to separate out the most-used characters into a single
3874 charset, which perhaps will lead to optimizations in various
3877 The way the algorithm works is something like this:
3879 Big5 can be viewed as a 94x157 charset, where the row is
3880 encoded into the bytes 0xA1 .. 0xFE and the column is encoded
3881 into the bytes 0x40 .. 0x7E and 0xA1 .. 0xFE. As for frequency,
3882 the split between low and high column numbers is apparently
3883 meaningless; ascending rows produce less and less frequent chars.
3884 Therefore, we assign the lower half of rows (0xA1 .. 0xC8) to
3885 the first charset, and the upper half (0xC9 .. 0xFE) to the
3886 second. To do the conversion, we convert the character into
3887 a single number where 0 .. 156 is the first row, 157 .. 313
3888 is the second, etc. That way, the characters are ordered by
3889 decreasing frequency. Then we just chop the space in two
3890 and coerce the result into a 94x94 space.
3893 #define DECODE_BIG5(b1, b2, lb, c1, c2) do \
3895 int B1 = b1, B2 = b2; \
3897 = (B1 - 0xA1) * BIG5_SAME_ROW + B2 - (B2 < 0x7F ? 0x40 : 0x62); \
3901 lb = LEADING_BYTE_CHINESE_BIG5_1; \
3905 lb = LEADING_BYTE_CHINESE_BIG5_2; \
3906 I -= (BIG5_SAME_ROW) * (0xC9 - 0xA1); \
3908 c1 = I / (0xFF - 0xA1) + 0xA1; \
3909 c2 = I % (0xFF - 0xA1) + 0xA1; \
3912 /* Convert the internal string representation of a Big5 character
3913 (lb, c1, c2) into Big5 code (b1, b2). */
3915 #define ENCODE_BIG5(lb, c1, c2, b1, b2) do \
3917 unsigned int I = ((c1) - 0xA1) * (0xFF - 0xA1) + ((c2) - 0xA1); \
3919 if (lb == LEADING_BYTE_CHINESE_BIG5_2) \
3921 I += BIG5_SAME_ROW * (0xC9 - 0xA1); \
3923 b1 = I / BIG5_SAME_ROW + 0xA1; \
3924 b2 = I % BIG5_SAME_ROW; \
3925 b2 += b2 < 0x3F ? 0x40 : 0x62; \
3929 detect_coding_big5 (struct detection_state *st, const Extbyte *src, Lstream_data_count n)
3933 unsigned char c = *(unsigned char *)src++;
3934 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO
3936 || (c >= 0x80 && c <= 0xA0)
3940 if (st->big5.in_second_byte)
3942 st->big5.in_second_byte = 0;
3943 if (c < 0x40 || (c >= 0x80 && c <= 0xA0))
3953 st->big5.in_second_byte = 1;
3955 return CODING_CATEGORY_BIG5_MASK;
3958 /* Convert Big5 data to internal format. */
3961 decode_coding_big5 (Lstream *decoding, const Extbyte *src,
3962 unsigned_char_dynarr *dst, Lstream_data_count n)
3964 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3965 unsigned int flags = str->flags;
3966 unsigned int cpos = str->cpos;
3967 eol_type_t eol_type = str->eol_type;
3970 = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (DECODING_STREAM_DATA
3971 (decoding)->codesys, 1);
3976 unsigned char c = *(unsigned char *)src++;
3979 /* Previous character was first byte of Big5 char. */
3980 if (BYTE_BIG5_TWO_BYTE_2_P (c))
3983 int code_point = (cpos << 8) | c;
3984 Emchar char_id = decode_defined_char (ccs, code_point, 0);
3988 = DECODE_CHAR (Vcharset_chinese_big5, code_point, 0);
3989 DECODE_ADD_UCS_CHAR (char_id, dst);
3991 unsigned char b1, b2, b3;
3992 DECODE_BIG5 (cpos, c, b1, b2, b3);
3993 Dynarr_add (dst, b1);
3994 Dynarr_add (dst, b2);
3995 Dynarr_add (dst, b3);
4000 DECODE_ADD_BINARY_CHAR (cpos, dst);
4001 DECODE_ADD_BINARY_CHAR (c, dst);
4007 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
4008 if (BYTE_BIG5_TWO_BYTE_1_P (c))
4010 decode_flush_er_chars (str, dst);
4015 decode_flush_er_chars (str, dst);
4016 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
4017 DECODE_ADD_BINARY_CHAR (c, dst);
4021 /* DECODE_ADD_BINARY_CHAR (c, dst); */
4022 decode_add_er_char (str, c, dst);
4025 label_continue_loop:;
4028 /* DECODE_HANDLE_END_OF_CONVERSION (flags, cpos, dst); */
4029 if (flags & CODING_STATE_END)
4031 decode_flush_er_chars (str, dst);
4032 DECODE_OUTPUT_PARTIAL_CHAR (cpos);
4033 if (flags & CODING_STATE_CR)
4034 Dynarr_add (dst, '\r');
4041 /* Convert internally-formatted data to Big5. */
4044 char_encode_big5 (struct encoding_stream *str, Emchar ch,
4045 unsigned_char_dynarr *dst, unsigned int *flags)
4047 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
4051 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
4052 Dynarr_add (dst, '\r');
4053 if (eol_type != EOL_CR)
4054 Dynarr_add (dst, ch);
4061 = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (str->codesys, 1);
4063 if ((code_point = charset_code_point (Vcharset_ascii, ch, 0)) >= 0)
4064 Dynarr_add (dst, code_point);
4065 else if ((code_point = charset_code_point (ccs, ch, 0)) >= 0)
4067 Dynarr_add (dst, code_point >> 8);
4068 Dynarr_add (dst, code_point & 0xFF);
4070 else if ((code_point
4071 = charset_code_point (Vcharset_chinese_big5, ch, 0)) >= 0)
4073 Dynarr_add (dst, code_point >> 8);
4074 Dynarr_add (dst, code_point & 0xFF);
4076 else if ((code_point
4077 = charset_code_point (Vcharset_chinese_big5_1, ch, 0)) >= 0)
4080 = ((code_point >> 8) - 33) * (0xFF - 0xA1)
4081 + ((code_point & 0xFF) - 33);
4082 unsigned char b1 = I / BIG5_SAME_ROW + 0xA1;
4083 unsigned char b2 = I % BIG5_SAME_ROW;
4085 b2 += b2 < 0x3F ? 0x40 : 0x62;
4086 Dynarr_add (dst, b1);
4087 Dynarr_add (dst, b2);
4089 else if ((code_point
4090 = charset_code_point (Vcharset_chinese_big5_2, ch, 0)) >= 0)
4093 = ((code_point >> 8) - 33) * (0xFF - 0xA1)
4094 + ((code_point & 0xFF) - 33);
4095 unsigned char b1, b2;
4097 I += BIG5_SAME_ROW * (0xC9 - 0xA1);
4098 b1 = I / BIG5_SAME_ROW + 0xA1;
4099 b2 = I % BIG5_SAME_ROW;
4100 b2 += b2 < 0x3F ? 0x40 : 0x62;
4101 Dynarr_add (dst, b1);
4102 Dynarr_add (dst, b2);
4104 else if (CODING_SYSTEM_USE_ENTITY_REFERENCE (str->codesys))
4108 char_encode_as_entity_reference (ch, buf);
4109 Dynarr_add_many (dst, buf, strlen (buf));
4112 Dynarr_add (dst, '?');
4119 char_finish_big5 (struct encoding_stream *str, unsigned_char_dynarr *dst,
4120 unsigned int *flags)
4125 DEFUN ("decode-big5-char", Fdecode_big5_char, 1, 1, 0, /*
4126 Decode a Big5 character CODE of BIG5 coding-system.
4127 CODE is the character code in BIG5, a cons of two integers.
4128 Return the corresponding character.
4132 unsigned char c1, c2, b1, b2;
4135 CHECK_INT (XCAR (code));
4136 CHECK_INT (XCDR (code));
4137 b1 = XINT (XCAR (code));
4138 b2 = XINT (XCDR (code));
4139 if (BYTE_BIG5_TWO_BYTE_1_P (b1) &&
4140 BYTE_BIG5_TWO_BYTE_2_P (b2))
4142 Charset_ID leading_byte;
4143 Lisp_Object charset;
4144 DECODE_BIG5 (b1, b2, leading_byte, c1, c2);
4145 charset = CHARSET_BY_LEADING_BYTE (leading_byte);
4146 return make_char (MAKE_CHAR (charset, c1 & 0x7F, c2 & 0x7F));
4152 DEFUN ("encode-big5-char", Fencode_big5_char, 1, 1, 0, /*
4153 Encode the Big5 character CHARACTER in the BIG5 coding-system.
4154 Return the corresponding character code in Big5.
4158 Lisp_Object charset;
4161 CHECK_CHAR_COERCE_INT (character);
4162 BREAKUP_CHAR (XCHAR (character), charset, c1, c2);
4163 if (EQ (charset, Vcharset_chinese_big5_1) ||
4164 EQ (charset, Vcharset_chinese_big5_2))
4166 ENCODE_BIG5 (XCHARSET_LEADING_BYTE (charset), c1 | 0x80, c2 | 0x80,
4168 return Fcons (make_int (b1), make_int (b2));
4175 /************************************************************************/
4177 /************************************************************************/
4180 detect_coding_ucs4 (struct detection_state *st, const Extbyte *src, Lstream_data_count n)
4184 unsigned char c = *(unsigned char *)src++;
4185 switch (st->ucs4.in_byte)
4194 st->ucs4.in_byte = 0;
4200 return CODING_CATEGORY_UCS4_MASK;
4204 decode_coding_ucs4 (Lstream *decoding, const Extbyte *src,
4205 unsigned_char_dynarr *dst, Lstream_data_count n)
4207 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
4208 unsigned int flags = str->flags;
4209 unsigned int cpos = str->cpos;
4210 unsigned char counter = str->counter;
4214 unsigned char c = *(unsigned char *)src++;
4222 DECODE_ADD_UCS_CHAR ((cpos << 8) | c, dst);
4227 cpos = ( cpos << 8 ) | c;
4231 if (counter & CODING_STATE_END)
4232 DECODE_OUTPUT_PARTIAL_CHAR (cpos);
4236 str->counter = counter;
4240 char_encode_ucs4 (struct encoding_stream *str, Emchar ch,
4241 unsigned_char_dynarr *dst, unsigned int *flags)
4243 Dynarr_add (dst, ch >> 24);
4244 Dynarr_add (dst, ch >> 16);
4245 Dynarr_add (dst, ch >> 8);
4246 Dynarr_add (dst, ch );
4250 char_finish_ucs4 (struct encoding_stream *str, unsigned_char_dynarr *dst,
4251 unsigned int *flags)
4256 /************************************************************************/
4257 /* UTF-16 methods */
4258 /************************************************************************/
4261 detect_coding_utf16 (struct detection_state *st, const Extbyte *src, Lstream_data_count n)
4263 return CODING_CATEGORY_UTF16_MASK;
4267 decode_coding_utf16 (Lstream *decoding, const Extbyte *src,
4268 unsigned_char_dynarr *dst, Lstream_data_count n)
4270 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
4271 unsigned int flags = str->flags;
4272 unsigned int cpos = str->cpos;
4273 unsigned char counter = str->counter & 3;
4274 unsigned char byte_order = str->counter >> 2;
4275 eol_type_t eol_type = str->eol_type;
4279 unsigned char c = *(unsigned char *)src++;
4285 else if (counter == 1)
4289 if (byte_order == 0)
4290 code = (c << 8) | cpos;
4292 code = (cpos << 8) | c;
4295 code = ((code & 0xFF) << 8) | (code >> 8);
4296 if ( byte_order == 0 )
4301 if ( (0xD800 <= code) && (code <= 0xDBFF) )
4312 DECODE_HANDLE_EOL_TYPE (eol_type, code, flags, dst);
4313 DECODE_ADD_UCS_CHAR (code, dst);
4317 else if (counter == 2)
4319 cpos = (cpos << 8) | c;
4327 ? (c << 8) | (cpos & 0xFF)
4328 : ((cpos & 0xFF) << 8) | c;
4330 DECODE_ADD_UCS_CHAR ((x - 0xD800) * 0x400 + (y - 0xDC00)
4335 label_continue_loop:;
4337 if (counter & CODING_STATE_END)
4338 DECODE_OUTPUT_PARTIAL_CHAR (cpos);
4342 str->counter = (byte_order << 2) | counter;
4346 char_encode_utf16 (struct encoding_stream *str, Emchar ch,
4347 unsigned_char_dynarr *dst, unsigned int *flags)
4351 Dynarr_add (dst, ch);
4352 Dynarr_add (dst, ch >> 8);
4356 int y = ((ch - 0x10000) / 0x400) + 0xD800;
4357 int z = ((ch - 0x10000) % 0x400) + 0xDC00;
4359 Dynarr_add (dst, y);
4360 Dynarr_add (dst, y >> 8);
4361 Dynarr_add (dst, z);
4362 Dynarr_add (dst, z >> 8);
4367 char_finish_utf16 (struct encoding_stream *str, unsigned_char_dynarr *dst,
4368 unsigned int *flags)
4373 /************************************************************************/
4375 /************************************************************************/
4378 detect_coding_utf8 (struct detection_state *st, const Extbyte *src, Lstream_data_count n)
4382 unsigned char c = *(unsigned char *)src++;
4383 switch (st->utf8.in_byte)
4386 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
4389 st->utf8.in_byte = 5;
4391 st->utf8.in_byte = 4;
4393 st->utf8.in_byte = 3;
4395 st->utf8.in_byte = 2;
4397 st->utf8.in_byte = 1;
4402 if ((c & 0xc0) != 0x80)
4408 return CODING_CATEGORY_UTF8_MASK;
4412 decode_output_utf8_partial_char (unsigned char counter,
4414 unsigned_char_dynarr *dst)
4417 DECODE_ADD_BINARY_CHAR ( (cpos|0xFC), dst);
4418 else if (counter == 4)
4420 if (cpos < (1 << 6))
4421 DECODE_ADD_BINARY_CHAR ( (cpos|0xF8), dst);
4424 DECODE_ADD_BINARY_CHAR ( ((cpos >> 6)|0xFC), dst);
4425 DECODE_ADD_BINARY_CHAR ( ((cpos&0x3F)|0x80), dst);
4428 else if (counter == 3)
4430 if (cpos < (1 << 6))
4431 DECODE_ADD_BINARY_CHAR ( (cpos|0xF0), dst);
4432 else if (cpos < (1 << 12))
4434 DECODE_ADD_BINARY_CHAR ( ((cpos >> 6)|0xF8), dst);
4435 DECODE_ADD_BINARY_CHAR ( ((cpos&0x3F)|0x80), dst);
4439 DECODE_ADD_BINARY_CHAR ( ( (cpos >> 12)|0xFC), dst);
4440 DECODE_ADD_BINARY_CHAR ( (((cpos >> 6)&0x3F)|0x80), dst);
4441 DECODE_ADD_BINARY_CHAR ( ( (cpos &0x3F)|0x80), dst);
4444 else if (counter == 2)
4446 if (cpos < (1 << 6))
4447 DECODE_ADD_BINARY_CHAR ( (cpos|0xE0), dst);
4448 else if (cpos < (1 << 12))
4450 DECODE_ADD_BINARY_CHAR ( ((cpos >> 6)|0xF0), dst);
4451 DECODE_ADD_BINARY_CHAR ( ((cpos&0x3F)|0x80), dst);
4453 else if (cpos < (1 << 18))
4455 DECODE_ADD_BINARY_CHAR ( ( (cpos >> 12)|0xF8), dst);
4456 DECODE_ADD_BINARY_CHAR ( (((cpos >> 6)&0x3F)|0x80), dst);
4457 DECODE_ADD_BINARY_CHAR ( ( (cpos &0x3F)|0x80), dst);
4461 DECODE_ADD_BINARY_CHAR ( ( (cpos >> 18)|0xFC), dst);
4462 DECODE_ADD_BINARY_CHAR ( (((cpos >> 12)&0x3F)|0x80), dst);
4463 DECODE_ADD_BINARY_CHAR ( (((cpos >> 6)&0x3F)|0x80), dst);
4464 DECODE_ADD_BINARY_CHAR ( ( (cpos &0x3F)|0x80), dst);
4469 if (cpos < (1 << 6))
4470 DECODE_ADD_BINARY_CHAR ( (cpos|0xC0), dst);
4471 else if (cpos < (1 << 12))
4473 DECODE_ADD_BINARY_CHAR ( ((cpos >> 6)|0xE0), dst);
4474 DECODE_ADD_BINARY_CHAR ( ((cpos&0x3F)|0x80), dst);
4476 else if (cpos < (1 << 18))
4478 DECODE_ADD_BINARY_CHAR ( ( (cpos >> 12)|0xF0), dst);
4479 DECODE_ADD_BINARY_CHAR ( (((cpos >> 6)&0x3F)|0x80), dst);
4480 DECODE_ADD_BINARY_CHAR ( ( (cpos &0x3F)|0x80), dst);
4482 else if (cpos < (1 << 24))
4484 DECODE_ADD_BINARY_CHAR ( ( (cpos >> 18)|0xF8), dst);
4485 DECODE_ADD_BINARY_CHAR ( (((cpos >> 12)&0x3F)|0x80), dst);
4486 DECODE_ADD_BINARY_CHAR ( (((cpos >> 6)&0x3F)|0x80), dst);
4487 DECODE_ADD_BINARY_CHAR ( ( (cpos &0x3F)|0x80), dst);
4491 DECODE_ADD_BINARY_CHAR ( ( (cpos >> 24)|0xFC), dst);
4492 DECODE_ADD_BINARY_CHAR ( (((cpos >> 18)&0x3F)|0x80), dst);
4493 DECODE_ADD_BINARY_CHAR ( (((cpos >> 12)&0x3F)|0x80), dst);
4494 DECODE_ADD_BINARY_CHAR ( (((cpos >> 6)&0x3F)|0x80), dst);
4495 DECODE_ADD_BINARY_CHAR ( ( (cpos &0x3F)|0x80), dst);
4501 decode_coding_utf8 (Lstream *decoding, const Extbyte *src,
4502 unsigned_char_dynarr *dst, Lstream_data_count n)
4504 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
4505 unsigned int flags = str->flags;
4506 unsigned int cpos = str->cpos;
4507 eol_type_t eol_type = str->eol_type;
4508 unsigned char counter = str->counter;
4510 int bom_flag = str->bom_flag;
4512 = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (DECODING_STREAM_DATA
4513 (decoding)->codesys, 0);
4518 unsigned char c = *(unsigned char *)src++;
4523 COMPOSE_FLUSH_CHARS (str, dst);
4524 decode_flush_er_chars (str, dst);
4525 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
4527 if ( bom_flag == 0 )
4530 DECODE_ADD_UCS_CHAR (c, dst);
4532 else if ( c < 0xC0 )
4534 if ( bom_flag == 0 )
4537 /* decode_add_er_char (str, c, dst); */
4538 COMPOSE_ADD_CHAR (str, c, dst);
4542 /* decode_flush_er_chars (str, dst); */
4548 else if ( c < 0xF0 )
4553 else if ( c < 0xF8 )
4558 else if ( c < 0xFC )
4570 else if ( (c & 0xC0) == 0x80 )
4572 cpos = ( cpos << 6 ) | ( c & 0x3f );
4577 if ( bom_flag == 0 )
4579 if ( cpos == 0xFEFF )
4590 char_id = decode_defined_char (ccs, cpos, 0);
4597 COMPOSE_ADD_CHAR (str, char_id, dst);
4607 COMPOSE_FLUSH_CHARS (str, dst);
4608 decode_flush_er_chars (str, dst);
4609 decode_output_utf8_partial_char (counter, cpos, dst);
4610 DECODE_ADD_BINARY_CHAR (c, dst);
4614 label_continue_loop:;
4617 if (flags & CODING_STATE_END)
4619 COMPOSE_FLUSH_CHARS (str, dst);
4620 decode_flush_er_chars (str, dst);
4623 decode_output_utf8_partial_char (counter, cpos, dst);
4630 str->counter = counter;
4632 str->bom_flag = bom_flag;
4637 char_encode_utf8 (struct encoding_stream *str, Emchar ch,
4638 unsigned_char_dynarr *dst, unsigned int *flags)
4640 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
4644 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
4645 Dynarr_add (dst, '\r');
4646 if (eol_type != EOL_CR)
4647 Dynarr_add (dst, ch);
4649 else if (ch <= 0x7f)
4651 Dynarr_add (dst, ch);
4656 = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (str->codesys, 0);
4657 int code_point = charset_code_point (ucs_ccs, ch, 0);
4659 if ( (code_point < 0) || (code_point > 0xEFFFF) )
4661 Lisp_Object seq = Fchar_feature (make_char (ch),
4662 Qrep_decomposition, Qnil,
4664 Lisp_Object map, ret;
4668 Lisp_Object base = Fcar (seq);
4671 if ( CHARP (base) && CONSP (seq) )
4673 Lisp_Object comb = Fcar (seq);
4677 char_encode_utf8 (str, XCHAR (base), dst, flags);
4678 char_encode_utf8 (str, XCHAR (comb), dst, flags);
4684 map = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (str->codesys, 1);
4686 && INTP (ret = Fchar_feature (make_char (ch),
4689 code_point = XINT (ret);
4690 else if ( !NILP (map =
4691 CODING_SYSTEM_ISO2022_INITIAL_CHARSET
4693 && INTP (ret = Fchar_feature (make_char (ch),
4696 code_point = XINT (ret);
4697 else if (CODING_SYSTEM_USE_ENTITY_REFERENCE (str->codesys))
4701 char_encode_as_entity_reference (ch, buf);
4702 Dynarr_add_many (dst, buf, strlen (buf));
4708 if (code_point <= 0x7ff)
4710 Dynarr_add (dst, (code_point >> 6) | 0xc0);
4711 Dynarr_add (dst, (code_point & 0x3f) | 0x80);
4713 else if (code_point <= 0xffff)
4715 Dynarr_add (dst, (code_point >> 12) | 0xe0);
4716 Dynarr_add (dst, ((code_point >> 6) & 0x3f) | 0x80);
4717 Dynarr_add (dst, (code_point & 0x3f) | 0x80);
4719 else if (code_point <= 0x1fffff)
4721 Dynarr_add (dst, (code_point >> 18) | 0xf0);
4722 Dynarr_add (dst, ((code_point >> 12) & 0x3f) | 0x80);
4723 Dynarr_add (dst, ((code_point >> 6) & 0x3f) | 0x80);
4724 Dynarr_add (dst, (code_point & 0x3f) | 0x80);
4726 else if (code_point <= 0x3ffffff)
4728 Dynarr_add (dst, (code_point >> 24) | 0xf8);
4729 Dynarr_add (dst, ((code_point >> 18) & 0x3f) | 0x80);
4730 Dynarr_add (dst, ((code_point >> 12) & 0x3f) | 0x80);
4731 Dynarr_add (dst, ((code_point >> 6) & 0x3f) | 0x80);
4732 Dynarr_add (dst, (code_point & 0x3f) | 0x80);
4736 Dynarr_add (dst, (code_point >> 30) | 0xfc);
4737 Dynarr_add (dst, ((code_point >> 24) & 0x3f) | 0x80);
4738 Dynarr_add (dst, ((code_point >> 18) & 0x3f) | 0x80);
4739 Dynarr_add (dst, ((code_point >> 12) & 0x3f) | 0x80);
4740 Dynarr_add (dst, ((code_point >> 6) & 0x3f) | 0x80);
4741 Dynarr_add (dst, (code_point & 0x3f) | 0x80);
4747 char_finish_utf8 (struct encoding_stream *str, unsigned_char_dynarr *dst,
4748 unsigned int *flags)
4753 /************************************************************************/
4754 /* ISO2022 methods */
4755 /************************************************************************/
4757 /* The following note describes the coding system ISO2022 briefly.
4758 Since the intention of this note is to help understand the
4759 functions in this file, some parts are NOT ACCURATE or OVERLY
4760 SIMPLIFIED. For thorough understanding, please refer to the
4761 original document of ISO2022.
4763 ISO2022 provides many mechanisms to encode several character sets
4764 in 7-bit and 8-bit environments. For 7-bit environments, all text
4765 is encoded using bytes less than 128. This may make the encoded
4766 text a little bit longer, but the text passes more easily through
4767 several gateways, some of which strip off MSB (Most Signigant Bit).
4769 There are two kinds of character sets: control character set and
4770 graphic character set. The former contains control characters such
4771 as `newline' and `escape' to provide control functions (control
4772 functions are also provided by escape sequences). The latter
4773 contains graphic characters such as 'A' and '-'. Emacs recognizes
4774 two control character sets and many graphic character sets.
4776 Graphic character sets are classified into one of the following
4777 four classes, according to the number of bytes (DIMENSION) and
4778 number of characters in one dimension (CHARS) of the set:
4779 - DIMENSION1_CHARS94
4780 - DIMENSION1_CHARS96
4781 - DIMENSION2_CHARS94
4782 - DIMENSION2_CHARS96
4784 In addition, each character set is assigned an identification tag,
4785 unique for each set, called "final character" (denoted as <F>
4786 hereafter). The <F> of each character set is decided by ECMA(*)
4787 when it is registered in ISO. The code range of <F> is 0x30..0x7F
4788 (0x30..0x3F are for private use only).
4790 Note (*): ECMA = European Computer Manufacturers Association
4792 Here are examples of graphic character set [NAME(<F>)]:
4793 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
4794 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
4795 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
4796 o DIMENSION2_CHARS96 -- none for the moment
4798 A code area (1 byte = 8 bits) is divided into 4 areas, C0, GL, C1, and GR.
4799 C0 [0x00..0x1F] -- control character plane 0
4800 GL [0x20..0x7F] -- graphic character plane 0
4801 C1 [0x80..0x9F] -- control character plane 1
4802 GR [0xA0..0xFF] -- graphic character plane 1
4804 A control character set is directly designated and invoked to C0 or
4805 C1 by an escape sequence. The most common case is that:
4806 - ISO646's control character set is designated/invoked to C0, and
4807 - ISO6429's control character set is designated/invoked to C1,
4808 and usually these designations/invocations are omitted in encoded
4809 text. In a 7-bit environment, only C0 can be used, and a control
4810 character for C1 is encoded by an appropriate escape sequence to
4811 fit into the environment. All control characters for C1 are
4812 defined to have corresponding escape sequences.
4814 A graphic character set is at first designated to one of four
4815 graphic registers (G0 through G3), then these graphic registers are
4816 invoked to GL or GR. These designations and invocations can be
4817 done independently. The most common case is that G0 is invoked to
4818 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
4819 these invocations and designations are omitted in encoded text.
4820 In a 7-bit environment, only GL can be used.
4822 When a graphic character set of CHARS94 is invoked to GL, codes
4823 0x20 and 0x7F of the GL area work as control characters SPACE and
4824 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
4827 There are two ways of invocation: locking-shift and single-shift.
4828 With locking-shift, the invocation lasts until the next different
4829 invocation, whereas with single-shift, the invocation affects the
4830 following character only and doesn't affect the locking-shift
4831 state. Invocations are done by the following control characters or
4834 ----------------------------------------------------------------------
4835 abbrev function cntrl escape seq description
4836 ----------------------------------------------------------------------
4837 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
4838 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
4839 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
4840 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
4841 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
4842 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
4843 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
4844 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
4845 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
4846 ----------------------------------------------------------------------
4847 (*) These are not used by any known coding system.
4849 Control characters for these functions are defined by macros
4850 ISO_CODE_XXX in `coding.h'.
4852 Designations are done by the following escape sequences:
4853 ----------------------------------------------------------------------
4854 escape sequence description
4855 ----------------------------------------------------------------------
4856 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
4857 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
4858 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
4859 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
4860 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
4861 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
4862 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
4863 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
4864 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
4865 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
4866 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
4867 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
4868 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
4869 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
4870 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
4871 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
4872 ----------------------------------------------------------------------
4874 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
4875 of dimension 1, chars 94, and final character <F>, etc...
4877 Note (*): Although these designations are not allowed in ISO2022,
4878 Emacs accepts them on decoding, and produces them on encoding
4879 CHARS96 character sets in a coding system which is characterized as
4880 7-bit environment, non-locking-shift, and non-single-shift.
4882 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
4883 '(' can be omitted. We refer to this as "short-form" hereafter.
4885 Now you may notice that there are a lot of ways for encoding the
4886 same multilingual text in ISO2022. Actually, there exist many
4887 coding systems such as Compound Text (used in X11's inter client
4888 communication, ISO-2022-JP (used in Japanese internet), ISO-2022-KR
4889 (used in Korean internet), EUC (Extended UNIX Code, used in Asian
4890 localized platforms), and all of these are variants of ISO2022.
4892 In addition to the above, Emacs handles two more kinds of escape
4893 sequences: ISO6429's direction specification and Emacs' private
4894 sequence for specifying character composition.
4896 ISO6429's direction specification takes the following form:
4897 o CSI ']' -- end of the current direction
4898 o CSI '0' ']' -- end of the current direction
4899 o CSI '1' ']' -- start of left-to-right text
4900 o CSI '2' ']' -- start of right-to-left text
4901 The control character CSI (0x9B: control sequence introducer) is
4902 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
4904 Character composition specification takes the following form:
4905 o ESC '0' -- start character composition
4906 o ESC '1' -- end character composition
4907 Since these are not standard escape sequences of any ISO standard,
4908 their use with these meanings is restricted to Emacs only. */
4911 reset_iso2022 (Lisp_Object coding_system, struct iso2022_decoder *iso)
4915 for (i = 0; i < 4; i++)
4917 if (!NILP (coding_system))
4919 XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, i);
4921 iso->charset[i] = Qt;
4922 iso->invalid_designated[i] = 0;
4924 iso->esc = ISO_ESC_NOTHING;
4925 iso->esc_bytes_index = 0;
4926 iso->register_left = 0;
4927 iso->register_right = 1;
4928 iso->switched_dir_and_no_valid_charset_yet = 0;
4929 iso->invalid_switch_dir = 0;
4930 iso->output_direction_sequence = 0;
4931 iso->output_literally = 0;
4932 #ifdef ENABLE_COMPOSITE_CHARS
4933 if (iso->composite_chars)
4934 Dynarr_reset (iso->composite_chars);
4939 fit_to_be_escape_quoted (unsigned char c)
4956 /* Parse one byte of an ISO2022 escape sequence.
4957 If the result is an invalid escape sequence, return 0 and
4958 do not change anything in STR. Otherwise, if the result is
4959 an incomplete escape sequence, update ISO2022.ESC and
4960 ISO2022.ESC_BYTES and return -1. Otherwise, update
4961 all the state variables (but not ISO2022.ESC_BYTES) and
4964 If CHECK_INVALID_CHARSETS is non-zero, check for designation
4965 or invocation of an invalid character set and treat that as
4966 an unrecognized escape sequence. */
4969 parse_iso2022_esc (Lisp_Object codesys, struct iso2022_decoder *iso,
4970 unsigned char c, unsigned int *flags,
4971 int check_invalid_charsets)
4973 /* (1) If we're at the end of a designation sequence, CS is the
4974 charset being designated and REG is the register to designate
4977 (2) If we're at the end of a locking-shift sequence, REG is
4978 the register to invoke and HALF (0 == left, 1 == right) is
4979 the half to invoke it into.
4981 (3) If we're at the end of a single-shift sequence, REG is
4982 the register to invoke. */
4983 Lisp_Object cs = Qnil;
4986 /* NOTE: This code does goto's all over the fucking place.
4987 The reason for this is that we're basically implementing
4988 a state machine here, and hierarchical languages like C
4989 don't really provide a clean way of doing this. */
4991 if (! (*flags & CODING_STATE_ESCAPE))
4992 /* At beginning of escape sequence; we need to reset our
4993 escape-state variables. */
4994 iso->esc = ISO_ESC_NOTHING;
4996 iso->output_literally = 0;
4997 iso->output_direction_sequence = 0;
5001 case ISO_ESC_NOTHING:
5002 iso->esc_bytes_index = 0;
5005 case ISO_CODE_ESC: /* Start escape sequence */
5006 *flags |= CODING_STATE_ESCAPE;
5010 case ISO_CODE_CSI: /* ISO6429 (specifying directionality) */
5011 *flags |= CODING_STATE_ESCAPE;
5012 iso->esc = ISO_ESC_5_11;
5015 case ISO_CODE_SO: /* locking shift 1 */
5018 case ISO_CODE_SI: /* locking shift 0 */
5022 case ISO_CODE_SS2: /* single shift */
5025 case ISO_CODE_SS3: /* single shift */
5029 default: /* Other control characters */
5036 /**** single shift ****/
5038 case 'N': /* single shift 2 */
5041 case 'O': /* single shift 3 */
5045 /**** locking shift ****/
5047 case '~': /* locking shift 1 right */
5050 case 'n': /* locking shift 2 */
5053 case '}': /* locking shift 2 right */
5056 case 'o': /* locking shift 3 */
5059 case '|': /* locking shift 3 right */
5063 #ifdef ENABLE_COMPOSITE_CHARS
5064 /**** composite ****/
5067 iso->esc = ISO_ESC_START_COMPOSITE;
5068 *flags = (*flags & CODING_STATE_ISO2022_LOCK) |
5069 CODING_STATE_COMPOSITE;
5073 iso->esc = ISO_ESC_END_COMPOSITE;
5074 *flags = (*flags & CODING_STATE_ISO2022_LOCK) &
5075 ~CODING_STATE_COMPOSITE;
5077 #endif /* ENABLE_COMPOSITE_CHARS */
5079 /**** directionality ****/
5082 iso->esc = ISO_ESC_5_11;
5085 /**** designation ****/
5087 case '$': /* multibyte charset prefix */
5088 iso->esc = ISO_ESC_2_4;
5092 if (0x28 <= c && c <= 0x2F)
5094 iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_8);
5098 /* This function is called with CODESYS equal to nil when
5099 doing coding-system detection. */
5101 && XCODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
5102 && fit_to_be_escape_quoted (c))
5104 iso->esc = ISO_ESC_LITERAL;
5105 *flags &= CODING_STATE_ISO2022_LOCK;
5115 /**** directionality ****/
5117 case ISO_ESC_5_11: /* ISO6429 direction control */
5120 *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
5121 goto directionality;
5123 if (c == '0') iso->esc = ISO_ESC_5_11_0;
5124 else if (c == '1') iso->esc = ISO_ESC_5_11_1;
5125 else if (c == '2') iso->esc = ISO_ESC_5_11_2;
5129 case ISO_ESC_5_11_0:
5132 *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
5133 goto directionality;
5137 case ISO_ESC_5_11_1:
5140 *flags = (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
5141 goto directionality;
5145 case ISO_ESC_5_11_2:
5148 *flags = (*flags & CODING_STATE_ISO2022_LOCK) | CODING_STATE_R2L;
5149 goto directionality;
5154 iso->esc = ISO_ESC_DIRECTIONALITY;
5155 /* Various junk here to attempt to preserve the direction sequences
5156 literally in the text if they would otherwise be swallowed due
5157 to invalid designations that don't show up as actual charset
5158 changes in the text. */
5159 if (iso->invalid_switch_dir)
5161 /* We already inserted a direction switch literally into the
5162 text. We assume (#### this may not be right) that the
5163 next direction switch is the one going the other way,
5164 and we need to output that literally as well. */
5165 iso->output_literally = 1;
5166 iso->invalid_switch_dir = 0;
5172 /* If we are in the thrall of an invalid designation,
5173 then stick the directionality sequence literally into the
5174 output stream so it ends up in the original text again. */
5175 for (jj = 0; jj < 4; jj++)
5176 if (iso->invalid_designated[jj])
5180 iso->output_literally = 1;
5181 iso->invalid_switch_dir = 1;
5184 /* Indicate that we haven't yet seen a valid designation,
5185 so that if a switch-dir is directly followed by an
5186 invalid designation, both get inserted literally. */
5187 iso->switched_dir_and_no_valid_charset_yet = 1;
5192 /**** designation ****/
5195 if (0x28 <= c && c <= 0x2F)
5197 iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_4_8);
5200 if (0x40 <= c && c <= 0x42)
5203 cs = CHARSET_BY_ATTRIBUTES (94, -1, c,
5204 *flags & CODING_STATE_R2L ?
5205 CHARSET_RIGHT_TO_LEFT :
5206 CHARSET_LEFT_TO_RIGHT);
5217 if (c < '0' || c > '~')
5218 return 0; /* bad final byte */
5220 if (iso->esc >= ISO_ESC_2_8 &&
5221 iso->esc <= ISO_ESC_2_15)
5223 chars = (iso->esc >= ISO_ESC_2_12) ? 96 : 94;
5224 single = 1; /* single-byte */
5225 reg = (iso->esc - ISO_ESC_2_8) & 3;
5227 else if (iso->esc >= ISO_ESC_2_4_8 &&
5228 iso->esc <= ISO_ESC_2_4_15)
5230 chars = (iso->esc >= ISO_ESC_2_4_12) ? 96 : 94;
5231 single = -1; /* multi-byte */
5232 reg = (iso->esc - ISO_ESC_2_4_8) & 3;
5236 /* Can this ever be reached? -slb */
5240 cs = CHARSET_BY_ATTRIBUTES (chars, single, c,
5241 *flags & CODING_STATE_R2L ?
5242 CHARSET_RIGHT_TO_LEFT :
5243 CHARSET_LEFT_TO_RIGHT);
5249 iso->esc_bytes[iso->esc_bytes_index++] = (unsigned char) c;
5253 if (check_invalid_charsets && !CHARSETP (iso->charset[reg]))
5254 /* can't invoke something that ain't there. */
5256 iso->esc = ISO_ESC_SINGLE_SHIFT;
5257 *flags &= CODING_STATE_ISO2022_LOCK;
5259 *flags |= CODING_STATE_SS2;
5261 *flags |= CODING_STATE_SS3;
5265 if (check_invalid_charsets &&
5266 !CHARSETP (iso->charset[reg]))
5267 /* can't invoke something that ain't there. */
5270 iso->register_right = reg;
5272 iso->register_left = reg;
5273 *flags &= CODING_STATE_ISO2022_LOCK;
5274 iso->esc = ISO_ESC_LOCKING_SHIFT;
5278 if (NILP (cs) && check_invalid_charsets)
5280 iso->invalid_designated[reg] = 1;
5281 iso->charset[reg] = Vcharset_ascii;
5282 iso->esc = ISO_ESC_DESIGNATE;
5283 *flags &= CODING_STATE_ISO2022_LOCK;
5284 iso->output_literally = 1;
5285 if (iso->switched_dir_and_no_valid_charset_yet)
5287 /* We encountered a switch-direction followed by an
5288 invalid designation. Ensure that the switch-direction
5289 gets outputted; otherwise it will probably get eaten
5290 when the text is written out again. */
5291 iso->switched_dir_and_no_valid_charset_yet = 0;
5292 iso->output_direction_sequence = 1;
5293 /* And make sure that the switch-dir going the other
5294 way gets outputted, as well. */
5295 iso->invalid_switch_dir = 1;
5299 /* This function is called with CODESYS equal to nil when
5300 doing coding-system detection. */
5301 if (!NILP (codesys))
5303 charset_conversion_spec_dynarr *dyn =
5304 XCODING_SYSTEM (codesys)->iso2022.input_conv;
5310 for (i = 0; i < Dynarr_length (dyn); i++)
5312 struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
5313 if (EQ (cs, spec->from_charset))
5314 cs = spec->to_charset;
5319 iso->charset[reg] = cs;
5320 iso->esc = ISO_ESC_DESIGNATE;
5321 *flags &= CODING_STATE_ISO2022_LOCK;
5322 if (iso->invalid_designated[reg])
5324 iso->invalid_designated[reg] = 0;
5325 iso->output_literally = 1;
5327 if (iso->switched_dir_and_no_valid_charset_yet)
5328 iso->switched_dir_and_no_valid_charset_yet = 0;
5333 detect_coding_iso2022 (struct detection_state *st, const Extbyte *src, Lstream_data_count n)
5337 /* #### There are serious deficiencies in the recognition mechanism
5338 here. This needs to be much smarter if it's going to cut it.
5339 The sequence "\xff\x0f" is currently detected as LOCK_SHIFT while
5340 it should be detected as Latin-1.
5341 All the ISO2022 stuff in this file should be synced up with the
5342 code from FSF Emacs-20.4, in which Mule should be more or less stable.
5343 Perhaps we should wait till R2L works in FSF Emacs? */
5345 if (!st->iso2022.initted)
5347 reset_iso2022 (Qnil, &st->iso2022.iso);
5348 st->iso2022.mask = (CODING_CATEGORY_ISO_7_MASK |
5349 CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
5350 CODING_CATEGORY_ISO_8_1_MASK |
5351 CODING_CATEGORY_ISO_8_2_MASK |
5352 CODING_CATEGORY_ISO_LOCK_SHIFT_MASK);
5353 st->iso2022.flags = 0;
5354 st->iso2022.high_byte_count = 0;
5355 st->iso2022.saw_single_shift = 0;
5356 st->iso2022.initted = 1;
5359 mask = st->iso2022.mask;
5363 unsigned char c = *(unsigned char *)src++;
5366 mask &= ~CODING_CATEGORY_ISO_7_MASK;
5367 st->iso2022.high_byte_count++;
5371 if (st->iso2022.high_byte_count && !st->iso2022.saw_single_shift)
5373 if (st->iso2022.high_byte_count & 1)
5374 /* odd number of high bytes; assume not iso-8-2 */
5375 mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
5377 st->iso2022.high_byte_count = 0;
5378 st->iso2022.saw_single_shift = 0;
5380 mask &= ~CODING_CATEGORY_ISO_7_MASK;
5382 if (!(st->iso2022.flags & CODING_STATE_ESCAPE)
5383 && (BYTE_C0_P (c) || BYTE_C1_P (c)))
5384 { /* control chars */
5387 /* Allow and ignore control characters that you might
5388 reasonably see in a text file */
5393 case 8: /* backspace */
5394 case 11: /* vertical tab */
5395 case 12: /* form feed */
5396 case 26: /* MS-DOS C-z junk */
5397 case 31: /* '^_' -- for info */
5398 goto label_continue_loop;
5405 if ((st->iso2022.flags & CODING_STATE_ESCAPE) || BYTE_C0_P (c)
5408 if (parse_iso2022_esc (Qnil, &st->iso2022.iso, c,
5409 &st->iso2022.flags, 0))
5411 switch (st->iso2022.iso.esc)
5413 case ISO_ESC_DESIGNATE:
5414 mask &= ~CODING_CATEGORY_ISO_8_1_MASK;
5415 mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
5417 case ISO_ESC_LOCKING_SHIFT:
5418 mask = CODING_CATEGORY_ISO_LOCK_SHIFT_MASK;
5419 goto ran_out_of_chars;
5420 case ISO_ESC_SINGLE_SHIFT:
5421 mask &= ~CODING_CATEGORY_ISO_8_DESIGNATE_MASK;
5422 st->iso2022.saw_single_shift = 1;
5431 goto ran_out_of_chars;
5434 label_continue_loop:;
5443 postprocess_iso2022_mask (int mask)
5445 /* #### kind of cheesy */
5446 /* If seven-bit ISO is allowed, then assume that the encoding is
5447 entirely seven-bit and turn off the eight-bit ones. */
5448 if (mask & CODING_CATEGORY_ISO_7_MASK)
5449 mask &= ~ (CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
5450 CODING_CATEGORY_ISO_8_1_MASK |
5451 CODING_CATEGORY_ISO_8_2_MASK);
5455 /* If FLAGS is a null pointer or specifies right-to-left motion,
5456 output a switch-dir-to-left-to-right sequence to DST.
5457 Also update FLAGS if it is not a null pointer.
5458 If INTERNAL_P is set, we are outputting in internal format and
5459 need to handle the CSI differently. */
5462 restore_left_to_right_direction (Lisp_Coding_System *codesys,
5463 unsigned_char_dynarr *dst,
5464 unsigned int *flags,
5467 if (!flags || (*flags & CODING_STATE_R2L))
5469 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
5471 Dynarr_add (dst, ISO_CODE_ESC);
5472 Dynarr_add (dst, '[');
5474 else if (internal_p)
5475 DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
5477 Dynarr_add (dst, ISO_CODE_CSI);
5478 Dynarr_add (dst, '0');
5479 Dynarr_add (dst, ']');
5481 *flags &= ~CODING_STATE_R2L;
5485 /* If FLAGS is a null pointer or specifies a direction different from
5486 DIRECTION (which should be either CHARSET_RIGHT_TO_LEFT or
5487 CHARSET_LEFT_TO_RIGHT), output the appropriate switch-dir escape
5488 sequence to DST. Also update FLAGS if it is not a null pointer.
5489 If INTERNAL_P is set, we are outputting in internal format and
5490 need to handle the CSI differently. */
5493 ensure_correct_direction (int direction, Lisp_Coding_System *codesys,
5494 unsigned_char_dynarr *dst, unsigned int *flags,
5497 if ((!flags || (*flags & CODING_STATE_R2L)) &&
5498 direction == CHARSET_LEFT_TO_RIGHT)
5499 restore_left_to_right_direction (codesys, dst, flags, internal_p);
5500 else if (!CODING_SYSTEM_ISO2022_NO_ISO6429 (codesys)
5501 && (!flags || !(*flags & CODING_STATE_R2L)) &&
5502 direction == CHARSET_RIGHT_TO_LEFT)
5504 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
5506 Dynarr_add (dst, ISO_CODE_ESC);
5507 Dynarr_add (dst, '[');
5509 else if (internal_p)
5510 DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
5512 Dynarr_add (dst, ISO_CODE_CSI);
5513 Dynarr_add (dst, '2');
5514 Dynarr_add (dst, ']');
5516 *flags |= CODING_STATE_R2L;
5520 /* Convert ISO2022-format data to internal format. */
5523 decode_coding_iso2022 (Lstream *decoding, const Extbyte *src,
5524 unsigned_char_dynarr *dst, Lstream_data_count n)
5526 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
5527 unsigned int flags = str->flags;
5528 unsigned int cpos = str->cpos;
5529 unsigned char counter = str->counter;
5530 eol_type_t eol_type = str->eol_type;
5531 #ifdef ENABLE_COMPOSITE_CHARS
5532 unsigned_char_dynarr *real_dst = dst;
5534 Lisp_Object coding_system;
5536 XSETCODING_SYSTEM (coding_system, str->codesys);
5538 #ifdef ENABLE_COMPOSITE_CHARS
5539 if (flags & CODING_STATE_COMPOSITE)
5540 dst = str->iso2022.composite_chars;
5541 #endif /* ENABLE_COMPOSITE_CHARS */
5545 unsigned char c = *(unsigned char *)src++;
5546 if (flags & CODING_STATE_ESCAPE)
5547 { /* Within ESC sequence */
5548 int retval = parse_iso2022_esc (coding_system, &str->iso2022,
5553 switch (str->iso2022.esc)
5555 #ifdef ENABLE_COMPOSITE_CHARS
5556 case ISO_ESC_START_COMPOSITE:
5557 if (str->iso2022.composite_chars)
5558 Dynarr_reset (str->iso2022.composite_chars);
5560 str->iso2022.composite_chars = Dynarr_new (unsigned_char);
5561 dst = str->iso2022.composite_chars;
5563 case ISO_ESC_END_COMPOSITE:
5565 Bufbyte comstr[MAX_EMCHAR_LEN];
5567 Emchar emch = lookup_composite_char (Dynarr_atp (dst, 0),
5568 Dynarr_length (dst));
5570 len = set_charptr_emchar (comstr, emch);
5571 Dynarr_add_many (dst, comstr, len);
5574 #endif /* ENABLE_COMPOSITE_CHARS */
5576 case ISO_ESC_LITERAL:
5577 COMPOSE_FLUSH_CHARS (str, dst);
5578 decode_flush_er_chars (str, dst);
5579 DECODE_ADD_BINARY_CHAR (c, dst);
5583 /* Everything else handled already */
5588 /* Attempted error recovery. */
5589 if (str->iso2022.output_direction_sequence)
5590 ensure_correct_direction (flags & CODING_STATE_R2L ?
5591 CHARSET_RIGHT_TO_LEFT :
5592 CHARSET_LEFT_TO_RIGHT,
5593 str->codesys, dst, 0, 1);
5594 /* More error recovery. */
5595 if (!retval || str->iso2022.output_literally)
5597 /* Output the (possibly invalid) sequence */
5599 COMPOSE_FLUSH_CHARS (str, dst);
5600 decode_flush_er_chars (str, dst);
5601 for (i = 0; i < str->iso2022.esc_bytes_index; i++)
5602 DECODE_ADD_BINARY_CHAR (str->iso2022.esc_bytes[i], dst);
5603 flags &= CODING_STATE_ISO2022_LOCK;
5605 n++, src--;/* Repeat the loop with the same character. */
5608 /* No sense in reprocessing the final byte of the
5609 escape sequence; it could mess things up anyway.
5611 COMPOSE_FLUSH_CHARS (str, dst);
5612 decode_flush_er_chars (str, dst);
5613 DECODE_ADD_BINARY_CHAR (c, dst);
5619 else if (BYTE_C0_P (c) || BYTE_C1_P (c))
5620 { /* Control characters */
5622 /***** Error-handling *****/
5624 /* If we were in the middle of a character, dump out the
5625 partial character. */
5628 COMPOSE_FLUSH_CHARS (str, dst);
5629 decode_flush_er_chars (str, dst);
5633 DECODE_ADD_BINARY_CHAR
5634 ((unsigned char)(cpos >> (counter * 8)), dst);
5639 /* If we just saw a single-shift character, dump it out.
5640 This may dump out the wrong sort of single-shift character,
5641 but least it will give an indication that something went
5643 if (flags & CODING_STATE_SS2)
5645 COMPOSE_FLUSH_CHARS (str, dst);
5646 decode_flush_er_chars (str, dst);
5647 DECODE_ADD_BINARY_CHAR (ISO_CODE_SS2, dst);
5648 flags &= ~CODING_STATE_SS2;
5650 if (flags & CODING_STATE_SS3)
5652 COMPOSE_FLUSH_CHARS (str, dst);
5653 decode_flush_er_chars (str, dst);
5654 DECODE_ADD_BINARY_CHAR (ISO_CODE_SS3, dst);
5655 flags &= ~CODING_STATE_SS3;
5658 /***** Now handle the control characters. *****/
5664 COMPOSE_FLUSH_CHARS (str, dst);
5665 decode_flush_er_chars (str, dst);
5666 if (eol_type == EOL_CR)
5667 Dynarr_add (dst, '\n');
5668 else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR)
5669 Dynarr_add (dst, c);
5671 flags |= CODING_STATE_CR;
5672 goto label_continue_loop;
5674 else if (flags & CODING_STATE_CR)
5675 { /* eol_type == CODING_SYSTEM_EOL_CRLF */
5677 Dynarr_add (dst, '\r');
5678 flags &= ~CODING_STATE_CR;
5681 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
5684 flags &= CODING_STATE_ISO2022_LOCK;
5686 if (!parse_iso2022_esc (coding_system, &str->iso2022, c, &flags, 1))
5688 COMPOSE_FLUSH_CHARS (str, dst);
5689 decode_flush_er_chars (str, dst);
5690 DECODE_ADD_BINARY_CHAR (c, dst);
5694 { /* Graphic characters */
5695 Lisp_Object charset;
5704 COMPOSE_FLUSH_CHARS (str, dst);
5705 decode_flush_er_chars (str, dst);
5706 if (eol_type == EOL_CR)
5707 Dynarr_add (dst, '\n');
5708 else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR)
5709 Dynarr_add (dst, c);
5711 flags |= CODING_STATE_CR;
5712 goto label_continue_loop;
5714 else if (flags & CODING_STATE_CR)
5715 { /* eol_type == CODING_SYSTEM_EOL_CRLF */
5717 Dynarr_add (dst, '\r');
5718 flags &= ~CODING_STATE_CR;
5721 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
5724 /* Now determine the charset. */
5725 reg = ((flags & CODING_STATE_SS2) ? 2
5726 : (flags & CODING_STATE_SS3) ? 3
5727 : !BYTE_ASCII_P (c) ? str->iso2022.register_right
5728 : str->iso2022.register_left);
5729 charset = str->iso2022.charset[reg];
5731 /* Error checking: */
5732 if (! CHARSETP (charset)
5733 || str->iso2022.invalid_designated[reg]
5734 || (((c & 0x7F) == ' ' || (c & 0x7F) == ISO_CODE_DEL)
5735 && XCHARSET_CHARS (charset) == 94))
5736 /* Mrmph. We are trying to invoke a register that has no
5737 or an invalid charset in it, or trying to add a character
5738 outside the range of the charset. Insert that char literally
5739 to preserve it for the output. */
5741 COMPOSE_FLUSH_CHARS (str, dst);
5742 decode_flush_er_chars (str, dst);
5746 DECODE_ADD_BINARY_CHAR
5747 ((unsigned char)(cpos >> (counter * 8)), dst);
5750 DECODE_ADD_BINARY_CHAR (c, dst);
5755 /* Things are probably hunky-dorey. */
5757 /* Fetch reverse charset, maybe. */
5758 if (((flags & CODING_STATE_R2L) &&
5759 XCHARSET_DIRECTION (charset) == CHARSET_LEFT_TO_RIGHT)
5761 (!(flags & CODING_STATE_R2L) &&
5762 XCHARSET_DIRECTION (charset) == CHARSET_RIGHT_TO_LEFT))
5764 Lisp_Object new_charset =
5765 XCHARSET_REVERSE_DIRECTION_CHARSET (charset);
5766 if (!NILP (new_charset))
5767 charset = new_charset;
5772 if (XCHARSET_DIMENSION (charset) == counter)
5774 COMPOSE_ADD_CHAR (str,
5775 DECODE_CHAR (charset,
5776 ((cpos & 0x7F7F7F) << 8)
5783 cpos = (cpos << 8) | c;
5785 lb = XCHARSET_LEADING_BYTE (charset);
5786 switch (XCHARSET_REP_BYTES (charset))
5789 DECODE_OUTPUT_PARTIAL_CHAR (ch);
5790 Dynarr_add (dst, c & 0x7F);
5793 case 2: /* one-byte official */
5794 DECODE_OUTPUT_PARTIAL_CHAR (ch);
5795 Dynarr_add (dst, lb);
5796 Dynarr_add (dst, c | 0x80);
5799 case 3: /* one-byte private or two-byte official */
5800 if (XCHARSET_PRIVATE_P (charset))
5802 DECODE_OUTPUT_PARTIAL_CHAR (ch);
5803 Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_1);
5804 Dynarr_add (dst, lb);
5805 Dynarr_add (dst, c | 0x80);
5811 Dynarr_add (dst, lb);
5812 Dynarr_add (dst, ch | 0x80);
5813 Dynarr_add (dst, c | 0x80);
5821 default: /* two-byte private */
5824 Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_2);
5825 Dynarr_add (dst, lb);
5826 Dynarr_add (dst, ch | 0x80);
5827 Dynarr_add (dst, c | 0x80);
5837 flags &= CODING_STATE_ISO2022_LOCK;
5840 label_continue_loop:;
5843 if (flags & CODING_STATE_END)
5845 COMPOSE_FLUSH_CHARS (str, dst);
5846 decode_flush_er_chars (str, dst);
5847 DECODE_OUTPUT_PARTIAL_CHAR (cpos);
5851 str->counter = counter;
5855 /***** ISO2022 encoder *****/
5857 /* Designate CHARSET into register REG. */
5860 iso2022_designate (Lisp_Object charset, unsigned char reg,
5861 struct encoding_stream *str, unsigned_char_dynarr *dst)
5863 static const char inter94[] = "()*+";
5864 static const char inter96[] = ",-./";
5865 unsigned short chars;
5866 unsigned char dimension;
5867 unsigned char final;
5868 Lisp_Object old_charset = str->iso2022.charset[reg];
5870 str->iso2022.charset[reg] = charset;
5871 if (!CHARSETP (charset))
5872 /* charset might be an initial nil or t. */
5874 chars = XCHARSET_CHARS (charset);
5875 dimension = XCHARSET_DIMENSION (charset);
5876 final = XCHARSET_FINAL (charset);
5877 if (!str->iso2022.force_charset_on_output[reg] &&
5878 CHARSETP (old_charset) &&
5879 XCHARSET_CHARS (old_charset) == chars &&
5880 XCHARSET_DIMENSION (old_charset) == dimension &&
5881 XCHARSET_FINAL (old_charset) == final)
5884 str->iso2022.force_charset_on_output[reg] = 0;
5887 charset_conversion_spec_dynarr *dyn =
5888 str->codesys->iso2022.output_conv;
5894 for (i = 0; i < Dynarr_length (dyn); i++)
5896 struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
5897 if (EQ (charset, spec->from_charset))
5898 charset = spec->to_charset;
5903 Dynarr_add (dst, ISO_CODE_ESC);
5908 Dynarr_add (dst, inter94[reg]);
5911 Dynarr_add (dst, '$');
5913 || !(CODING_SYSTEM_ISO2022_SHORT (str->codesys))
5916 Dynarr_add (dst, inter94[reg]);
5921 Dynarr_add (dst, inter96[reg]);
5924 Dynarr_add (dst, '$');
5925 Dynarr_add (dst, inter96[reg]);
5929 Dynarr_add (dst, final);
5933 ensure_normal_shift (struct encoding_stream *str, unsigned_char_dynarr *dst)
5935 if (str->iso2022.register_left != 0)
5937 Dynarr_add (dst, ISO_CODE_SI);
5938 str->iso2022.register_left = 0;
5943 ensure_shift_out (struct encoding_stream *str, unsigned_char_dynarr *dst)
5945 if (str->iso2022.register_left != 1)
5947 Dynarr_add (dst, ISO_CODE_SO);
5948 str->iso2022.register_left = 1;
5953 char_encode_iso2022 (struct encoding_stream *str, Emchar ch,
5954 unsigned_char_dynarr *dst, unsigned int *flags)
5956 unsigned char charmask;
5957 Lisp_Coding_System* codesys = str->codesys;
5958 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
5960 Lisp_Object charset = str->iso2022.current_charset;
5961 int half = str->iso2022.current_half;
5962 int code_point = -1;
5966 restore_left_to_right_direction (codesys, dst, flags, 0);
5968 /* Make sure G0 contains ASCII */
5969 if ((ch > ' ' && ch < ISO_CODE_DEL)
5970 || !CODING_SYSTEM_ISO2022_NO_ASCII_CNTL (codesys))
5972 ensure_normal_shift (str, dst);
5973 iso2022_designate (Vcharset_ascii, 0, str, dst);
5976 /* If necessary, restore everything to the default state
5978 if (ch == '\n' && !(CODING_SYSTEM_ISO2022_NO_ASCII_EOL (codesys)))
5980 restore_left_to_right_direction (codesys, dst, flags, 0);
5982 ensure_normal_shift (str, dst);
5984 for (i = 0; i < 4; i++)
5986 Lisp_Object initial_charset =
5987 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
5988 iso2022_designate (initial_charset, i, str, dst);
5993 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5994 Dynarr_add (dst, '\r');
5995 if (eol_type != EOL_CR)
5996 Dynarr_add (dst, ch);
6000 if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
6001 && fit_to_be_escape_quoted (ch))
6002 Dynarr_add (dst, ISO_CODE_ESC);
6003 Dynarr_add (dst, ch);
6006 else if ( (0x80 <= ch) && (ch <= 0x9f) )
6008 charmask = (half == 0 ? 0x00 : 0x80);
6010 if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
6011 && fit_to_be_escape_quoted (ch))
6012 Dynarr_add (dst, ISO_CODE_ESC);
6013 /* you asked for it ... */
6014 Dynarr_add (dst, ch);
6020 /* Now determine which register to use. */
6022 for (i = 0; i < 4; i++)
6024 if ((CHARSETP (charset = str->iso2022.charset[i])
6025 && ((code_point = charset_code_point (charset, ch, 0)) >= 0))
6029 = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i))
6030 && ((code_point = charset_code_point (charset, ch, 0)) >= 0)))
6038 Lisp_Object original_default_coded_charset_priority_list
6039 = Vdefault_coded_charset_priority_list;
6040 Vdefault_coded_charset_priority_list
6041 = CODING_SYSTEM_CCS_PRIORITY_LIST (codesys);
6042 while (!EQ (Vdefault_coded_charset_priority_list, Qnil))
6044 code_point = ENCODE_CHAR (ch, charset);
6045 if (XCHARSET_FINAL (charset))
6047 Vdefault_coded_charset_priority_list
6048 = Fcdr (Fmemq (XCHARSET_NAME (charset),
6049 Vdefault_coded_charset_priority_list));
6051 Vdefault_coded_charset_priority_list
6052 = original_default_coded_charset_priority_list;
6053 while (!EQ (Vdefault_coded_charset_priority_list, Qnil))
6055 code_point = ENCODE_CHAR (ch, charset);
6056 if (XCHARSET_FINAL (charset))
6058 Vdefault_coded_charset_priority_list
6059 = Fcdr (Fmemq (XCHARSET_NAME (charset),
6060 Vdefault_coded_charset_priority_list));
6062 code_point = ENCODE_CHAR (ch, charset);
6063 if (!XCHARSET_FINAL (charset))
6065 charset = Vcharset_ascii;
6069 Vdefault_coded_charset_priority_list
6070 = original_default_coded_charset_priority_list;
6072 ensure_correct_direction (XCHARSET_DIRECTION (charset),
6073 codesys, dst, flags, 0);
6077 if (XCHARSET_GRAPHIC (charset) != 0)
6079 if (!NILP (str->iso2022.charset[1]) &&
6080 (!CODING_SYSTEM_ISO2022_SEVEN (codesys)
6081 || CODING_SYSTEM_ISO2022_LOCK_SHIFT (codesys)))
6083 else if (!NILP (str->iso2022.charset[2]))
6085 else if (!NILP (str->iso2022.charset[3]))
6094 iso2022_designate (charset, reg, str, dst);
6096 /* Now invoke that register. */
6100 ensure_normal_shift (str, dst);
6104 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
6106 ensure_shift_out (str, dst);
6113 if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
6115 Dynarr_add (dst, ISO_CODE_ESC);
6116 Dynarr_add (dst, 'N');
6121 Dynarr_add (dst, ISO_CODE_SS2);
6126 if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
6128 Dynarr_add (dst, ISO_CODE_ESC);
6129 Dynarr_add (dst, 'O');
6134 Dynarr_add (dst, ISO_CODE_SS3);
6142 charmask = (half == 0 ? 0x00 : 0x80);
6144 switch (XCHARSET_DIMENSION (charset))
6147 Dynarr_add (dst, (code_point & 0xFF) | charmask);
6150 Dynarr_add (dst, ((code_point >> 8) & 0xFF) | charmask);
6151 Dynarr_add (dst, ( code_point & 0xFF) | charmask);
6154 Dynarr_add (dst, ((code_point >> 16) & 0xFF) | charmask);
6155 Dynarr_add (dst, ((code_point >> 8) & 0xFF) | charmask);
6156 Dynarr_add (dst, ( code_point & 0xFF) | charmask);
6159 Dynarr_add (dst, ((code_point >> 24) & 0xFF) | charmask);
6160 Dynarr_add (dst, ((code_point >> 16) & 0xFF) | charmask);
6161 Dynarr_add (dst, ((code_point >> 8) & 0xFF) | charmask);
6162 Dynarr_add (dst, ( code_point & 0xFF) | charmask);
6168 str->iso2022.current_charset = charset;
6169 str->iso2022.current_half = half;
6173 char_finish_iso2022 (struct encoding_stream *str, unsigned_char_dynarr *dst,
6174 unsigned int *flags)
6176 Lisp_Coding_System* codesys = str->codesys;
6179 restore_left_to_right_direction (codesys, dst, flags, 0);
6180 ensure_normal_shift (str, dst);
6181 for (i = 0; i < 4; i++)
6183 Lisp_Object initial_charset
6184 = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
6185 iso2022_designate (initial_charset, i, str, dst);
6190 /************************************************************************/
6191 /* No-conversion methods */
6192 /************************************************************************/
6194 /* This is used when reading in "binary" files -- i.e. files that may
6195 contain all 256 possible byte values and that are not to be
6196 interpreted as being in any particular decoding. */
6198 decode_coding_no_conversion (Lstream *decoding, const Extbyte *src,
6199 unsigned_char_dynarr *dst, Lstream_data_count n)
6201 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
6202 unsigned int flags = str->flags;
6203 unsigned int cpos = str->cpos;
6204 eol_type_t eol_type = str->eol_type;
6208 unsigned char c = *(unsigned char *)src++;
6210 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
6211 DECODE_ADD_BINARY_CHAR (c, dst);
6212 label_continue_loop:;
6215 DECODE_HANDLE_END_OF_CONVERSION (flags, cpos, dst);
6222 encode_coding_no_conversion (Lstream *encoding, const Bufbyte *src,
6223 unsigned_char_dynarr *dst, Lstream_data_count n)
6226 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
6227 unsigned int flags = str->flags;
6228 unsigned int ch = str->ch;
6229 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
6231 unsigned char char_boundary = str->iso2022.current_char_boundary;
6238 if (char_boundary == 0)
6244 else if ( c >= 0xf8 )
6249 else if ( c >= 0xf0 )
6254 else if ( c >= 0xe0 )
6259 else if ( c >= 0xc0 )
6269 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
6270 Dynarr_add (dst, '\r');
6271 if (eol_type != EOL_CR)
6272 Dynarr_add (dst, c);
6275 Dynarr_add (dst, c);
6278 else if (char_boundary == 1)
6280 ch = ( ch << 6 ) | ( c & 0x3f );
6281 Dynarr_add (dst, ch & 0xff);
6286 ch = ( ch << 6 ) | ( c & 0x3f );
6289 #else /* not UTF2000 */
6292 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
6293 Dynarr_add (dst, '\r');
6294 if (eol_type != EOL_CR)
6295 Dynarr_add (dst, '\n');
6298 else if (BYTE_ASCII_P (c))
6301 Dynarr_add (dst, c);
6303 else if (BUFBYTE_LEADING_BYTE_P (c))
6306 if (c == LEADING_BYTE_LATIN_ISO8859_1 ||
6307 c == LEADING_BYTE_CONTROL_1)
6310 Dynarr_add (dst, '~'); /* untranslatable character */
6314 if (ch == LEADING_BYTE_LATIN_ISO8859_1)
6315 Dynarr_add (dst, c);
6316 else if (ch == LEADING_BYTE_CONTROL_1)
6319 Dynarr_add (dst, c - 0x20);
6321 /* else it should be the second or third byte of an
6322 untranslatable character, so ignore it */
6325 #endif /* not UTF2000 */
6331 str->iso2022.current_char_boundary = char_boundary;
6337 /************************************************************************/
6338 /* Initialization */
6339 /************************************************************************/
6342 syms_of_file_coding (void)
6344 INIT_LRECORD_IMPLEMENTATION (coding_system);
6346 DEFERROR_STANDARD (Qcoding_system_error, Qio_error);
6348 DEFSUBR (Fcoding_system_p);
6349 DEFSUBR (Ffind_coding_system);
6350 DEFSUBR (Fget_coding_system);
6351 DEFSUBR (Fcoding_system_list);
6352 DEFSUBR (Fcoding_system_name);
6353 DEFSUBR (Fmake_coding_system);
6354 DEFSUBR (Fcopy_coding_system);
6355 DEFSUBR (Fcoding_system_canonical_name_p);
6356 DEFSUBR (Fcoding_system_alias_p);
6357 DEFSUBR (Fcoding_system_aliasee);
6358 DEFSUBR (Fdefine_coding_system_alias);
6359 DEFSUBR (Fsubsidiary_coding_system);
6361 DEFSUBR (Fcoding_system_type);
6362 DEFSUBR (Fcoding_system_doc_string);
6364 DEFSUBR (Fcoding_system_charset);
6366 DEFSUBR (Fcoding_system_property);
6368 DEFSUBR (Fcoding_category_list);
6369 DEFSUBR (Fset_coding_priority_list);
6370 DEFSUBR (Fcoding_priority_list);
6371 DEFSUBR (Fset_coding_category_system);
6372 DEFSUBR (Fcoding_category_system);
6374 DEFSUBR (Fdetect_coding_region);
6375 DEFSUBR (Fdecode_coding_region);
6376 DEFSUBR (Fencode_coding_region);
6378 DEFSUBR (Fdecode_shift_jis_char);
6379 DEFSUBR (Fencode_shift_jis_char);
6380 DEFSUBR (Fdecode_big5_char);
6381 DEFSUBR (Fencode_big5_char);
6383 defsymbol (&Qcoding_systemp, "coding-system-p");
6384 defsymbol (&Qno_conversion, "no-conversion");
6385 defsymbol (&Qraw_text, "raw-text");
6387 defsymbol (&Qbig5, "big5");
6388 defsymbol (&Qshift_jis, "shift-jis");
6389 defsymbol (&Qucs4, "ucs-4");
6390 defsymbol (&Qutf8, "utf-8");
6391 defsymbol (&Qutf16, "utf-16");
6392 defsymbol (&Qccl, "ccl");
6393 defsymbol (&Qiso2022, "iso2022");
6395 defsymbol (&Qmnemonic, "mnemonic");
6396 defsymbol (&Qeol_type, "eol-type");
6397 defsymbol (&Qpost_read_conversion, "post-read-conversion");
6398 defsymbol (&Qpre_write_conversion, "pre-write-conversion");
6400 defsymbol (&Qcr, "cr");
6401 defsymbol (&Qlf, "lf");
6402 defsymbol (&Qcrlf, "crlf");
6403 defsymbol (&Qeol_cr, "eol-cr");
6404 defsymbol (&Qeol_lf, "eol-lf");
6405 defsymbol (&Qeol_crlf, "eol-crlf");
6407 defsymbol (&Qcharset_g0, "charset-g0");
6408 defsymbol (&Qcharset_g1, "charset-g1");
6409 defsymbol (&Qcharset_g2, "charset-g2");
6410 defsymbol (&Qcharset_g3, "charset-g3");
6411 defsymbol (&Qforce_g0_on_output, "force-g0-on-output");
6412 defsymbol (&Qforce_g1_on_output, "force-g1-on-output");
6413 defsymbol (&Qforce_g2_on_output, "force-g2-on-output");
6414 defsymbol (&Qforce_g3_on_output, "force-g3-on-output");
6415 defsymbol (&Qno_iso6429, "no-iso6429");
6416 defsymbol (&Qinput_charset_conversion, "input-charset-conversion");
6417 defsymbol (&Qoutput_charset_conversion, "output-charset-conversion");
6419 defsymbol (&Qshort, "short");
6420 defsymbol (&Qno_ascii_eol, "no-ascii-eol");
6421 defsymbol (&Qno_ascii_cntl, "no-ascii-cntl");
6422 defsymbol (&Qseven, "seven");
6423 defsymbol (&Qlock_shift, "lock-shift");
6424 defsymbol (&Qescape_quoted, "escape-quoted");
6427 defsymbol (&Qutf_8_mcs, "utf-8-mcs");
6428 defsymbol (&Qdisable_composition, "disable-composition");
6429 defsymbol (&Qccs_priority_list, "ccs-priority-list");
6430 defsymbol (&Quse_entity_reference, "use-entity-reference");
6431 defsymbol (&Qd, "d");
6432 defsymbol (&Qx, "x");
6433 defsymbol (&QX, "X");
6435 defsymbol (&Qencode, "encode");
6436 defsymbol (&Qdecode, "decode");
6439 defsymbol (&coding_category_symbol[CODING_CATEGORY_SHIFT_JIS],
6441 defsymbol (&coding_category_symbol[CODING_CATEGORY_BIG5],
6443 defsymbol (&coding_category_symbol[CODING_CATEGORY_UCS4],
6445 defsymbol (&coding_category_symbol[CODING_CATEGORY_UTF16],
6447 defsymbol (&coding_category_symbol[CODING_CATEGORY_UTF8],
6449 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_7],
6451 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_DESIGNATE],
6453 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_1],
6455 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_2],
6457 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_LOCK_SHIFT],
6460 defsymbol (&coding_category_symbol[CODING_CATEGORY_NO_CONVERSION],
6465 lstream_type_create_file_coding (void)
6467 LSTREAM_HAS_METHOD (decoding, reader);
6468 LSTREAM_HAS_METHOD (decoding, writer);
6469 LSTREAM_HAS_METHOD (decoding, rewinder);
6470 LSTREAM_HAS_METHOD (decoding, seekable_p);
6471 LSTREAM_HAS_METHOD (decoding, flusher);
6472 LSTREAM_HAS_METHOD (decoding, closer);
6473 LSTREAM_HAS_METHOD (decoding, marker);
6475 LSTREAM_HAS_METHOD (encoding, reader);
6476 LSTREAM_HAS_METHOD (encoding, writer);
6477 LSTREAM_HAS_METHOD (encoding, rewinder);
6478 LSTREAM_HAS_METHOD (encoding, seekable_p);
6479 LSTREAM_HAS_METHOD (encoding, flusher);
6480 LSTREAM_HAS_METHOD (encoding, closer);
6481 LSTREAM_HAS_METHOD (encoding, marker);
6485 vars_of_file_coding (void)
6489 fcd = xnew (struct file_coding_dump);
6490 dump_add_root_struct_ptr (&fcd, &fcd_description);
6492 /* Initialize to something reasonable ... */
6493 for (i = 0; i < CODING_CATEGORY_LAST; i++)
6495 fcd->coding_category_system[i] = Qnil;
6496 fcd->coding_category_by_priority[i] = i;
6499 Fprovide (intern ("file-coding"));
6501 DEFVAR_LISP ("keyboard-coding-system", &Vkeyboard_coding_system /*
6502 Coding system used for TTY keyboard input.
6503 Not used under a windowing system.
6505 Vkeyboard_coding_system = Qnil;
6507 DEFVAR_LISP ("terminal-coding-system", &Vterminal_coding_system /*
6508 Coding system used for TTY display output.
6509 Not used under a windowing system.
6511 Vterminal_coding_system = Qnil;
6513 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read /*
6514 Overriding coding system used when reading from a file or process.
6515 You should bind this variable with `let', but do not set it globally.
6516 If this is non-nil, it specifies the coding system that will be used
6517 to decode input on read operations, such as from a file or process.
6518 It overrides `buffer-file-coding-system-for-read',
6519 `insert-file-contents-pre-hook', etc. Use those variables instead of
6520 this one for permanent changes to the environment. */ );
6521 Vcoding_system_for_read = Qnil;
6523 DEFVAR_LISP ("coding-system-for-write",
6524 &Vcoding_system_for_write /*
6525 Overriding coding system used when writing to a file or process.
6526 You should bind this variable with `let', but do not set it globally.
6527 If this is non-nil, it specifies the coding system that will be used
6528 to encode output for write operations, such as to a file or process.
6529 It overrides `buffer-file-coding-system', `write-region-pre-hook', etc.
6530 Use those variables instead of this one for permanent changes to the
6532 Vcoding_system_for_write = Qnil;
6534 DEFVAR_LISP ("file-name-coding-system", &Vfile_name_coding_system /*
6535 Coding system used to convert pathnames when accessing files.
6537 Vfile_name_coding_system = Qnil;
6539 DEFVAR_LISP ("coded-charset-entity-reference-alist",
6540 &Vcoded_charset_entity_reference_alist /*
6541 Alist of coded-charset vs corresponding entity-reference.
6542 Each element looks like (CCS PREFIX CODE-COLUMNS CODE-TYPE).
6543 CCS is coded-charset.
6544 CODE-COLUMNS is columns of code-point of entity-reference.
6545 CODE-TYPE is format type of code-point of entity-reference.
6546 `d' means decimal value and `x' means hexadecimal value.
6548 Vcoded_charset_entity_reference_alist = Qnil;
6550 DEFVAR_BOOL ("enable-multibyte-characters", &enable_multibyte_characters /*
6551 Non-nil means the buffer contents are regarded as multi-byte form
6552 of characters, not a binary code. This affects the display, file I/O,
6553 and behaviors of various editing commands.
6555 Setting this to nil does not do anything.
6557 enable_multibyte_characters = 1;
6561 complex_vars_of_file_coding (void)
6563 staticpro (&Vcoding_system_hash_table);
6564 Vcoding_system_hash_table =
6565 make_lisp_hash_table (50, HASH_TABLE_NON_WEAK, HASH_TABLE_EQ);
6567 the_codesys_prop_dynarr = Dynarr_new (codesys_prop);
6568 dump_add_root_struct_ptr (&the_codesys_prop_dynarr, &codesys_prop_dynarr_description);
6570 #define DEFINE_CODESYS_PROP(Prop_Type, Sym) do \
6572 struct codesys_prop csp; \
6574 csp.prop_type = (Prop_Type); \
6575 Dynarr_add (the_codesys_prop_dynarr, csp); \
6578 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qmnemonic);
6579 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_type);
6580 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_cr);
6581 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_crlf);
6582 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_lf);
6583 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qpost_read_conversion);
6584 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qpre_write_conversion);
6586 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g0);
6587 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g1);
6588 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g2);
6589 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g3);
6590 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g0_on_output);
6591 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g1_on_output);
6592 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g2_on_output);
6593 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g3_on_output);
6594 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qshort);
6595 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_eol);
6596 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_cntl);
6597 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qseven);
6598 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qlock_shift);
6599 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_iso6429);
6600 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qescape_quoted);
6601 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qinput_charset_conversion);
6602 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qoutput_charset_conversion);
6604 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qccs_priority_list);
6607 DEFINE_CODESYS_PROP (CODESYS_PROP_CCL, Qencode);
6608 DEFINE_CODESYS_PROP (CODESYS_PROP_CCL, Qdecode);
6610 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qdisable_composition);
6611 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Quse_entity_reference);
6614 /* Need to create this here or we're really screwed. */
6616 (Qraw_text, Qno_conversion,
6617 build_string ("Raw text, which means it converts only line-break-codes."),
6618 list2 (Qmnemonic, build_string ("Raw")));
6621 (Qbinary, Qno_conversion,
6622 build_string ("Binary, which means it does not convert anything."),
6623 list4 (Qeol_type, Qlf,
6624 Qmnemonic, build_string ("Binary")));
6630 ("Coding-system of UTF-8 with Multiple Coded-character-Sets extension."),
6631 list2 (Qmnemonic, build_string ("MTF8")));
6634 Fdefine_coding_system_alias (Qno_conversion, Qraw_text);
6636 Fdefine_coding_system_alias (Qfile_name, Qbinary);
6638 Fdefine_coding_system_alias (Qterminal, Qbinary);
6639 Fdefine_coding_system_alias (Qkeyboard, Qbinary);
6641 /* Need this for bootstrapping */
6642 fcd->coding_category_system[CODING_CATEGORY_NO_CONVERSION] =
6643 Fget_coding_system (Qraw_text);
6646 fcd->coding_category_system[CODING_CATEGORY_UTF8]
6647 = Fget_coding_system (Qutf_8_mcs);
6650 #if defined(MULE) && !defined(UTF2000)
6654 for (i = 0; i < countof (fcd->ucs_to_mule_table); i++)
6655 fcd->ucs_to_mule_table[i] = Qnil;
6657 staticpro (&mule_to_ucs_table);
6658 mule_to_ucs_table = Fmake_char_table(Qgeneric);
6659 #endif /* defined(MULE) && !defined(UTF2000) */